@vectororm/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2493 @@
1
+ /**
2
+ * Represents a vector record in the database.
3
+ *
4
+ * This is the fundamental unit of storage in Glyph, containing:
5
+ * - Unique identifier
6
+ * - Embedding vector
7
+ * - Metadata (including V/H/S fields)
8
+ * - Optional text and score
9
+ */
10
+ interface VectorRecord {
11
+ /** Unique identifier for this record */
12
+ id: string;
13
+ /** Embedding vector (dimensionality depends on embedding model) */
14
+ embedding: number[];
15
+ /**
16
+ * Metadata fields including:
17
+ * - Vertical fields (__v_*): Document-level metadata
18
+ * - Horizontal fields (__h_*): Theme/section metadata
19
+ * - Structural fields (__s_*): Position/hierarchy metadata
20
+ * - Custom user fields
21
+ */
22
+ metadata: Record<string, any>;
23
+ /** Optional text content of this chunk */
24
+ text?: string;
25
+ /** Optional similarity score (populated during search) */
26
+ score?: number;
27
+ }
28
+
29
+ /**
30
+ * Result from a vector search operation.
31
+ */
32
+ interface SearchResult {
33
+ /** Matching vector records */
34
+ records: VectorRecord[];
35
+ /** Total count of matches (if available from DB) */
36
+ totalCount?: number;
37
+ /** Cursor for pagination (if supported by DB) */
38
+ nextCursor?: string;
39
+ }
40
+
41
+ /**
42
+ * Metadata field prefixes for the three axes of Glyph's schema.
43
+ *
44
+ * These prefixes separate framework fields from user-defined metadata:
45
+ * - __v_: Vertical axis (document identity)
46
+ * - __h_: Horizontal axis (content/theme identity)
47
+ * - __s_: Structural axis (position/hierarchy)
48
+ */
49
+ declare const METADATA_PREFIXES: {
50
+ readonly VERTICAL: "__v_";
51
+ readonly HORIZONTAL: "__h_";
52
+ readonly STRUCTURAL: "__s_";
53
+ };
54
+ /**
55
+ * Vertical axis fields - identify WHICH document a chunk belongs to.
56
+ */
57
+ declare const VerticalFields: {
58
+ /** Unique document identifier */
59
+ readonly DOC_ID: "__v_doc_id";
60
+ /** Original source path/URL */
61
+ readonly SOURCE: "__v_source";
62
+ /** Logical partition key (for filtering by document subsets) */
63
+ readonly PARTITION: "__v_partition";
64
+ /** Document type classification */
65
+ readonly DOC_TYPE: "__v_doc_type";
66
+ /** Arbitrary vertical tags */
67
+ readonly TAGS: "__v_tags";
68
+ };
69
+ /**
70
+ * Horizontal axis fields - identify WHAT topic/theme a chunk covers.
71
+ */
72
+ declare const HorizontalFields: {
73
+ /** Primary theme classification */
74
+ readonly THEME: "__h_theme";
75
+ /** Multiple themes (if applicable) */
76
+ readonly THEMES: "__h_themes";
77
+ /** Classification confidence score */
78
+ readonly THEME_CONFIDENCE: "__h_theme_confidence";
79
+ /** Hierarchical section path (e.g., "Chapter 3/Pricing/Rates") */
80
+ readonly SECTION_PATH: "__h_section_path";
81
+ /** Depth level in hierarchy (0 = root) */
82
+ readonly SECTION_LEVEL: "__h_section_level";
83
+ /** Section header text */
84
+ readonly SECTION_TITLE: "__h_section_title";
85
+ };
86
+ /**
87
+ * Structural axis fields - track chunk position and relationships.
88
+ */
89
+ declare const StructuralFields: {
90
+ /** Position in document (0-indexed) */
91
+ readonly CHUNK_INDEX: "__s_chunk_index";
92
+ /** Parent chunk ID (for hierarchical chunking) */
93
+ readonly PARENT_ID: "__s_parent_id";
94
+ /** Whether this chunk has children */
95
+ readonly HAS_CHILDREN: "__s_has_children";
96
+ /** Total chunks in this document */
97
+ readonly TOTAL_CHUNKS: "__s_total_chunks";
98
+ };
99
+
100
+ /**
101
+ * Type-safe metadata field names.
102
+ *
103
+ * Use these instead of string literals to get autocomplete and catch typos.
104
+ */
105
+ /** Type for vertical field keys */
106
+ type VerticalFieldKey = 'docId' | 'source' | 'partition' | 'docType' | 'tags';
107
+ /** Type for horizontal field keys */
108
+ type HorizontalFieldKey = 'theme' | 'themes' | 'themeConfidence' | 'sectionPath' | 'sectionLevel' | 'sectionTitle';
109
+ /** Type for structural field keys */
110
+ type StructuralFieldKey = 'chunkIndex' | 'parentId' | 'hasChildren' | 'totalChunks';
111
+
112
+ /**
113
+ * MetadataBuilder provides a fluent API for constructing metadata objects
114
+ * with proper V/H/S prefixes and type safety.
115
+ *
116
+ * Example:
117
+ * ```typescript
118
+ * const metadata = new MetadataBuilder()
119
+ * .vertical({ doc_id: 'doc123', source: 'file.pdf' })
120
+ * .horizontal({ theme: 'pricing' })
121
+ * .structural({ chunk_index: 0, total_chunks: 10 })
122
+ * .custom({ author: 'John Doe' })
123
+ * .build();
124
+ * ```
125
+ *
126
+ * Features:
127
+ * - Fluent chaining API
128
+ * - Automatic prefix application
129
+ * - Skips undefined values
130
+ * - Returns immutable copy on build()
131
+ */
132
+ declare class MetadataBuilder {
133
+ private metadata;
134
+ /**
135
+ * Add vertical axis metadata (document identity).
136
+ * Automatically prefixes fields with '__v_'.
137
+ *
138
+ * @param fields - Vertical metadata fields (doc_id, source, partition, etc.)
139
+ * @returns This builder for chaining
140
+ */
141
+ vertical(fields: Record<string, any>): this;
142
+ /**
143
+ * Add horizontal axis metadata (theme/section identity).
144
+ * Automatically prefixes fields with '__h_'.
145
+ *
146
+ * @param fields - Horizontal metadata fields (theme, section_path, etc.)
147
+ * @returns This builder for chaining
148
+ */
149
+ horizontal(fields: Record<string, any>): this;
150
+ /**
151
+ * Add structural axis metadata (position/hierarchy).
152
+ * Automatically prefixes fields with '__s_'.
153
+ *
154
+ * @param fields - Structural metadata fields (chunk_index, parent_id, etc.)
155
+ * @returns This builder for chaining
156
+ */
157
+ structural(fields: Record<string, any>): this;
158
+ /**
159
+ * Add custom user-defined metadata.
160
+ * Fields are added as-is without any prefix.
161
+ *
162
+ * @param fields - Custom metadata fields
163
+ * @returns This builder for chaining
164
+ */
165
+ custom(fields: Record<string, any>): this;
166
+ /**
167
+ * Build and return the complete metadata object.
168
+ * Returns a copy to prevent external modification.
169
+ *
170
+ * @returns Immutable copy of the metadata object
171
+ */
172
+ build(): Record<string, any>;
173
+ }
174
+
175
+ /**
176
+ * Universal filter language for database-agnostic queries.
177
+ *
178
+ * Filters are expressed in a standard format, then translated
179
+ * to native database syntax by each adapter.
180
+ */
181
+ /**
182
+ * Supported filter operators.
183
+ */
184
+ type FilterOperator = 'eq' | 'neq' | 'in' | 'nin' | 'gt' | 'gte' | 'lt' | 'lte' | 'contains' | 'exists';
185
+ /**
186
+ * Basic filter condition.
187
+ */
188
+ interface FilterCondition {
189
+ field: string;
190
+ op: FilterOperator;
191
+ value: any;
192
+ }
193
+ /**
194
+ * Compound AND filter (all conditions must match).
195
+ */
196
+ interface AndFilter {
197
+ and: UniversalFilter[];
198
+ }
199
+ /**
200
+ * Compound OR filter (any condition must match).
201
+ */
202
+ interface OrFilter {
203
+ or: UniversalFilter[];
204
+ }
205
+ /**
206
+ * Universal filter - can be a simple condition or compound.
207
+ */
208
+ type UniversalFilter = FilterCondition | AndFilter | OrFilter;
209
+ /**
210
+ * Shorthand filter format (user-friendly).
211
+ *
212
+ * Examples:
213
+ * - {region: "ny"} → {field: "region", op: "eq", value: "ny"}
214
+ * - {year__gte: 2023} → {field: "year", op: "gte", value: 2023}
215
+ * - {region: "ny", year__gte: 2023} → {and: [...]}
216
+ */
217
+ type ShorthandFilter = Record<string, any>;
218
+
219
+ /**
220
+ * Translates filters between formats and validates structure.
221
+ */
222
+ declare class FilterTranslator {
223
+ /**
224
+ * Normalize any filter input to standard UniversalFilter format.
225
+ *
226
+ * Handles:
227
+ * - Standard format (pass through)
228
+ * - Shorthand format (convert to standard)
229
+ * - Operator suffixes (field__op syntax)
230
+ */
231
+ static normalize(input: ShorthandFilter | UniversalFilter): UniversalFilter;
232
+ /**
233
+ * Validate filter structure and operators.
234
+ *
235
+ * Throws error if filter is invalid.
236
+ */
237
+ static validate(filter: UniversalFilter): void;
238
+ /**
239
+ * Check if filter is compound (AND/OR).
240
+ */
241
+ static isCompound(filter: UniversalFilter): boolean;
242
+ /**
243
+ * Check if input is already in standard format.
244
+ */
245
+ private static isStandardFormat;
246
+ /**
247
+ * Convert shorthand format to standard.
248
+ */
249
+ private static fromShorthand;
250
+ }
251
+
252
+ /**
253
+ * Types for vector database adapters.
254
+ *
255
+ * These types define the common interface elements that all
256
+ * adapters must support or return.
257
+ */
258
+ /**
259
+ * Statistics about a vector collection.
260
+ */
261
+ interface CollectionStats {
262
+ /** Total number of vectors in the collection */
263
+ vectorCount: number;
264
+ /** Dimension of vectors in this collection */
265
+ dimension: number;
266
+ /** Distance metric used (cosine, euclidean, etc.) */
267
+ metric: DistanceMetric;
268
+ /** Additional DB-specific stats (optional) */
269
+ [key: string]: any;
270
+ }
271
+ /**
272
+ * Metadata update operation.
273
+ *
274
+ * Used for efficient metadata enrichment without re-uploading vectors.
275
+ */
276
+ interface MetadataUpdate {
277
+ /** ID of the record to update */
278
+ id: string;
279
+ /** Metadata fields to set/update */
280
+ metadata: Record<string, any>;
281
+ }
282
+ /**
283
+ * Distance metric for vector similarity.
284
+ */
285
+ type DistanceMetric = 'cosine' | 'euclidean' | 'dotProduct';
286
+
287
+ /**
288
+ * Abstract base class for all vector database adapters.
289
+ *
290
+ * This is the KEY abstraction that enables database-agnostic operations.
291
+ * Each database (Pinecone, Chroma, Qdrant, etc.) implements this interface,
292
+ * allowing the SDK to work with any vector database.
293
+ *
294
+ * Design principles:
295
+ * 1. All methods are abstract (must be implemented by subclasses)
296
+ * 2. Capability flags have default implementations (can be overridden)
297
+ * 3. Universal filter translation is adapter-specific
298
+ * 4. Async iteration enables efficient enrichment pipelines
299
+ *
300
+ * @abstract
301
+ */
302
+ declare abstract class VectorDBAdapter {
303
+ /**
304
+ * Connect to the vector database.
305
+ *
306
+ * Initialize client, authenticate, verify connection.
307
+ */
308
+ abstract connect(): Promise<void>;
309
+ /**
310
+ * Disconnect from the vector database.
311
+ *
312
+ * Clean up resources, close connections.
313
+ */
314
+ abstract disconnect(): Promise<void>;
315
+ /**
316
+ * Check if currently connected to the database.
317
+ */
318
+ abstract isConnected(): Promise<boolean>;
319
+ /**
320
+ * Create a new vector collection.
321
+ *
322
+ * @param name - Collection name
323
+ * @param dimension - Vector dimension
324
+ * @param metric - Distance metric (default: cosine)
325
+ */
326
+ abstract createCollection(name: string, dimension: number, metric?: DistanceMetric): Promise<void>;
327
+ /**
328
+ * Delete a collection and all its vectors.
329
+ *
330
+ * @param name - Collection name
331
+ */
332
+ abstract deleteCollection(name: string): Promise<void>;
333
+ /**
334
+ * Check if a collection exists.
335
+ *
336
+ * @param name - Collection name
337
+ */
338
+ abstract collectionExists(name: string): Promise<boolean>;
339
+ /**
340
+ * Get statistics about a collection.
341
+ *
342
+ * @param name - Collection name
343
+ */
344
+ abstract getCollectionStats(name: string): Promise<CollectionStats>;
345
+ /**
346
+ * Upsert (insert or update) vector records.
347
+ *
348
+ * This is the primary method for adding vectors to the database.
349
+ * If a record with the same ID exists, it is updated.
350
+ *
351
+ * @param collection - Collection name
352
+ * @param records - Vector records to upsert
353
+ */
354
+ abstract upsert(collection: string, records: VectorRecord[]): Promise<void>;
355
+ /**
356
+ * Fetch vector records by ID.
357
+ *
358
+ * @param collection - Collection name
359
+ * @param ids - Record IDs to fetch
360
+ * @returns Array of matching records (may be empty)
361
+ */
362
+ abstract fetch(collection: string, ids: string[]): Promise<VectorRecord[]>;
363
+ /**
364
+ * Delete vector records by ID.
365
+ *
366
+ * @param collection - Collection name
367
+ * @param ids - Record IDs to delete
368
+ */
369
+ abstract delete(collection: string, ids: string[]): Promise<void>;
370
+ /**
371
+ * Update metadata for existing records without re-uploading vectors.
372
+ *
373
+ * This is CRITICAL for enrichment pipelines where we need to:
374
+ * 1. Insert initial vectors with basic metadata
375
+ * 2. Later enrich with vertical/horizontal metadata
376
+ * 3. Avoid re-uploading large embedding vectors
377
+ *
378
+ * @param collection - Collection name
379
+ * @param updates - Metadata updates to apply
380
+ */
381
+ abstract updateMetadata(collection: string, updates: MetadataUpdate[]): Promise<void>;
382
+ /**
383
+ * Search for similar vectors.
384
+ *
385
+ * @param collection - Collection name
386
+ * @param queryVector - Query vector to search with
387
+ * @param options - Search options
388
+ * @returns Search results
389
+ */
390
+ abstract search(collection: string, queryVector: number[], options?: {
391
+ topK?: number;
392
+ filter?: UniversalFilter;
393
+ includeMetadata?: boolean;
394
+ includeValues?: boolean;
395
+ }): Promise<SearchResult>;
396
+ /**
397
+ * Translate universal filter to database-specific filter format.
398
+ *
399
+ * This is the KEY method that enables database-agnostic filtering.
400
+ * Each adapter translates the universal filter to its native format:
401
+ *
402
+ * - Pinecone: {field: {$eq: value}}
403
+ * - Qdrant: {must: [{key: field, match: {value}}]}
404
+ * - Chroma: {field: value}
405
+ *
406
+ * @param filter - Universal filter
407
+ * @returns Database-specific filter object
408
+ */
409
+ abstract translateFilter(filter: UniversalFilter): any;
410
+ /**
411
+ * Iterate over all vectors in a collection in batches.
412
+ *
413
+ * This enables efficient enrichment pipelines:
414
+ * 1. Fetch vectors in batches
415
+ * 2. Enrich each batch with metadata
416
+ * 3. Update metadata back to DB
417
+ *
418
+ * @param collection - Collection name
419
+ * @param options - Iteration options
420
+ * @yields Batches of vector records
421
+ */
422
+ abstract iterate(collection: string, options?: {
423
+ batchSize?: number;
424
+ filter?: UniversalFilter;
425
+ }): AsyncIterableIterator<VectorRecord[]>;
426
+ /**
427
+ * Whether this adapter supports metadata updates without re-uploading vectors.
428
+ *
429
+ * Default: false (must re-upload entire record)
430
+ * Override to return true if your DB supports partial updates.
431
+ */
432
+ supportsMetadataUpdate(): boolean;
433
+ /**
434
+ * Whether this adapter supports filtering during search.
435
+ *
436
+ * Default: false (no filtering support)
437
+ * Override to return true if your DB supports metadata filtering.
438
+ */
439
+ supportsFiltering(): boolean;
440
+ /**
441
+ * Whether this adapter supports batch operations efficiently.
442
+ *
443
+ * Default: false (single operations only)
444
+ * Override to return true if your DB supports batch upsert/delete.
445
+ */
446
+ supportsBatchOperations(): boolean;
447
+ }
448
+
449
+ /**
450
+ * Query Composition Layer - Retrieval Types and Interfaces
451
+ *
452
+ * Defines the core interfaces for retrieval operations in Glyph.
453
+ * These types abstract query parameters and results across different
454
+ * vector database adapters.
455
+ */
456
+
457
+ /**
458
+ * Parameters for a retrieval operation.
459
+ *
460
+ * Combines query text, collection targeting, and optional filters
461
+ * for both vertical (document-level) and horizontal (theme-level) filtering.
462
+ */
463
+ interface RetrievalParams {
464
+ /** The search query text to embed and search for */
465
+ query: string;
466
+ /** Target collection to search in */
467
+ collection: string;
468
+ /** Number of results to return */
469
+ topK: number;
470
+ /** Optional document-level filters (e.g., filter by doc_id, region, year) */
471
+ verticalFilters?: UniversalFilter;
472
+ /** Optional theme/section-level filters (e.g., filter by theme, section) */
473
+ horizontalFilters?: UniversalFilter;
474
+ /** Optional additional user-defined filters */
475
+ customFilters?: UniversalFilter;
476
+ /** Whether to include embedding vectors in results (default: false) */
477
+ includeEmbeddings?: boolean;
478
+ }
479
+ /**
480
+ * Result of a retrieval operation.
481
+ *
482
+ * Contains the retrieved records, original query, and information
483
+ * about which filters were applied.
484
+ */
485
+ interface RetrievalResult {
486
+ /** The retrieved vector records */
487
+ records: VectorRecord[];
488
+ /** The original query text */
489
+ query: string;
490
+ /** Information about which filters were applied */
491
+ filtersApplied: {
492
+ vertical?: UniversalFilter;
493
+ horizontal?: UniversalFilter;
494
+ custom?: UniversalFilter;
495
+ };
496
+ }
497
+ /**
498
+ * Options for a search operation at the adapter level.
499
+ *
500
+ * These are lower-level options used by adapters to perform
501
+ * the actual vector search.
502
+ */
503
+ interface SearchOptions {
504
+ /** Number of results to return */
505
+ topK: number;
506
+ /**
507
+ * Optional universal filter for the search.
508
+ * This is NOT yet translated - adapters will translate it to their native format.
509
+ * See VectorDBAdapter.translateFilter() for translation logic.
510
+ */
511
+ filter?: UniversalFilter;
512
+ /** Whether to include embedding vectors in results */
513
+ includeEmbeddings?: boolean;
514
+ }
515
+ /**
516
+ * Results grouped by different dimensions.
517
+ *
518
+ * Used for organizing search results by vertical (document)
519
+ * or horizontal (theme) dimensions.
520
+ *
521
+ * **How Map keys are determined:**
522
+ * - Vertical: Keys are extracted from the `__v_doc_id` field in record metadata
523
+ * - Horizontal: Keys are extracted from the `__h_theme` field in record metadata
524
+ *
525
+ * **Handling missing metadata:**
526
+ * - If a record is missing `__v_doc_id`, it will NOT appear in the vertical Map
527
+ * - If a record is missing `__h_theme`, it will NOT appear in the horizontal Map
528
+ * - Records can be excluded from both Maps if they lack the required metadata fields
529
+ *
530
+ * **Grouping behavior:**
531
+ * - Each record appears in AT MOST ONE group per dimension (based on its metadata value)
532
+ * - A record with `__v_doc_id: "doc1"` will appear in `vertical.get("doc1")`
533
+ * - A record with `__h_theme: "legal"` will appear in `horizontal.get("legal")`
534
+ * - Records cannot appear in multiple groups within the same dimension
535
+ */
536
+ interface GroupedResults {
537
+ /** Records grouped by document ID (__v_doc_id) */
538
+ vertical: Map<string, VectorRecord[]>;
539
+ /** Records grouped by theme (__h_theme) */
540
+ horizontal: Map<string, VectorRecord[]>;
541
+ }
542
+
543
+ /**
544
+ * FilterBuilder - Utility for combining multiple filters with fluent API.
545
+ *
546
+ * Provides a convenient way to combine vertical, horizontal, and custom filters
547
+ * into a single UniversalFilter with AND logic.
548
+ *
549
+ * @example
550
+ * ```typescript
551
+ * const filter = new FilterBuilder()
552
+ * .withVerticalFilter({ field: 'doc_id', op: 'eq', value: 'doc123' })
553
+ * .withHorizontalFilter({ field: 'theme', op: 'eq', value: 'legal' })
554
+ * .build();
555
+ * ```
556
+ */
557
+ declare class FilterBuilder {
558
+ private verticalFilter?;
559
+ private horizontalFilter?;
560
+ private customFilter?;
561
+ /**
562
+ * Add a vertical (document-level) filter.
563
+ *
564
+ * @param filter - The vertical filter to add (standard or shorthand format)
565
+ * @returns This builder for method chaining
566
+ */
567
+ withVerticalFilter(filter: UniversalFilter | Record<string, any>): this;
568
+ /**
569
+ * Add a horizontal (theme-level) filter.
570
+ *
571
+ * @param filter - The horizontal filter to add (standard or shorthand format)
572
+ * @returns This builder for method chaining
573
+ */
574
+ withHorizontalFilter(filter: UniversalFilter | Record<string, any>): this;
575
+ /**
576
+ * Add a custom user-defined filter.
577
+ *
578
+ * @param filter - The custom filter to add (standard or shorthand format)
579
+ * @returns This builder for method chaining
580
+ */
581
+ withCustomFilter(filter: UniversalFilter | Record<string, any>): this;
582
+ /**
583
+ * Build the combined filter.
584
+ *
585
+ * Combination logic:
586
+ * - If no filters: returns undefined
587
+ * - If single filter: returns it directly
588
+ * - If multiple filters: combines with AND logic
589
+ *
590
+ * @returns The combined filter, or undefined if no filters were added
591
+ */
592
+ build(): UniversalFilter | undefined;
593
+ }
594
+
595
+ /**
596
+ * Abstract base class for text embedding models.
597
+ *
598
+ * This abstraction allows the VectorORM to work with any embedding provider
599
+ * (OpenAI, Cohere, HuggingFace, etc.) by implementing a consistent interface.
600
+ *
601
+ * Implementations must provide:
602
+ * - `embed()`: Convert a single text string into a vector embedding
603
+ * - `embedBatch()`: Convert multiple texts into embeddings efficiently
604
+ * - `dimensions`: The size of the embedding vectors produced
605
+ * - `modelName`: Identifier for the embedding model being used
606
+ *
607
+ * @example
608
+ * ```typescript
609
+ * class OpenAIEmbedder extends Embedder {
610
+ * get dimensions(): number { return 1536; }
611
+ * get modelName(): string { return 'text-embedding-ada-002'; }
612
+ *
613
+ * async embed(text: string): Promise<number[]> {
614
+ * // Call OpenAI API
615
+ * }
616
+ *
617
+ * async embedBatch(texts: string[]): Promise<number[][]> {
618
+ * // Batch call to OpenAI API
619
+ * }
620
+ * }
621
+ * ```
622
+ */
623
+ declare abstract class Embedder {
624
+ /**
625
+ * The dimensionality of embeddings produced by this model.
626
+ * Must be consistent across all embeddings from the same model.
627
+ */
628
+ abstract get dimensions(): number;
629
+ /**
630
+ * Identifier for the embedding model.
631
+ * Used for tracking which model generated embeddings.
632
+ */
633
+ abstract get modelName(): string;
634
+ /**
635
+ * Embed a single text string into a vector.
636
+ *
637
+ * @param text - The text to embed
638
+ * @returns A promise that resolves to a number array representing the embedding
639
+ */
640
+ abstract embed(text: string): Promise<number[]>;
641
+ /**
642
+ * Embed multiple texts into vectors efficiently.
643
+ * Implementations should maintain the order of input texts in the output.
644
+ *
645
+ * @param texts - Array of texts to embed
646
+ * @returns A promise that resolves to an array of embeddings, one per input text
647
+ */
648
+ abstract embedBatch(texts: string[]): Promise<number[][]>;
649
+ /**
650
+ * Constructor is protected to prevent direct instantiation of abstract class.
651
+ * Subclasses can call super() in their constructors.
652
+ */
653
+ protected constructor();
654
+ }
655
+
656
+ /**
657
+ * RAGQueryComposer - Main orchestrator for retrieval operations.
658
+ *
659
+ * Coordinates between embedder and vector database adapter to perform
660
+ * semantic search with filtering. Provides specialized methods for
661
+ * grouping results by vertical (document) or horizontal (theme) dimensions.
662
+ *
663
+ * @example
664
+ * ```typescript
665
+ * const composer = new RAGQueryComposer(adapter, embedder);
666
+ *
667
+ * // Basic retrieval
668
+ * const result = await composer.retrieve({
669
+ * query: 'pricing information',
670
+ * collection: 'documents',
671
+ * topK: 10
672
+ * });
673
+ *
674
+ * // Retrieval with filters
675
+ * const filtered = await composer.retrieve({
676
+ * query: 'pricing information',
677
+ * collection: 'documents',
678
+ * topK: 10,
679
+ * verticalFilters: { doc_id: 'contract-123' },
680
+ * horizontalFilters: { theme: 'legal' }
681
+ * });
682
+ *
683
+ * // Grouped by document
684
+ * const byDocument = await composer.retrieveVertical({
685
+ * query: 'pricing information',
686
+ * collection: 'documents',
687
+ * topK: 10
688
+ * });
689
+ * ```
690
+ */
691
+ declare class RAGQueryComposer {
692
+ private readonly adapter;
693
+ private readonly embedder;
694
+ /**
695
+ * Create a new RAGQueryComposer.
696
+ *
697
+ * @param adapter - Vector database adapter for search operations
698
+ * @param embedder - Embedder for converting text queries to vectors
699
+ */
700
+ constructor(adapter: VectorDBAdapter, embedder: Embedder);
701
+ /**
702
+ * Main retrieval method.
703
+ *
704
+ * Performs semantic search with optional filtering:
705
+ * 1. Embeds query text using embedder
706
+ * 2. Builds combined filter using FilterBuilder
707
+ * 3. Calls adapter.search() with query vector and filter
708
+ * 4. Returns results with filter information
709
+ *
710
+ * @param params - Retrieval parameters
711
+ * @returns Retrieval result with records and filter information
712
+ */
713
+ retrieve(params: RetrievalParams): Promise<RetrievalResult>;
714
+ /**
715
+ * Retrieve and group results by document ID.
716
+ *
717
+ * Calls retrieve() and organizes results into a Map keyed by __v_doc_id.
718
+ * Records without a doc_id are excluded.
719
+ *
720
+ * @param params - Retrieval parameters
721
+ * @returns Map of document ID to array of records
722
+ */
723
+ retrieveVertical(params: RetrievalParams): Promise<Map<string, VectorRecord[]>>;
724
+ /**
725
+ * Retrieve and group results by theme.
726
+ *
727
+ * Calls retrieve() and organizes results into a Map keyed by __h_theme.
728
+ * Records without a theme are excluded.
729
+ *
730
+ * @param params - Retrieval parameters
731
+ * @returns Map of theme to array of records
732
+ */
733
+ retrieveHorizontal(params: RetrievalParams): Promise<Map<string, VectorRecord[]>>;
734
+ }
735
+
736
+ /**
737
+ * Options for LLM text generation.
738
+ *
739
+ * These options control how the LLM generates text,
740
+ * allowing fine-grained control over the output behavior.
741
+ */
742
+ interface GenerateOptions {
743
+ /**
744
+ * Controls randomness in generation.
745
+ * Higher values (e.g., 1.0) make output more random.
746
+ * Lower values (e.g., 0.1) make output more deterministic.
747
+ * Range: 0.0 to 2.0
748
+ */
749
+ temperature?: number;
750
+ /**
751
+ * Maximum number of tokens to generate.
752
+ * Limits the length of the generated output.
753
+ */
754
+ maxTokens?: number;
755
+ /**
756
+ * System prompt to set context for the LLM.
757
+ * Used to guide the model's behavior and personality.
758
+ */
759
+ systemPrompt?: string;
760
+ /**
761
+ * Sequences where the LLM should stop generating.
762
+ * When encountered, generation stops immediately.
763
+ */
764
+ stopSequences?: string[];
765
+ }
766
+
767
+ /**
768
+ * Abstract base class for LLM (Large Language Model) clients.
769
+ *
770
+ * This abstraction allows the VectorORM to work with any LLM provider
771
+ * (OpenAI, Anthropic, Google, etc.) by implementing a consistent interface.
772
+ *
773
+ * Implementations must provide:
774
+ * - `generate()`: Generate text from a prompt
775
+ * - `generateJSON<T>()`: Generate structured JSON output
776
+ * - `generateBatch()`: Generate multiple responses efficiently
777
+ * - `modelName`: Identifier for the LLM model being used
778
+ * - `provider`: Name of the LLM provider
779
+ *
780
+ * @example
781
+ * ```typescript
782
+ * class OpenAIClient extends LLMClient {
783
+ * get modelName(): string { return 'gpt-4'; }
784
+ * get provider(): string { return 'openai'; }
785
+ *
786
+ * async generate(prompt: string, options?: GenerateOptions): Promise<string> {
787
+ * // Call OpenAI API
788
+ * }
789
+ *
790
+ * async generateJSON<T>(prompt: string, options?: GenerateOptions): Promise<T> {
791
+ * // Call OpenAI API with JSON mode
792
+ * }
793
+ *
794
+ * async generateBatch(prompts: string[], options?: GenerateOptions): Promise<string[]> {
795
+ * // Batch call to OpenAI API
796
+ * }
797
+ * }
798
+ * ```
799
+ */
800
+ declare abstract class LLMClient {
801
+ /**
802
+ * Identifier for the LLM model.
803
+ * Used for tracking which model generated responses.
804
+ */
805
+ abstract get modelName(): string;
806
+ /**
807
+ * Name of the LLM provider.
808
+ * Examples: 'openai', 'anthropic', 'google', 'mock'
809
+ */
810
+ abstract get provider(): string;
811
+ /**
812
+ * Generate text from a prompt.
813
+ *
814
+ * @param prompt - The text prompt to send to the LLM
815
+ * @param options - Optional generation parameters
816
+ * @returns A promise that resolves to the generated text
817
+ */
818
+ abstract generate(prompt: string, options?: GenerateOptions): Promise<string>;
819
+ /**
820
+ * Generate structured JSON output from a prompt.
821
+ * The LLM will be instructed to return valid JSON that matches type T.
822
+ *
823
+ * @param prompt - The text prompt to send to the LLM
824
+ * @param options - Optional generation parameters
825
+ * @returns A promise that resolves to the parsed JSON object
826
+ */
827
+ abstract generateJSON<T>(prompt: string, options?: GenerateOptions): Promise<T>;
828
+ /**
829
+ * Generate multiple responses efficiently.
830
+ * Implementations should maintain the order of input prompts in the output.
831
+ *
832
+ * @param prompts - Array of prompts to process
833
+ * @param options - Optional generation parameters
834
+ * @returns A promise that resolves to an array of responses, one per input prompt
835
+ */
836
+ abstract generateBatch(prompts: string[], options?: GenerateOptions): Promise<string[]>;
837
+ /**
838
+ * Constructor is protected to prevent direct instantiation of abstract class.
839
+ * Subclasses can call super() in their constructors.
840
+ */
841
+ protected constructor();
842
+ }
843
+
844
+ /**
845
+ * MockLLM for testing purposes only.
846
+ * Returns canned responses that can be set programmatically.
847
+ *
848
+ * @example
849
+ * ```typescript
850
+ * const llm = new MockLLM();
851
+ * llm.setResponse('Hello, world!');
852
+ * const result = await llm.generate('Say hello'); // Returns 'Hello, world!'
853
+ * ```
854
+ */
855
+ declare class MockLLM extends LLMClient {
856
+ private _response;
857
+ constructor();
858
+ get modelName(): string;
859
+ get provider(): string;
860
+ /**
861
+ * Set the canned response that will be returned by generate methods.
862
+ *
863
+ * @param response - The response text to return
864
+ */
865
+ setResponse(response: string): void;
866
+ generate(prompt: string, options?: GenerateOptions): Promise<string>;
867
+ generateJSON<T>(prompt: string, options?: GenerateOptions): Promise<T>;
868
+ generateBatch(prompts: string[], options?: GenerateOptions): Promise<string[]>;
869
+ }
870
+
871
+ /**
872
+ * Theme classification result containing the identified theme and confidence score.
873
+ *
874
+ * @property theme - The identified theme label (e.g., 'technology', 'business', 'science')
875
+ * @property confidence - Confidence score between 0 and 1 indicating classification certainty
876
+ * @property allScores - Optional map of all theme labels to their respective confidence scores
877
+ */
878
+ interface ThemeClassification {
879
+ /**
880
+ * The identified theme label.
881
+ * Examples: 'technology', 'business', 'science', 'healthcare', 'education', etc.
882
+ */
883
+ theme: string;
884
+ /**
885
+ * Confidence score between 0 and 1 indicating classification certainty.
886
+ * Higher values indicate greater confidence in the classification.
887
+ */
888
+ confidence: number;
889
+ /**
890
+ * Optional map of all theme labels to their respective confidence scores.
891
+ * Useful for understanding alternative themes and their relative probabilities.
892
+ *
893
+ * @example
894
+ * ```typescript
895
+ * {
896
+ * 'technology': 0.85,
897
+ * 'business': 0.10,
898
+ * 'science': 0.05
899
+ * }
900
+ * ```
901
+ */
902
+ allScores?: Record<string, number>;
903
+ }
904
+ /**
905
+ * Interface for theme classification strategies.
906
+ *
907
+ * Theme classifiers identify the primary theme or topic of text content.
908
+ * Different implementations can use various strategies:
909
+ *
910
+ * 1. **Keyword-based Classification**: Uses predefined keyword lists to match themes
911
+ * - Fast and deterministic
912
+ * - Good for well-defined domains with clear vocabulary
913
+ * - Example: Medical texts with specific terminology
914
+ *
915
+ * 2. **Zero-shot Classification**: Uses pre-trained models without fine-tuning
916
+ * - No training data required
917
+ * - Good for general-purpose classification
918
+ * - Example: Hugging Face zero-shot classification models
919
+ *
920
+ * 3. **Embedding-based Classification**: Uses vector similarity between text and theme embeddings
921
+ * - Semantic understanding of themes
922
+ * - Can find nuanced thematic relationships
923
+ * - Example: Comparing document embeddings to theme prototype embeddings
924
+ *
925
+ * 4. **LLM-based Classification**: Uses language models for theme identification
926
+ * - Most flexible and powerful
927
+ * - Can understand complex, nuanced themes
928
+ * - Example: GPT-4, Claude, or other LLMs with structured output
929
+ *
930
+ * Implementations should:
931
+ * - Return confidence scores between 0 and 1
932
+ * - Handle empty or invalid input gracefully
933
+ * - Maintain consistent theme labels across calls
934
+ * - Optionally provide all theme scores for transparency
935
+ *
936
+ * @example
937
+ * ```typescript
938
+ * class KeywordThemeClassifier implements ThemeClassifier {
939
+ * async classify(text: string): Promise<ThemeClassification> {
940
+ * // Keyword matching logic
941
+ * const theme = 'technology';
942
+ * const confidence = 0.92;
943
+ * return { theme, confidence };
944
+ * }
945
+ *
946
+ * async classifyBatch(texts: string[]): Promise<ThemeClassification[]> {
947
+ * return Promise.all(texts.map(text => this.classify(text)));
948
+ * }
949
+ * }
950
+ * ```
951
+ *
952
+ * @example
953
+ * ```typescript
954
+ * class LLMThemeClassifier implements ThemeClassifier {
955
+ * constructor(private llm: LLMClient, private themes: string[]) {}
956
+ *
957
+ * async classify(text: string): Promise<ThemeClassification> {
958
+ * const prompt = `Classify the following text into one of these themes: ${this.themes.join(', ')}
959
+ *
960
+ * Text: ${text}
961
+ *
962
+ * Return JSON with: theme (string), confidence (number 0-1), allScores (object)`;
963
+ *
964
+ * const result = await this.llm.generateJSON<ThemeClassification>(prompt);
965
+ * return result;
966
+ * }
967
+ *
968
+ * async classifyBatch(texts: string[]): Promise<ThemeClassification[]> {
969
+ * // Efficient batch processing
970
+ * return Promise.all(texts.map(text => this.classify(text)));
971
+ * }
972
+ * }
973
+ * ```
974
+ */
975
+ interface ThemeClassifier {
976
+ /**
977
+ * Classify a single text and return the identified theme with confidence score.
978
+ *
979
+ * @param text - The text content to classify
980
+ * @returns A promise that resolves to the theme classification result
981
+ *
982
+ * @example
983
+ * ```typescript
984
+ * const classifier = new KeywordThemeClassifier();
985
+ * const result = await classifier.classify('Machine learning is transforming AI');
986
+ * console.log(result.theme); // 'technology'
987
+ * console.log(result.confidence); // 0.92
988
+ * ```
989
+ */
990
+ classify(text: string): Promise<ThemeClassification>;
991
+ /**
992
+ * Classify multiple texts efficiently and return their theme classifications.
993
+ *
994
+ * Implementations should maintain the order of input texts in the output array.
995
+ * May use parallel processing or batching for efficiency.
996
+ *
997
+ * @param texts - Array of text contents to classify
998
+ * @returns A promise that resolves to an array of theme classifications
999
+ *
1000
+ * @example
1001
+ * ```typescript
1002
+ * const classifier = new KeywordThemeClassifier();
1003
+ * const texts = [
1004
+ * 'Machine learning is transforming AI',
1005
+ * 'The stock market reached new highs',
1006
+ * 'New cancer treatment shows promise'
1007
+ * ];
1008
+ * const results = await classifier.classifyBatch(texts);
1009
+ * // results[0].theme === 'technology'
1010
+ * // results[1].theme === 'business'
1011
+ * // results[2].theme === 'healthcare'
1012
+ * ```
1013
+ */
1014
+ classifyBatch(texts: string[]): Promise<ThemeClassification[]>;
1015
+ }
1016
+
1017
+ /**
1018
+ * Types and interfaces for document enrichment operations.
1019
+ *
1020
+ * This module defines the configuration interfaces for various enrichment strategies:
1021
+ * - Vertical enrichment: Classify documents into business verticals
1022
+ * - Theme enrichment: Add thematic tags to documents
1023
+ * - Section enrichment: Structure documents into logical sections
1024
+ */
1025
+
1026
+ /**
1027
+ * Progress callback function for tracking enrichment operations.
1028
+ *
1029
+ * @param stats - Current enrichment statistics
1030
+ *
1031
+ * @example
1032
+ * ```typescript
1033
+ * const onProgress: ProgressCallback = (stats) => {
1034
+ * console.log(`Processed: ${stats.recordsProcessed}/${stats.recordsProcessed + stats.recordsSkipped}`);
1035
+ * console.log(`Updated: ${stats.recordsUpdated}`);
1036
+ * };
1037
+ * ```
1038
+ */
1039
+ type ProgressCallback = (stats: EnrichmentStats) => void;
1040
+ /**
1041
+ * Statistics for an enrichment operation.
1042
+ *
1043
+ * Tracks the progress and outcome of enrichment operations,
1044
+ * including records processed, updated, skipped, and any errors encountered.
1045
+ *
1046
+ * @property recordsProcessed - Total number of records processed
1047
+ * @property recordsUpdated - Number of records successfully updated
1048
+ * @property recordsSkipped - Number of records skipped (e.g., filtered out)
1049
+ * @property timeMs - Total time taken in milliseconds
1050
+ * @property errors - Optional array of error messages encountered during enrichment
1051
+ *
1052
+ * @example
1053
+ * ```typescript
1054
+ * const stats: EnrichmentStats = {
1055
+ * recordsProcessed: 100,
1056
+ * recordsUpdated: 95,
1057
+ * recordsSkipped: 5,
1058
+ * timeMs: 1250,
1059
+ * errors: ['Failed to classify record 42']
1060
+ * };
1061
+ * ```
1062
+ */
1063
+ interface EnrichmentStats {
1064
+ /**
1065
+ * Total number of records processed.
1066
+ */
1067
+ recordsProcessed: number;
1068
+ /**
1069
+ * Number of records successfully updated with enrichment data.
1070
+ */
1071
+ recordsUpdated: number;
1072
+ /**
1073
+ * Number of records skipped (e.g., filtered out or already enriched).
1074
+ */
1075
+ recordsSkipped: number;
1076
+ /**
1077
+ * Total time taken in milliseconds.
1078
+ */
1079
+ timeMs: number;
1080
+ /**
1081
+ * Optional array of error messages encountered during enrichment.
1082
+ */
1083
+ errors?: string[];
1084
+ }
1085
+ /**
1086
+ * Configuration for field mapping-based vertical enrichment.
1087
+ *
1088
+ * Maps values from an existing field to vertical classifications.
1089
+ * This is the simplest enrichment strategy, useful when vertical
1090
+ * information is already present in a different field.
1091
+ *
1092
+ * @property mapping - Map of source field values to vertical labels
1093
+ * @property filter - Optional filter to select which records to enrich
1094
+ * @property batchSize - Optional batch size for processing (default: 100)
1095
+ *
1096
+ * @example
1097
+ * ```typescript
1098
+ * const config: FieldMappingConfig = {
1099
+ * mapping: {
1100
+ * 'tech': 'technology',
1101
+ * 'healthcare': 'medical',
1102
+ * 'fin': 'finance'
1103
+ * },
1104
+ * filter: { field: 'category', op: 'exists', value: true },
1105
+ * batchSize: 50
1106
+ * };
1107
+ * ```
1108
+ */
1109
+ interface FieldMappingConfig {
1110
+ /**
1111
+ * Map of source field values to vertical labels.
1112
+ *
1113
+ * @example
1114
+ * ```typescript
1115
+ * {
1116
+ * 'tech': 'technology',
1117
+ * 'healthcare': 'medical',
1118
+ * 'finance': 'finance'
1119
+ * }
1120
+ * ```
1121
+ */
1122
+ mapping: Record<string, string>;
1123
+ /**
1124
+ * Optional filter to select which records to enrich.
1125
+ */
1126
+ filter?: UniversalFilter;
1127
+ /**
1128
+ * Optional batch size for processing (default: 100).
1129
+ */
1130
+ batchSize?: number;
1131
+ }
1132
+ /**
1133
+ * Configuration for custom extractor function-based vertical enrichment.
1134
+ *
1135
+ * Uses a custom function to extract vertical classifications from documents.
1136
+ * This provides maximum flexibility for complex extraction logic.
1137
+ *
1138
+ * @property extractor - Function that extracts vertical label from a document
1139
+ * @property filter - Optional filter to select which records to enrich
1140
+ * @property batchSize - Optional batch size for processing (default: 100)
1141
+ *
1142
+ * @example
1143
+ * ```typescript
1144
+ * const config: ExtractorConfig = {
1145
+ * extractor: async (doc) => {
1146
+ * if (doc.content.includes('machine learning')) return 'technology';
1147
+ * if (doc.content.includes('stock market')) return 'finance';
1148
+ * return 'general';
1149
+ * },
1150
+ * filter: { field: 'content', op: 'exists', value: true },
1151
+ * batchSize: 25
1152
+ * };
1153
+ * ```
1154
+ */
1155
+ interface ExtractorConfig {
1156
+ /**
1157
+ * Function that extracts vertical label from a document.
1158
+ *
1159
+ * @param document - The document to extract vertical from
1160
+ * @returns Promise resolving to the vertical label
1161
+ */
1162
+ extractor: (document: any) => Promise<string>;
1163
+ /**
1164
+ * Optional filter to select which records to enrich.
1165
+ */
1166
+ filter?: UniversalFilter;
1167
+ /**
1168
+ * Optional batch size for processing (default: 100).
1169
+ */
1170
+ batchSize?: number;
1171
+ }
1172
+ /**
1173
+ * Configuration for automatic LLM-based vertical enrichment.
1174
+ *
1175
+ * Uses a language model to automatically classify documents into verticals.
1176
+ * Can use predefined field mappings or automatic extraction from text.
1177
+ *
1178
+ * @property llm - The LLM client to use for classification
1179
+ * @property fields - Array of vertical labels to classify into
1180
+ * @property promptTemplate - Optional custom prompt template for the LLM
1181
+ * @property textField - Optional field name containing text to classify (default: 'content')
1182
+ * @property filter - Optional filter to select which records to enrich
1183
+ * @property batchSize - Optional batch size for processing (default: 10)
1184
+ *
1185
+ * @example
1186
+ * ```typescript
1187
+ * const config: AutomaticExtractionConfig = {
1188
+ * automatic: {
1189
+ * llm: myLLMClient,
1190
+ * fields: ['technology', 'finance', 'healthcare', 'retail'],
1191
+ * promptTemplate: 'Classify this text into one of: {fields}\n\nText: {text}',
1192
+ * textField: 'description'
1193
+ * },
1194
+ * filter: { field: 'vertical', op: 'eq', value: null },
1195
+ * batchSize: 5
1196
+ * };
1197
+ * ```
1198
+ */
1199
+ interface AutomaticExtractionConfig {
1200
+ /**
1201
+ * Automatic extraction settings using an LLM.
1202
+ */
1203
+ automatic: {
1204
+ /**
1205
+ * The LLM client to use for classification.
1206
+ */
1207
+ llm: LLMClient;
1208
+ /**
1209
+ * Array of vertical labels to classify into.
1210
+ *
1211
+ * @example
1212
+ * ['technology', 'finance', 'healthcare', 'retail']
1213
+ */
1214
+ fields: string[];
1215
+ /**
1216
+ * Optional custom prompt template for the LLM.
1217
+ * Use {fields} for the list of verticals and {text} for the document text.
1218
+ *
1219
+ * @example
1220
+ * 'Classify this text into one of: {fields}\n\nText: {text}'
1221
+ */
1222
+ promptTemplate?: string;
1223
+ /**
1224
+ * Optional field name containing text to classify (default: 'content').
1225
+ */
1226
+ textField?: string;
1227
+ };
1228
+ /**
1229
+ * Optional filter to select which records to enrich.
1230
+ */
1231
+ filter?: UniversalFilter;
1232
+ /**
1233
+ * Optional batch size for processing (default: 10).
1234
+ */
1235
+ batchSize?: number;
1236
+ }
1237
+ /**
1238
+ * Configuration for vertical enrichment operations.
1239
+ *
1240
+ * Vertical enrichment classifies documents into business verticals
1241
+ * (e.g., technology, finance, healthcare). Three strategies are supported:
1242
+ *
1243
+ * 1. **Field Mapping**: Map existing field values to verticals
1244
+ * 2. **Custom Extractor**: Use a custom function to extract verticals
1245
+ * 3. **Automatic Extraction**: Use an LLM to automatically classify documents
1246
+ *
1247
+ * @example
1248
+ * ```typescript
1249
+ * // Field mapping
1250
+ * const config1: VerticalEnrichmentConfig = {
1251
+ * mapping: { 'tech': 'technology', 'hc': 'healthcare' }
1252
+ * };
1253
+ *
1254
+ * // Custom extractor
1255
+ * const config2: VerticalEnrichmentConfig = {
1256
+ * extractor: async (doc) => extractVertical(doc)
1257
+ * };
1258
+ *
1259
+ * // Automatic extraction
1260
+ * const config3: VerticalEnrichmentConfig = {
1261
+ * automatic: {
1262
+ * llm: myLLMClient,
1263
+ * fields: ['technology', 'finance', 'healthcare']
1264
+ * }
1265
+ * };
1266
+ * ```
1267
+ */
1268
+ type VerticalEnrichmentConfig = FieldMappingConfig | ExtractorConfig | AutomaticExtractionConfig;
1269
+ /**
1270
+ * Configuration for theme enrichment operations.
1271
+ *
1272
+ * Theme enrichment adds thematic tags to documents using a theme classifier.
1273
+ * Supports confidence thresholds, multi-theme tagging, and custom text fields.
1274
+ *
1275
+ * @property themes - Array of theme labels to classify into
1276
+ * @property classifier - The theme classifier to use for classification
1277
+ * @property textField - Optional field name containing text to classify (default: 'content')
1278
+ * @property confidenceThreshold - Optional minimum confidence threshold (default: 0.0)
1279
+ * @property multiTheme - Optional flag to allow multiple themes per document (default: false)
1280
+ * @property filter - Optional filter to select which records to enrich
1281
+ * @property batchSize - Optional batch size for processing (default: 100)
1282
+ * @property onProgress - Optional callback for tracking progress
1283
+ *
1284
+ * @example
1285
+ * ```typescript
1286
+ * const config: ThemeEnrichmentConfig = {
1287
+ * themes: ['technology', 'business', 'science', 'healthcare'],
1288
+ * classifier: new KeywordThemeClassifier(),
1289
+ * textField: 'description',
1290
+ * confidenceThreshold: 0.7,
1291
+ * multiTheme: true,
1292
+ * filter: { field: 'themes', op: 'eq', value: null },
1293
+ * batchSize: 50,
1294
+ * onProgress: (stats) => console.log(`Processed: ${stats.recordsProcessed}`)
1295
+ * };
1296
+ * ```
1297
+ */
1298
+ interface ThemeEnrichmentConfig {
1299
+ /**
1300
+ * Array of theme labels to classify into.
1301
+ *
1302
+ * @example
1303
+ * ['technology', 'business', 'science', 'healthcare']
1304
+ */
1305
+ themes: string[];
1306
+ /**
1307
+ * The theme classifier to use for classification.
1308
+ */
1309
+ classifier: ThemeClassifier;
1310
+ /**
1311
+ * Optional field name containing text to classify (default: 'content').
1312
+ */
1313
+ textField?: string;
1314
+ /**
1315
+ * Optional minimum confidence threshold (default: 0.0).
1316
+ * Only themes with confidence >= this value will be assigned.
1317
+ */
1318
+ confidenceThreshold?: number;
1319
+ /**
1320
+ * Optional flag to allow multiple themes per document (default: false).
1321
+ * When true, all themes above the confidence threshold are assigned.
1322
+ */
1323
+ multiTheme?: boolean;
1324
+ /**
1325
+ * Optional filter to select which records to enrich.
1326
+ */
1327
+ filter?: UniversalFilter;
1328
+ /**
1329
+ * Optional batch size for processing (default: 100).
1330
+ */
1331
+ batchSize?: number;
1332
+ /**
1333
+ * Optional callback for tracking progress.
1334
+ */
1335
+ onProgress?: ProgressCallback;
1336
+ }
1337
+ /**
1338
+ * Configuration for section enrichment operations.
1339
+ *
1340
+ * Section enrichment structures documents into logical sections
1341
+ * (e.g., introduction, methodology, results, conclusion).
1342
+ * Can use existing section markers or automatically detect sections.
1343
+ *
1344
+ * @property existingField - Optional field name containing existing section markers
1345
+ * @property autoDetect - Optional flag to automatically detect sections (default: false)
1346
+ * @property filter - Optional filter to select which records to enrich
1347
+ * @property batchSize - Optional batch size for processing (default: 100)
1348
+ *
1349
+ * @example
1350
+ * ```typescript
1351
+ * // Use existing section markers
1352
+ * const config1: SectionEnrichmentConfig = {
1353
+ * existingField: 'raw_sections',
1354
+ * filter: { field: 'sections', op: 'eq', value: null }
1355
+ * };
1356
+ *
1357
+ * // Auto-detect sections
1358
+ * const config2: SectionEnrichmentConfig = {
1359
+ * autoDetect: true,
1360
+ * batchSize: 25
1361
+ * };
1362
+ * ```
1363
+ */
1364
+ interface SectionEnrichmentConfig {
1365
+ /**
1366
+ * Optional field name containing existing section markers.
1367
+ * If provided, sections will be extracted from this field.
1368
+ */
1369
+ existingField?: string;
1370
+ /**
1371
+ * Optional flag to automatically detect sections (default: false).
1372
+ * When true, sections will be detected using heuristics (headers, paragraphs, etc.).
1373
+ */
1374
+ autoDetect?: boolean;
1375
+ /**
1376
+ * Optional filter to select which records to enrich.
1377
+ */
1378
+ filter?: UniversalFilter;
1379
+ /**
1380
+ * Optional batch size for processing (default: 100).
1381
+ */
1382
+ batchSize?: number;
1383
+ }
1384
+ /**
1385
+ * Configuration for enriching all aspects of documents.
1386
+ *
1387
+ * Combines vertical, theme, and section enrichment into a single operation.
1388
+ * Allows running multiple enrichment strategies in sequence with shared settings.
1389
+ *
1390
+ * @property vertical - Optional vertical enrichment configuration
1391
+ * @property themes - Optional theme enrichment configuration
1392
+ * @property sections - Optional section enrichment configuration
1393
+ * @property filter - Optional global filter applied to all enrichment operations
1394
+ * @property batchSize - Optional global batch size for all operations (default: 100)
1395
+ * @property onProgress - Optional global progress callback for all operations
1396
+ *
1397
+ * @example
1398
+ * ```typescript
1399
+ * const config: EnrichAllConfig = {
1400
+ * vertical: {
1401
+ * automatic: {
1402
+ * llm: myLLMClient,
1403
+ * fields: ['technology', 'finance', 'healthcare']
1404
+ * }
1405
+ * },
1406
+ * themes: {
1407
+ * themes: ['innovation', 'research', 'product'],
1408
+ * classifier: new KeywordThemeClassifier(),
1409
+ * confidenceThreshold: 0.8
1410
+ * },
1411
+ * sections: {
1412
+ * autoDetect: true
1413
+ * },
1414
+ * filter: { field: 'status', op: 'eq', value: 'pending' },
1415
+ * batchSize: 50,
1416
+ * onProgress: (stats) => console.log(`Progress: ${stats.recordsProcessed}`)
1417
+ * };
1418
+ * ```
1419
+ */
1420
+ interface EnrichAllConfig {
1421
+ /**
1422
+ * Optional vertical enrichment configuration.
1423
+ */
1424
+ vertical?: VerticalEnrichmentConfig;
1425
+ /**
1426
+ * Optional theme enrichment configuration.
1427
+ */
1428
+ themes?: ThemeEnrichmentConfig;
1429
+ /**
1430
+ * Optional section enrichment configuration.
1431
+ */
1432
+ sections?: SectionEnrichmentConfig;
1433
+ /**
1434
+ * Optional global filter applied to all enrichment operations.
1435
+ * This filter is combined with individual operation filters using AND logic.
1436
+ */
1437
+ filter?: UniversalFilter;
1438
+ /**
1439
+ * Optional global batch size for all operations (default: 100).
1440
+ * Individual operation batch sizes override this value.
1441
+ */
1442
+ batchSize?: number;
1443
+ /**
1444
+ * Optional global progress callback for all operations.
1445
+ * Called after each enrichment operation completes.
1446
+ */
1447
+ onProgress?: ProgressCallback;
1448
+ }
1449
+
1450
+ /**
1451
+ * Fast, deterministic keyword-based theme classifier
1452
+ * Uses precompiled regex patterns with word boundaries for efficient matching
1453
+ */
1454
+ interface ThemeClassificationResult {
1455
+ theme: string;
1456
+ confidence: number;
1457
+ allScores?: Record<string, number>;
1458
+ }
1459
+ declare class KeywordThemeClassifier {
1460
+ private themes;
1461
+ private caseSensitive;
1462
+ private patterns;
1463
+ private keywordCounts;
1464
+ /**
1465
+ * Creates a new KeywordThemeClassifier
1466
+ * @param themes - Array of theme names
1467
+ * @param keywords - Map of theme names to their keyword arrays
1468
+ * @param caseSensitive - Whether matching should be case sensitive (default: false)
1469
+ */
1470
+ constructor(themes: string[], keywords: Record<string, string[]>, caseSensitive?: boolean);
1471
+ /**
1472
+ * Classify a single text
1473
+ * @param text - Text to classify
1474
+ * @returns Classification result with theme, confidence, and all scores
1475
+ */
1476
+ classify(text: string): ThemeClassificationResult;
1477
+ /**
1478
+ * Classify multiple texts in batch
1479
+ * @param texts - Array of texts to classify
1480
+ * @returns Array of classification results
1481
+ */
1482
+ classifyBatch(texts: string[]): ThemeClassificationResult[];
1483
+ /**
1484
+ * Escape special regex characters in a string
1485
+ * @param str - String to escape
1486
+ * @returns Escaped string safe for use in regex
1487
+ */
1488
+ private escapeRegex;
1489
+ }
1490
+
1491
+ /**
1492
+ * Zero-shot theme classifier using Transformers.js
1493
+ * Uses pre-trained models without requiring fine-tuning or training data
1494
+ */
1495
+
1496
+ /**
1497
+ * Zero-shot classification using pre-trained transformer models.
1498
+ *
1499
+ * This classifier uses Hugging Face's zero-shot classification pipeline
1500
+ * to classify text into themes without requiring training data or fine-tuning.
1501
+ * The model is loaded lazily on the first classify() call to improve startup time.
1502
+ *
1503
+ * Features:
1504
+ * - No training data required
1505
+ * - Works with any set of theme labels
1506
+ * - Lazy model loading (loads on first classification)
1507
+ * - Sequential batch processing to avoid memory issues
1508
+ * - Handles empty text with uniform scores
1509
+ *
1510
+ * @example
1511
+ * ```typescript
1512
+ * const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'business']);
1513
+ * const result = await classifier.classify('Machine learning is transforming AI');
1514
+ * console.log(result.theme); // 'technology'
1515
+ * console.log(result.confidence); // 0.95
1516
+ * ```
1517
+ */
1518
+ declare class ZeroShotThemeClassifier implements ThemeClassifier {
1519
+ private model;
1520
+ private modelName;
1521
+ private themes;
1522
+ /**
1523
+ * Creates a new ZeroShotThemeClassifier
1524
+ *
1525
+ * @param themes - Array of theme labels to classify into
1526
+ * @param modelName - Name of the Hugging Face model to use (default: 'Xenova/distilbert-base-uncased-mnli')
1527
+ *
1528
+ * @example
1529
+ * ```typescript
1530
+ * // Use default model
1531
+ * const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'finance']);
1532
+ *
1533
+ * // Use custom model
1534
+ * const classifier = new ZeroShotThemeClassifier(
1535
+ * ['positive', 'negative'],
1536
+ * 'Xenova/distilbert-base-uncased-mnli'
1537
+ * );
1538
+ * ```
1539
+ */
1540
+ constructor(themes: string[], modelName?: string);
1541
+ /**
1542
+ * Lazy loads the zero-shot classification model
1543
+ * Only loads once on first call, subsequent calls reuse the loaded model
1544
+ *
1545
+ * @returns Promise that resolves to the loaded pipeline
1546
+ */
1547
+ private ensureModelLoaded;
1548
+ /**
1549
+ * Classify a single text into one of the provided themes
1550
+ *
1551
+ * @param text - The text content to classify
1552
+ * @returns A promise that resolves to the theme classification result
1553
+ *
1554
+ * @example
1555
+ * ```typescript
1556
+ * const classifier = new ZeroShotThemeClassifier(['technology', 'sports']);
1557
+ * const result = await classifier.classify('Machine learning and AI');
1558
+ * console.log(result.theme); // 'technology'
1559
+ * console.log(result.confidence); // 0.92
1560
+ * console.log(result.allScores); // { technology: 0.92, sports: 0.08 }
1561
+ * ```
1562
+ */
1563
+ classify(text: string): Promise<ThemeClassification>;
1564
+ /**
1565
+ * Classify multiple texts efficiently
1566
+ *
1567
+ * Processes texts sequentially to avoid memory issues with large batches.
1568
+ * The model is loaded once and reused for all texts.
1569
+ *
1570
+ * @param texts - Array of text contents to classify
1571
+ * @returns A promise that resolves to an array of theme classifications
1572
+ *
1573
+ * @example
1574
+ * ```typescript
1575
+ * const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'finance']);
1576
+ * const results = await classifier.classifyBatch([
1577
+ * 'Machine learning is transforming AI',
1578
+ * 'The football team won the championship',
1579
+ * 'Stock market hits record high'
1580
+ * ]);
1581
+ * // results[0].theme === 'technology'
1582
+ * // results[1].theme === 'sports'
1583
+ * // results[2].theme === 'finance'
1584
+ * ```
1585
+ */
1586
+ classifyBatch(texts: string[]): Promise<ThemeClassification[]>;
1587
+ }
1588
+
1589
+ /**
1590
+ * Embedding-based theme classifier using cosine similarity
1591
+ * Computes similarity between text embeddings and theme embeddings
1592
+ */
1593
+
1594
+ /**
1595
+ * Embedding-based classification using cosine similarity.
1596
+ *
1597
+ * This classifier computes embeddings for text and themes, then uses cosine
1598
+ * similarity to determine which theme is most similar to the text. Theme
1599
+ * embeddings are computed lazily on the first classify() call, or can be
1600
+ * provided precomputed in the constructor.
1601
+ *
1602
+ * Features:
1603
+ * - Lazy initialization: theme embeddings computed on first classify()
1604
+ * - Optional precomputed embeddings for faster startup
1605
+ * - Cosine similarity: dotProduct / (normA * normB)
1606
+ * - Normalize similarity [-1,1] to confidence [0,1]
1607
+ * - Handles empty text with uniform scores
1608
+ *
1609
+ * @example
1610
+ * ```typescript
1611
+ * const embedder = new OpenAIEmbedder();
1612
+ * const classifier = new EmbeddingThemeClassifier(['technology', 'sports', 'finance'], embedder);
1613
+ * const result = await classifier.classify('Machine learning is transforming AI');
1614
+ * console.log(result.theme); // 'technology'
1615
+ * console.log(result.confidence); // 0.89
1616
+ * ```
1617
+ */
1618
+ declare class EmbeddingThemeClassifier implements ThemeClassifier {
1619
+ private themeEmbeddings;
1620
+ private embedder;
1621
+ private themes;
1622
+ /**
1623
+ * Creates a new EmbeddingThemeClassifier
1624
+ *
1625
+ * @param themes - Array of theme labels to classify into
1626
+ * @param embedder - Embedder instance to use for generating embeddings
1627
+ * @param precomputedEmbeddings - Optional precomputed theme embeddings for faster startup
1628
+ *
1629
+ * @example
1630
+ * ```typescript
1631
+ * // Lazy initialization
1632
+ * const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder);
1633
+ *
1634
+ * // With precomputed embeddings
1635
+ * const themeEmbeddings = {
1636
+ * technology: await embedder.embed('technology'),
1637
+ * sports: await embedder.embed('sports')
1638
+ * };
1639
+ * const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder, themeEmbeddings);
1640
+ * ```
1641
+ */
1642
+ constructor(themes: string[], embedder: Embedder, precomputedEmbeddings?: Record<string, number[]>);
1643
+ /**
1644
+ * Lazy loads theme embeddings on first use
1645
+ * Computes embeddings for all theme labels if not already computed
1646
+ *
1647
+ * @returns Promise that resolves to the theme embeddings map
1648
+ */
1649
+ private ensureThemeEmbeddings;
1650
+ /**
1651
+ * Compute cosine similarity between two vectors
1652
+ *
1653
+ * Cosine similarity = dotProduct / (normA * normB)
1654
+ * Returns value in range [-1, 1] where:
1655
+ * - 1 means vectors point in the same direction
1656
+ * - 0 means vectors are orthogonal
1657
+ * - -1 means vectors point in opposite directions
1658
+ *
1659
+ * @param a - First vector
1660
+ * @param b - Second vector
1661
+ * @returns Cosine similarity between the vectors
1662
+ */
1663
+ private cosineSimilarity;
1664
+ /**
1665
+ * Normalize cosine similarity from [-1, 1] to confidence score [0, 1]
1666
+ *
1667
+ * Uses linear transformation: (similarity + 1) / 2
1668
+ *
1669
+ * @param similarity - Cosine similarity value in range [-1, 1]
1670
+ * @returns Confidence score in range [0, 1]
1671
+ */
1672
+ private normalizeToConfidence;
1673
+ /**
1674
+ * Classify a single text into one of the provided themes
1675
+ *
1676
+ * @param text - The text content to classify
1677
+ * @returns A promise that resolves to the theme classification result
1678
+ *
1679
+ * @example
1680
+ * ```typescript
1681
+ * const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder);
1682
+ * const result = await classifier.classify('Machine learning and AI');
1683
+ * console.log(result.theme); // 'technology'
1684
+ * console.log(result.confidence); // 0.92
1685
+ * console.log(result.allScores); // { technology: 0.92, sports: 0.45 }
1686
+ * ```
1687
+ */
1688
+ classify(text: string): Promise<ThemeClassification>;
1689
+ /**
1690
+ * Classify multiple texts efficiently
1691
+ *
1692
+ * Ensures theme embeddings are loaded once, then processes all texts.
1693
+ * Text embeddings are computed in batch for efficiency.
1694
+ *
1695
+ * @param texts - Array of text contents to classify
1696
+ * @returns A promise that resolves to an array of theme classifications
1697
+ *
1698
+ * @example
1699
+ * ```typescript
1700
+ * const classifier = new EmbeddingThemeClassifier(['technology', 'sports', 'finance'], embedder);
1701
+ * const results = await classifier.classifyBatch([
1702
+ * 'Machine learning is transforming AI',
1703
+ * 'The football team won the championship',
1704
+ * 'Stock market hits record high'
1705
+ * ]);
1706
+ * // results[0].theme === 'technology'
1707
+ * // results[1].theme === 'sports'
1708
+ * // results[2].theme === 'finance'
1709
+ * ```
1710
+ */
1711
+ classifyBatch(texts: string[]): Promise<ThemeClassification[]>;
1712
+ }
1713
+
1714
+ /**
1715
+ * LLM-based theme classifier using language models for high-quality classification
1716
+ * Provides the most flexible and accurate theme classification using LLMs
1717
+ */
1718
+
1719
+ /**
1720
+ * LLM-based theme classification using language models.
1721
+ *
1722
+ * This classifier uses LLMs to provide the highest quality theme classification
1723
+ * with semantic understanding and nuanced reasoning. It supports custom prompt
1724
+ * templates for domain-specific classification needs.
1725
+ *
1726
+ * Features:
1727
+ * - Default prompt template with {themes} and {text} placeholders
1728
+ * - Custom prompt template support for specialized domains
1729
+ * - Structured JSON output using LLM.generateJSON<>
1730
+ * - Sequential batch processing to avoid rate limits
1731
+ * - Comprehensive error handling with cause chain
1732
+ * - Empty text handling with uniform scores
1733
+ *
1734
+ * @example
1735
+ * ```typescript
1736
+ * const llm = new OpenAIClient('gpt-4');
1737
+ * const classifier = new LLMThemeClassifier(
1738
+ * ['technology', 'sports', 'finance'],
1739
+ * llm
1740
+ * );
1741
+ * const result = await classifier.classify('Machine learning is transforming AI');
1742
+ * console.log(result.theme); // 'technology'
1743
+ * console.log(result.confidence); // 0.95
1744
+ * ```
1745
+ *
1746
+ * @example Custom prompt template
1747
+ * ```typescript
1748
+ * const customTemplate = `Classify this medical text: {text}
1749
+ * Themes: {themes}
1750
+ * Return JSON with theme, confidence, allScores.`;
1751
+ *
1752
+ * const classifier = new LLMThemeClassifier(
1753
+ * ['cardiology', 'neurology', 'oncology'],
1754
+ * llm,
1755
+ * customTemplate
1756
+ * );
1757
+ * ```
1758
+ */
1759
+ declare class LLMThemeClassifier implements ThemeClassifier {
1760
+ private themes;
1761
+ private llm;
1762
+ private promptTemplate;
1763
+ /**
1764
+ * Creates a new LLMThemeClassifier
1765
+ *
1766
+ * @param themes - Array of theme labels to classify into
1767
+ * @param llm - LLM client instance to use for classification
1768
+ * @param promptTemplate - Optional custom prompt template with {themes} and {text} placeholders
1769
+ *
1770
+ * @example
1771
+ * ```typescript
1772
+ * const classifier = new LLMThemeClassifier(
1773
+ * ['technology', 'sports', 'finance'],
1774
+ * llm
1775
+ * );
1776
+ * ```
1777
+ *
1778
+ * @example With custom prompt
1779
+ * ```typescript
1780
+ * const customTemplate = `Classify: {text}\nThemes: {themes}\nReturn JSON.`;
1781
+ * const classifier = new LLMThemeClassifier(
1782
+ * ['technology', 'sports'],
1783
+ * llm,
1784
+ * customTemplate
1785
+ * );
1786
+ * ```
1787
+ */
1788
+ constructor(themes: string[], llm: LLMClient, promptTemplate?: string);
1789
+ /**
1790
+ * Build the classification prompt by replacing placeholders
1791
+ *
1792
+ * @param text - The text to classify
1793
+ * @returns The complete prompt with placeholders replaced
1794
+ */
1795
+ private buildPrompt;
1796
+ /**
1797
+ * Classify a single text into one of the provided themes
1798
+ *
1799
+ * @param text - The text content to classify
1800
+ * @returns A promise that resolves to the theme classification result
1801
+ *
1802
+ * @example
1803
+ * ```typescript
1804
+ * const classifier = new LLMThemeClassifier(['technology', 'sports'], llm);
1805
+ * const result = await classifier.classify('Machine learning and AI');
1806
+ * console.log(result.theme); // 'technology'
1807
+ * console.log(result.confidence); // 0.95
1808
+ * console.log(result.allScores); // { technology: 0.95, sports: 0.05 }
1809
+ * ```
1810
+ */
1811
+ classify(text: string): Promise<ThemeClassification>;
1812
+ /**
1813
+ * Classify multiple texts sequentially
1814
+ *
1815
+ * Processes texts one at a time to avoid rate limits and ensure predictable behavior.
1816
+ * Sequential processing provides better error handling and rate limit compliance.
1817
+ *
1818
+ * @param texts - Array of text contents to classify
1819
+ * @returns A promise that resolves to an array of theme classifications
1820
+ *
1821
+ * @example
1822
+ * ```typescript
1823
+ * const classifier = new LLMThemeClassifier(['technology', 'sports', 'finance'], llm);
1824
+ * const results = await classifier.classifyBatch([
1825
+ * 'Machine learning is transforming AI',
1826
+ * 'The football team won the championship',
1827
+ * 'Stock market hits record high'
1828
+ * ]);
1829
+ * // results[0].theme === 'technology'
1830
+ * // results[1].theme === 'sports'
1831
+ * // results[2].theme === 'finance'
1832
+ * ```
1833
+ */
1834
+ classifyBatch(texts: string[]): Promise<ThemeClassification[]>;
1835
+ }
1836
+
1837
+ /**
1838
+ * Enrichment pipeline for adding metadata to vector records.
1839
+ *
1840
+ * This class provides the main enrichment functionality:
1841
+ * - Vertical enrichment: Classify documents into business verticals
1842
+ * - Theme enrichment: Add thematic tags to documents
1843
+ * - Section enrichment: Structure documents into logical sections
1844
+ * - Batch processing: Efficiently process large collections
1845
+ *
1846
+ * Design principles:
1847
+ * 1. Database-agnostic: Works with any VectorDBAdapter
1848
+ * 2. Strategy pattern: Multiple enrichment strategies per operation
1849
+ * 3. Batch processing: Efficient iteration and bulk updates
1850
+ * 4. Error resilience: Continue processing despite individual failures
1851
+ */
1852
+
1853
+ /**
1854
+ * EnrichmentPipeline provides methods to enrich vector records with metadata.
1855
+ *
1856
+ * The pipeline supports three types of enrichment:
1857
+ * 1. Vertical enrichment: Classify into business verticals (technology, finance, etc.)
1858
+ * 2. Theme enrichment: Add thematic tags (innovation, research, etc.)
1859
+ * 3. Section enrichment: Structure into logical sections
1860
+ *
1861
+ * Each enrichment type supports multiple strategies for maximum flexibility.
1862
+ *
1863
+ * @example
1864
+ * ```typescript
1865
+ * const pipeline = new EnrichmentPipeline(adapter, embedder, llm);
1866
+ *
1867
+ * // Enrich using field mapping
1868
+ * await pipeline.enrichVertical('my-collection', {
1869
+ * mapping: { 'tech': 'technology', 'hc': 'healthcare' }
1870
+ * });
1871
+ *
1872
+ * // Enrich using custom extractor
1873
+ * await pipeline.enrichVertical('my-collection', {
1874
+ * extractor: async (doc) => extractVertical(doc)
1875
+ * });
1876
+ *
1877
+ * // Enrich using LLM
1878
+ * await pipeline.enrichVertical('my-collection', {
1879
+ * automatic: {
1880
+ * llm: myLLMClient,
1881
+ * fields: ['technology', 'finance', 'healthcare']
1882
+ * }
1883
+ * });
1884
+ * ```
1885
+ */
1886
+ declare class EnrichmentPipeline {
1887
+ private adapter;
1888
+ private embedder?;
1889
+ private llm?;
1890
+ /**
1891
+ * Create a new enrichment pipeline.
1892
+ *
1893
+ * @param adapter - Vector database adapter for reading/writing records
1894
+ * @param embedder - Optional embedder for embedding-based enrichment
1895
+ * @param llm - Optional LLM client for automatic enrichment
1896
+ */
1897
+ constructor(adapter: VectorDBAdapter, embedder?: any | undefined, llm?: any | undefined);
1898
+ /**
1899
+ * Enrich records with vertical classifications.
1900
+ *
1901
+ * Supports three strategies:
1902
+ * 1. Field mapping: Map existing field values to verticals
1903
+ * 2. Custom extractor: Use a custom function to extract verticals
1904
+ * 3. Automatic LLM: Use an LLM to classify documents
1905
+ *
1906
+ * @param collection - Name of the collection to enrich
1907
+ * @param config - Vertical enrichment configuration
1908
+ * @returns Statistics about the enrichment operation
1909
+ *
1910
+ * @example
1911
+ * ```typescript
1912
+ * // Field mapping
1913
+ * await pipeline.enrichVertical('docs', {
1914
+ * mapping: { 'tech': 'technology' }
1915
+ * });
1916
+ *
1917
+ * // Custom extractor
1918
+ * await pipeline.enrichVertical('docs', {
1919
+ * extractor: async (doc) => 'technology'
1920
+ * });
1921
+ *
1922
+ * // Automatic LLM
1923
+ * await pipeline.enrichVertical('docs', {
1924
+ * automatic: {
1925
+ * llm: myLLMClient,
1926
+ * fields: ['technology', 'finance']
1927
+ * }
1928
+ * });
1929
+ * ```
1930
+ */
1931
+ enrichVertical(collection: string, config: VerticalEnrichmentConfig): Promise<EnrichmentStats>;
1932
+ /**
1933
+ * Enrich records using field mapping strategy.
1934
+ *
1935
+ * Maps values from an existing field to vertical classifications.
1936
+ *
1937
+ * @param collection - Collection name
1938
+ * @param config - Field mapping configuration
1939
+ * @param stats - Statistics object to update
1940
+ */
1941
+ private enrichWithFieldMapping;
1942
+ /**
1943
+ * Apply field mapping to extract vertical from a record.
1944
+ *
1945
+ * @param record - Vector record
1946
+ * @param mapping - Field mapping configuration
1947
+ * @returns Vertical label or null if no match
1948
+ */
1949
+ private applyFieldMapping;
1950
+ /**
1951
+ * Enrich records using custom extractor strategy.
1952
+ *
1953
+ * Calls the provided extractor function for each record.
1954
+ *
1955
+ * @param collection - Collection name
1956
+ * @param config - Extractor configuration
1957
+ * @param stats - Statistics object to update
1958
+ */
1959
+ private enrichWithExtractor;
1960
+ /**
1961
+ * Enrich records using automatic LLM strategy.
1962
+ *
1963
+ * Uses a language model to classify documents into verticals.
1964
+ *
1965
+ * @param collection - Collection name
1966
+ * @param config - Automatic extraction configuration
1967
+ * @param stats - Statistics object to update
1968
+ */
1969
+ private enrichWithLLM;
1970
+ /**
1971
+ * Extract vertical classification using LLM.
1972
+ *
1973
+ * @param record - Vector record
1974
+ * @param llm - LLM client
1975
+ * @param fields - Available vertical fields
1976
+ * @param textField - Field name containing text to classify
1977
+ * @param promptTemplate - Optional custom prompt template
1978
+ * @returns Vertical label
1979
+ */
1980
+ private extractWithLLM;
1981
+ /**
1982
+ * Enrich records with theme classifications.
1983
+ *
1984
+ * Uses a theme classifier to identify themes in text content and updates
1985
+ * record metadata with theme information. Supports single and multi-theme
1986
+ * classification with configurable confidence thresholds.
1987
+ *
1988
+ * @param collection - Name of the collection to enrich
1989
+ * @param config - Theme enrichment configuration
1990
+ * @returns Statistics about the enrichment operation
1991
+ *
1992
+ * @example
1993
+ * ```typescript
1994
+ * // Single theme classification
1995
+ * await pipeline.enrichThemes('docs', {
1996
+ * themes: ['technology', 'business', 'science'],
1997
+ * classifier: new KeywordThemeClassifier(),
1998
+ * confidenceThreshold: 0.7
1999
+ * });
2000
+ *
2001
+ * // Multi-theme classification
2002
+ * await pipeline.enrichThemes('docs', {
2003
+ * themes: ['technology', 'business', 'science'],
2004
+ * classifier: new LLMThemeClassifier(),
2005
+ * multiTheme: true,
2006
+ * confidenceThreshold: 0.5
2007
+ * });
2008
+ * ```
2009
+ */
2010
+ enrichThemes(collection: string, config: ThemeEnrichmentConfig): Promise<EnrichmentStats>;
2011
+ /**
2012
+ * Enrich records using theme classifier.
2013
+ *
2014
+ * @param collection - Collection name
2015
+ * @param config - Theme enrichment configuration
2016
+ * @param stats - Statistics object to update
2017
+ */
2018
+ private enrichWithThemeClassifier;
2019
+ /**
2020
+ * Enrich records with section structure.
2021
+ *
2022
+ * Extracts section metadata from documents using either existing field mappings
2023
+ * or automatic detection strategies (markdown, HTML, or pattern-based).
2024
+ *
2025
+ * @param collection - Name of the collection to enrich
2026
+ * @param config - Section enrichment configuration
2027
+ * @returns Statistics about the enrichment operation
2028
+ *
2029
+ * @example
2030
+ * ```typescript
2031
+ * // Use existing section field
2032
+ * await pipeline.enrichSections('docs', {
2033
+ * existingField: 'section_path'
2034
+ * });
2035
+ *
2036
+ * // Auto-detect sections
2037
+ * await pipeline.enrichSections('docs', {
2038
+ * autoDetect: true
2039
+ * });
2040
+ * ```
2041
+ */
2042
+ enrichSections(collection: string, config: SectionEnrichmentConfig): Promise<EnrichmentStats>;
2043
+ /**
2044
+ * Enrich records with all enrichment types.
2045
+ *
2046
+ * Runs vertical, theme, and section enrichment sequentially with shared
2047
+ * configuration. Global filters and batch sizes apply to all operations.
2048
+ *
2049
+ * @param collection - Name of the collection to enrich
2050
+ * @param config - Combined enrichment configuration
2051
+ * @returns Statistics about the enrichment operation
2052
+ *
2053
+ * @example
2054
+ * ```typescript
2055
+ * await pipeline.enrichAll('docs', {
2056
+ * vertical: { mapping: { tech: 'technology' } },
2057
+ * themes: { themes: ['innovation'], classifier },
2058
+ * sections: { autoDetect: true },
2059
+ * filter: { field: 'status', op: 'eq', value: 'pending' },
2060
+ * batchSize: 50
2061
+ * });
2062
+ * ```
2063
+ */
2064
+ enrichAll(collection: string, config: EnrichAllConfig): Promise<EnrichmentStats>;
2065
+ /**
2066
+ * Apply global configuration to individual enrichment configs.
2067
+ *
2068
+ * @param individualConfig - Configuration for a specific enrichment type
2069
+ * @param globalConfig - Global configuration
2070
+ * @returns Merged configuration
2071
+ */
2072
+ private applyGlobalConfig;
2073
+ /**
2074
+ * Merge stats from an enrichment operation into aggregate stats.
2075
+ *
2076
+ * @param aggregate - Aggregate stats to update
2077
+ * @param stats - Stats from a single operation
2078
+ */
2079
+ private mergeStats;
2080
+ /**
2081
+ * Enrich records using section detection.
2082
+ *
2083
+ * @param collection - Collection name
2084
+ * @param config - Section enrichment configuration
2085
+ * @param stats - Statistics object to update
2086
+ */
2087
+ private enrichWithSectionDetection;
2088
+ /**
2089
+ * Extract section metadata from an existing field value.
2090
+ *
2091
+ * @param sectionPath - Section path string (e.g., "introduction/overview")
2092
+ * @returns Section metadata or null
2093
+ */
2094
+ private extractSectionMetadata;
2095
+ /**
2096
+ * Detect sections in text using heuristics.
2097
+ *
2098
+ * @param text - Text content to analyze
2099
+ * @returns Section metadata or null
2100
+ */
2101
+ private detectSections;
2102
+ /**
2103
+ * Detect markdown headers (# Header).
2104
+ *
2105
+ * @param text - Text content
2106
+ * @returns Section metadata or null
2107
+ */
2108
+ private detectMarkdownSections;
2109
+ /**
2110
+ * Detect HTML headers (<h1>Header</h1>).
2111
+ *
2112
+ * @param text - Text content
2113
+ * @returns Section metadata or null
2114
+ */
2115
+ private detectHtmlSections;
2116
+ /**
2117
+ * Detect sections using common patterns (SECTION: Title).
2118
+ *
2119
+ * @param text - Text content
2120
+ * @returns Section metadata or null
2121
+ */
2122
+ private detectPatternSections;
2123
+ }
2124
+
2125
+ /**
2126
+ * Loaded document with extracted text and metadata.
2127
+ */
2128
+ interface Document {
2129
+ /** Full document text */
2130
+ text: string;
2131
+ /** File path or source identifier */
2132
+ source: string;
2133
+ /** File type/extension (pdf, txt, docx, html) */
2134
+ type: string;
2135
+ /** Optional user-provided or loader-extracted metadata */
2136
+ metadata?: Record<string, any>;
2137
+ }
2138
+ /**
2139
+ * Statistics returned by ingestion operations.
2140
+ */
2141
+ interface IngestionStats {
2142
+ documentsProcessed: number;
2143
+ documentsSucceeded: number;
2144
+ documentsFailed: number;
2145
+ chunksCreated: number;
2146
+ chunksUpserted: number;
2147
+ timeMs: number;
2148
+ errors?: Array<{
2149
+ source: string;
2150
+ stage: 'load' | 'chunk' | 'embed' | 'upsert';
2151
+ error: Error;
2152
+ }>;
2153
+ }
2154
+ /**
2155
+ * Configuration for ingestion operations.
2156
+ */
2157
+ interface IngestionConfig {
2158
+ chunkSize?: number;
2159
+ chunkOverlap?: number;
2160
+ chunker?: any;
2161
+ metadata?: Record<string, any>;
2162
+ metadataExtractor?: (doc: Document) => Record<string, any>;
2163
+ batchSize?: number;
2164
+ concurrency?: number;
2165
+ onProgress?: (progress: ProgressInfo) => void;
2166
+ onDocumentLoaded?: (doc: Document) => void;
2167
+ onChunksCreated?: (chunks: any[]) => void;
2168
+ }
2169
+ /**
2170
+ * Progress information during ingestion.
2171
+ */
2172
+ interface ProgressInfo {
2173
+ stage: 'loading' | 'chunking' | 'embedding' | 'upserting';
2174
+ documentsProcessed: number;
2175
+ totalDocuments: number;
2176
+ chunksProcessed: number;
2177
+ totalChunks?: number;
2178
+ currentDocument?: string;
2179
+ }
2180
+ /**
2181
+ * Text chunk with position metadata.
2182
+ */
2183
+ interface TextChunk {
2184
+ text: string;
2185
+ index: number;
2186
+ metadata: {
2187
+ source: string;
2188
+ chunkIndex: number;
2189
+ totalChunks: number;
2190
+ startChar: number;
2191
+ endChar: number;
2192
+ };
2193
+ }
2194
+ /**
2195
+ * Configuration for chunking operations.
2196
+ */
2197
+ interface ChunkConfig {
2198
+ chunkSize?: number;
2199
+ chunkOverlap?: number;
2200
+ }
2201
+
2202
+ /**
2203
+ * Abstract interface for document loaders.
2204
+ * Implementations load specific file formats and return standardized Document objects.
2205
+ */
2206
+ interface DocumentLoader {
2207
+ /**
2208
+ * Check if this loader can handle the given file.
2209
+ * @param filePath - Path to the file
2210
+ * @returns true if loader can handle this file type
2211
+ */
2212
+ canHandle(filePath: string): boolean;
2213
+ /**
2214
+ * Load a document from the given file path.
2215
+ * @param filePath - Path to the file to load
2216
+ * @returns Promise resolving to Document
2217
+ */
2218
+ load(filePath: string): Promise<Document>;
2219
+ }
2220
+
2221
+ /**
2222
+ * Registry for document loaders.
2223
+ * Manages loaders and routes files to correct loader based on extension.
2224
+ */
2225
+ declare class LoaderRegistry {
2226
+ private loaders;
2227
+ constructor();
2228
+ /**
2229
+ * Register a custom document loader.
2230
+ * @param loader - Loader to register
2231
+ */
2232
+ register(loader: DocumentLoader): void;
2233
+ /**
2234
+ * Check if any loader can handle this file.
2235
+ * @param filePath - Path to check
2236
+ * @returns true if a loader exists for this file type
2237
+ */
2238
+ canLoad(filePath: string): boolean;
2239
+ /**
2240
+ * Load a document using the appropriate loader.
2241
+ * @param filePath - Path to the file to load
2242
+ * @returns Promise resolving to Document
2243
+ * @throws Error if no loader found for file type
2244
+ */
2245
+ load(filePath: string): Promise<Document>;
2246
+ }
2247
+
2248
+ /**
2249
+ * Abstract interface for text chunking strategies.
2250
+ * Implementations split text into chunks with different algorithms.
2251
+ */
2252
+ interface TextChunker {
2253
+ /**
2254
+ * Chunk text into smaller pieces.
2255
+ * @param text - Text to chunk
2256
+ * @param config - Optional chunking configuration
2257
+ * @returns Array of text chunks with position metadata
2258
+ */
2259
+ chunk(text: string, config?: ChunkConfig): TextChunk[];
2260
+ }
2261
+ /**
2262
+ * Default chunk size in tokens (approximate).
2263
+ */
2264
+ declare const DEFAULT_CHUNK_SIZE = 500;
2265
+ /**
2266
+ * Default chunk overlap in tokens (approximate).
2267
+ */
2268
+ declare const DEFAULT_CHUNK_OVERLAP = 50;
2269
+ /**
2270
+ * Estimate token count from character count.
2271
+ * Simple heuristic: 1 token ≈ 4 characters for English text.
2272
+ */
2273
+ declare function estimateTokens(text: string): number;
2274
+ /**
2275
+ * Estimate character count from token count.
2276
+ */
2277
+ declare function estimateChars(tokens: number): number;
2278
+
2279
+ /**
2280
+ * Main ingestion pipeline orchestrator.
2281
+ * Coordinates loading, chunking, embedding, and upserting documents.
2282
+ */
2283
+ declare class IngestionPipeline {
2284
+ private adapter;
2285
+ private embedder;
2286
+ private loaderRegistry;
2287
+ private defaultChunker;
2288
+ constructor(adapter: VectorDBAdapter, embedder: Embedder, loaderRegistry: LoaderRegistry, chunker?: TextChunker);
2289
+ /**
2290
+ * Ingest documents into a vector database collection.
2291
+ * @param sources - File paths
2292
+ * @param collection - Target collection name
2293
+ * @param config - Optional ingestion configuration
2294
+ * @returns Statistics about the ingestion operation
2295
+ */
2296
+ ingest(sources: string | string[], collection: string, config?: IngestionConfig): Promise<IngestionStats>;
2297
+ private ingestFile;
2298
+ private buildMetadata;
2299
+ }
2300
+
2301
+ /**
2302
+ * Loader for plain text files (.txt, .md).
2303
+ * No external dependencies, uses Node.js built-in fs.
2304
+ */
2305
+ declare class TextLoader implements DocumentLoader {
2306
+ canHandle(filePath: string): boolean;
2307
+ load(filePath: string): Promise<Document>;
2308
+ }
2309
+
2310
+ /**
2311
+ * Loader for PDF files using pdf-parse library.
2312
+ * Extracts text from all pages and includes PDF metadata.
2313
+ */
2314
+ declare class PDFLoader implements DocumentLoader {
2315
+ canHandle(filePath: string): boolean;
2316
+ load(filePath: string): Promise<Document>;
2317
+ }
2318
+
2319
+ /**
2320
+ * Loader for DOCX files using mammoth library.
2321
+ * Converts DOCX to plain text, preserves paragraph structure.
2322
+ */
2323
+ declare class DOCXLoader implements DocumentLoader {
2324
+ canHandle(filePath: string): boolean;
2325
+ load(filePath: string): Promise<Document>;
2326
+ }
2327
+
2328
+ /**
2329
+ * Loader for HTML files using cheerio library.
2330
+ * Strips tags, extracts visible text, removes scripts/styles.
2331
+ */
2332
+ declare class HTMLLoader implements DocumentLoader {
2333
+ canHandle(filePath: string): boolean;
2334
+ load(filePath: string): Promise<Document>;
2335
+ }
2336
+
2337
+ /**
2338
+ * Recursive text chunker that tries different separators hierarchically.
2339
+ * Tries to split by paragraphs first, then sentences, then words, then characters.
2340
+ */
2341
+ declare class RecursiveChunker implements TextChunker {
2342
+ private readonly separators;
2343
+ chunk(text: string, config?: ChunkConfig): TextChunk[];
2344
+ private recursiveSplit;
2345
+ private addOverlap;
2346
+ }
2347
+
2348
+ /**
2349
+ * Fixed-size text chunker that splits at exact character boundaries.
2350
+ * Fast and predictable, but may split mid-sentence or mid-word.
2351
+ */
2352
+ declare class FixedChunker implements TextChunker {
2353
+ chunk(text: string, config?: ChunkConfig): TextChunk[];
2354
+ }
2355
+
2356
+ /**
2357
+ * Sentence-aware chunker that splits on sentence boundaries.
2358
+ * Uses a simple regex-based sentence splitter for portability.
2359
+ */
2360
+ declare class SentenceChunker implements TextChunker {
2361
+ chunk(text: string, config?: ChunkConfig): TextChunk[];
2362
+ private splitSentences;
2363
+ private addSentenceOverlap;
2364
+ }
2365
+
2366
+ /**
2367
+ * Configuration for RAGClient.
2368
+ */
2369
+ interface RAGClientConfig {
2370
+ /** Vector database adapter */
2371
+ adapter: VectorDBAdapter;
2372
+ /** Embedding model */
2373
+ embedder: Embedder;
2374
+ /** Optional LLM client (required for query()) */
2375
+ llm?: LLMClient;
2376
+ /** Default collection name */
2377
+ defaultCollection?: string;
2378
+ /** Default number of results to return (default: 10) */
2379
+ defaultTopK?: number;
2380
+ }
2381
+ /**
2382
+ * Options for retrieval operations.
2383
+ */
2384
+ interface RetrieveOptions {
2385
+ /** Override defaultCollection */
2386
+ collection?: string;
2387
+ /** Override defaultTopK */
2388
+ topK?: number;
2389
+ /** Custom filter */
2390
+ filter?: UniversalFilter;
2391
+ /** Shorthand for vertical filter on __v_partition */
2392
+ partition?: string;
2393
+ /** Shorthand for horizontal filter on __h_theme */
2394
+ theme?: string;
2395
+ /** Group results by document or theme */
2396
+ groupBy?: 'document' | 'theme';
2397
+ }
2398
+ /**
2399
+ * Options for full RAG query operations.
2400
+ */
2401
+ interface QueryOptions extends RetrieveOptions {
2402
+ /** Override default RAG system prompt */
2403
+ systemPrompt?: string;
2404
+ /** LLM temperature */
2405
+ temperature?: number;
2406
+ /** LLM max tokens */
2407
+ maxTokens?: number;
2408
+ }
2409
+ /**
2410
+ * Response from a full RAG query.
2411
+ */
2412
+ interface RAGResponse {
2413
+ /** LLM-generated answer */
2414
+ answer: string;
2415
+ /** Retrieved context chunks used to generate the answer */
2416
+ sources: VectorRecord[];
2417
+ /** Original question */
2418
+ query: string;
2419
+ /** Full retrieval details */
2420
+ retrievalResult: RetrievalResult;
2421
+ }
2422
+
2423
+ /**
2424
+ * RAGClient - Unified facade for all Glyph VectorORM operations.
2425
+ *
2426
+ * Ties together adapter, embedder, LLM, ingestion, enrichment, and query
2427
+ * into a single developer-facing API.
2428
+ *
2429
+ * @example
2430
+ * ```typescript
2431
+ * const client = new RAGClient({
2432
+ * adapter: new ChromaAdapter(),
2433
+ * embedder: new OpenAIEmbedder(),
2434
+ * llm: new OpenAIClient(),
2435
+ * defaultCollection: 'my-docs'
2436
+ * });
2437
+ *
2438
+ * // Ingest documents
2439
+ * await client.ingest(['docs/*.pdf']);
2440
+ *
2441
+ * // Retrieve
2442
+ * const result = await client.retrieve('pricing info');
2443
+ *
2444
+ * // Full RAG query
2445
+ * const response = await client.query('What are the pricing terms?');
2446
+ * console.log(response.answer);
2447
+ * ```
2448
+ */
2449
+ declare class RAGClient {
2450
+ private readonly adapter;
2451
+ private readonly embedder;
2452
+ private readonly llm?;
2453
+ private readonly defaultCollection?;
2454
+ private readonly defaultTopK;
2455
+ private readonly queryComposer;
2456
+ private readonly ingestionPipeline;
2457
+ private readonly enrichmentPipeline;
2458
+ constructor(config: RAGClientConfig);
2459
+ /**
2460
+ * Create a new vector collection.
2461
+ * Dimension defaults to embedder.dimensions if not specified.
2462
+ */
2463
+ createCollection(name: string, dimension?: number, metric?: DistanceMetric): Promise<void>;
2464
+ /**
2465
+ * Delete a collection.
2466
+ */
2467
+ deleteCollection(name: string): Promise<void>;
2468
+ /**
2469
+ * Check if a collection exists.
2470
+ */
2471
+ collectionExists(name: string): Promise<boolean>;
2472
+ /**
2473
+ * Ingest documents into a collection.
2474
+ * Collection defaults to defaultCollection if not specified.
2475
+ */
2476
+ ingest(sources: string | string[], collection?: string, config?: IngestionConfig): Promise<IngestionStats>;
2477
+ /**
2478
+ * Retrieve relevant chunks for a query.
2479
+ * Supports filter shorthands (partition, theme) and groupBy.
2480
+ */
2481
+ retrieve(query: string, options?: RetrieveOptions): Promise<RetrievalResult>;
2482
+ /**
2483
+ * Enrich a collection with vertical, theme, and/or section metadata.
2484
+ */
2485
+ enrich(collection: string, config: EnrichAllConfig): Promise<EnrichmentStats>;
2486
+ /**
2487
+ * Full RAG: retrieve relevant context and generate an answer using LLM.
2488
+ * Requires an LLM client to be provided in the constructor config.
2489
+ */
2490
+ query(question: string, options?: QueryOptions): Promise<RAGResponse>;
2491
+ }
2492
+
2493
+ export { type AndFilter, type AutomaticExtractionConfig, type ChunkConfig, type CollectionStats, DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, DOCXLoader, type DistanceMetric, type Document, type DocumentLoader, Embedder, EmbeddingThemeClassifier, type EnrichAllConfig, EnrichmentPipeline, type EnrichmentStats, type ExtractorConfig, type FieldMappingConfig, FilterBuilder, type FilterCondition, type FilterOperator, FilterTranslator, FixedChunker, type GenerateOptions, type GroupedResults, HTMLLoader, type HorizontalFieldKey, HorizontalFields, type IngestionConfig, IngestionPipeline, type IngestionStats, KeywordThemeClassifier, LLMClient, LLMThemeClassifier, LoaderRegistry, METADATA_PREFIXES, MetadataBuilder, type MetadataUpdate, MockLLM, type OrFilter, PDFLoader, type ProgressCallback, type ProgressInfo, type QueryOptions, RAGClient, type RAGClientConfig, RAGQueryComposer, type RAGResponse, RecursiveChunker, type RetrievalParams, type RetrievalResult, type RetrieveOptions, type SearchOptions, type SearchResult, type SectionEnrichmentConfig, SentenceChunker, type ShorthandFilter, type StructuralFieldKey, StructuralFields, type TextChunk, type TextChunker, TextLoader, type ThemeClassification, type ThemeClassificationResult, type ThemeClassifier, type ThemeEnrichmentConfig, type UniversalFilter, VectorDBAdapter, type VectorRecord, type VerticalEnrichmentConfig, type VerticalFieldKey, VerticalFields, ZeroShotThemeClassifier, estimateChars, estimateTokens };