@vectororm/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -0
- package/dist/index.d.mts +2493 -0
- package/dist/index.d.ts +2493 -0
- package/dist/index.js +2508 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +2441 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +65 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,2493 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Represents a vector record in the database.
|
|
3
|
+
*
|
|
4
|
+
* This is the fundamental unit of storage in Glyph, containing:
|
|
5
|
+
* - Unique identifier
|
|
6
|
+
* - Embedding vector
|
|
7
|
+
* - Metadata (including V/H/S fields)
|
|
8
|
+
* - Optional text and score
|
|
9
|
+
*/
|
|
10
|
+
interface VectorRecord {
|
|
11
|
+
/** Unique identifier for this record */
|
|
12
|
+
id: string;
|
|
13
|
+
/** Embedding vector (dimensionality depends on embedding model) */
|
|
14
|
+
embedding: number[];
|
|
15
|
+
/**
|
|
16
|
+
* Metadata fields including:
|
|
17
|
+
* - Vertical fields (__v_*): Document-level metadata
|
|
18
|
+
* - Horizontal fields (__h_*): Theme/section metadata
|
|
19
|
+
* - Structural fields (__s_*): Position/hierarchy metadata
|
|
20
|
+
* - Custom user fields
|
|
21
|
+
*/
|
|
22
|
+
metadata: Record<string, any>;
|
|
23
|
+
/** Optional text content of this chunk */
|
|
24
|
+
text?: string;
|
|
25
|
+
/** Optional similarity score (populated during search) */
|
|
26
|
+
score?: number;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Result from a vector search operation.
|
|
31
|
+
*/
|
|
32
|
+
interface SearchResult {
|
|
33
|
+
/** Matching vector records */
|
|
34
|
+
records: VectorRecord[];
|
|
35
|
+
/** Total count of matches (if available from DB) */
|
|
36
|
+
totalCount?: number;
|
|
37
|
+
/** Cursor for pagination (if supported by DB) */
|
|
38
|
+
nextCursor?: string;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Metadata field prefixes for the three axes of Glyph's schema.
|
|
43
|
+
*
|
|
44
|
+
* These prefixes separate framework fields from user-defined metadata:
|
|
45
|
+
* - __v_: Vertical axis (document identity)
|
|
46
|
+
* - __h_: Horizontal axis (content/theme identity)
|
|
47
|
+
* - __s_: Structural axis (position/hierarchy)
|
|
48
|
+
*/
|
|
49
|
+
declare const METADATA_PREFIXES: {
|
|
50
|
+
readonly VERTICAL: "__v_";
|
|
51
|
+
readonly HORIZONTAL: "__h_";
|
|
52
|
+
readonly STRUCTURAL: "__s_";
|
|
53
|
+
};
|
|
54
|
+
/**
|
|
55
|
+
* Vertical axis fields - identify WHICH document a chunk belongs to.
|
|
56
|
+
*/
|
|
57
|
+
declare const VerticalFields: {
|
|
58
|
+
/** Unique document identifier */
|
|
59
|
+
readonly DOC_ID: "__v_doc_id";
|
|
60
|
+
/** Original source path/URL */
|
|
61
|
+
readonly SOURCE: "__v_source";
|
|
62
|
+
/** Logical partition key (for filtering by document subsets) */
|
|
63
|
+
readonly PARTITION: "__v_partition";
|
|
64
|
+
/** Document type classification */
|
|
65
|
+
readonly DOC_TYPE: "__v_doc_type";
|
|
66
|
+
/** Arbitrary vertical tags */
|
|
67
|
+
readonly TAGS: "__v_tags";
|
|
68
|
+
};
|
|
69
|
+
/**
|
|
70
|
+
* Horizontal axis fields - identify WHAT topic/theme a chunk covers.
|
|
71
|
+
*/
|
|
72
|
+
declare const HorizontalFields: {
|
|
73
|
+
/** Primary theme classification */
|
|
74
|
+
readonly THEME: "__h_theme";
|
|
75
|
+
/** Multiple themes (if applicable) */
|
|
76
|
+
readonly THEMES: "__h_themes";
|
|
77
|
+
/** Classification confidence score */
|
|
78
|
+
readonly THEME_CONFIDENCE: "__h_theme_confidence";
|
|
79
|
+
/** Hierarchical section path (e.g., "Chapter 3/Pricing/Rates") */
|
|
80
|
+
readonly SECTION_PATH: "__h_section_path";
|
|
81
|
+
/** Depth level in hierarchy (0 = root) */
|
|
82
|
+
readonly SECTION_LEVEL: "__h_section_level";
|
|
83
|
+
/** Section header text */
|
|
84
|
+
readonly SECTION_TITLE: "__h_section_title";
|
|
85
|
+
};
|
|
86
|
+
/**
|
|
87
|
+
* Structural axis fields - track chunk position and relationships.
|
|
88
|
+
*/
|
|
89
|
+
declare const StructuralFields: {
|
|
90
|
+
/** Position in document (0-indexed) */
|
|
91
|
+
readonly CHUNK_INDEX: "__s_chunk_index";
|
|
92
|
+
/** Parent chunk ID (for hierarchical chunking) */
|
|
93
|
+
readonly PARENT_ID: "__s_parent_id";
|
|
94
|
+
/** Whether this chunk has children */
|
|
95
|
+
readonly HAS_CHILDREN: "__s_has_children";
|
|
96
|
+
/** Total chunks in this document */
|
|
97
|
+
readonly TOTAL_CHUNKS: "__s_total_chunks";
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Type-safe metadata field names.
|
|
102
|
+
*
|
|
103
|
+
* Use these instead of string literals to get autocomplete and catch typos.
|
|
104
|
+
*/
|
|
105
|
+
/** Type for vertical field keys */
|
|
106
|
+
type VerticalFieldKey = 'docId' | 'source' | 'partition' | 'docType' | 'tags';
|
|
107
|
+
/** Type for horizontal field keys */
|
|
108
|
+
type HorizontalFieldKey = 'theme' | 'themes' | 'themeConfidence' | 'sectionPath' | 'sectionLevel' | 'sectionTitle';
|
|
109
|
+
/** Type for structural field keys */
|
|
110
|
+
type StructuralFieldKey = 'chunkIndex' | 'parentId' | 'hasChildren' | 'totalChunks';
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* MetadataBuilder provides a fluent API for constructing metadata objects
|
|
114
|
+
* with proper V/H/S prefixes and type safety.
|
|
115
|
+
*
|
|
116
|
+
* Example:
|
|
117
|
+
* ```typescript
|
|
118
|
+
* const metadata = new MetadataBuilder()
|
|
119
|
+
* .vertical({ doc_id: 'doc123', source: 'file.pdf' })
|
|
120
|
+
* .horizontal({ theme: 'pricing' })
|
|
121
|
+
* .structural({ chunk_index: 0, total_chunks: 10 })
|
|
122
|
+
* .custom({ author: 'John Doe' })
|
|
123
|
+
* .build();
|
|
124
|
+
* ```
|
|
125
|
+
*
|
|
126
|
+
* Features:
|
|
127
|
+
* - Fluent chaining API
|
|
128
|
+
* - Automatic prefix application
|
|
129
|
+
* - Skips undefined values
|
|
130
|
+
* - Returns immutable copy on build()
|
|
131
|
+
*/
|
|
132
|
+
declare class MetadataBuilder {
|
|
133
|
+
private metadata;
|
|
134
|
+
/**
|
|
135
|
+
* Add vertical axis metadata (document identity).
|
|
136
|
+
* Automatically prefixes fields with '__v_'.
|
|
137
|
+
*
|
|
138
|
+
* @param fields - Vertical metadata fields (doc_id, source, partition, etc.)
|
|
139
|
+
* @returns This builder for chaining
|
|
140
|
+
*/
|
|
141
|
+
vertical(fields: Record<string, any>): this;
|
|
142
|
+
/**
|
|
143
|
+
* Add horizontal axis metadata (theme/section identity).
|
|
144
|
+
* Automatically prefixes fields with '__h_'.
|
|
145
|
+
*
|
|
146
|
+
* @param fields - Horizontal metadata fields (theme, section_path, etc.)
|
|
147
|
+
* @returns This builder for chaining
|
|
148
|
+
*/
|
|
149
|
+
horizontal(fields: Record<string, any>): this;
|
|
150
|
+
/**
|
|
151
|
+
* Add structural axis metadata (position/hierarchy).
|
|
152
|
+
* Automatically prefixes fields with '__s_'.
|
|
153
|
+
*
|
|
154
|
+
* @param fields - Structural metadata fields (chunk_index, parent_id, etc.)
|
|
155
|
+
* @returns This builder for chaining
|
|
156
|
+
*/
|
|
157
|
+
structural(fields: Record<string, any>): this;
|
|
158
|
+
/**
|
|
159
|
+
* Add custom user-defined metadata.
|
|
160
|
+
* Fields are added as-is without any prefix.
|
|
161
|
+
*
|
|
162
|
+
* @param fields - Custom metadata fields
|
|
163
|
+
* @returns This builder for chaining
|
|
164
|
+
*/
|
|
165
|
+
custom(fields: Record<string, any>): this;
|
|
166
|
+
/**
|
|
167
|
+
* Build and return the complete metadata object.
|
|
168
|
+
* Returns a copy to prevent external modification.
|
|
169
|
+
*
|
|
170
|
+
* @returns Immutable copy of the metadata object
|
|
171
|
+
*/
|
|
172
|
+
build(): Record<string, any>;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Universal filter language for database-agnostic queries.
|
|
177
|
+
*
|
|
178
|
+
* Filters are expressed in a standard format, then translated
|
|
179
|
+
* to native database syntax by each adapter.
|
|
180
|
+
*/
|
|
181
|
+
/**
|
|
182
|
+
* Supported filter operators.
|
|
183
|
+
*/
|
|
184
|
+
type FilterOperator = 'eq' | 'neq' | 'in' | 'nin' | 'gt' | 'gte' | 'lt' | 'lte' | 'contains' | 'exists';
|
|
185
|
+
/**
|
|
186
|
+
* Basic filter condition.
|
|
187
|
+
*/
|
|
188
|
+
interface FilterCondition {
|
|
189
|
+
field: string;
|
|
190
|
+
op: FilterOperator;
|
|
191
|
+
value: any;
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Compound AND filter (all conditions must match).
|
|
195
|
+
*/
|
|
196
|
+
interface AndFilter {
|
|
197
|
+
and: UniversalFilter[];
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Compound OR filter (any condition must match).
|
|
201
|
+
*/
|
|
202
|
+
interface OrFilter {
|
|
203
|
+
or: UniversalFilter[];
|
|
204
|
+
}
|
|
205
|
+
/**
|
|
206
|
+
* Universal filter - can be a simple condition or compound.
|
|
207
|
+
*/
|
|
208
|
+
type UniversalFilter = FilterCondition | AndFilter | OrFilter;
|
|
209
|
+
/**
|
|
210
|
+
* Shorthand filter format (user-friendly).
|
|
211
|
+
*
|
|
212
|
+
* Examples:
|
|
213
|
+
* - {region: "ny"} → {field: "region", op: "eq", value: "ny"}
|
|
214
|
+
* - {year__gte: 2023} → {field: "year", op: "gte", value: 2023}
|
|
215
|
+
* - {region: "ny", year__gte: 2023} → {and: [...]}
|
|
216
|
+
*/
|
|
217
|
+
type ShorthandFilter = Record<string, any>;
|
|
218
|
+
|
|
219
|
+
/**
|
|
220
|
+
* Translates filters between formats and validates structure.
|
|
221
|
+
*/
|
|
222
|
+
declare class FilterTranslator {
|
|
223
|
+
/**
|
|
224
|
+
* Normalize any filter input to standard UniversalFilter format.
|
|
225
|
+
*
|
|
226
|
+
* Handles:
|
|
227
|
+
* - Standard format (pass through)
|
|
228
|
+
* - Shorthand format (convert to standard)
|
|
229
|
+
* - Operator suffixes (field__op syntax)
|
|
230
|
+
*/
|
|
231
|
+
static normalize(input: ShorthandFilter | UniversalFilter): UniversalFilter;
|
|
232
|
+
/**
|
|
233
|
+
* Validate filter structure and operators.
|
|
234
|
+
*
|
|
235
|
+
* Throws error if filter is invalid.
|
|
236
|
+
*/
|
|
237
|
+
static validate(filter: UniversalFilter): void;
|
|
238
|
+
/**
|
|
239
|
+
* Check if filter is compound (AND/OR).
|
|
240
|
+
*/
|
|
241
|
+
static isCompound(filter: UniversalFilter): boolean;
|
|
242
|
+
/**
|
|
243
|
+
* Check if input is already in standard format.
|
|
244
|
+
*/
|
|
245
|
+
private static isStandardFormat;
|
|
246
|
+
/**
|
|
247
|
+
* Convert shorthand format to standard.
|
|
248
|
+
*/
|
|
249
|
+
private static fromShorthand;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Types for vector database adapters.
|
|
254
|
+
*
|
|
255
|
+
* These types define the common interface elements that all
|
|
256
|
+
* adapters must support or return.
|
|
257
|
+
*/
|
|
258
|
+
/**
|
|
259
|
+
* Statistics about a vector collection.
|
|
260
|
+
*/
|
|
261
|
+
interface CollectionStats {
|
|
262
|
+
/** Total number of vectors in the collection */
|
|
263
|
+
vectorCount: number;
|
|
264
|
+
/** Dimension of vectors in this collection */
|
|
265
|
+
dimension: number;
|
|
266
|
+
/** Distance metric used (cosine, euclidean, etc.) */
|
|
267
|
+
metric: DistanceMetric;
|
|
268
|
+
/** Additional DB-specific stats (optional) */
|
|
269
|
+
[key: string]: any;
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Metadata update operation.
|
|
273
|
+
*
|
|
274
|
+
* Used for efficient metadata enrichment without re-uploading vectors.
|
|
275
|
+
*/
|
|
276
|
+
interface MetadataUpdate {
|
|
277
|
+
/** ID of the record to update */
|
|
278
|
+
id: string;
|
|
279
|
+
/** Metadata fields to set/update */
|
|
280
|
+
metadata: Record<string, any>;
|
|
281
|
+
}
|
|
282
|
+
/**
|
|
283
|
+
* Distance metric for vector similarity.
|
|
284
|
+
*/
|
|
285
|
+
type DistanceMetric = 'cosine' | 'euclidean' | 'dotProduct';
|
|
286
|
+
|
|
287
|
+
/**
|
|
288
|
+
* Abstract base class for all vector database adapters.
|
|
289
|
+
*
|
|
290
|
+
* This is the KEY abstraction that enables database-agnostic operations.
|
|
291
|
+
* Each database (Pinecone, Chroma, Qdrant, etc.) implements this interface,
|
|
292
|
+
* allowing the SDK to work with any vector database.
|
|
293
|
+
*
|
|
294
|
+
* Design principles:
|
|
295
|
+
* 1. All methods are abstract (must be implemented by subclasses)
|
|
296
|
+
* 2. Capability flags have default implementations (can be overridden)
|
|
297
|
+
* 3. Universal filter translation is adapter-specific
|
|
298
|
+
* 4. Async iteration enables efficient enrichment pipelines
|
|
299
|
+
*
|
|
300
|
+
* @abstract
|
|
301
|
+
*/
|
|
302
|
+
declare abstract class VectorDBAdapter {
|
|
303
|
+
/**
|
|
304
|
+
* Connect to the vector database.
|
|
305
|
+
*
|
|
306
|
+
* Initialize client, authenticate, verify connection.
|
|
307
|
+
*/
|
|
308
|
+
abstract connect(): Promise<void>;
|
|
309
|
+
/**
|
|
310
|
+
* Disconnect from the vector database.
|
|
311
|
+
*
|
|
312
|
+
* Clean up resources, close connections.
|
|
313
|
+
*/
|
|
314
|
+
abstract disconnect(): Promise<void>;
|
|
315
|
+
/**
|
|
316
|
+
* Check if currently connected to the database.
|
|
317
|
+
*/
|
|
318
|
+
abstract isConnected(): Promise<boolean>;
|
|
319
|
+
/**
|
|
320
|
+
* Create a new vector collection.
|
|
321
|
+
*
|
|
322
|
+
* @param name - Collection name
|
|
323
|
+
* @param dimension - Vector dimension
|
|
324
|
+
* @param metric - Distance metric (default: cosine)
|
|
325
|
+
*/
|
|
326
|
+
abstract createCollection(name: string, dimension: number, metric?: DistanceMetric): Promise<void>;
|
|
327
|
+
/**
|
|
328
|
+
* Delete a collection and all its vectors.
|
|
329
|
+
*
|
|
330
|
+
* @param name - Collection name
|
|
331
|
+
*/
|
|
332
|
+
abstract deleteCollection(name: string): Promise<void>;
|
|
333
|
+
/**
|
|
334
|
+
* Check if a collection exists.
|
|
335
|
+
*
|
|
336
|
+
* @param name - Collection name
|
|
337
|
+
*/
|
|
338
|
+
abstract collectionExists(name: string): Promise<boolean>;
|
|
339
|
+
/**
|
|
340
|
+
* Get statistics about a collection.
|
|
341
|
+
*
|
|
342
|
+
* @param name - Collection name
|
|
343
|
+
*/
|
|
344
|
+
abstract getCollectionStats(name: string): Promise<CollectionStats>;
|
|
345
|
+
/**
|
|
346
|
+
* Upsert (insert or update) vector records.
|
|
347
|
+
*
|
|
348
|
+
* This is the primary method for adding vectors to the database.
|
|
349
|
+
* If a record with the same ID exists, it is updated.
|
|
350
|
+
*
|
|
351
|
+
* @param collection - Collection name
|
|
352
|
+
* @param records - Vector records to upsert
|
|
353
|
+
*/
|
|
354
|
+
abstract upsert(collection: string, records: VectorRecord[]): Promise<void>;
|
|
355
|
+
/**
|
|
356
|
+
* Fetch vector records by ID.
|
|
357
|
+
*
|
|
358
|
+
* @param collection - Collection name
|
|
359
|
+
* @param ids - Record IDs to fetch
|
|
360
|
+
* @returns Array of matching records (may be empty)
|
|
361
|
+
*/
|
|
362
|
+
abstract fetch(collection: string, ids: string[]): Promise<VectorRecord[]>;
|
|
363
|
+
/**
|
|
364
|
+
* Delete vector records by ID.
|
|
365
|
+
*
|
|
366
|
+
* @param collection - Collection name
|
|
367
|
+
* @param ids - Record IDs to delete
|
|
368
|
+
*/
|
|
369
|
+
abstract delete(collection: string, ids: string[]): Promise<void>;
|
|
370
|
+
/**
|
|
371
|
+
* Update metadata for existing records without re-uploading vectors.
|
|
372
|
+
*
|
|
373
|
+
* This is CRITICAL for enrichment pipelines where we need to:
|
|
374
|
+
* 1. Insert initial vectors with basic metadata
|
|
375
|
+
* 2. Later enrich with vertical/horizontal metadata
|
|
376
|
+
* 3. Avoid re-uploading large embedding vectors
|
|
377
|
+
*
|
|
378
|
+
* @param collection - Collection name
|
|
379
|
+
* @param updates - Metadata updates to apply
|
|
380
|
+
*/
|
|
381
|
+
abstract updateMetadata(collection: string, updates: MetadataUpdate[]): Promise<void>;
|
|
382
|
+
/**
|
|
383
|
+
* Search for similar vectors.
|
|
384
|
+
*
|
|
385
|
+
* @param collection - Collection name
|
|
386
|
+
* @param queryVector - Query vector to search with
|
|
387
|
+
* @param options - Search options
|
|
388
|
+
* @returns Search results
|
|
389
|
+
*/
|
|
390
|
+
abstract search(collection: string, queryVector: number[], options?: {
|
|
391
|
+
topK?: number;
|
|
392
|
+
filter?: UniversalFilter;
|
|
393
|
+
includeMetadata?: boolean;
|
|
394
|
+
includeValues?: boolean;
|
|
395
|
+
}): Promise<SearchResult>;
|
|
396
|
+
/**
|
|
397
|
+
* Translate universal filter to database-specific filter format.
|
|
398
|
+
*
|
|
399
|
+
* This is the KEY method that enables database-agnostic filtering.
|
|
400
|
+
* Each adapter translates the universal filter to its native format:
|
|
401
|
+
*
|
|
402
|
+
* - Pinecone: {field: {$eq: value}}
|
|
403
|
+
* - Qdrant: {must: [{key: field, match: {value}}]}
|
|
404
|
+
* - Chroma: {field: value}
|
|
405
|
+
*
|
|
406
|
+
* @param filter - Universal filter
|
|
407
|
+
* @returns Database-specific filter object
|
|
408
|
+
*/
|
|
409
|
+
abstract translateFilter(filter: UniversalFilter): any;
|
|
410
|
+
/**
|
|
411
|
+
* Iterate over all vectors in a collection in batches.
|
|
412
|
+
*
|
|
413
|
+
* This enables efficient enrichment pipelines:
|
|
414
|
+
* 1. Fetch vectors in batches
|
|
415
|
+
* 2. Enrich each batch with metadata
|
|
416
|
+
* 3. Update metadata back to DB
|
|
417
|
+
*
|
|
418
|
+
* @param collection - Collection name
|
|
419
|
+
* @param options - Iteration options
|
|
420
|
+
* @yields Batches of vector records
|
|
421
|
+
*/
|
|
422
|
+
abstract iterate(collection: string, options?: {
|
|
423
|
+
batchSize?: number;
|
|
424
|
+
filter?: UniversalFilter;
|
|
425
|
+
}): AsyncIterableIterator<VectorRecord[]>;
|
|
426
|
+
/**
|
|
427
|
+
* Whether this adapter supports metadata updates without re-uploading vectors.
|
|
428
|
+
*
|
|
429
|
+
* Default: false (must re-upload entire record)
|
|
430
|
+
* Override to return true if your DB supports partial updates.
|
|
431
|
+
*/
|
|
432
|
+
supportsMetadataUpdate(): boolean;
|
|
433
|
+
/**
|
|
434
|
+
* Whether this adapter supports filtering during search.
|
|
435
|
+
*
|
|
436
|
+
* Default: false (no filtering support)
|
|
437
|
+
* Override to return true if your DB supports metadata filtering.
|
|
438
|
+
*/
|
|
439
|
+
supportsFiltering(): boolean;
|
|
440
|
+
/**
|
|
441
|
+
* Whether this adapter supports batch operations efficiently.
|
|
442
|
+
*
|
|
443
|
+
* Default: false (single operations only)
|
|
444
|
+
* Override to return true if your DB supports batch upsert/delete.
|
|
445
|
+
*/
|
|
446
|
+
supportsBatchOperations(): boolean;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
/**
|
|
450
|
+
* Query Composition Layer - Retrieval Types and Interfaces
|
|
451
|
+
*
|
|
452
|
+
* Defines the core interfaces for retrieval operations in Glyph.
|
|
453
|
+
* These types abstract query parameters and results across different
|
|
454
|
+
* vector database adapters.
|
|
455
|
+
*/
|
|
456
|
+
|
|
457
|
+
/**
|
|
458
|
+
* Parameters for a retrieval operation.
|
|
459
|
+
*
|
|
460
|
+
* Combines query text, collection targeting, and optional filters
|
|
461
|
+
* for both vertical (document-level) and horizontal (theme-level) filtering.
|
|
462
|
+
*/
|
|
463
|
+
interface RetrievalParams {
|
|
464
|
+
/** The search query text to embed and search for */
|
|
465
|
+
query: string;
|
|
466
|
+
/** Target collection to search in */
|
|
467
|
+
collection: string;
|
|
468
|
+
/** Number of results to return */
|
|
469
|
+
topK: number;
|
|
470
|
+
/** Optional document-level filters (e.g., filter by doc_id, region, year) */
|
|
471
|
+
verticalFilters?: UniversalFilter;
|
|
472
|
+
/** Optional theme/section-level filters (e.g., filter by theme, section) */
|
|
473
|
+
horizontalFilters?: UniversalFilter;
|
|
474
|
+
/** Optional additional user-defined filters */
|
|
475
|
+
customFilters?: UniversalFilter;
|
|
476
|
+
/** Whether to include embedding vectors in results (default: false) */
|
|
477
|
+
includeEmbeddings?: boolean;
|
|
478
|
+
}
|
|
479
|
+
/**
|
|
480
|
+
* Result of a retrieval operation.
|
|
481
|
+
*
|
|
482
|
+
* Contains the retrieved records, original query, and information
|
|
483
|
+
* about which filters were applied.
|
|
484
|
+
*/
|
|
485
|
+
interface RetrievalResult {
|
|
486
|
+
/** The retrieved vector records */
|
|
487
|
+
records: VectorRecord[];
|
|
488
|
+
/** The original query text */
|
|
489
|
+
query: string;
|
|
490
|
+
/** Information about which filters were applied */
|
|
491
|
+
filtersApplied: {
|
|
492
|
+
vertical?: UniversalFilter;
|
|
493
|
+
horizontal?: UniversalFilter;
|
|
494
|
+
custom?: UniversalFilter;
|
|
495
|
+
};
|
|
496
|
+
}
|
|
497
|
+
/**
|
|
498
|
+
* Options for a search operation at the adapter level.
|
|
499
|
+
*
|
|
500
|
+
* These are lower-level options used by adapters to perform
|
|
501
|
+
* the actual vector search.
|
|
502
|
+
*/
|
|
503
|
+
interface SearchOptions {
|
|
504
|
+
/** Number of results to return */
|
|
505
|
+
topK: number;
|
|
506
|
+
/**
|
|
507
|
+
* Optional universal filter for the search.
|
|
508
|
+
* This is NOT yet translated - adapters will translate it to their native format.
|
|
509
|
+
* See VectorDBAdapter.translateFilter() for translation logic.
|
|
510
|
+
*/
|
|
511
|
+
filter?: UniversalFilter;
|
|
512
|
+
/** Whether to include embedding vectors in results */
|
|
513
|
+
includeEmbeddings?: boolean;
|
|
514
|
+
}
|
|
515
|
+
/**
|
|
516
|
+
* Results grouped by different dimensions.
|
|
517
|
+
*
|
|
518
|
+
* Used for organizing search results by vertical (document)
|
|
519
|
+
* or horizontal (theme) dimensions.
|
|
520
|
+
*
|
|
521
|
+
* **How Map keys are determined:**
|
|
522
|
+
* - Vertical: Keys are extracted from the `__v_doc_id` field in record metadata
|
|
523
|
+
* - Horizontal: Keys are extracted from the `__h_theme` field in record metadata
|
|
524
|
+
*
|
|
525
|
+
* **Handling missing metadata:**
|
|
526
|
+
* - If a record is missing `__v_doc_id`, it will NOT appear in the vertical Map
|
|
527
|
+
* - If a record is missing `__h_theme`, it will NOT appear in the horizontal Map
|
|
528
|
+
* - Records can be excluded from both Maps if they lack the required metadata fields
|
|
529
|
+
*
|
|
530
|
+
* **Grouping behavior:**
|
|
531
|
+
* - Each record appears in AT MOST ONE group per dimension (based on its metadata value)
|
|
532
|
+
* - A record with `__v_doc_id: "doc1"` will appear in `vertical.get("doc1")`
|
|
533
|
+
* - A record with `__h_theme: "legal"` will appear in `horizontal.get("legal")`
|
|
534
|
+
* - Records cannot appear in multiple groups within the same dimension
|
|
535
|
+
*/
|
|
536
|
+
interface GroupedResults {
|
|
537
|
+
/** Records grouped by document ID (__v_doc_id) */
|
|
538
|
+
vertical: Map<string, VectorRecord[]>;
|
|
539
|
+
/** Records grouped by theme (__h_theme) */
|
|
540
|
+
horizontal: Map<string, VectorRecord[]>;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
/**
|
|
544
|
+
* FilterBuilder - Utility for combining multiple filters with fluent API.
|
|
545
|
+
*
|
|
546
|
+
* Provides a convenient way to combine vertical, horizontal, and custom filters
|
|
547
|
+
* into a single UniversalFilter with AND logic.
|
|
548
|
+
*
|
|
549
|
+
* @example
|
|
550
|
+
* ```typescript
|
|
551
|
+
* const filter = new FilterBuilder()
|
|
552
|
+
* .withVerticalFilter({ field: 'doc_id', op: 'eq', value: 'doc123' })
|
|
553
|
+
* .withHorizontalFilter({ field: 'theme', op: 'eq', value: 'legal' })
|
|
554
|
+
* .build();
|
|
555
|
+
* ```
|
|
556
|
+
*/
|
|
557
|
+
declare class FilterBuilder {
|
|
558
|
+
private verticalFilter?;
|
|
559
|
+
private horizontalFilter?;
|
|
560
|
+
private customFilter?;
|
|
561
|
+
/**
|
|
562
|
+
* Add a vertical (document-level) filter.
|
|
563
|
+
*
|
|
564
|
+
* @param filter - The vertical filter to add (standard or shorthand format)
|
|
565
|
+
* @returns This builder for method chaining
|
|
566
|
+
*/
|
|
567
|
+
withVerticalFilter(filter: UniversalFilter | Record<string, any>): this;
|
|
568
|
+
/**
|
|
569
|
+
* Add a horizontal (theme-level) filter.
|
|
570
|
+
*
|
|
571
|
+
* @param filter - The horizontal filter to add (standard or shorthand format)
|
|
572
|
+
* @returns This builder for method chaining
|
|
573
|
+
*/
|
|
574
|
+
withHorizontalFilter(filter: UniversalFilter | Record<string, any>): this;
|
|
575
|
+
/**
|
|
576
|
+
* Add a custom user-defined filter.
|
|
577
|
+
*
|
|
578
|
+
* @param filter - The custom filter to add (standard or shorthand format)
|
|
579
|
+
* @returns This builder for method chaining
|
|
580
|
+
*/
|
|
581
|
+
withCustomFilter(filter: UniversalFilter | Record<string, any>): this;
|
|
582
|
+
/**
|
|
583
|
+
* Build the combined filter.
|
|
584
|
+
*
|
|
585
|
+
* Combination logic:
|
|
586
|
+
* - If no filters: returns undefined
|
|
587
|
+
* - If single filter: returns it directly
|
|
588
|
+
* - If multiple filters: combines with AND logic
|
|
589
|
+
*
|
|
590
|
+
* @returns The combined filter, or undefined if no filters were added
|
|
591
|
+
*/
|
|
592
|
+
build(): UniversalFilter | undefined;
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
/**
|
|
596
|
+
* Abstract base class for text embedding models.
|
|
597
|
+
*
|
|
598
|
+
* This abstraction allows the VectorORM to work with any embedding provider
|
|
599
|
+
* (OpenAI, Cohere, HuggingFace, etc.) by implementing a consistent interface.
|
|
600
|
+
*
|
|
601
|
+
* Implementations must provide:
|
|
602
|
+
* - `embed()`: Convert a single text string into a vector embedding
|
|
603
|
+
* - `embedBatch()`: Convert multiple texts into embeddings efficiently
|
|
604
|
+
* - `dimensions`: The size of the embedding vectors produced
|
|
605
|
+
* - `modelName`: Identifier for the embedding model being used
|
|
606
|
+
*
|
|
607
|
+
* @example
|
|
608
|
+
* ```typescript
|
|
609
|
+
* class OpenAIEmbedder extends Embedder {
|
|
610
|
+
* get dimensions(): number { return 1536; }
|
|
611
|
+
* get modelName(): string { return 'text-embedding-ada-002'; }
|
|
612
|
+
*
|
|
613
|
+
* async embed(text: string): Promise<number[]> {
|
|
614
|
+
* // Call OpenAI API
|
|
615
|
+
* }
|
|
616
|
+
*
|
|
617
|
+
* async embedBatch(texts: string[]): Promise<number[][]> {
|
|
618
|
+
* // Batch call to OpenAI API
|
|
619
|
+
* }
|
|
620
|
+
* }
|
|
621
|
+
* ```
|
|
622
|
+
*/
|
|
623
|
+
declare abstract class Embedder {
|
|
624
|
+
/**
|
|
625
|
+
* The dimensionality of embeddings produced by this model.
|
|
626
|
+
* Must be consistent across all embeddings from the same model.
|
|
627
|
+
*/
|
|
628
|
+
abstract get dimensions(): number;
|
|
629
|
+
/**
|
|
630
|
+
* Identifier for the embedding model.
|
|
631
|
+
* Used for tracking which model generated embeddings.
|
|
632
|
+
*/
|
|
633
|
+
abstract get modelName(): string;
|
|
634
|
+
/**
|
|
635
|
+
* Embed a single text string into a vector.
|
|
636
|
+
*
|
|
637
|
+
* @param text - The text to embed
|
|
638
|
+
* @returns A promise that resolves to a number array representing the embedding
|
|
639
|
+
*/
|
|
640
|
+
abstract embed(text: string): Promise<number[]>;
|
|
641
|
+
/**
|
|
642
|
+
* Embed multiple texts into vectors efficiently.
|
|
643
|
+
* Implementations should maintain the order of input texts in the output.
|
|
644
|
+
*
|
|
645
|
+
* @param texts - Array of texts to embed
|
|
646
|
+
* @returns A promise that resolves to an array of embeddings, one per input text
|
|
647
|
+
*/
|
|
648
|
+
abstract embedBatch(texts: string[]): Promise<number[][]>;
|
|
649
|
+
/**
|
|
650
|
+
* Constructor is protected to prevent direct instantiation of abstract class.
|
|
651
|
+
* Subclasses can call super() in their constructors.
|
|
652
|
+
*/
|
|
653
|
+
protected constructor();
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
/**
|
|
657
|
+
* RAGQueryComposer - Main orchestrator for retrieval operations.
|
|
658
|
+
*
|
|
659
|
+
* Coordinates between embedder and vector database adapter to perform
|
|
660
|
+
* semantic search with filtering. Provides specialized methods for
|
|
661
|
+
* grouping results by vertical (document) or horizontal (theme) dimensions.
|
|
662
|
+
*
|
|
663
|
+
* @example
|
|
664
|
+
* ```typescript
|
|
665
|
+
* const composer = new RAGQueryComposer(adapter, embedder);
|
|
666
|
+
*
|
|
667
|
+
* // Basic retrieval
|
|
668
|
+
* const result = await composer.retrieve({
|
|
669
|
+
* query: 'pricing information',
|
|
670
|
+
* collection: 'documents',
|
|
671
|
+
* topK: 10
|
|
672
|
+
* });
|
|
673
|
+
*
|
|
674
|
+
* // Retrieval with filters
|
|
675
|
+
* const filtered = await composer.retrieve({
|
|
676
|
+
* query: 'pricing information',
|
|
677
|
+
* collection: 'documents',
|
|
678
|
+
* topK: 10,
|
|
679
|
+
* verticalFilters: { doc_id: 'contract-123' },
|
|
680
|
+
* horizontalFilters: { theme: 'legal' }
|
|
681
|
+
* });
|
|
682
|
+
*
|
|
683
|
+
* // Grouped by document
|
|
684
|
+
* const byDocument = await composer.retrieveVertical({
|
|
685
|
+
* query: 'pricing information',
|
|
686
|
+
* collection: 'documents',
|
|
687
|
+
* topK: 10
|
|
688
|
+
* });
|
|
689
|
+
* ```
|
|
690
|
+
*/
|
|
691
|
+
declare class RAGQueryComposer {
|
|
692
|
+
private readonly adapter;
|
|
693
|
+
private readonly embedder;
|
|
694
|
+
/**
|
|
695
|
+
* Create a new RAGQueryComposer.
|
|
696
|
+
*
|
|
697
|
+
* @param adapter - Vector database adapter for search operations
|
|
698
|
+
* @param embedder - Embedder for converting text queries to vectors
|
|
699
|
+
*/
|
|
700
|
+
constructor(adapter: VectorDBAdapter, embedder: Embedder);
|
|
701
|
+
/**
|
|
702
|
+
* Main retrieval method.
|
|
703
|
+
*
|
|
704
|
+
* Performs semantic search with optional filtering:
|
|
705
|
+
* 1. Embeds query text using embedder
|
|
706
|
+
* 2. Builds combined filter using FilterBuilder
|
|
707
|
+
* 3. Calls adapter.search() with query vector and filter
|
|
708
|
+
* 4. Returns results with filter information
|
|
709
|
+
*
|
|
710
|
+
* @param params - Retrieval parameters
|
|
711
|
+
* @returns Retrieval result with records and filter information
|
|
712
|
+
*/
|
|
713
|
+
retrieve(params: RetrievalParams): Promise<RetrievalResult>;
|
|
714
|
+
/**
|
|
715
|
+
* Retrieve and group results by document ID.
|
|
716
|
+
*
|
|
717
|
+
* Calls retrieve() and organizes results into a Map keyed by __v_doc_id.
|
|
718
|
+
* Records without a doc_id are excluded.
|
|
719
|
+
*
|
|
720
|
+
* @param params - Retrieval parameters
|
|
721
|
+
* @returns Map of document ID to array of records
|
|
722
|
+
*/
|
|
723
|
+
retrieveVertical(params: RetrievalParams): Promise<Map<string, VectorRecord[]>>;
|
|
724
|
+
/**
|
|
725
|
+
* Retrieve and group results by theme.
|
|
726
|
+
*
|
|
727
|
+
* Calls retrieve() and organizes results into a Map keyed by __h_theme.
|
|
728
|
+
* Records without a theme are excluded.
|
|
729
|
+
*
|
|
730
|
+
* @param params - Retrieval parameters
|
|
731
|
+
* @returns Map of theme to array of records
|
|
732
|
+
*/
|
|
733
|
+
retrieveHorizontal(params: RetrievalParams): Promise<Map<string, VectorRecord[]>>;
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/**
|
|
737
|
+
* Options for LLM text generation.
|
|
738
|
+
*
|
|
739
|
+
* These options control how the LLM generates text,
|
|
740
|
+
* allowing fine-grained control over the output behavior.
|
|
741
|
+
*/
|
|
742
|
+
interface GenerateOptions {
|
|
743
|
+
/**
|
|
744
|
+
* Controls randomness in generation.
|
|
745
|
+
* Higher values (e.g., 1.0) make output more random.
|
|
746
|
+
* Lower values (e.g., 0.1) make output more deterministic.
|
|
747
|
+
* Range: 0.0 to 2.0
|
|
748
|
+
*/
|
|
749
|
+
temperature?: number;
|
|
750
|
+
/**
|
|
751
|
+
* Maximum number of tokens to generate.
|
|
752
|
+
* Limits the length of the generated output.
|
|
753
|
+
*/
|
|
754
|
+
maxTokens?: number;
|
|
755
|
+
/**
|
|
756
|
+
* System prompt to set context for the LLM.
|
|
757
|
+
* Used to guide the model's behavior and personality.
|
|
758
|
+
*/
|
|
759
|
+
systemPrompt?: string;
|
|
760
|
+
/**
|
|
761
|
+
* Sequences where the LLM should stop generating.
|
|
762
|
+
* When encountered, generation stops immediately.
|
|
763
|
+
*/
|
|
764
|
+
stopSequences?: string[];
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
/**
|
|
768
|
+
* Abstract base class for LLM (Large Language Model) clients.
|
|
769
|
+
*
|
|
770
|
+
* This abstraction allows the VectorORM to work with any LLM provider
|
|
771
|
+
* (OpenAI, Anthropic, Google, etc.) by implementing a consistent interface.
|
|
772
|
+
*
|
|
773
|
+
* Implementations must provide:
|
|
774
|
+
* - `generate()`: Generate text from a prompt
|
|
775
|
+
* - `generateJSON<T>()`: Generate structured JSON output
|
|
776
|
+
* - `generateBatch()`: Generate multiple responses efficiently
|
|
777
|
+
* - `modelName`: Identifier for the LLM model being used
|
|
778
|
+
* - `provider`: Name of the LLM provider
|
|
779
|
+
*
|
|
780
|
+
* @example
|
|
781
|
+
* ```typescript
|
|
782
|
+
* class OpenAIClient extends LLMClient {
|
|
783
|
+
* get modelName(): string { return 'gpt-4'; }
|
|
784
|
+
* get provider(): string { return 'openai'; }
|
|
785
|
+
*
|
|
786
|
+
* async generate(prompt: string, options?: GenerateOptions): Promise<string> {
|
|
787
|
+
* // Call OpenAI API
|
|
788
|
+
* }
|
|
789
|
+
*
|
|
790
|
+
* async generateJSON<T>(prompt: string, options?: GenerateOptions): Promise<T> {
|
|
791
|
+
* // Call OpenAI API with JSON mode
|
|
792
|
+
* }
|
|
793
|
+
*
|
|
794
|
+
* async generateBatch(prompts: string[], options?: GenerateOptions): Promise<string[]> {
|
|
795
|
+
* // Batch call to OpenAI API
|
|
796
|
+
* }
|
|
797
|
+
* }
|
|
798
|
+
* ```
|
|
799
|
+
*/
|
|
800
|
+
declare abstract class LLMClient {
|
|
801
|
+
/**
|
|
802
|
+
* Identifier for the LLM model.
|
|
803
|
+
* Used for tracking which model generated responses.
|
|
804
|
+
*/
|
|
805
|
+
abstract get modelName(): string;
|
|
806
|
+
/**
|
|
807
|
+
* Name of the LLM provider.
|
|
808
|
+
* Examples: 'openai', 'anthropic', 'google', 'mock'
|
|
809
|
+
*/
|
|
810
|
+
abstract get provider(): string;
|
|
811
|
+
/**
|
|
812
|
+
* Generate text from a prompt.
|
|
813
|
+
*
|
|
814
|
+
* @param prompt - The text prompt to send to the LLM
|
|
815
|
+
* @param options - Optional generation parameters
|
|
816
|
+
* @returns A promise that resolves to the generated text
|
|
817
|
+
*/
|
|
818
|
+
abstract generate(prompt: string, options?: GenerateOptions): Promise<string>;
|
|
819
|
+
/**
|
|
820
|
+
* Generate structured JSON output from a prompt.
|
|
821
|
+
* The LLM will be instructed to return valid JSON that matches type T.
|
|
822
|
+
*
|
|
823
|
+
* @param prompt - The text prompt to send to the LLM
|
|
824
|
+
* @param options - Optional generation parameters
|
|
825
|
+
* @returns A promise that resolves to the parsed JSON object
|
|
826
|
+
*/
|
|
827
|
+
abstract generateJSON<T>(prompt: string, options?: GenerateOptions): Promise<T>;
|
|
828
|
+
/**
|
|
829
|
+
* Generate multiple responses efficiently.
|
|
830
|
+
* Implementations should maintain the order of input prompts in the output.
|
|
831
|
+
*
|
|
832
|
+
* @param prompts - Array of prompts to process
|
|
833
|
+
* @param options - Optional generation parameters
|
|
834
|
+
* @returns A promise that resolves to an array of responses, one per input prompt
|
|
835
|
+
*/
|
|
836
|
+
abstract generateBatch(prompts: string[], options?: GenerateOptions): Promise<string[]>;
|
|
837
|
+
/**
|
|
838
|
+
* Constructor is protected to prevent direct instantiation of abstract class.
|
|
839
|
+
* Subclasses can call super() in their constructors.
|
|
840
|
+
*/
|
|
841
|
+
protected constructor();
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
/**
|
|
845
|
+
* MockLLM for testing purposes only.
|
|
846
|
+
* Returns canned responses that can be set programmatically.
|
|
847
|
+
*
|
|
848
|
+
* @example
|
|
849
|
+
* ```typescript
|
|
850
|
+
* const llm = new MockLLM();
|
|
851
|
+
* llm.setResponse('Hello, world!');
|
|
852
|
+
* const result = await llm.generate('Say hello'); // Returns 'Hello, world!'
|
|
853
|
+
* ```
|
|
854
|
+
*/
|
|
855
|
+
declare class MockLLM extends LLMClient {
|
|
856
|
+
private _response;
|
|
857
|
+
constructor();
|
|
858
|
+
get modelName(): string;
|
|
859
|
+
get provider(): string;
|
|
860
|
+
/**
|
|
861
|
+
* Set the canned response that will be returned by generate methods.
|
|
862
|
+
*
|
|
863
|
+
* @param response - The response text to return
|
|
864
|
+
*/
|
|
865
|
+
setResponse(response: string): void;
|
|
866
|
+
generate(prompt: string, options?: GenerateOptions): Promise<string>;
|
|
867
|
+
generateJSON<T>(prompt: string, options?: GenerateOptions): Promise<T>;
|
|
868
|
+
generateBatch(prompts: string[], options?: GenerateOptions): Promise<string[]>;
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
/**
|
|
872
|
+
* Theme classification result containing the identified theme and confidence score.
|
|
873
|
+
*
|
|
874
|
+
* @property theme - The identified theme label (e.g., 'technology', 'business', 'science')
|
|
875
|
+
* @property confidence - Confidence score between 0 and 1 indicating classification certainty
|
|
876
|
+
* @property allScores - Optional map of all theme labels to their respective confidence scores
|
|
877
|
+
*/
|
|
878
|
+
interface ThemeClassification {
|
|
879
|
+
/**
|
|
880
|
+
* The identified theme label.
|
|
881
|
+
* Examples: 'technology', 'business', 'science', 'healthcare', 'education', etc.
|
|
882
|
+
*/
|
|
883
|
+
theme: string;
|
|
884
|
+
/**
|
|
885
|
+
* Confidence score between 0 and 1 indicating classification certainty.
|
|
886
|
+
* Higher values indicate greater confidence in the classification.
|
|
887
|
+
*/
|
|
888
|
+
confidence: number;
|
|
889
|
+
/**
|
|
890
|
+
* Optional map of all theme labels to their respective confidence scores.
|
|
891
|
+
* Useful for understanding alternative themes and their relative probabilities.
|
|
892
|
+
*
|
|
893
|
+
* @example
|
|
894
|
+
* ```typescript
|
|
895
|
+
* {
|
|
896
|
+
* 'technology': 0.85,
|
|
897
|
+
* 'business': 0.10,
|
|
898
|
+
* 'science': 0.05
|
|
899
|
+
* }
|
|
900
|
+
* ```
|
|
901
|
+
*/
|
|
902
|
+
allScores?: Record<string, number>;
|
|
903
|
+
}
|
|
904
|
+
/**
|
|
905
|
+
* Interface for theme classification strategies.
|
|
906
|
+
*
|
|
907
|
+
* Theme classifiers identify the primary theme or topic of text content.
|
|
908
|
+
* Different implementations can use various strategies:
|
|
909
|
+
*
|
|
910
|
+
* 1. **Keyword-based Classification**: Uses predefined keyword lists to match themes
|
|
911
|
+
* - Fast and deterministic
|
|
912
|
+
* - Good for well-defined domains with clear vocabulary
|
|
913
|
+
* - Example: Medical texts with specific terminology
|
|
914
|
+
*
|
|
915
|
+
* 2. **Zero-shot Classification**: Uses pre-trained models without fine-tuning
|
|
916
|
+
* - No training data required
|
|
917
|
+
* - Good for general-purpose classification
|
|
918
|
+
* - Example: Hugging Face zero-shot classification models
|
|
919
|
+
*
|
|
920
|
+
* 3. **Embedding-based Classification**: Uses vector similarity between text and theme embeddings
|
|
921
|
+
* - Semantic understanding of themes
|
|
922
|
+
* - Can find nuanced thematic relationships
|
|
923
|
+
* - Example: Comparing document embeddings to theme prototype embeddings
|
|
924
|
+
*
|
|
925
|
+
* 4. **LLM-based Classification**: Uses language models for theme identification
|
|
926
|
+
* - Most flexible and powerful
|
|
927
|
+
* - Can understand complex, nuanced themes
|
|
928
|
+
* - Example: GPT-4, Claude, or other LLMs with structured output
|
|
929
|
+
*
|
|
930
|
+
* Implementations should:
|
|
931
|
+
* - Return confidence scores between 0 and 1
|
|
932
|
+
* - Handle empty or invalid input gracefully
|
|
933
|
+
* - Maintain consistent theme labels across calls
|
|
934
|
+
* - Optionally provide all theme scores for transparency
|
|
935
|
+
*
|
|
936
|
+
* @example
|
|
937
|
+
* ```typescript
|
|
938
|
+
* class KeywordThemeClassifier implements ThemeClassifier {
|
|
939
|
+
* async classify(text: string): Promise<ThemeClassification> {
|
|
940
|
+
* // Keyword matching logic
|
|
941
|
+
* const theme = 'technology';
|
|
942
|
+
* const confidence = 0.92;
|
|
943
|
+
* return { theme, confidence };
|
|
944
|
+
* }
|
|
945
|
+
*
|
|
946
|
+
* async classifyBatch(texts: string[]): Promise<ThemeClassification[]> {
|
|
947
|
+
* return Promise.all(texts.map(text => this.classify(text)));
|
|
948
|
+
* }
|
|
949
|
+
* }
|
|
950
|
+
* ```
|
|
951
|
+
*
|
|
952
|
+
* @example
|
|
953
|
+
* ```typescript
|
|
954
|
+
* class LLMThemeClassifier implements ThemeClassifier {
|
|
955
|
+
* constructor(private llm: LLMClient, private themes: string[]) {}
|
|
956
|
+
*
|
|
957
|
+
* async classify(text: string): Promise<ThemeClassification> {
|
|
958
|
+
* const prompt = `Classify the following text into one of these themes: ${this.themes.join(', ')}
|
|
959
|
+
*
|
|
960
|
+
* Text: ${text}
|
|
961
|
+
*
|
|
962
|
+
* Return JSON with: theme (string), confidence (number 0-1), allScores (object)`;
|
|
963
|
+
*
|
|
964
|
+
* const result = await this.llm.generateJSON<ThemeClassification>(prompt);
|
|
965
|
+
* return result;
|
|
966
|
+
* }
|
|
967
|
+
*
|
|
968
|
+
* async classifyBatch(texts: string[]): Promise<ThemeClassification[]> {
|
|
969
|
+
* // Efficient batch processing
|
|
970
|
+
* return Promise.all(texts.map(text => this.classify(text)));
|
|
971
|
+
* }
|
|
972
|
+
* }
|
|
973
|
+
* ```
|
|
974
|
+
*/
|
|
975
|
+
interface ThemeClassifier {
|
|
976
|
+
/**
|
|
977
|
+
* Classify a single text and return the identified theme with confidence score.
|
|
978
|
+
*
|
|
979
|
+
* @param text - The text content to classify
|
|
980
|
+
* @returns A promise that resolves to the theme classification result
|
|
981
|
+
*
|
|
982
|
+
* @example
|
|
983
|
+
* ```typescript
|
|
984
|
+
* const classifier = new KeywordThemeClassifier();
|
|
985
|
+
* const result = await classifier.classify('Machine learning is transforming AI');
|
|
986
|
+
* console.log(result.theme); // 'technology'
|
|
987
|
+
* console.log(result.confidence); // 0.92
|
|
988
|
+
* ```
|
|
989
|
+
*/
|
|
990
|
+
classify(text: string): Promise<ThemeClassification>;
|
|
991
|
+
/**
|
|
992
|
+
* Classify multiple texts efficiently and return their theme classifications.
|
|
993
|
+
*
|
|
994
|
+
* Implementations should maintain the order of input texts in the output array.
|
|
995
|
+
* May use parallel processing or batching for efficiency.
|
|
996
|
+
*
|
|
997
|
+
* @param texts - Array of text contents to classify
|
|
998
|
+
* @returns A promise that resolves to an array of theme classifications
|
|
999
|
+
*
|
|
1000
|
+
* @example
|
|
1001
|
+
* ```typescript
|
|
1002
|
+
* const classifier = new KeywordThemeClassifier();
|
|
1003
|
+
* const texts = [
|
|
1004
|
+
* 'Machine learning is transforming AI',
|
|
1005
|
+
* 'The stock market reached new highs',
|
|
1006
|
+
* 'New cancer treatment shows promise'
|
|
1007
|
+
* ];
|
|
1008
|
+
* const results = await classifier.classifyBatch(texts);
|
|
1009
|
+
* // results[0].theme === 'technology'
|
|
1010
|
+
* // results[1].theme === 'business'
|
|
1011
|
+
* // results[2].theme === 'healthcare'
|
|
1012
|
+
* ```
|
|
1013
|
+
*/
|
|
1014
|
+
classifyBatch(texts: string[]): Promise<ThemeClassification[]>;
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
/**
|
|
1018
|
+
* Types and interfaces for document enrichment operations.
|
|
1019
|
+
*
|
|
1020
|
+
* This module defines the configuration interfaces for various enrichment strategies:
|
|
1021
|
+
* - Vertical enrichment: Classify documents into business verticals
|
|
1022
|
+
* - Theme enrichment: Add thematic tags to documents
|
|
1023
|
+
* - Section enrichment: Structure documents into logical sections
|
|
1024
|
+
*/
|
|
1025
|
+
|
|
1026
|
+
/**
|
|
1027
|
+
* Progress callback function for tracking enrichment operations.
|
|
1028
|
+
*
|
|
1029
|
+
* @param stats - Current enrichment statistics
|
|
1030
|
+
*
|
|
1031
|
+
* @example
|
|
1032
|
+
* ```typescript
|
|
1033
|
+
* const onProgress: ProgressCallback = (stats) => {
|
|
1034
|
+
* console.log(`Processed: ${stats.recordsProcessed}/${stats.recordsProcessed + stats.recordsSkipped}`);
|
|
1035
|
+
* console.log(`Updated: ${stats.recordsUpdated}`);
|
|
1036
|
+
* };
|
|
1037
|
+
* ```
|
|
1038
|
+
*/
|
|
1039
|
+
type ProgressCallback = (stats: EnrichmentStats) => void;
|
|
1040
|
+
/**
|
|
1041
|
+
* Statistics for an enrichment operation.
|
|
1042
|
+
*
|
|
1043
|
+
* Tracks the progress and outcome of enrichment operations,
|
|
1044
|
+
* including records processed, updated, skipped, and any errors encountered.
|
|
1045
|
+
*
|
|
1046
|
+
* @property recordsProcessed - Total number of records processed
|
|
1047
|
+
* @property recordsUpdated - Number of records successfully updated
|
|
1048
|
+
* @property recordsSkipped - Number of records skipped (e.g., filtered out)
|
|
1049
|
+
* @property timeMs - Total time taken in milliseconds
|
|
1050
|
+
* @property errors - Optional array of error messages encountered during enrichment
|
|
1051
|
+
*
|
|
1052
|
+
* @example
|
|
1053
|
+
* ```typescript
|
|
1054
|
+
* const stats: EnrichmentStats = {
|
|
1055
|
+
* recordsProcessed: 100,
|
|
1056
|
+
* recordsUpdated: 95,
|
|
1057
|
+
* recordsSkipped: 5,
|
|
1058
|
+
* timeMs: 1250,
|
|
1059
|
+
* errors: ['Failed to classify record 42']
|
|
1060
|
+
* };
|
|
1061
|
+
* ```
|
|
1062
|
+
*/
|
|
1063
|
+
interface EnrichmentStats {
|
|
1064
|
+
/**
|
|
1065
|
+
* Total number of records processed.
|
|
1066
|
+
*/
|
|
1067
|
+
recordsProcessed: number;
|
|
1068
|
+
/**
|
|
1069
|
+
* Number of records successfully updated with enrichment data.
|
|
1070
|
+
*/
|
|
1071
|
+
recordsUpdated: number;
|
|
1072
|
+
/**
|
|
1073
|
+
* Number of records skipped (e.g., filtered out or already enriched).
|
|
1074
|
+
*/
|
|
1075
|
+
recordsSkipped: number;
|
|
1076
|
+
/**
|
|
1077
|
+
* Total time taken in milliseconds.
|
|
1078
|
+
*/
|
|
1079
|
+
timeMs: number;
|
|
1080
|
+
/**
|
|
1081
|
+
* Optional array of error messages encountered during enrichment.
|
|
1082
|
+
*/
|
|
1083
|
+
errors?: string[];
|
|
1084
|
+
}
|
|
1085
|
+
/**
|
|
1086
|
+
* Configuration for field mapping-based vertical enrichment.
|
|
1087
|
+
*
|
|
1088
|
+
* Maps values from an existing field to vertical classifications.
|
|
1089
|
+
* This is the simplest enrichment strategy, useful when vertical
|
|
1090
|
+
* information is already present in a different field.
|
|
1091
|
+
*
|
|
1092
|
+
* @property mapping - Map of source field values to vertical labels
|
|
1093
|
+
* @property filter - Optional filter to select which records to enrich
|
|
1094
|
+
* @property batchSize - Optional batch size for processing (default: 100)
|
|
1095
|
+
*
|
|
1096
|
+
* @example
|
|
1097
|
+
* ```typescript
|
|
1098
|
+
* const config: FieldMappingConfig = {
|
|
1099
|
+
* mapping: {
|
|
1100
|
+
* 'tech': 'technology',
|
|
1101
|
+
* 'healthcare': 'medical',
|
|
1102
|
+
* 'fin': 'finance'
|
|
1103
|
+
* },
|
|
1104
|
+
* filter: { field: 'category', op: 'exists', value: true },
|
|
1105
|
+
* batchSize: 50
|
|
1106
|
+
* };
|
|
1107
|
+
* ```
|
|
1108
|
+
*/
|
|
1109
|
+
interface FieldMappingConfig {
|
|
1110
|
+
/**
|
|
1111
|
+
* Map of source field values to vertical labels.
|
|
1112
|
+
*
|
|
1113
|
+
* @example
|
|
1114
|
+
* ```typescript
|
|
1115
|
+
* {
|
|
1116
|
+
* 'tech': 'technology',
|
|
1117
|
+
* 'healthcare': 'medical',
|
|
1118
|
+
* 'finance': 'finance'
|
|
1119
|
+
* }
|
|
1120
|
+
* ```
|
|
1121
|
+
*/
|
|
1122
|
+
mapping: Record<string, string>;
|
|
1123
|
+
/**
|
|
1124
|
+
* Optional filter to select which records to enrich.
|
|
1125
|
+
*/
|
|
1126
|
+
filter?: UniversalFilter;
|
|
1127
|
+
/**
|
|
1128
|
+
* Optional batch size for processing (default: 100).
|
|
1129
|
+
*/
|
|
1130
|
+
batchSize?: number;
|
|
1131
|
+
}
|
|
1132
|
+
/**
|
|
1133
|
+
* Configuration for custom extractor function-based vertical enrichment.
|
|
1134
|
+
*
|
|
1135
|
+
* Uses a custom function to extract vertical classifications from documents.
|
|
1136
|
+
* This provides maximum flexibility for complex extraction logic.
|
|
1137
|
+
*
|
|
1138
|
+
* @property extractor - Function that extracts vertical label from a document
|
|
1139
|
+
* @property filter - Optional filter to select which records to enrich
|
|
1140
|
+
* @property batchSize - Optional batch size for processing (default: 100)
|
|
1141
|
+
*
|
|
1142
|
+
* @example
|
|
1143
|
+
* ```typescript
|
|
1144
|
+
* const config: ExtractorConfig = {
|
|
1145
|
+
* extractor: async (doc) => {
|
|
1146
|
+
* if (doc.content.includes('machine learning')) return 'technology';
|
|
1147
|
+
* if (doc.content.includes('stock market')) return 'finance';
|
|
1148
|
+
* return 'general';
|
|
1149
|
+
* },
|
|
1150
|
+
* filter: { field: 'content', op: 'exists', value: true },
|
|
1151
|
+
* batchSize: 25
|
|
1152
|
+
* };
|
|
1153
|
+
* ```
|
|
1154
|
+
*/
|
|
1155
|
+
interface ExtractorConfig {
|
|
1156
|
+
/**
|
|
1157
|
+
* Function that extracts vertical label from a document.
|
|
1158
|
+
*
|
|
1159
|
+
* @param document - The document to extract vertical from
|
|
1160
|
+
* @returns Promise resolving to the vertical label
|
|
1161
|
+
*/
|
|
1162
|
+
extractor: (document: any) => Promise<string>;
|
|
1163
|
+
/**
|
|
1164
|
+
* Optional filter to select which records to enrich.
|
|
1165
|
+
*/
|
|
1166
|
+
filter?: UniversalFilter;
|
|
1167
|
+
/**
|
|
1168
|
+
* Optional batch size for processing (default: 100).
|
|
1169
|
+
*/
|
|
1170
|
+
batchSize?: number;
|
|
1171
|
+
}
|
|
1172
|
+
/**
|
|
1173
|
+
* Configuration for automatic LLM-based vertical enrichment.
|
|
1174
|
+
*
|
|
1175
|
+
* Uses a language model to automatically classify documents into verticals.
|
|
1176
|
+
* Can use predefined field mappings or automatic extraction from text.
|
|
1177
|
+
*
|
|
1178
|
+
* @property llm - The LLM client to use for classification
|
|
1179
|
+
* @property fields - Array of vertical labels to classify into
|
|
1180
|
+
* @property promptTemplate - Optional custom prompt template for the LLM
|
|
1181
|
+
* @property textField - Optional field name containing text to classify (default: 'content')
|
|
1182
|
+
* @property filter - Optional filter to select which records to enrich
|
|
1183
|
+
* @property batchSize - Optional batch size for processing (default: 10)
|
|
1184
|
+
*
|
|
1185
|
+
* @example
|
|
1186
|
+
* ```typescript
|
|
1187
|
+
* const config: AutomaticExtractionConfig = {
|
|
1188
|
+
* automatic: {
|
|
1189
|
+
* llm: myLLMClient,
|
|
1190
|
+
* fields: ['technology', 'finance', 'healthcare', 'retail'],
|
|
1191
|
+
* promptTemplate: 'Classify this text into one of: {fields}\n\nText: {text}',
|
|
1192
|
+
* textField: 'description'
|
|
1193
|
+
* },
|
|
1194
|
+
* filter: { field: 'vertical', op: 'eq', value: null },
|
|
1195
|
+
* batchSize: 5
|
|
1196
|
+
* };
|
|
1197
|
+
* ```
|
|
1198
|
+
*/
|
|
1199
|
+
interface AutomaticExtractionConfig {
|
|
1200
|
+
/**
|
|
1201
|
+
* Automatic extraction settings using an LLM.
|
|
1202
|
+
*/
|
|
1203
|
+
automatic: {
|
|
1204
|
+
/**
|
|
1205
|
+
* The LLM client to use for classification.
|
|
1206
|
+
*/
|
|
1207
|
+
llm: LLMClient;
|
|
1208
|
+
/**
|
|
1209
|
+
* Array of vertical labels to classify into.
|
|
1210
|
+
*
|
|
1211
|
+
* @example
|
|
1212
|
+
* ['technology', 'finance', 'healthcare', 'retail']
|
|
1213
|
+
*/
|
|
1214
|
+
fields: string[];
|
|
1215
|
+
/**
|
|
1216
|
+
* Optional custom prompt template for the LLM.
|
|
1217
|
+
* Use {fields} for the list of verticals and {text} for the document text.
|
|
1218
|
+
*
|
|
1219
|
+
* @example
|
|
1220
|
+
* 'Classify this text into one of: {fields}\n\nText: {text}'
|
|
1221
|
+
*/
|
|
1222
|
+
promptTemplate?: string;
|
|
1223
|
+
/**
|
|
1224
|
+
* Optional field name containing text to classify (default: 'content').
|
|
1225
|
+
*/
|
|
1226
|
+
textField?: string;
|
|
1227
|
+
};
|
|
1228
|
+
/**
|
|
1229
|
+
* Optional filter to select which records to enrich.
|
|
1230
|
+
*/
|
|
1231
|
+
filter?: UniversalFilter;
|
|
1232
|
+
/**
|
|
1233
|
+
* Optional batch size for processing (default: 10).
|
|
1234
|
+
*/
|
|
1235
|
+
batchSize?: number;
|
|
1236
|
+
}
|
|
1237
|
+
/**
|
|
1238
|
+
* Configuration for vertical enrichment operations.
|
|
1239
|
+
*
|
|
1240
|
+
* Vertical enrichment classifies documents into business verticals
|
|
1241
|
+
* (e.g., technology, finance, healthcare). Three strategies are supported:
|
|
1242
|
+
*
|
|
1243
|
+
* 1. **Field Mapping**: Map existing field values to verticals
|
|
1244
|
+
* 2. **Custom Extractor**: Use a custom function to extract verticals
|
|
1245
|
+
* 3. **Automatic Extraction**: Use an LLM to automatically classify documents
|
|
1246
|
+
*
|
|
1247
|
+
* @example
|
|
1248
|
+
* ```typescript
|
|
1249
|
+
* // Field mapping
|
|
1250
|
+
* const config1: VerticalEnrichmentConfig = {
|
|
1251
|
+
* mapping: { 'tech': 'technology', 'hc': 'healthcare' }
|
|
1252
|
+
* };
|
|
1253
|
+
*
|
|
1254
|
+
* // Custom extractor
|
|
1255
|
+
* const config2: VerticalEnrichmentConfig = {
|
|
1256
|
+
* extractor: async (doc) => extractVertical(doc)
|
|
1257
|
+
* };
|
|
1258
|
+
*
|
|
1259
|
+
* // Automatic extraction
|
|
1260
|
+
* const config3: VerticalEnrichmentConfig = {
|
|
1261
|
+
* automatic: {
|
|
1262
|
+
* llm: myLLMClient,
|
|
1263
|
+
* fields: ['technology', 'finance', 'healthcare']
|
|
1264
|
+
* }
|
|
1265
|
+
* };
|
|
1266
|
+
* ```
|
|
1267
|
+
*/
|
|
1268
|
+
type VerticalEnrichmentConfig = FieldMappingConfig | ExtractorConfig | AutomaticExtractionConfig;
|
|
1269
|
+
/**
|
|
1270
|
+
* Configuration for theme enrichment operations.
|
|
1271
|
+
*
|
|
1272
|
+
* Theme enrichment adds thematic tags to documents using a theme classifier.
|
|
1273
|
+
* Supports confidence thresholds, multi-theme tagging, and custom text fields.
|
|
1274
|
+
*
|
|
1275
|
+
* @property themes - Array of theme labels to classify into
|
|
1276
|
+
* @property classifier - The theme classifier to use for classification
|
|
1277
|
+
* @property textField - Optional field name containing text to classify (default: 'content')
|
|
1278
|
+
* @property confidenceThreshold - Optional minimum confidence threshold (default: 0.0)
|
|
1279
|
+
* @property multiTheme - Optional flag to allow multiple themes per document (default: false)
|
|
1280
|
+
* @property filter - Optional filter to select which records to enrich
|
|
1281
|
+
* @property batchSize - Optional batch size for processing (default: 100)
|
|
1282
|
+
* @property onProgress - Optional callback for tracking progress
|
|
1283
|
+
*
|
|
1284
|
+
* @example
|
|
1285
|
+
* ```typescript
|
|
1286
|
+
* const config: ThemeEnrichmentConfig = {
|
|
1287
|
+
* themes: ['technology', 'business', 'science', 'healthcare'],
|
|
1288
|
+
* classifier: new KeywordThemeClassifier(),
|
|
1289
|
+
* textField: 'description',
|
|
1290
|
+
* confidenceThreshold: 0.7,
|
|
1291
|
+
* multiTheme: true,
|
|
1292
|
+
* filter: { field: 'themes', op: 'eq', value: null },
|
|
1293
|
+
* batchSize: 50,
|
|
1294
|
+
* onProgress: (stats) => console.log(`Processed: ${stats.recordsProcessed}`)
|
|
1295
|
+
* };
|
|
1296
|
+
* ```
|
|
1297
|
+
*/
|
|
1298
|
+
interface ThemeEnrichmentConfig {
|
|
1299
|
+
/**
|
|
1300
|
+
* Array of theme labels to classify into.
|
|
1301
|
+
*
|
|
1302
|
+
* @example
|
|
1303
|
+
* ['technology', 'business', 'science', 'healthcare']
|
|
1304
|
+
*/
|
|
1305
|
+
themes: string[];
|
|
1306
|
+
/**
|
|
1307
|
+
* The theme classifier to use for classification.
|
|
1308
|
+
*/
|
|
1309
|
+
classifier: ThemeClassifier;
|
|
1310
|
+
/**
|
|
1311
|
+
* Optional field name containing text to classify (default: 'content').
|
|
1312
|
+
*/
|
|
1313
|
+
textField?: string;
|
|
1314
|
+
/**
|
|
1315
|
+
* Optional minimum confidence threshold (default: 0.0).
|
|
1316
|
+
* Only themes with confidence >= this value will be assigned.
|
|
1317
|
+
*/
|
|
1318
|
+
confidenceThreshold?: number;
|
|
1319
|
+
/**
|
|
1320
|
+
* Optional flag to allow multiple themes per document (default: false).
|
|
1321
|
+
* When true, all themes above the confidence threshold are assigned.
|
|
1322
|
+
*/
|
|
1323
|
+
multiTheme?: boolean;
|
|
1324
|
+
/**
|
|
1325
|
+
* Optional filter to select which records to enrich.
|
|
1326
|
+
*/
|
|
1327
|
+
filter?: UniversalFilter;
|
|
1328
|
+
/**
|
|
1329
|
+
* Optional batch size for processing (default: 100).
|
|
1330
|
+
*/
|
|
1331
|
+
batchSize?: number;
|
|
1332
|
+
/**
|
|
1333
|
+
* Optional callback for tracking progress.
|
|
1334
|
+
*/
|
|
1335
|
+
onProgress?: ProgressCallback;
|
|
1336
|
+
}
|
|
1337
|
+
/**
|
|
1338
|
+
* Configuration for section enrichment operations.
|
|
1339
|
+
*
|
|
1340
|
+
* Section enrichment structures documents into logical sections
|
|
1341
|
+
* (e.g., introduction, methodology, results, conclusion).
|
|
1342
|
+
* Can use existing section markers or automatically detect sections.
|
|
1343
|
+
*
|
|
1344
|
+
* @property existingField - Optional field name containing existing section markers
|
|
1345
|
+
* @property autoDetect - Optional flag to automatically detect sections (default: false)
|
|
1346
|
+
* @property filter - Optional filter to select which records to enrich
|
|
1347
|
+
* @property batchSize - Optional batch size for processing (default: 100)
|
|
1348
|
+
*
|
|
1349
|
+
* @example
|
|
1350
|
+
* ```typescript
|
|
1351
|
+
* // Use existing section markers
|
|
1352
|
+
* const config1: SectionEnrichmentConfig = {
|
|
1353
|
+
* existingField: 'raw_sections',
|
|
1354
|
+
* filter: { field: 'sections', op: 'eq', value: null }
|
|
1355
|
+
* };
|
|
1356
|
+
*
|
|
1357
|
+
* // Auto-detect sections
|
|
1358
|
+
* const config2: SectionEnrichmentConfig = {
|
|
1359
|
+
* autoDetect: true,
|
|
1360
|
+
* batchSize: 25
|
|
1361
|
+
* };
|
|
1362
|
+
* ```
|
|
1363
|
+
*/
|
|
1364
|
+
interface SectionEnrichmentConfig {
|
|
1365
|
+
/**
|
|
1366
|
+
* Optional field name containing existing section markers.
|
|
1367
|
+
* If provided, sections will be extracted from this field.
|
|
1368
|
+
*/
|
|
1369
|
+
existingField?: string;
|
|
1370
|
+
/**
|
|
1371
|
+
* Optional flag to automatically detect sections (default: false).
|
|
1372
|
+
* When true, sections will be detected using heuristics (headers, paragraphs, etc.).
|
|
1373
|
+
*/
|
|
1374
|
+
autoDetect?: boolean;
|
|
1375
|
+
/**
|
|
1376
|
+
* Optional filter to select which records to enrich.
|
|
1377
|
+
*/
|
|
1378
|
+
filter?: UniversalFilter;
|
|
1379
|
+
/**
|
|
1380
|
+
* Optional batch size for processing (default: 100).
|
|
1381
|
+
*/
|
|
1382
|
+
batchSize?: number;
|
|
1383
|
+
}
|
|
1384
|
+
/**
|
|
1385
|
+
* Configuration for enriching all aspects of documents.
|
|
1386
|
+
*
|
|
1387
|
+
* Combines vertical, theme, and section enrichment into a single operation.
|
|
1388
|
+
* Allows running multiple enrichment strategies in sequence with shared settings.
|
|
1389
|
+
*
|
|
1390
|
+
* @property vertical - Optional vertical enrichment configuration
|
|
1391
|
+
* @property themes - Optional theme enrichment configuration
|
|
1392
|
+
* @property sections - Optional section enrichment configuration
|
|
1393
|
+
* @property filter - Optional global filter applied to all enrichment operations
|
|
1394
|
+
* @property batchSize - Optional global batch size for all operations (default: 100)
|
|
1395
|
+
* @property onProgress - Optional global progress callback for all operations
|
|
1396
|
+
*
|
|
1397
|
+
* @example
|
|
1398
|
+
* ```typescript
|
|
1399
|
+
* const config: EnrichAllConfig = {
|
|
1400
|
+
* vertical: {
|
|
1401
|
+
* automatic: {
|
|
1402
|
+
* llm: myLLMClient,
|
|
1403
|
+
* fields: ['technology', 'finance', 'healthcare']
|
|
1404
|
+
* }
|
|
1405
|
+
* },
|
|
1406
|
+
* themes: {
|
|
1407
|
+
* themes: ['innovation', 'research', 'product'],
|
|
1408
|
+
* classifier: new KeywordThemeClassifier(),
|
|
1409
|
+
* confidenceThreshold: 0.8
|
|
1410
|
+
* },
|
|
1411
|
+
* sections: {
|
|
1412
|
+
* autoDetect: true
|
|
1413
|
+
* },
|
|
1414
|
+
* filter: { field: 'status', op: 'eq', value: 'pending' },
|
|
1415
|
+
* batchSize: 50,
|
|
1416
|
+
* onProgress: (stats) => console.log(`Progress: ${stats.recordsProcessed}`)
|
|
1417
|
+
* };
|
|
1418
|
+
* ```
|
|
1419
|
+
*/
|
|
1420
|
+
interface EnrichAllConfig {
|
|
1421
|
+
/**
|
|
1422
|
+
* Optional vertical enrichment configuration.
|
|
1423
|
+
*/
|
|
1424
|
+
vertical?: VerticalEnrichmentConfig;
|
|
1425
|
+
/**
|
|
1426
|
+
* Optional theme enrichment configuration.
|
|
1427
|
+
*/
|
|
1428
|
+
themes?: ThemeEnrichmentConfig;
|
|
1429
|
+
/**
|
|
1430
|
+
* Optional section enrichment configuration.
|
|
1431
|
+
*/
|
|
1432
|
+
sections?: SectionEnrichmentConfig;
|
|
1433
|
+
/**
|
|
1434
|
+
* Optional global filter applied to all enrichment operations.
|
|
1435
|
+
* This filter is combined with individual operation filters using AND logic.
|
|
1436
|
+
*/
|
|
1437
|
+
filter?: UniversalFilter;
|
|
1438
|
+
/**
|
|
1439
|
+
* Optional global batch size for all operations (default: 100).
|
|
1440
|
+
* Individual operation batch sizes override this value.
|
|
1441
|
+
*/
|
|
1442
|
+
batchSize?: number;
|
|
1443
|
+
/**
|
|
1444
|
+
* Optional global progress callback for all operations.
|
|
1445
|
+
* Called after each enrichment operation completes.
|
|
1446
|
+
*/
|
|
1447
|
+
onProgress?: ProgressCallback;
|
|
1448
|
+
}
|
|
1449
|
+
|
|
1450
|
+
/**
|
|
1451
|
+
* Fast, deterministic keyword-based theme classifier
|
|
1452
|
+
* Uses precompiled regex patterns with word boundaries for efficient matching
|
|
1453
|
+
*/
|
|
1454
|
+
interface ThemeClassificationResult {
|
|
1455
|
+
theme: string;
|
|
1456
|
+
confidence: number;
|
|
1457
|
+
allScores?: Record<string, number>;
|
|
1458
|
+
}
|
|
1459
|
+
declare class KeywordThemeClassifier {
|
|
1460
|
+
private themes;
|
|
1461
|
+
private caseSensitive;
|
|
1462
|
+
private patterns;
|
|
1463
|
+
private keywordCounts;
|
|
1464
|
+
/**
|
|
1465
|
+
* Creates a new KeywordThemeClassifier
|
|
1466
|
+
* @param themes - Array of theme names
|
|
1467
|
+
* @param keywords - Map of theme names to their keyword arrays
|
|
1468
|
+
* @param caseSensitive - Whether matching should be case sensitive (default: false)
|
|
1469
|
+
*/
|
|
1470
|
+
constructor(themes: string[], keywords: Record<string, string[]>, caseSensitive?: boolean);
|
|
1471
|
+
/**
|
|
1472
|
+
* Classify a single text
|
|
1473
|
+
* @param text - Text to classify
|
|
1474
|
+
* @returns Classification result with theme, confidence, and all scores
|
|
1475
|
+
*/
|
|
1476
|
+
classify(text: string): ThemeClassificationResult;
|
|
1477
|
+
/**
|
|
1478
|
+
* Classify multiple texts in batch
|
|
1479
|
+
* @param texts - Array of texts to classify
|
|
1480
|
+
* @returns Array of classification results
|
|
1481
|
+
*/
|
|
1482
|
+
classifyBatch(texts: string[]): ThemeClassificationResult[];
|
|
1483
|
+
/**
|
|
1484
|
+
* Escape special regex characters in a string
|
|
1485
|
+
* @param str - String to escape
|
|
1486
|
+
* @returns Escaped string safe for use in regex
|
|
1487
|
+
*/
|
|
1488
|
+
private escapeRegex;
|
|
1489
|
+
}
|
|
1490
|
+
|
|
1491
|
+
/**
|
|
1492
|
+
* Zero-shot theme classifier using Transformers.js
|
|
1493
|
+
* Uses pre-trained models without requiring fine-tuning or training data
|
|
1494
|
+
*/
|
|
1495
|
+
|
|
1496
|
+
/**
|
|
1497
|
+
* Zero-shot classification using pre-trained transformer models.
|
|
1498
|
+
*
|
|
1499
|
+
* This classifier uses Hugging Face's zero-shot classification pipeline
|
|
1500
|
+
* to classify text into themes without requiring training data or fine-tuning.
|
|
1501
|
+
* The model is loaded lazily on the first classify() call to improve startup time.
|
|
1502
|
+
*
|
|
1503
|
+
* Features:
|
|
1504
|
+
* - No training data required
|
|
1505
|
+
* - Works with any set of theme labels
|
|
1506
|
+
* - Lazy model loading (loads on first classification)
|
|
1507
|
+
* - Sequential batch processing to avoid memory issues
|
|
1508
|
+
* - Handles empty text with uniform scores
|
|
1509
|
+
*
|
|
1510
|
+
* @example
|
|
1511
|
+
* ```typescript
|
|
1512
|
+
* const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'business']);
|
|
1513
|
+
* const result = await classifier.classify('Machine learning is transforming AI');
|
|
1514
|
+
* console.log(result.theme); // 'technology'
|
|
1515
|
+
* console.log(result.confidence); // 0.95
|
|
1516
|
+
* ```
|
|
1517
|
+
*/
|
|
1518
|
+
declare class ZeroShotThemeClassifier implements ThemeClassifier {
|
|
1519
|
+
private model;
|
|
1520
|
+
private modelName;
|
|
1521
|
+
private themes;
|
|
1522
|
+
/**
|
|
1523
|
+
* Creates a new ZeroShotThemeClassifier
|
|
1524
|
+
*
|
|
1525
|
+
* @param themes - Array of theme labels to classify into
|
|
1526
|
+
* @param modelName - Name of the Hugging Face model to use (default: 'Xenova/distilbert-base-uncased-mnli')
|
|
1527
|
+
*
|
|
1528
|
+
* @example
|
|
1529
|
+
* ```typescript
|
|
1530
|
+
* // Use default model
|
|
1531
|
+
* const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'finance']);
|
|
1532
|
+
*
|
|
1533
|
+
* // Use custom model
|
|
1534
|
+
* const classifier = new ZeroShotThemeClassifier(
|
|
1535
|
+
* ['positive', 'negative'],
|
|
1536
|
+
* 'Xenova/distilbert-base-uncased-mnli'
|
|
1537
|
+
* );
|
|
1538
|
+
* ```
|
|
1539
|
+
*/
|
|
1540
|
+
constructor(themes: string[], modelName?: string);
|
|
1541
|
+
/**
|
|
1542
|
+
* Lazy loads the zero-shot classification model
|
|
1543
|
+
* Only loads once on first call, subsequent calls reuse the loaded model
|
|
1544
|
+
*
|
|
1545
|
+
* @returns Promise that resolves to the loaded pipeline
|
|
1546
|
+
*/
|
|
1547
|
+
private ensureModelLoaded;
|
|
1548
|
+
/**
|
|
1549
|
+
* Classify a single text into one of the provided themes
|
|
1550
|
+
*
|
|
1551
|
+
* @param text - The text content to classify
|
|
1552
|
+
* @returns A promise that resolves to the theme classification result
|
|
1553
|
+
*
|
|
1554
|
+
* @example
|
|
1555
|
+
* ```typescript
|
|
1556
|
+
* const classifier = new ZeroShotThemeClassifier(['technology', 'sports']);
|
|
1557
|
+
* const result = await classifier.classify('Machine learning and AI');
|
|
1558
|
+
* console.log(result.theme); // 'technology'
|
|
1559
|
+
* console.log(result.confidence); // 0.92
|
|
1560
|
+
* console.log(result.allScores); // { technology: 0.92, sports: 0.08 }
|
|
1561
|
+
* ```
|
|
1562
|
+
*/
|
|
1563
|
+
classify(text: string): Promise<ThemeClassification>;
|
|
1564
|
+
/**
|
|
1565
|
+
* Classify multiple texts efficiently
|
|
1566
|
+
*
|
|
1567
|
+
* Processes texts sequentially to avoid memory issues with large batches.
|
|
1568
|
+
* The model is loaded once and reused for all texts.
|
|
1569
|
+
*
|
|
1570
|
+
* @param texts - Array of text contents to classify
|
|
1571
|
+
* @returns A promise that resolves to an array of theme classifications
|
|
1572
|
+
*
|
|
1573
|
+
* @example
|
|
1574
|
+
* ```typescript
|
|
1575
|
+
* const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'finance']);
|
|
1576
|
+
* const results = await classifier.classifyBatch([
|
|
1577
|
+
* 'Machine learning is transforming AI',
|
|
1578
|
+
* 'The football team won the championship',
|
|
1579
|
+
* 'Stock market hits record high'
|
|
1580
|
+
* ]);
|
|
1581
|
+
* // results[0].theme === 'technology'
|
|
1582
|
+
* // results[1].theme === 'sports'
|
|
1583
|
+
* // results[2].theme === 'finance'
|
|
1584
|
+
* ```
|
|
1585
|
+
*/
|
|
1586
|
+
classifyBatch(texts: string[]): Promise<ThemeClassification[]>;
|
|
1587
|
+
}
|
|
1588
|
+
|
|
1589
|
+
/**
|
|
1590
|
+
* Embedding-based theme classifier using cosine similarity
|
|
1591
|
+
* Computes similarity between text embeddings and theme embeddings
|
|
1592
|
+
*/
|
|
1593
|
+
|
|
1594
|
+
/**
|
|
1595
|
+
* Embedding-based classification using cosine similarity.
|
|
1596
|
+
*
|
|
1597
|
+
* This classifier computes embeddings for text and themes, then uses cosine
|
|
1598
|
+
* similarity to determine which theme is most similar to the text. Theme
|
|
1599
|
+
* embeddings are computed lazily on the first classify() call, or can be
|
|
1600
|
+
* provided precomputed in the constructor.
|
|
1601
|
+
*
|
|
1602
|
+
* Features:
|
|
1603
|
+
* - Lazy initialization: theme embeddings computed on first classify()
|
|
1604
|
+
* - Optional precomputed embeddings for faster startup
|
|
1605
|
+
* - Cosine similarity: dotProduct / (normA * normB)
|
|
1606
|
+
* - Normalize similarity [-1,1] to confidence [0,1]
|
|
1607
|
+
* - Handles empty text with uniform scores
|
|
1608
|
+
*
|
|
1609
|
+
* @example
|
|
1610
|
+
* ```typescript
|
|
1611
|
+
* const embedder = new OpenAIEmbedder();
|
|
1612
|
+
* const classifier = new EmbeddingThemeClassifier(['technology', 'sports', 'finance'], embedder);
|
|
1613
|
+
* const result = await classifier.classify('Machine learning is transforming AI');
|
|
1614
|
+
* console.log(result.theme); // 'technology'
|
|
1615
|
+
* console.log(result.confidence); // 0.89
|
|
1616
|
+
* ```
|
|
1617
|
+
*/
|
|
1618
|
+
declare class EmbeddingThemeClassifier implements ThemeClassifier {
|
|
1619
|
+
private themeEmbeddings;
|
|
1620
|
+
private embedder;
|
|
1621
|
+
private themes;
|
|
1622
|
+
/**
|
|
1623
|
+
* Creates a new EmbeddingThemeClassifier
|
|
1624
|
+
*
|
|
1625
|
+
* @param themes - Array of theme labels to classify into
|
|
1626
|
+
* @param embedder - Embedder instance to use for generating embeddings
|
|
1627
|
+
* @param precomputedEmbeddings - Optional precomputed theme embeddings for faster startup
|
|
1628
|
+
*
|
|
1629
|
+
* @example
|
|
1630
|
+
* ```typescript
|
|
1631
|
+
* // Lazy initialization
|
|
1632
|
+
* const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder);
|
|
1633
|
+
*
|
|
1634
|
+
* // With precomputed embeddings
|
|
1635
|
+
* const themeEmbeddings = {
|
|
1636
|
+
* technology: await embedder.embed('technology'),
|
|
1637
|
+
* sports: await embedder.embed('sports')
|
|
1638
|
+
* };
|
|
1639
|
+
* const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder, themeEmbeddings);
|
|
1640
|
+
* ```
|
|
1641
|
+
*/
|
|
1642
|
+
constructor(themes: string[], embedder: Embedder, precomputedEmbeddings?: Record<string, number[]>);
|
|
1643
|
+
/**
|
|
1644
|
+
* Lazy loads theme embeddings on first use
|
|
1645
|
+
* Computes embeddings for all theme labels if not already computed
|
|
1646
|
+
*
|
|
1647
|
+
* @returns Promise that resolves to the theme embeddings map
|
|
1648
|
+
*/
|
|
1649
|
+
private ensureThemeEmbeddings;
|
|
1650
|
+
/**
|
|
1651
|
+
* Compute cosine similarity between two vectors
|
|
1652
|
+
*
|
|
1653
|
+
* Cosine similarity = dotProduct / (normA * normB)
|
|
1654
|
+
* Returns value in range [-1, 1] where:
|
|
1655
|
+
* - 1 means vectors point in the same direction
|
|
1656
|
+
* - 0 means vectors are orthogonal
|
|
1657
|
+
* - -1 means vectors point in opposite directions
|
|
1658
|
+
*
|
|
1659
|
+
* @param a - First vector
|
|
1660
|
+
* @param b - Second vector
|
|
1661
|
+
* @returns Cosine similarity between the vectors
|
|
1662
|
+
*/
|
|
1663
|
+
private cosineSimilarity;
|
|
1664
|
+
/**
|
|
1665
|
+
* Normalize cosine similarity from [-1, 1] to confidence score [0, 1]
|
|
1666
|
+
*
|
|
1667
|
+
* Uses linear transformation: (similarity + 1) / 2
|
|
1668
|
+
*
|
|
1669
|
+
* @param similarity - Cosine similarity value in range [-1, 1]
|
|
1670
|
+
* @returns Confidence score in range [0, 1]
|
|
1671
|
+
*/
|
|
1672
|
+
private normalizeToConfidence;
|
|
1673
|
+
/**
|
|
1674
|
+
* Classify a single text into one of the provided themes
|
|
1675
|
+
*
|
|
1676
|
+
* @param text - The text content to classify
|
|
1677
|
+
* @returns A promise that resolves to the theme classification result
|
|
1678
|
+
*
|
|
1679
|
+
* @example
|
|
1680
|
+
* ```typescript
|
|
1681
|
+
* const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder);
|
|
1682
|
+
* const result = await classifier.classify('Machine learning and AI');
|
|
1683
|
+
* console.log(result.theme); // 'technology'
|
|
1684
|
+
* console.log(result.confidence); // 0.92
|
|
1685
|
+
* console.log(result.allScores); // { technology: 0.92, sports: 0.45 }
|
|
1686
|
+
* ```
|
|
1687
|
+
*/
|
|
1688
|
+
classify(text: string): Promise<ThemeClassification>;
|
|
1689
|
+
/**
|
|
1690
|
+
* Classify multiple texts efficiently
|
|
1691
|
+
*
|
|
1692
|
+
* Ensures theme embeddings are loaded once, then processes all texts.
|
|
1693
|
+
* Text embeddings are computed in batch for efficiency.
|
|
1694
|
+
*
|
|
1695
|
+
* @param texts - Array of text contents to classify
|
|
1696
|
+
* @returns A promise that resolves to an array of theme classifications
|
|
1697
|
+
*
|
|
1698
|
+
* @example
|
|
1699
|
+
* ```typescript
|
|
1700
|
+
* const classifier = new EmbeddingThemeClassifier(['technology', 'sports', 'finance'], embedder);
|
|
1701
|
+
* const results = await classifier.classifyBatch([
|
|
1702
|
+
* 'Machine learning is transforming AI',
|
|
1703
|
+
* 'The football team won the championship',
|
|
1704
|
+
* 'Stock market hits record high'
|
|
1705
|
+
* ]);
|
|
1706
|
+
* // results[0].theme === 'technology'
|
|
1707
|
+
* // results[1].theme === 'sports'
|
|
1708
|
+
* // results[2].theme === 'finance'
|
|
1709
|
+
* ```
|
|
1710
|
+
*/
|
|
1711
|
+
classifyBatch(texts: string[]): Promise<ThemeClassification[]>;
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
/**
|
|
1715
|
+
* LLM-based theme classifier using language models for high-quality classification
|
|
1716
|
+
* Provides the most flexible and accurate theme classification using LLMs
|
|
1717
|
+
*/
|
|
1718
|
+
|
|
1719
|
+
/**
|
|
1720
|
+
* LLM-based theme classification using language models.
|
|
1721
|
+
*
|
|
1722
|
+
* This classifier uses LLMs to provide the highest quality theme classification
|
|
1723
|
+
* with semantic understanding and nuanced reasoning. It supports custom prompt
|
|
1724
|
+
* templates for domain-specific classification needs.
|
|
1725
|
+
*
|
|
1726
|
+
* Features:
|
|
1727
|
+
* - Default prompt template with {themes} and {text} placeholders
|
|
1728
|
+
* - Custom prompt template support for specialized domains
|
|
1729
|
+
* - Structured JSON output using LLM.generateJSON<>
|
|
1730
|
+
* - Sequential batch processing to avoid rate limits
|
|
1731
|
+
* - Comprehensive error handling with cause chain
|
|
1732
|
+
* - Empty text handling with uniform scores
|
|
1733
|
+
*
|
|
1734
|
+
* @example
|
|
1735
|
+
* ```typescript
|
|
1736
|
+
* const llm = new OpenAIClient('gpt-4');
|
|
1737
|
+
* const classifier = new LLMThemeClassifier(
|
|
1738
|
+
* ['technology', 'sports', 'finance'],
|
|
1739
|
+
* llm
|
|
1740
|
+
* );
|
|
1741
|
+
* const result = await classifier.classify('Machine learning is transforming AI');
|
|
1742
|
+
* console.log(result.theme); // 'technology'
|
|
1743
|
+
* console.log(result.confidence); // 0.95
|
|
1744
|
+
* ```
|
|
1745
|
+
*
|
|
1746
|
+
* @example Custom prompt template
|
|
1747
|
+
* ```typescript
|
|
1748
|
+
* const customTemplate = `Classify this medical text: {text}
|
|
1749
|
+
* Themes: {themes}
|
|
1750
|
+
* Return JSON with theme, confidence, allScores.`;
|
|
1751
|
+
*
|
|
1752
|
+
* const classifier = new LLMThemeClassifier(
|
|
1753
|
+
* ['cardiology', 'neurology', 'oncology'],
|
|
1754
|
+
* llm,
|
|
1755
|
+
* customTemplate
|
|
1756
|
+
* );
|
|
1757
|
+
* ```
|
|
1758
|
+
*/
|
|
1759
|
+
declare class LLMThemeClassifier implements ThemeClassifier {
|
|
1760
|
+
private themes;
|
|
1761
|
+
private llm;
|
|
1762
|
+
private promptTemplate;
|
|
1763
|
+
/**
|
|
1764
|
+
* Creates a new LLMThemeClassifier
|
|
1765
|
+
*
|
|
1766
|
+
* @param themes - Array of theme labels to classify into
|
|
1767
|
+
* @param llm - LLM client instance to use for classification
|
|
1768
|
+
* @param promptTemplate - Optional custom prompt template with {themes} and {text} placeholders
|
|
1769
|
+
*
|
|
1770
|
+
* @example
|
|
1771
|
+
* ```typescript
|
|
1772
|
+
* const classifier = new LLMThemeClassifier(
|
|
1773
|
+
* ['technology', 'sports', 'finance'],
|
|
1774
|
+
* llm
|
|
1775
|
+
* );
|
|
1776
|
+
* ```
|
|
1777
|
+
*
|
|
1778
|
+
* @example With custom prompt
|
|
1779
|
+
* ```typescript
|
|
1780
|
+
* const customTemplate = `Classify: {text}\nThemes: {themes}\nReturn JSON.`;
|
|
1781
|
+
* const classifier = new LLMThemeClassifier(
|
|
1782
|
+
* ['technology', 'sports'],
|
|
1783
|
+
* llm,
|
|
1784
|
+
* customTemplate
|
|
1785
|
+
* );
|
|
1786
|
+
* ```
|
|
1787
|
+
*/
|
|
1788
|
+
constructor(themes: string[], llm: LLMClient, promptTemplate?: string);
|
|
1789
|
+
/**
|
|
1790
|
+
* Build the classification prompt by replacing placeholders
|
|
1791
|
+
*
|
|
1792
|
+
* @param text - The text to classify
|
|
1793
|
+
* @returns The complete prompt with placeholders replaced
|
|
1794
|
+
*/
|
|
1795
|
+
private buildPrompt;
|
|
1796
|
+
/**
|
|
1797
|
+
* Classify a single text into one of the provided themes
|
|
1798
|
+
*
|
|
1799
|
+
* @param text - The text content to classify
|
|
1800
|
+
* @returns A promise that resolves to the theme classification result
|
|
1801
|
+
*
|
|
1802
|
+
* @example
|
|
1803
|
+
* ```typescript
|
|
1804
|
+
* const classifier = new LLMThemeClassifier(['technology', 'sports'], llm);
|
|
1805
|
+
* const result = await classifier.classify('Machine learning and AI');
|
|
1806
|
+
* console.log(result.theme); // 'technology'
|
|
1807
|
+
* console.log(result.confidence); // 0.95
|
|
1808
|
+
* console.log(result.allScores); // { technology: 0.95, sports: 0.05 }
|
|
1809
|
+
* ```
|
|
1810
|
+
*/
|
|
1811
|
+
classify(text: string): Promise<ThemeClassification>;
|
|
1812
|
+
/**
|
|
1813
|
+
* Classify multiple texts sequentially
|
|
1814
|
+
*
|
|
1815
|
+
* Processes texts one at a time to avoid rate limits and ensure predictable behavior.
|
|
1816
|
+
* Sequential processing provides better error handling and rate limit compliance.
|
|
1817
|
+
*
|
|
1818
|
+
* @param texts - Array of text contents to classify
|
|
1819
|
+
* @returns A promise that resolves to an array of theme classifications
|
|
1820
|
+
*
|
|
1821
|
+
* @example
|
|
1822
|
+
* ```typescript
|
|
1823
|
+
* const classifier = new LLMThemeClassifier(['technology', 'sports', 'finance'], llm);
|
|
1824
|
+
* const results = await classifier.classifyBatch([
|
|
1825
|
+
* 'Machine learning is transforming AI',
|
|
1826
|
+
* 'The football team won the championship',
|
|
1827
|
+
* 'Stock market hits record high'
|
|
1828
|
+
* ]);
|
|
1829
|
+
* // results[0].theme === 'technology'
|
|
1830
|
+
* // results[1].theme === 'sports'
|
|
1831
|
+
* // results[2].theme === 'finance'
|
|
1832
|
+
* ```
|
|
1833
|
+
*/
|
|
1834
|
+
classifyBatch(texts: string[]): Promise<ThemeClassification[]>;
|
|
1835
|
+
}
|
|
1836
|
+
|
|
1837
|
+
/**
|
|
1838
|
+
* Enrichment pipeline for adding metadata to vector records.
|
|
1839
|
+
*
|
|
1840
|
+
* This class provides the main enrichment functionality:
|
|
1841
|
+
* - Vertical enrichment: Classify documents into business verticals
|
|
1842
|
+
* - Theme enrichment: Add thematic tags to documents
|
|
1843
|
+
* - Section enrichment: Structure documents into logical sections
|
|
1844
|
+
* - Batch processing: Efficiently process large collections
|
|
1845
|
+
*
|
|
1846
|
+
* Design principles:
|
|
1847
|
+
* 1. Database-agnostic: Works with any VectorDBAdapter
|
|
1848
|
+
* 2. Strategy pattern: Multiple enrichment strategies per operation
|
|
1849
|
+
* 3. Batch processing: Efficient iteration and bulk updates
|
|
1850
|
+
* 4. Error resilience: Continue processing despite individual failures
|
|
1851
|
+
*/
|
|
1852
|
+
|
|
1853
|
+
/**
|
|
1854
|
+
* EnrichmentPipeline provides methods to enrich vector records with metadata.
|
|
1855
|
+
*
|
|
1856
|
+
* The pipeline supports three types of enrichment:
|
|
1857
|
+
* 1. Vertical enrichment: Classify into business verticals (technology, finance, etc.)
|
|
1858
|
+
* 2. Theme enrichment: Add thematic tags (innovation, research, etc.)
|
|
1859
|
+
* 3. Section enrichment: Structure into logical sections
|
|
1860
|
+
*
|
|
1861
|
+
* Each enrichment type supports multiple strategies for maximum flexibility.
|
|
1862
|
+
*
|
|
1863
|
+
* @example
|
|
1864
|
+
* ```typescript
|
|
1865
|
+
* const pipeline = new EnrichmentPipeline(adapter, embedder, llm);
|
|
1866
|
+
*
|
|
1867
|
+
* // Enrich using field mapping
|
|
1868
|
+
* await pipeline.enrichVertical('my-collection', {
|
|
1869
|
+
* mapping: { 'tech': 'technology', 'hc': 'healthcare' }
|
|
1870
|
+
* });
|
|
1871
|
+
*
|
|
1872
|
+
* // Enrich using custom extractor
|
|
1873
|
+
* await pipeline.enrichVertical('my-collection', {
|
|
1874
|
+
* extractor: async (doc) => extractVertical(doc)
|
|
1875
|
+
* });
|
|
1876
|
+
*
|
|
1877
|
+
* // Enrich using LLM
|
|
1878
|
+
* await pipeline.enrichVertical('my-collection', {
|
|
1879
|
+
* automatic: {
|
|
1880
|
+
* llm: myLLMClient,
|
|
1881
|
+
* fields: ['technology', 'finance', 'healthcare']
|
|
1882
|
+
* }
|
|
1883
|
+
* });
|
|
1884
|
+
* ```
|
|
1885
|
+
*/
|
|
1886
|
+
declare class EnrichmentPipeline {
|
|
1887
|
+
private adapter;
|
|
1888
|
+
private embedder?;
|
|
1889
|
+
private llm?;
|
|
1890
|
+
/**
|
|
1891
|
+
* Create a new enrichment pipeline.
|
|
1892
|
+
*
|
|
1893
|
+
* @param adapter - Vector database adapter for reading/writing records
|
|
1894
|
+
* @param embedder - Optional embedder for embedding-based enrichment
|
|
1895
|
+
* @param llm - Optional LLM client for automatic enrichment
|
|
1896
|
+
*/
|
|
1897
|
+
constructor(adapter: VectorDBAdapter, embedder?: any | undefined, llm?: any | undefined);
|
|
1898
|
+
/**
|
|
1899
|
+
* Enrich records with vertical classifications.
|
|
1900
|
+
*
|
|
1901
|
+
* Supports three strategies:
|
|
1902
|
+
* 1. Field mapping: Map existing field values to verticals
|
|
1903
|
+
* 2. Custom extractor: Use a custom function to extract verticals
|
|
1904
|
+
* 3. Automatic LLM: Use an LLM to classify documents
|
|
1905
|
+
*
|
|
1906
|
+
* @param collection - Name of the collection to enrich
|
|
1907
|
+
* @param config - Vertical enrichment configuration
|
|
1908
|
+
* @returns Statistics about the enrichment operation
|
|
1909
|
+
*
|
|
1910
|
+
* @example
|
|
1911
|
+
* ```typescript
|
|
1912
|
+
* // Field mapping
|
|
1913
|
+
* await pipeline.enrichVertical('docs', {
|
|
1914
|
+
* mapping: { 'tech': 'technology' }
|
|
1915
|
+
* });
|
|
1916
|
+
*
|
|
1917
|
+
* // Custom extractor
|
|
1918
|
+
* await pipeline.enrichVertical('docs', {
|
|
1919
|
+
* extractor: async (doc) => 'technology'
|
|
1920
|
+
* });
|
|
1921
|
+
*
|
|
1922
|
+
* // Automatic LLM
|
|
1923
|
+
* await pipeline.enrichVertical('docs', {
|
|
1924
|
+
* automatic: {
|
|
1925
|
+
* llm: myLLMClient,
|
|
1926
|
+
* fields: ['technology', 'finance']
|
|
1927
|
+
* }
|
|
1928
|
+
* });
|
|
1929
|
+
* ```
|
|
1930
|
+
*/
|
|
1931
|
+
enrichVertical(collection: string, config: VerticalEnrichmentConfig): Promise<EnrichmentStats>;
|
|
1932
|
+
/**
|
|
1933
|
+
* Enrich records using field mapping strategy.
|
|
1934
|
+
*
|
|
1935
|
+
* Maps values from an existing field to vertical classifications.
|
|
1936
|
+
*
|
|
1937
|
+
* @param collection - Collection name
|
|
1938
|
+
* @param config - Field mapping configuration
|
|
1939
|
+
* @param stats - Statistics object to update
|
|
1940
|
+
*/
|
|
1941
|
+
private enrichWithFieldMapping;
|
|
1942
|
+
/**
|
|
1943
|
+
* Apply field mapping to extract vertical from a record.
|
|
1944
|
+
*
|
|
1945
|
+
* @param record - Vector record
|
|
1946
|
+
* @param mapping - Field mapping configuration
|
|
1947
|
+
* @returns Vertical label or null if no match
|
|
1948
|
+
*/
|
|
1949
|
+
private applyFieldMapping;
|
|
1950
|
+
/**
|
|
1951
|
+
* Enrich records using custom extractor strategy.
|
|
1952
|
+
*
|
|
1953
|
+
* Calls the provided extractor function for each record.
|
|
1954
|
+
*
|
|
1955
|
+
* @param collection - Collection name
|
|
1956
|
+
* @param config - Extractor configuration
|
|
1957
|
+
* @param stats - Statistics object to update
|
|
1958
|
+
*/
|
|
1959
|
+
private enrichWithExtractor;
|
|
1960
|
+
/**
|
|
1961
|
+
* Enrich records using automatic LLM strategy.
|
|
1962
|
+
*
|
|
1963
|
+
* Uses a language model to classify documents into verticals.
|
|
1964
|
+
*
|
|
1965
|
+
* @param collection - Collection name
|
|
1966
|
+
* @param config - Automatic extraction configuration
|
|
1967
|
+
* @param stats - Statistics object to update
|
|
1968
|
+
*/
|
|
1969
|
+
private enrichWithLLM;
|
|
1970
|
+
/**
|
|
1971
|
+
* Extract vertical classification using LLM.
|
|
1972
|
+
*
|
|
1973
|
+
* @param record - Vector record
|
|
1974
|
+
* @param llm - LLM client
|
|
1975
|
+
* @param fields - Available vertical fields
|
|
1976
|
+
* @param textField - Field name containing text to classify
|
|
1977
|
+
* @param promptTemplate - Optional custom prompt template
|
|
1978
|
+
* @returns Vertical label
|
|
1979
|
+
*/
|
|
1980
|
+
private extractWithLLM;
|
|
1981
|
+
/**
|
|
1982
|
+
* Enrich records with theme classifications.
|
|
1983
|
+
*
|
|
1984
|
+
* Uses a theme classifier to identify themes in text content and updates
|
|
1985
|
+
* record metadata with theme information. Supports single and multi-theme
|
|
1986
|
+
* classification with configurable confidence thresholds.
|
|
1987
|
+
*
|
|
1988
|
+
* @param collection - Name of the collection to enrich
|
|
1989
|
+
* @param config - Theme enrichment configuration
|
|
1990
|
+
* @returns Statistics about the enrichment operation
|
|
1991
|
+
*
|
|
1992
|
+
* @example
|
|
1993
|
+
* ```typescript
|
|
1994
|
+
* // Single theme classification
|
|
1995
|
+
* await pipeline.enrichThemes('docs', {
|
|
1996
|
+
* themes: ['technology', 'business', 'science'],
|
|
1997
|
+
* classifier: new KeywordThemeClassifier(),
|
|
1998
|
+
* confidenceThreshold: 0.7
|
|
1999
|
+
* });
|
|
2000
|
+
*
|
|
2001
|
+
* // Multi-theme classification
|
|
2002
|
+
* await pipeline.enrichThemes('docs', {
|
|
2003
|
+
* themes: ['technology', 'business', 'science'],
|
|
2004
|
+
* classifier: new LLMThemeClassifier(),
|
|
2005
|
+
* multiTheme: true,
|
|
2006
|
+
* confidenceThreshold: 0.5
|
|
2007
|
+
* });
|
|
2008
|
+
* ```
|
|
2009
|
+
*/
|
|
2010
|
+
enrichThemes(collection: string, config: ThemeEnrichmentConfig): Promise<EnrichmentStats>;
|
|
2011
|
+
/**
|
|
2012
|
+
* Enrich records using theme classifier.
|
|
2013
|
+
*
|
|
2014
|
+
* @param collection - Collection name
|
|
2015
|
+
* @param config - Theme enrichment configuration
|
|
2016
|
+
* @param stats - Statistics object to update
|
|
2017
|
+
*/
|
|
2018
|
+
private enrichWithThemeClassifier;
|
|
2019
|
+
/**
|
|
2020
|
+
* Enrich records with section structure.
|
|
2021
|
+
*
|
|
2022
|
+
* Extracts section metadata from documents using either existing field mappings
|
|
2023
|
+
* or automatic detection strategies (markdown, HTML, or pattern-based).
|
|
2024
|
+
*
|
|
2025
|
+
* @param collection - Name of the collection to enrich
|
|
2026
|
+
* @param config - Section enrichment configuration
|
|
2027
|
+
* @returns Statistics about the enrichment operation
|
|
2028
|
+
*
|
|
2029
|
+
* @example
|
|
2030
|
+
* ```typescript
|
|
2031
|
+
* // Use existing section field
|
|
2032
|
+
* await pipeline.enrichSections('docs', {
|
|
2033
|
+
* existingField: 'section_path'
|
|
2034
|
+
* });
|
|
2035
|
+
*
|
|
2036
|
+
* // Auto-detect sections
|
|
2037
|
+
* await pipeline.enrichSections('docs', {
|
|
2038
|
+
* autoDetect: true
|
|
2039
|
+
* });
|
|
2040
|
+
* ```
|
|
2041
|
+
*/
|
|
2042
|
+
enrichSections(collection: string, config: SectionEnrichmentConfig): Promise<EnrichmentStats>;
|
|
2043
|
+
/**
|
|
2044
|
+
* Enrich records with all enrichment types.
|
|
2045
|
+
*
|
|
2046
|
+
* Runs vertical, theme, and section enrichment sequentially with shared
|
|
2047
|
+
* configuration. Global filters and batch sizes apply to all operations.
|
|
2048
|
+
*
|
|
2049
|
+
* @param collection - Name of the collection to enrich
|
|
2050
|
+
* @param config - Combined enrichment configuration
|
|
2051
|
+
* @returns Statistics about the enrichment operation
|
|
2052
|
+
*
|
|
2053
|
+
* @example
|
|
2054
|
+
* ```typescript
|
|
2055
|
+
* await pipeline.enrichAll('docs', {
|
|
2056
|
+
* vertical: { mapping: { tech: 'technology' } },
|
|
2057
|
+
* themes: { themes: ['innovation'], classifier },
|
|
2058
|
+
* sections: { autoDetect: true },
|
|
2059
|
+
* filter: { field: 'status', op: 'eq', value: 'pending' },
|
|
2060
|
+
* batchSize: 50
|
|
2061
|
+
* });
|
|
2062
|
+
* ```
|
|
2063
|
+
*/
|
|
2064
|
+
enrichAll(collection: string, config: EnrichAllConfig): Promise<EnrichmentStats>;
|
|
2065
|
+
/**
|
|
2066
|
+
* Apply global configuration to individual enrichment configs.
|
|
2067
|
+
*
|
|
2068
|
+
* @param individualConfig - Configuration for a specific enrichment type
|
|
2069
|
+
* @param globalConfig - Global configuration
|
|
2070
|
+
* @returns Merged configuration
|
|
2071
|
+
*/
|
|
2072
|
+
private applyGlobalConfig;
|
|
2073
|
+
/**
|
|
2074
|
+
* Merge stats from an enrichment operation into aggregate stats.
|
|
2075
|
+
*
|
|
2076
|
+
* @param aggregate - Aggregate stats to update
|
|
2077
|
+
* @param stats - Stats from a single operation
|
|
2078
|
+
*/
|
|
2079
|
+
private mergeStats;
|
|
2080
|
+
/**
|
|
2081
|
+
* Enrich records using section detection.
|
|
2082
|
+
*
|
|
2083
|
+
* @param collection - Collection name
|
|
2084
|
+
* @param config - Section enrichment configuration
|
|
2085
|
+
* @param stats - Statistics object to update
|
|
2086
|
+
*/
|
|
2087
|
+
private enrichWithSectionDetection;
|
|
2088
|
+
/**
|
|
2089
|
+
* Extract section metadata from an existing field value.
|
|
2090
|
+
*
|
|
2091
|
+
* @param sectionPath - Section path string (e.g., "introduction/overview")
|
|
2092
|
+
* @returns Section metadata or null
|
|
2093
|
+
*/
|
|
2094
|
+
private extractSectionMetadata;
|
|
2095
|
+
/**
|
|
2096
|
+
* Detect sections in text using heuristics.
|
|
2097
|
+
*
|
|
2098
|
+
* @param text - Text content to analyze
|
|
2099
|
+
* @returns Section metadata or null
|
|
2100
|
+
*/
|
|
2101
|
+
private detectSections;
|
|
2102
|
+
/**
|
|
2103
|
+
* Detect markdown headers (# Header).
|
|
2104
|
+
*
|
|
2105
|
+
* @param text - Text content
|
|
2106
|
+
* @returns Section metadata or null
|
|
2107
|
+
*/
|
|
2108
|
+
private detectMarkdownSections;
|
|
2109
|
+
/**
|
|
2110
|
+
* Detect HTML headers (<h1>Header</h1>).
|
|
2111
|
+
*
|
|
2112
|
+
* @param text - Text content
|
|
2113
|
+
* @returns Section metadata or null
|
|
2114
|
+
*/
|
|
2115
|
+
private detectHtmlSections;
|
|
2116
|
+
/**
|
|
2117
|
+
* Detect sections using common patterns (SECTION: Title).
|
|
2118
|
+
*
|
|
2119
|
+
* @param text - Text content
|
|
2120
|
+
* @returns Section metadata or null
|
|
2121
|
+
*/
|
|
2122
|
+
private detectPatternSections;
|
|
2123
|
+
}
|
|
2124
|
+
|
|
2125
|
+
/**
|
|
2126
|
+
* Loaded document with extracted text and metadata.
|
|
2127
|
+
*/
|
|
2128
|
+
interface Document {
|
|
2129
|
+
/** Full document text */
|
|
2130
|
+
text: string;
|
|
2131
|
+
/** File path or source identifier */
|
|
2132
|
+
source: string;
|
|
2133
|
+
/** File type/extension (pdf, txt, docx, html) */
|
|
2134
|
+
type: string;
|
|
2135
|
+
/** Optional user-provided or loader-extracted metadata */
|
|
2136
|
+
metadata?: Record<string, any>;
|
|
2137
|
+
}
|
|
2138
|
+
/**
|
|
2139
|
+
* Statistics returned by ingestion operations.
|
|
2140
|
+
*/
|
|
2141
|
+
interface IngestionStats {
|
|
2142
|
+
documentsProcessed: number;
|
|
2143
|
+
documentsSucceeded: number;
|
|
2144
|
+
documentsFailed: number;
|
|
2145
|
+
chunksCreated: number;
|
|
2146
|
+
chunksUpserted: number;
|
|
2147
|
+
timeMs: number;
|
|
2148
|
+
errors?: Array<{
|
|
2149
|
+
source: string;
|
|
2150
|
+
stage: 'load' | 'chunk' | 'embed' | 'upsert';
|
|
2151
|
+
error: Error;
|
|
2152
|
+
}>;
|
|
2153
|
+
}
|
|
2154
|
+
/**
|
|
2155
|
+
* Configuration for ingestion operations.
|
|
2156
|
+
*/
|
|
2157
|
+
interface IngestionConfig {
|
|
2158
|
+
chunkSize?: number;
|
|
2159
|
+
chunkOverlap?: number;
|
|
2160
|
+
chunker?: any;
|
|
2161
|
+
metadata?: Record<string, any>;
|
|
2162
|
+
metadataExtractor?: (doc: Document) => Record<string, any>;
|
|
2163
|
+
batchSize?: number;
|
|
2164
|
+
concurrency?: number;
|
|
2165
|
+
onProgress?: (progress: ProgressInfo) => void;
|
|
2166
|
+
onDocumentLoaded?: (doc: Document) => void;
|
|
2167
|
+
onChunksCreated?: (chunks: any[]) => void;
|
|
2168
|
+
}
|
|
2169
|
+
/**
|
|
2170
|
+
* Progress information during ingestion.
|
|
2171
|
+
*/
|
|
2172
|
+
interface ProgressInfo {
|
|
2173
|
+
stage: 'loading' | 'chunking' | 'embedding' | 'upserting';
|
|
2174
|
+
documentsProcessed: number;
|
|
2175
|
+
totalDocuments: number;
|
|
2176
|
+
chunksProcessed: number;
|
|
2177
|
+
totalChunks?: number;
|
|
2178
|
+
currentDocument?: string;
|
|
2179
|
+
}
|
|
2180
|
+
/**
|
|
2181
|
+
* Text chunk with position metadata.
|
|
2182
|
+
*/
|
|
2183
|
+
interface TextChunk {
|
|
2184
|
+
text: string;
|
|
2185
|
+
index: number;
|
|
2186
|
+
metadata: {
|
|
2187
|
+
source: string;
|
|
2188
|
+
chunkIndex: number;
|
|
2189
|
+
totalChunks: number;
|
|
2190
|
+
startChar: number;
|
|
2191
|
+
endChar: number;
|
|
2192
|
+
};
|
|
2193
|
+
}
|
|
2194
|
+
/**
|
|
2195
|
+
* Configuration for chunking operations.
|
|
2196
|
+
*/
|
|
2197
|
+
interface ChunkConfig {
|
|
2198
|
+
chunkSize?: number;
|
|
2199
|
+
chunkOverlap?: number;
|
|
2200
|
+
}
|
|
2201
|
+
|
|
2202
|
+
/**
|
|
2203
|
+
* Abstract interface for document loaders.
|
|
2204
|
+
* Implementations load specific file formats and return standardized Document objects.
|
|
2205
|
+
*/
|
|
2206
|
+
interface DocumentLoader {
|
|
2207
|
+
/**
|
|
2208
|
+
* Check if this loader can handle the given file.
|
|
2209
|
+
* @param filePath - Path to the file
|
|
2210
|
+
* @returns true if loader can handle this file type
|
|
2211
|
+
*/
|
|
2212
|
+
canHandle(filePath: string): boolean;
|
|
2213
|
+
/**
|
|
2214
|
+
* Load a document from the given file path.
|
|
2215
|
+
* @param filePath - Path to the file to load
|
|
2216
|
+
* @returns Promise resolving to Document
|
|
2217
|
+
*/
|
|
2218
|
+
load(filePath: string): Promise<Document>;
|
|
2219
|
+
}
|
|
2220
|
+
|
|
2221
|
+
/**
|
|
2222
|
+
* Registry for document loaders.
|
|
2223
|
+
* Manages loaders and routes files to correct loader based on extension.
|
|
2224
|
+
*/
|
|
2225
|
+
declare class LoaderRegistry {
|
|
2226
|
+
private loaders;
|
|
2227
|
+
constructor();
|
|
2228
|
+
/**
|
|
2229
|
+
* Register a custom document loader.
|
|
2230
|
+
* @param loader - Loader to register
|
|
2231
|
+
*/
|
|
2232
|
+
register(loader: DocumentLoader): void;
|
|
2233
|
+
/**
|
|
2234
|
+
* Check if any loader can handle this file.
|
|
2235
|
+
* @param filePath - Path to check
|
|
2236
|
+
* @returns true if a loader exists for this file type
|
|
2237
|
+
*/
|
|
2238
|
+
canLoad(filePath: string): boolean;
|
|
2239
|
+
/**
|
|
2240
|
+
* Load a document using the appropriate loader.
|
|
2241
|
+
* @param filePath - Path to the file to load
|
|
2242
|
+
* @returns Promise resolving to Document
|
|
2243
|
+
* @throws Error if no loader found for file type
|
|
2244
|
+
*/
|
|
2245
|
+
load(filePath: string): Promise<Document>;
|
|
2246
|
+
}
|
|
2247
|
+
|
|
2248
|
+
/**
|
|
2249
|
+
* Abstract interface for text chunking strategies.
|
|
2250
|
+
* Implementations split text into chunks with different algorithms.
|
|
2251
|
+
*/
|
|
2252
|
+
interface TextChunker {
|
|
2253
|
+
/**
|
|
2254
|
+
* Chunk text into smaller pieces.
|
|
2255
|
+
* @param text - Text to chunk
|
|
2256
|
+
* @param config - Optional chunking configuration
|
|
2257
|
+
* @returns Array of text chunks with position metadata
|
|
2258
|
+
*/
|
|
2259
|
+
chunk(text: string, config?: ChunkConfig): TextChunk[];
|
|
2260
|
+
}
|
|
2261
|
+
/**
|
|
2262
|
+
* Default chunk size in tokens (approximate).
|
|
2263
|
+
*/
|
|
2264
|
+
declare const DEFAULT_CHUNK_SIZE = 500;
|
|
2265
|
+
/**
|
|
2266
|
+
* Default chunk overlap in tokens (approximate).
|
|
2267
|
+
*/
|
|
2268
|
+
declare const DEFAULT_CHUNK_OVERLAP = 50;
|
|
2269
|
+
/**
|
|
2270
|
+
* Estimate token count from character count.
|
|
2271
|
+
* Simple heuristic: 1 token ≈ 4 characters for English text.
|
|
2272
|
+
*/
|
|
2273
|
+
declare function estimateTokens(text: string): number;
|
|
2274
|
+
/**
|
|
2275
|
+
* Estimate character count from token count.
|
|
2276
|
+
*/
|
|
2277
|
+
declare function estimateChars(tokens: number): number;
|
|
2278
|
+
|
|
2279
|
+
/**
|
|
2280
|
+
* Main ingestion pipeline orchestrator.
|
|
2281
|
+
* Coordinates loading, chunking, embedding, and upserting documents.
|
|
2282
|
+
*/
|
|
2283
|
+
declare class IngestionPipeline {
|
|
2284
|
+
private adapter;
|
|
2285
|
+
private embedder;
|
|
2286
|
+
private loaderRegistry;
|
|
2287
|
+
private defaultChunker;
|
|
2288
|
+
constructor(adapter: VectorDBAdapter, embedder: Embedder, loaderRegistry: LoaderRegistry, chunker?: TextChunker);
|
|
2289
|
+
/**
|
|
2290
|
+
* Ingest documents into a vector database collection.
|
|
2291
|
+
* @param sources - File paths
|
|
2292
|
+
* @param collection - Target collection name
|
|
2293
|
+
* @param config - Optional ingestion configuration
|
|
2294
|
+
* @returns Statistics about the ingestion operation
|
|
2295
|
+
*/
|
|
2296
|
+
ingest(sources: string | string[], collection: string, config?: IngestionConfig): Promise<IngestionStats>;
|
|
2297
|
+
private ingestFile;
|
|
2298
|
+
private buildMetadata;
|
|
2299
|
+
}
|
|
2300
|
+
|
|
2301
|
+
/**
|
|
2302
|
+
* Loader for plain text files (.txt, .md).
|
|
2303
|
+
* No external dependencies, uses Node.js built-in fs.
|
|
2304
|
+
*/
|
|
2305
|
+
declare class TextLoader implements DocumentLoader {
|
|
2306
|
+
canHandle(filePath: string): boolean;
|
|
2307
|
+
load(filePath: string): Promise<Document>;
|
|
2308
|
+
}
|
|
2309
|
+
|
|
2310
|
+
/**
|
|
2311
|
+
* Loader for PDF files using pdf-parse library.
|
|
2312
|
+
* Extracts text from all pages and includes PDF metadata.
|
|
2313
|
+
*/
|
|
2314
|
+
declare class PDFLoader implements DocumentLoader {
|
|
2315
|
+
canHandle(filePath: string): boolean;
|
|
2316
|
+
load(filePath: string): Promise<Document>;
|
|
2317
|
+
}
|
|
2318
|
+
|
|
2319
|
+
/**
|
|
2320
|
+
* Loader for DOCX files using mammoth library.
|
|
2321
|
+
* Converts DOCX to plain text, preserves paragraph structure.
|
|
2322
|
+
*/
|
|
2323
|
+
declare class DOCXLoader implements DocumentLoader {
|
|
2324
|
+
canHandle(filePath: string): boolean;
|
|
2325
|
+
load(filePath: string): Promise<Document>;
|
|
2326
|
+
}
|
|
2327
|
+
|
|
2328
|
+
/**
|
|
2329
|
+
* Loader for HTML files using cheerio library.
|
|
2330
|
+
* Strips tags, extracts visible text, removes scripts/styles.
|
|
2331
|
+
*/
|
|
2332
|
+
declare class HTMLLoader implements DocumentLoader {
|
|
2333
|
+
canHandle(filePath: string): boolean;
|
|
2334
|
+
load(filePath: string): Promise<Document>;
|
|
2335
|
+
}
|
|
2336
|
+
|
|
2337
|
+
/**
|
|
2338
|
+
* Recursive text chunker that tries different separators hierarchically.
|
|
2339
|
+
* Tries to split by paragraphs first, then sentences, then words, then characters.
|
|
2340
|
+
*/
|
|
2341
|
+
declare class RecursiveChunker implements TextChunker {
|
|
2342
|
+
private readonly separators;
|
|
2343
|
+
chunk(text: string, config?: ChunkConfig): TextChunk[];
|
|
2344
|
+
private recursiveSplit;
|
|
2345
|
+
private addOverlap;
|
|
2346
|
+
}
|
|
2347
|
+
|
|
2348
|
+
/**
|
|
2349
|
+
* Fixed-size text chunker that splits at exact character boundaries.
|
|
2350
|
+
* Fast and predictable, but may split mid-sentence or mid-word.
|
|
2351
|
+
*/
|
|
2352
|
+
declare class FixedChunker implements TextChunker {
|
|
2353
|
+
chunk(text: string, config?: ChunkConfig): TextChunk[];
|
|
2354
|
+
}
|
|
2355
|
+
|
|
2356
|
+
/**
|
|
2357
|
+
* Sentence-aware chunker that splits on sentence boundaries.
|
|
2358
|
+
* Uses a simple regex-based sentence splitter for portability.
|
|
2359
|
+
*/
|
|
2360
|
+
declare class SentenceChunker implements TextChunker {
|
|
2361
|
+
chunk(text: string, config?: ChunkConfig): TextChunk[];
|
|
2362
|
+
private splitSentences;
|
|
2363
|
+
private addSentenceOverlap;
|
|
2364
|
+
}
|
|
2365
|
+
|
|
2366
|
+
/**
|
|
2367
|
+
* Configuration for RAGClient.
|
|
2368
|
+
*/
|
|
2369
|
+
interface RAGClientConfig {
|
|
2370
|
+
/** Vector database adapter */
|
|
2371
|
+
adapter: VectorDBAdapter;
|
|
2372
|
+
/** Embedding model */
|
|
2373
|
+
embedder: Embedder;
|
|
2374
|
+
/** Optional LLM client (required for query()) */
|
|
2375
|
+
llm?: LLMClient;
|
|
2376
|
+
/** Default collection name */
|
|
2377
|
+
defaultCollection?: string;
|
|
2378
|
+
/** Default number of results to return (default: 10) */
|
|
2379
|
+
defaultTopK?: number;
|
|
2380
|
+
}
|
|
2381
|
+
/**
|
|
2382
|
+
* Options for retrieval operations.
|
|
2383
|
+
*/
|
|
2384
|
+
interface RetrieveOptions {
|
|
2385
|
+
/** Override defaultCollection */
|
|
2386
|
+
collection?: string;
|
|
2387
|
+
/** Override defaultTopK */
|
|
2388
|
+
topK?: number;
|
|
2389
|
+
/** Custom filter */
|
|
2390
|
+
filter?: UniversalFilter;
|
|
2391
|
+
/** Shorthand for vertical filter on __v_partition */
|
|
2392
|
+
partition?: string;
|
|
2393
|
+
/** Shorthand for horizontal filter on __h_theme */
|
|
2394
|
+
theme?: string;
|
|
2395
|
+
/** Group results by document or theme */
|
|
2396
|
+
groupBy?: 'document' | 'theme';
|
|
2397
|
+
}
|
|
2398
|
+
/**
|
|
2399
|
+
* Options for full RAG query operations.
|
|
2400
|
+
*/
|
|
2401
|
+
interface QueryOptions extends RetrieveOptions {
|
|
2402
|
+
/** Override default RAG system prompt */
|
|
2403
|
+
systemPrompt?: string;
|
|
2404
|
+
/** LLM temperature */
|
|
2405
|
+
temperature?: number;
|
|
2406
|
+
/** LLM max tokens */
|
|
2407
|
+
maxTokens?: number;
|
|
2408
|
+
}
|
|
2409
|
+
/**
|
|
2410
|
+
* Response from a full RAG query.
|
|
2411
|
+
*/
|
|
2412
|
+
interface RAGResponse {
|
|
2413
|
+
/** LLM-generated answer */
|
|
2414
|
+
answer: string;
|
|
2415
|
+
/** Retrieved context chunks used to generate the answer */
|
|
2416
|
+
sources: VectorRecord[];
|
|
2417
|
+
/** Original question */
|
|
2418
|
+
query: string;
|
|
2419
|
+
/** Full retrieval details */
|
|
2420
|
+
retrievalResult: RetrievalResult;
|
|
2421
|
+
}
|
|
2422
|
+
|
|
2423
|
+
/**
|
|
2424
|
+
* RAGClient - Unified facade for all Glyph VectorORM operations.
|
|
2425
|
+
*
|
|
2426
|
+
* Ties together adapter, embedder, LLM, ingestion, enrichment, and query
|
|
2427
|
+
* into a single developer-facing API.
|
|
2428
|
+
*
|
|
2429
|
+
* @example
|
|
2430
|
+
* ```typescript
|
|
2431
|
+
* const client = new RAGClient({
|
|
2432
|
+
* adapter: new ChromaAdapter(),
|
|
2433
|
+
* embedder: new OpenAIEmbedder(),
|
|
2434
|
+
* llm: new OpenAIClient(),
|
|
2435
|
+
* defaultCollection: 'my-docs'
|
|
2436
|
+
* });
|
|
2437
|
+
*
|
|
2438
|
+
* // Ingest documents
|
|
2439
|
+
* await client.ingest(['docs/*.pdf']);
|
|
2440
|
+
*
|
|
2441
|
+
* // Retrieve
|
|
2442
|
+
* const result = await client.retrieve('pricing info');
|
|
2443
|
+
*
|
|
2444
|
+
* // Full RAG query
|
|
2445
|
+
* const response = await client.query('What are the pricing terms?');
|
|
2446
|
+
* console.log(response.answer);
|
|
2447
|
+
* ```
|
|
2448
|
+
*/
|
|
2449
|
+
declare class RAGClient {
|
|
2450
|
+
private readonly adapter;
|
|
2451
|
+
private readonly embedder;
|
|
2452
|
+
private readonly llm?;
|
|
2453
|
+
private readonly defaultCollection?;
|
|
2454
|
+
private readonly defaultTopK;
|
|
2455
|
+
private readonly queryComposer;
|
|
2456
|
+
private readonly ingestionPipeline;
|
|
2457
|
+
private readonly enrichmentPipeline;
|
|
2458
|
+
constructor(config: RAGClientConfig);
|
|
2459
|
+
/**
|
|
2460
|
+
* Create a new vector collection.
|
|
2461
|
+
* Dimension defaults to embedder.dimensions if not specified.
|
|
2462
|
+
*/
|
|
2463
|
+
createCollection(name: string, dimension?: number, metric?: DistanceMetric): Promise<void>;
|
|
2464
|
+
/**
|
|
2465
|
+
* Delete a collection.
|
|
2466
|
+
*/
|
|
2467
|
+
deleteCollection(name: string): Promise<void>;
|
|
2468
|
+
/**
|
|
2469
|
+
* Check if a collection exists.
|
|
2470
|
+
*/
|
|
2471
|
+
collectionExists(name: string): Promise<boolean>;
|
|
2472
|
+
/**
|
|
2473
|
+
* Ingest documents into a collection.
|
|
2474
|
+
* Collection defaults to defaultCollection if not specified.
|
|
2475
|
+
*/
|
|
2476
|
+
ingest(sources: string | string[], collection?: string, config?: IngestionConfig): Promise<IngestionStats>;
|
|
2477
|
+
/**
|
|
2478
|
+
* Retrieve relevant chunks for a query.
|
|
2479
|
+
* Supports filter shorthands (partition, theme) and groupBy.
|
|
2480
|
+
*/
|
|
2481
|
+
retrieve(query: string, options?: RetrieveOptions): Promise<RetrievalResult>;
|
|
2482
|
+
/**
|
|
2483
|
+
* Enrich a collection with vertical, theme, and/or section metadata.
|
|
2484
|
+
*/
|
|
2485
|
+
enrich(collection: string, config: EnrichAllConfig): Promise<EnrichmentStats>;
|
|
2486
|
+
/**
|
|
2487
|
+
* Full RAG: retrieve relevant context and generate an answer using LLM.
|
|
2488
|
+
* Requires an LLM client to be provided in the constructor config.
|
|
2489
|
+
*/
|
|
2490
|
+
query(question: string, options?: QueryOptions): Promise<RAGResponse>;
|
|
2491
|
+
}
|
|
2492
|
+
|
|
2493
|
+
export { type AndFilter, type AutomaticExtractionConfig, type ChunkConfig, type CollectionStats, DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE, DOCXLoader, type DistanceMetric, type Document, type DocumentLoader, Embedder, EmbeddingThemeClassifier, type EnrichAllConfig, EnrichmentPipeline, type EnrichmentStats, type ExtractorConfig, type FieldMappingConfig, FilterBuilder, type FilterCondition, type FilterOperator, FilterTranslator, FixedChunker, type GenerateOptions, type GroupedResults, HTMLLoader, type HorizontalFieldKey, HorizontalFields, type IngestionConfig, IngestionPipeline, type IngestionStats, KeywordThemeClassifier, LLMClient, LLMThemeClassifier, LoaderRegistry, METADATA_PREFIXES, MetadataBuilder, type MetadataUpdate, MockLLM, type OrFilter, PDFLoader, type ProgressCallback, type ProgressInfo, type QueryOptions, RAGClient, type RAGClientConfig, RAGQueryComposer, type RAGResponse, RecursiveChunker, type RetrievalParams, type RetrievalResult, type RetrieveOptions, type SearchOptions, type SearchResult, type SectionEnrichmentConfig, SentenceChunker, type ShorthandFilter, type StructuralFieldKey, StructuralFields, type TextChunk, type TextChunker, TextLoader, type ThemeClassification, type ThemeClassificationResult, type ThemeClassifier, type ThemeEnrichmentConfig, type UniversalFilter, VectorDBAdapter, type VectorRecord, type VerticalEnrichmentConfig, type VerticalFieldKey, VerticalFields, ZeroShotThemeClassifier, estimateChars, estimateTokens };
|