@vectororm/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs ADDED
@@ -0,0 +1,2441 @@
1
+ // src/metadata/constants.ts
2
+ var METADATA_PREFIXES = {
3
+ VERTICAL: "__v_",
4
+ HORIZONTAL: "__h_",
5
+ STRUCTURAL: "__s_"
6
+ };
7
+ var VerticalFields = {
8
+ /** Unique document identifier */
9
+ DOC_ID: "__v_doc_id",
10
+ /** Original source path/URL */
11
+ SOURCE: "__v_source",
12
+ /** Logical partition key (for filtering by document subsets) */
13
+ PARTITION: "__v_partition",
14
+ /** Document type classification */
15
+ DOC_TYPE: "__v_doc_type",
16
+ /** Arbitrary vertical tags */
17
+ TAGS: "__v_tags"
18
+ };
19
+ var HorizontalFields = {
20
+ /** Primary theme classification */
21
+ THEME: "__h_theme",
22
+ /** Multiple themes (if applicable) */
23
+ THEMES: "__h_themes",
24
+ /** Classification confidence score */
25
+ THEME_CONFIDENCE: "__h_theme_confidence",
26
+ /** Hierarchical section path (e.g., "Chapter 3/Pricing/Rates") */
27
+ SECTION_PATH: "__h_section_path",
28
+ /** Depth level in hierarchy (0 = root) */
29
+ SECTION_LEVEL: "__h_section_level",
30
+ /** Section header text */
31
+ SECTION_TITLE: "__h_section_title"
32
+ };
33
+ var StructuralFields = {
34
+ /** Position in document (0-indexed) */
35
+ CHUNK_INDEX: "__s_chunk_index",
36
+ /** Parent chunk ID (for hierarchical chunking) */
37
+ PARENT_ID: "__s_parent_id",
38
+ /** Whether this chunk has children */
39
+ HAS_CHILDREN: "__s_has_children",
40
+ /** Total chunks in this document */
41
+ TOTAL_CHUNKS: "__s_total_chunks"
42
+ };
43
+
44
+ // src/metadata/builder.ts
45
+ var MetadataBuilder = class {
46
+ metadata = {};
47
+ /**
48
+ * Add vertical axis metadata (document identity).
49
+ * Automatically prefixes fields with '__v_'.
50
+ *
51
+ * @param fields - Vertical metadata fields (doc_id, source, partition, etc.)
52
+ * @returns This builder for chaining
53
+ */
54
+ vertical(fields) {
55
+ for (const [key, value] of Object.entries(fields)) {
56
+ if (value !== void 0) {
57
+ this.metadata[`${METADATA_PREFIXES.VERTICAL}${key}`] = value;
58
+ }
59
+ }
60
+ return this;
61
+ }
62
+ /**
63
+ * Add horizontal axis metadata (theme/section identity).
64
+ * Automatically prefixes fields with '__h_'.
65
+ *
66
+ * @param fields - Horizontal metadata fields (theme, section_path, etc.)
67
+ * @returns This builder for chaining
68
+ */
69
+ horizontal(fields) {
70
+ for (const [key, value] of Object.entries(fields)) {
71
+ if (value !== void 0) {
72
+ this.metadata[`${METADATA_PREFIXES.HORIZONTAL}${key}`] = value;
73
+ }
74
+ }
75
+ return this;
76
+ }
77
+ /**
78
+ * Add structural axis metadata (position/hierarchy).
79
+ * Automatically prefixes fields with '__s_'.
80
+ *
81
+ * @param fields - Structural metadata fields (chunk_index, parent_id, etc.)
82
+ * @returns This builder for chaining
83
+ */
84
+ structural(fields) {
85
+ for (const [key, value] of Object.entries(fields)) {
86
+ if (value !== void 0) {
87
+ this.metadata[`${METADATA_PREFIXES.STRUCTURAL}${key}`] = value;
88
+ }
89
+ }
90
+ return this;
91
+ }
92
+ /**
93
+ * Add custom user-defined metadata.
94
+ * Fields are added as-is without any prefix.
95
+ *
96
+ * @param fields - Custom metadata fields
97
+ * @returns This builder for chaining
98
+ */
99
+ custom(fields) {
100
+ for (const [key, value] of Object.entries(fields)) {
101
+ if (value !== void 0) {
102
+ this.metadata[key] = value;
103
+ }
104
+ }
105
+ return this;
106
+ }
107
+ /**
108
+ * Build and return the complete metadata object.
109
+ * Returns a copy to prevent external modification.
110
+ *
111
+ * @returns Immutable copy of the metadata object
112
+ */
113
+ build() {
114
+ return { ...this.metadata };
115
+ }
116
+ };
117
+
118
+ // src/filters/translator.ts
119
+ var VALID_OPERATORS = [
120
+ "eq",
121
+ "neq",
122
+ "in",
123
+ "nin",
124
+ "gt",
125
+ "gte",
126
+ "lt",
127
+ "lte",
128
+ "contains",
129
+ "exists"
130
+ ];
131
+ var FilterTranslator = class {
132
+ /**
133
+ * Normalize any filter input to standard UniversalFilter format.
134
+ *
135
+ * Handles:
136
+ * - Standard format (pass through)
137
+ * - Shorthand format (convert to standard)
138
+ * - Operator suffixes (field__op syntax)
139
+ */
140
+ static normalize(input) {
141
+ if (this.isStandardFormat(input)) {
142
+ return input;
143
+ }
144
+ return this.fromShorthand(input);
145
+ }
146
+ /**
147
+ * Validate filter structure and operators.
148
+ *
149
+ * Throws error if filter is invalid.
150
+ */
151
+ static validate(filter) {
152
+ if (this.isCompound(filter)) {
153
+ const compound = filter;
154
+ const conditions = "and" in compound ? compound.and : compound.or;
155
+ if (!Array.isArray(conditions) || conditions.length === 0) {
156
+ throw new Error("Compound filter must have at least one condition");
157
+ }
158
+ conditions.forEach((c) => this.validate(c));
159
+ } else {
160
+ const condition = filter;
161
+ if (!condition.field || typeof condition.field !== "string") {
162
+ throw new Error("Filter field must be a non-empty string");
163
+ }
164
+ if (!VALID_OPERATORS.includes(condition.op)) {
165
+ throw new Error(`Invalid filter operator: ${condition.op}`);
166
+ }
167
+ if (condition.value === void 0) {
168
+ throw new Error("Filter value is required");
169
+ }
170
+ }
171
+ }
172
+ /**
173
+ * Check if filter is compound (AND/OR).
174
+ */
175
+ static isCompound(filter) {
176
+ return "and" in filter || "or" in filter;
177
+ }
178
+ /**
179
+ * Check if input is already in standard format.
180
+ */
181
+ static isStandardFormat(input) {
182
+ if (!input || typeof input !== "object") {
183
+ return false;
184
+ }
185
+ if ("and" in input || "or" in input) {
186
+ return true;
187
+ }
188
+ if ("field" in input && "op" in input && "value" in input) {
189
+ return true;
190
+ }
191
+ return false;
192
+ }
193
+ /**
194
+ * Convert shorthand format to standard.
195
+ */
196
+ static fromShorthand(shorthand) {
197
+ const entries = Object.entries(shorthand);
198
+ if (entries.length === 0) {
199
+ throw new Error("Cannot convert empty shorthand filter object");
200
+ }
201
+ const conditions = [];
202
+ for (const [key, value] of entries) {
203
+ let field;
204
+ let op;
205
+ if (key.includes("__") && !key.startsWith("__")) {
206
+ const lastIndex = key.lastIndexOf("__");
207
+ field = key.substring(0, lastIndex);
208
+ const extractedOp = key.substring(lastIndex + 2);
209
+ if (!VALID_OPERATORS.includes(extractedOp)) {
210
+ throw new Error(`Invalid filter operator in shorthand: ${extractedOp}`);
211
+ }
212
+ op = extractedOp;
213
+ } else {
214
+ field = key;
215
+ op = "eq";
216
+ }
217
+ conditions.push({ field, op, value });
218
+ }
219
+ if (conditions.length === 1) {
220
+ return conditions[0];
221
+ }
222
+ return { and: conditions };
223
+ }
224
+ };
225
+
226
+ // src/adapters/vector-db-adapter.ts
227
+ var VectorDBAdapter = class {
228
+ // ============================================================================
229
+ // CAPABILITY FLAGS (WITH DEFAULT IMPLEMENTATIONS)
230
+ // ============================================================================
231
+ /**
232
+ * Whether this adapter supports metadata updates without re-uploading vectors.
233
+ *
234
+ * Default: false (must re-upload entire record)
235
+ * Override to return true if your DB supports partial updates.
236
+ */
237
+ supportsMetadataUpdate() {
238
+ return false;
239
+ }
240
+ /**
241
+ * Whether this adapter supports filtering during search.
242
+ *
243
+ * Default: false (no filtering support)
244
+ * Override to return true if your DB supports metadata filtering.
245
+ */
246
+ supportsFiltering() {
247
+ return false;
248
+ }
249
+ /**
250
+ * Whether this adapter supports batch operations efficiently.
251
+ *
252
+ * Default: false (single operations only)
253
+ * Override to return true if your DB supports batch upsert/delete.
254
+ */
255
+ supportsBatchOperations() {
256
+ return false;
257
+ }
258
+ };
259
+
260
+ // src/query/filter-builder.ts
261
+ var FilterBuilder = class {
262
+ verticalFilter;
263
+ horizontalFilter;
264
+ customFilter;
265
+ /**
266
+ * Add a vertical (document-level) filter.
267
+ *
268
+ * @param filter - The vertical filter to add (standard or shorthand format)
269
+ * @returns This builder for method chaining
270
+ */
271
+ withVerticalFilter(filter) {
272
+ this.verticalFilter = FilterTranslator.normalize(filter);
273
+ return this;
274
+ }
275
+ /**
276
+ * Add a horizontal (theme-level) filter.
277
+ *
278
+ * @param filter - The horizontal filter to add (standard or shorthand format)
279
+ * @returns This builder for method chaining
280
+ */
281
+ withHorizontalFilter(filter) {
282
+ this.horizontalFilter = FilterTranslator.normalize(filter);
283
+ return this;
284
+ }
285
+ /**
286
+ * Add a custom user-defined filter.
287
+ *
288
+ * @param filter - The custom filter to add (standard or shorthand format)
289
+ * @returns This builder for method chaining
290
+ */
291
+ withCustomFilter(filter) {
292
+ this.customFilter = FilterTranslator.normalize(filter);
293
+ return this;
294
+ }
295
+ /**
296
+ * Build the combined filter.
297
+ *
298
+ * Combination logic:
299
+ * - If no filters: returns undefined
300
+ * - If single filter: returns it directly
301
+ * - If multiple filters: combines with AND logic
302
+ *
303
+ * @returns The combined filter, or undefined if no filters were added
304
+ */
305
+ build() {
306
+ const filters = [];
307
+ if (this.verticalFilter) {
308
+ filters.push(this.verticalFilter);
309
+ }
310
+ if (this.horizontalFilter) {
311
+ filters.push(this.horizontalFilter);
312
+ }
313
+ if (this.customFilter) {
314
+ filters.push(this.customFilter);
315
+ }
316
+ if (filters.length === 0) {
317
+ return void 0;
318
+ }
319
+ if (filters.length === 1) {
320
+ return filters[0];
321
+ }
322
+ return { and: filters };
323
+ }
324
+ };
325
+
326
+ // src/query/rag-query-composer.ts
327
+ var RAGQueryComposer = class {
328
+ /**
329
+ * Create a new RAGQueryComposer.
330
+ *
331
+ * @param adapter - Vector database adapter for search operations
332
+ * @param embedder - Embedder for converting text queries to vectors
333
+ */
334
+ constructor(adapter, embedder) {
335
+ this.adapter = adapter;
336
+ this.embedder = embedder;
337
+ }
338
+ /**
339
+ * Main retrieval method.
340
+ *
341
+ * Performs semantic search with optional filtering:
342
+ * 1. Embeds query text using embedder
343
+ * 2. Builds combined filter using FilterBuilder
344
+ * 3. Calls adapter.search() with query vector and filter
345
+ * 4. Returns results with filter information
346
+ *
347
+ * @param params - Retrieval parameters
348
+ * @returns Retrieval result with records and filter information
349
+ */
350
+ async retrieve(params) {
351
+ const queryVector = await this.embedder.embed(params.query);
352
+ const filterBuilder = new FilterBuilder();
353
+ if (params.verticalFilters) {
354
+ filterBuilder.withVerticalFilter(params.verticalFilters);
355
+ }
356
+ if (params.horizontalFilters) {
357
+ filterBuilder.withHorizontalFilter(params.horizontalFilters);
358
+ }
359
+ if (params.customFilters) {
360
+ filterBuilder.withCustomFilter(params.customFilters);
361
+ }
362
+ const combinedFilter = filterBuilder.build();
363
+ const searchResult = await this.adapter.search(
364
+ params.collection,
365
+ queryVector,
366
+ {
367
+ topK: params.topK,
368
+ filter: combinedFilter,
369
+ includeMetadata: true,
370
+ includeValues: params.includeEmbeddings
371
+ }
372
+ );
373
+ return {
374
+ records: searchResult.records,
375
+ query: params.query,
376
+ filtersApplied: {
377
+ ...params.verticalFilters && { vertical: params.verticalFilters },
378
+ ...params.horizontalFilters && { horizontal: params.horizontalFilters },
379
+ ...params.customFilters && { custom: params.customFilters }
380
+ }
381
+ };
382
+ }
383
+ /**
384
+ * Retrieve and group results by document ID.
385
+ *
386
+ * Calls retrieve() and organizes results into a Map keyed by __v_doc_id.
387
+ * Records without a doc_id are excluded.
388
+ *
389
+ * @param params - Retrieval parameters
390
+ * @returns Map of document ID to array of records
391
+ */
392
+ async retrieveVertical(params) {
393
+ const result = await this.retrieve(params);
394
+ const grouped = /* @__PURE__ */ new Map();
395
+ for (const record of result.records) {
396
+ const docId = record.metadata[VerticalFields.DOC_ID];
397
+ if (typeof docId === "string") {
398
+ if (!grouped.has(docId)) {
399
+ grouped.set(docId, []);
400
+ }
401
+ grouped.get(docId).push(record);
402
+ }
403
+ }
404
+ return grouped;
405
+ }
406
+ /**
407
+ * Retrieve and group results by theme.
408
+ *
409
+ * Calls retrieve() and organizes results into a Map keyed by __h_theme.
410
+ * Records without a theme are excluded.
411
+ *
412
+ * @param params - Retrieval parameters
413
+ * @returns Map of theme to array of records
414
+ */
415
+ async retrieveHorizontal(params) {
416
+ const result = await this.retrieve(params);
417
+ const grouped = /* @__PURE__ */ new Map();
418
+ for (const record of result.records) {
419
+ const theme = record.metadata[HorizontalFields.THEME];
420
+ if (typeof theme === "string") {
421
+ if (!grouped.has(theme)) {
422
+ grouped.set(theme, []);
423
+ }
424
+ grouped.get(theme).push(record);
425
+ }
426
+ }
427
+ return grouped;
428
+ }
429
+ };
430
+
431
+ // src/embedders/embedder.ts
432
+ var Embedder = class _Embedder {
433
+ /**
434
+ * Constructor is protected to prevent direct instantiation of abstract class.
435
+ * Subclasses can call super() in their constructors.
436
+ */
437
+ constructor() {
438
+ if (new.target === _Embedder) {
439
+ throw new Error("Cannot instantiate abstract class Embedder directly");
440
+ }
441
+ }
442
+ };
443
+
444
+ // src/llm/llm-client.ts
445
+ var LLMClient = class _LLMClient {
446
+ /**
447
+ * Constructor is protected to prevent direct instantiation of abstract class.
448
+ * Subclasses can call super() in their constructors.
449
+ */
450
+ constructor() {
451
+ if (new.target === _LLMClient) {
452
+ throw new Error("Cannot instantiate abstract class LLMClient directly");
453
+ }
454
+ }
455
+ };
456
+
457
+ // src/llm/mock-llm.ts
458
+ var MockLLM = class extends LLMClient {
459
+ _response = "";
460
+ constructor() {
461
+ super();
462
+ }
463
+ get modelName() {
464
+ return "mock-llm-v1";
465
+ }
466
+ get provider() {
467
+ return "mock";
468
+ }
469
+ /**
470
+ * Set the canned response that will be returned by generate methods.
471
+ *
472
+ * @param response - The response text to return
473
+ */
474
+ setResponse(response) {
475
+ this._response = response;
476
+ }
477
+ async generate(prompt, options) {
478
+ return this._response;
479
+ }
480
+ async generateJSON(prompt, options) {
481
+ try {
482
+ return JSON.parse(this._response);
483
+ } catch (error) {
484
+ throw new Error(
485
+ `Failed to parse mock response as JSON: ${error instanceof Error ? error.message : "unknown error"}`
486
+ );
487
+ }
488
+ }
489
+ async generateBatch(prompts, options) {
490
+ return prompts.map(() => this._response);
491
+ }
492
+ };
493
+
494
+ // src/enrichment/classifiers/keyword-classifier.ts
495
+ var KeywordThemeClassifier = class {
496
+ /**
497
+ * Creates a new KeywordThemeClassifier
498
+ * @param themes - Array of theme names
499
+ * @param keywords - Map of theme names to their keyword arrays
500
+ * @param caseSensitive - Whether matching should be case sensitive (default: false)
501
+ */
502
+ constructor(themes, keywords, caseSensitive = false) {
503
+ this.themes = themes;
504
+ this.caseSensitive = caseSensitive;
505
+ this.patterns = /* @__PURE__ */ new Map();
506
+ this.keywordCounts = /* @__PURE__ */ new Map();
507
+ for (const theme of themes) {
508
+ const themeKeywords = keywords[theme] || [];
509
+ this.keywordCounts.set(theme, themeKeywords.length);
510
+ const patterns = themeKeywords.map((keyword) => {
511
+ const escapedKeyword = this.escapeRegex(keyword);
512
+ const flags = caseSensitive ? "g" : "gi";
513
+ return new RegExp(`\\b${escapedKeyword}\\b`, flags);
514
+ });
515
+ this.patterns.set(theme, patterns);
516
+ }
517
+ }
518
+ patterns;
519
+ keywordCounts;
520
+ /**
521
+ * Classify a single text
522
+ * @param text - Text to classify
523
+ * @returns Classification result with theme, confidence, and all scores
524
+ */
525
+ classify(text) {
526
+ if (!text || text.trim().length === 0) {
527
+ return {
528
+ theme: "unknown",
529
+ confidence: 0,
530
+ allScores: {}
531
+ };
532
+ }
533
+ const scores = {};
534
+ let maxScore = 0;
535
+ let winningTheme = "unknown";
536
+ for (const theme of this.themes) {
537
+ const patterns = this.patterns.get(theme) || [];
538
+ let matchCount = 0;
539
+ for (const pattern of patterns) {
540
+ const matches = text.match(pattern);
541
+ if (matches) {
542
+ matchCount += matches.length;
543
+ }
544
+ }
545
+ scores[theme] = matchCount;
546
+ if (matchCount > maxScore) {
547
+ maxScore = matchCount;
548
+ winningTheme = theme;
549
+ }
550
+ }
551
+ if (maxScore === 0) {
552
+ return {
553
+ theme: "unknown",
554
+ confidence: 0,
555
+ allScores: scores
556
+ };
557
+ }
558
+ const totalKeywords = this.keywordCounts.get(winningTheme) || 1;
559
+ const confidence = maxScore / totalKeywords;
560
+ return {
561
+ theme: winningTheme,
562
+ confidence: Math.min(confidence, 1),
563
+ // Cap at 1.0
564
+ allScores: scores
565
+ };
566
+ }
567
+ /**
568
+ * Classify multiple texts in batch
569
+ * @param texts - Array of texts to classify
570
+ * @returns Array of classification results
571
+ */
572
+ classifyBatch(texts) {
573
+ return texts.map((text) => this.classify(text));
574
+ }
575
+ /**
576
+ * Escape special regex characters in a string
577
+ * @param str - String to escape
578
+ * @returns Escaped string safe for use in regex
579
+ */
580
+ escapeRegex(str) {
581
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
582
+ }
583
+ };
584
+
585
+ // src/enrichment/classifiers/zero-shot-classifier.ts
586
+ var ZeroShotThemeClassifier = class {
587
+ model = null;
588
+ modelName;
589
+ themes;
590
+ /**
591
+ * Creates a new ZeroShotThemeClassifier
592
+ *
593
+ * @param themes - Array of theme labels to classify into
594
+ * @param modelName - Name of the Hugging Face model to use (default: 'Xenova/distilbert-base-uncased-mnli')
595
+ *
596
+ * @example
597
+ * ```typescript
598
+ * // Use default model
599
+ * const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'finance']);
600
+ *
601
+ * // Use custom model
602
+ * const classifier = new ZeroShotThemeClassifier(
603
+ * ['positive', 'negative'],
604
+ * 'Xenova/distilbert-base-uncased-mnli'
605
+ * );
606
+ * ```
607
+ */
608
+ constructor(themes, modelName = "Xenova/distilbert-base-uncased-mnli") {
609
+ this.themes = themes;
610
+ this.modelName = modelName;
611
+ }
612
+ /**
613
+ * Lazy loads the zero-shot classification model
614
+ * Only loads once on first call, subsequent calls reuse the loaded model
615
+ *
616
+ * @returns Promise that resolves to the loaded pipeline
617
+ */
618
+ async ensureModelLoaded() {
619
+ if (!this.model) {
620
+ const { pipeline } = await import("@xenova/transformers");
621
+ this.model = await pipeline("zero-shot-classification", this.modelName);
622
+ }
623
+ return this.model;
624
+ }
625
+ /**
626
+ * Classify a single text into one of the provided themes
627
+ *
628
+ * @param text - The text content to classify
629
+ * @returns A promise that resolves to the theme classification result
630
+ *
631
+ * @example
632
+ * ```typescript
633
+ * const classifier = new ZeroShotThemeClassifier(['technology', 'sports']);
634
+ * const result = await classifier.classify('Machine learning and AI');
635
+ * console.log(result.theme); // 'technology'
636
+ * console.log(result.confidence); // 0.92
637
+ * console.log(result.allScores); // { technology: 0.92, sports: 0.08 }
638
+ * ```
639
+ */
640
+ async classify(text) {
641
+ if (!text || text.trim().length === 0) {
642
+ const uniformScore = 1 / this.themes.length;
643
+ const allScores2 = {};
644
+ for (const theme of this.themes) {
645
+ allScores2[theme] = uniformScore;
646
+ }
647
+ return {
648
+ theme: this.themes[0],
649
+ // Return first theme
650
+ confidence: uniformScore,
651
+ allScores: allScores2
652
+ };
653
+ }
654
+ const model = await this.ensureModelLoaded();
655
+ const result = await model(text, this.themes);
656
+ const allScores = {};
657
+ for (let i = 0; i < result.labels.length; i++) {
658
+ allScores[result.labels[i]] = result.scores[i];
659
+ }
660
+ return {
661
+ theme: result.labels[0],
662
+ confidence: result.scores[0],
663
+ allScores
664
+ };
665
+ }
666
+ /**
667
+ * Classify multiple texts efficiently
668
+ *
669
+ * Processes texts sequentially to avoid memory issues with large batches.
670
+ * The model is loaded once and reused for all texts.
671
+ *
672
+ * @param texts - Array of text contents to classify
673
+ * @returns A promise that resolves to an array of theme classifications
674
+ *
675
+ * @example
676
+ * ```typescript
677
+ * const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'finance']);
678
+ * const results = await classifier.classifyBatch([
679
+ * 'Machine learning is transforming AI',
680
+ * 'The football team won the championship',
681
+ * 'Stock market hits record high'
682
+ * ]);
683
+ * // results[0].theme === 'technology'
684
+ * // results[1].theme === 'sports'
685
+ * // results[2].theme === 'finance'
686
+ * ```
687
+ */
688
+ async classifyBatch(texts) {
689
+ await this.ensureModelLoaded();
690
+ const results = [];
691
+ for (const text of texts) {
692
+ const result = await this.classify(text);
693
+ results.push(result);
694
+ }
695
+ return results;
696
+ }
697
+ };
698
+
699
+ // src/enrichment/classifiers/embedding-classifier.ts
700
+ var EmbeddingThemeClassifier = class {
701
+ themeEmbeddings = null;
702
+ embedder;
703
+ themes;
704
+ /**
705
+ * Creates a new EmbeddingThemeClassifier
706
+ *
707
+ * @param themes - Array of theme labels to classify into
708
+ * @param embedder - Embedder instance to use for generating embeddings
709
+ * @param precomputedEmbeddings - Optional precomputed theme embeddings for faster startup
710
+ *
711
+ * @example
712
+ * ```typescript
713
+ * // Lazy initialization
714
+ * const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder);
715
+ *
716
+ * // With precomputed embeddings
717
+ * const themeEmbeddings = {
718
+ * technology: await embedder.embed('technology'),
719
+ * sports: await embedder.embed('sports')
720
+ * };
721
+ * const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder, themeEmbeddings);
722
+ * ```
723
+ */
724
+ constructor(themes, embedder, precomputedEmbeddings) {
725
+ this.themes = themes;
726
+ this.embedder = embedder;
727
+ this.themeEmbeddings = precomputedEmbeddings || null;
728
+ }
729
+ /**
730
+ * Lazy loads theme embeddings on first use
731
+ * Computes embeddings for all theme labels if not already computed
732
+ *
733
+ * @returns Promise that resolves to the theme embeddings map
734
+ */
735
+ async ensureThemeEmbeddings() {
736
+ if (!this.themeEmbeddings) {
737
+ this.themeEmbeddings = {};
738
+ const embeddings = await this.embedder.embedBatch(this.themes);
739
+ for (let i = 0; i < this.themes.length; i++) {
740
+ this.themeEmbeddings[this.themes[i]] = embeddings[i];
741
+ }
742
+ }
743
+ return this.themeEmbeddings;
744
+ }
745
+ /**
746
+ * Compute cosine similarity between two vectors
747
+ *
748
+ * Cosine similarity = dotProduct / (normA * normB)
749
+ * Returns value in range [-1, 1] where:
750
+ * - 1 means vectors point in the same direction
751
+ * - 0 means vectors are orthogonal
752
+ * - -1 means vectors point in opposite directions
753
+ *
754
+ * @param a - First vector
755
+ * @param b - Second vector
756
+ * @returns Cosine similarity between the vectors
757
+ */
758
+ cosineSimilarity(a, b) {
759
+ if (a.length !== b.length) {
760
+ throw new Error("Vectors must have the same length for cosine similarity");
761
+ }
762
+ let dotProduct = 0;
763
+ let normA = 0;
764
+ let normB = 0;
765
+ for (let i = 0; i < a.length; i++) {
766
+ dotProduct += a[i] * b[i];
767
+ normA += a[i] * a[i];
768
+ normB += b[i] * b[i];
769
+ }
770
+ normA = Math.sqrt(normA);
771
+ normB = Math.sqrt(normB);
772
+ if (normA === 0 || normB === 0) {
773
+ return 0;
774
+ }
775
+ return dotProduct / (normA * normB);
776
+ }
777
+ /**
778
+ * Normalize cosine similarity from [-1, 1] to confidence score [0, 1]
779
+ *
780
+ * Uses linear transformation: (similarity + 1) / 2
781
+ *
782
+ * @param similarity - Cosine similarity value in range [-1, 1]
783
+ * @returns Confidence score in range [0, 1]
784
+ */
785
+ normalizeToConfidence(similarity) {
786
+ return (similarity + 1) / 2;
787
+ }
788
+ /**
789
+ * Classify a single text into one of the provided themes
790
+ *
791
+ * @param text - The text content to classify
792
+ * @returns A promise that resolves to the theme classification result
793
+ *
794
+ * @example
795
+ * ```typescript
796
+ * const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder);
797
+ * const result = await classifier.classify('Machine learning and AI');
798
+ * console.log(result.theme); // 'technology'
799
+ * console.log(result.confidence); // 0.92
800
+ * console.log(result.allScores); // { technology: 0.92, sports: 0.45 }
801
+ * ```
802
+ */
803
+ async classify(text) {
804
+ if (!text || text.trim().length === 0) {
805
+ const uniformScore = 1 / this.themes.length;
806
+ const allScores2 = {};
807
+ for (const theme of this.themes) {
808
+ allScores2[theme] = uniformScore;
809
+ }
810
+ return {
811
+ theme: this.themes[0],
812
+ // Return first theme
813
+ confidence: uniformScore,
814
+ allScores: allScores2
815
+ };
816
+ }
817
+ const themeEmbeddings = await this.ensureThemeEmbeddings();
818
+ const textEmbedding = await this.embedder.embed(text);
819
+ const similarities = {};
820
+ let maxSimilarity = -Infinity;
821
+ let winningTheme = this.themes[0];
822
+ for (const theme of this.themes) {
823
+ const themeEmbedding = themeEmbeddings[theme];
824
+ const similarity = this.cosineSimilarity(textEmbedding, themeEmbedding);
825
+ similarities[theme] = similarity;
826
+ if (similarity > maxSimilarity) {
827
+ maxSimilarity = similarity;
828
+ winningTheme = theme;
829
+ }
830
+ }
831
+ const allScores = {};
832
+ for (const theme of this.themes) {
833
+ allScores[theme] = this.normalizeToConfidence(similarities[theme]);
834
+ }
835
+ return {
836
+ theme: winningTheme,
837
+ confidence: this.normalizeToConfidence(maxSimilarity),
838
+ allScores
839
+ };
840
+ }
841
+ /**
842
+ * Classify multiple texts efficiently
843
+ *
844
+ * Ensures theme embeddings are loaded once, then processes all texts.
845
+ * Text embeddings are computed in batch for efficiency.
846
+ *
847
+ * @param texts - Array of text contents to classify
848
+ * @returns A promise that resolves to an array of theme classifications
849
+ *
850
+ * @example
851
+ * ```typescript
852
+ * const classifier = new EmbeddingThemeClassifier(['technology', 'sports', 'finance'], embedder);
853
+ * const results = await classifier.classifyBatch([
854
+ * 'Machine learning is transforming AI',
855
+ * 'The football team won the championship',
856
+ * 'Stock market hits record high'
857
+ * ]);
858
+ * // results[0].theme === 'technology'
859
+ * // results[1].theme === 'sports'
860
+ * // results[2].theme === 'finance'
861
+ * ```
862
+ */
863
+ async classifyBatch(texts) {
864
+ await this.ensureThemeEmbeddings();
865
+ const results = [];
866
+ for (const text of texts) {
867
+ const result = await this.classify(text);
868
+ results.push(result);
869
+ }
870
+ return results;
871
+ }
872
+ };
873
+
874
+ // src/enrichment/classifiers/llm-classifier.ts
875
+ var DEFAULT_PROMPT_TEMPLATE = `You are a theme classification system. Classify the following text into one of the provided themes.
876
+
877
+ Available themes: {themes}
878
+
879
+ Text to classify:
880
+ {text}
881
+
882
+ Return a JSON object with the following structure:
883
+ - theme: the most appropriate theme from the list (string)
884
+ - confidence: confidence score between 0 and 1 (number)
885
+ - allScores: an object mapping each theme to its confidence score (object)
886
+
887
+ Return only valid JSON, no additional text.`;
888
+ var LLMThemeClassifier = class {
889
+ themes;
890
+ llm;
891
+ promptTemplate;
892
+ /**
893
+ * Creates a new LLMThemeClassifier
894
+ *
895
+ * @param themes - Array of theme labels to classify into
896
+ * @param llm - LLM client instance to use for classification
897
+ * @param promptTemplate - Optional custom prompt template with {themes} and {text} placeholders
898
+ *
899
+ * @example
900
+ * ```typescript
901
+ * const classifier = new LLMThemeClassifier(
902
+ * ['technology', 'sports', 'finance'],
903
+ * llm
904
+ * );
905
+ * ```
906
+ *
907
+ * @example With custom prompt
908
+ * ```typescript
909
+ * const customTemplate = `Classify: {text}\nThemes: {themes}\nReturn JSON.`;
910
+ * const classifier = new LLMThemeClassifier(
911
+ * ['technology', 'sports'],
912
+ * llm,
913
+ * customTemplate
914
+ * );
915
+ * ```
916
+ */
917
+ constructor(themes, llm, promptTemplate = DEFAULT_PROMPT_TEMPLATE) {
918
+ this.themes = themes;
919
+ this.llm = llm;
920
+ this.promptTemplate = promptTemplate;
921
+ }
922
+ /**
923
+ * Build the classification prompt by replacing placeholders
924
+ *
925
+ * @param text - The text to classify
926
+ * @returns The complete prompt with placeholders replaced
927
+ */
928
+ buildPrompt(text) {
929
+ const themesStr = this.themes.join(", ");
930
+ return this.promptTemplate.replace("{themes}", themesStr).replace("{text}", text);
931
+ }
932
+ /**
933
+ * Classify a single text into one of the provided themes
934
+ *
935
+ * @param text - The text content to classify
936
+ * @returns A promise that resolves to the theme classification result
937
+ *
938
+ * @example
939
+ * ```typescript
940
+ * const classifier = new LLMThemeClassifier(['technology', 'sports'], llm);
941
+ * const result = await classifier.classify('Machine learning and AI');
942
+ * console.log(result.theme); // 'technology'
943
+ * console.log(result.confidence); // 0.95
944
+ * console.log(result.allScores); // { technology: 0.95, sports: 0.05 }
945
+ * ```
946
+ */
947
+ async classify(text) {
948
+ if (!text || text.trim().length === 0) {
949
+ const uniformScore = 1 / this.themes.length;
950
+ const allScores = {};
951
+ for (const theme of this.themes) {
952
+ allScores[theme] = uniformScore;
953
+ }
954
+ return {
955
+ theme: this.themes[0],
956
+ // Return first theme
957
+ confidence: uniformScore,
958
+ allScores
959
+ };
960
+ }
961
+ const prompt = this.buildPrompt(text);
962
+ try {
963
+ const result = await this.llm.generateJSON(prompt);
964
+ return result;
965
+ } catch (error) {
966
+ const message = `Failed to classify text with LLM: ${error instanceof Error ? error.message : "unknown error"}`;
967
+ const classificationError = new Error(message);
968
+ if (error instanceof Error) {
969
+ classificationError.cause = error;
970
+ }
971
+ throw classificationError;
972
+ }
973
+ }
974
+ /**
975
+ * Classify multiple texts sequentially
976
+ *
977
+ * Processes texts one at a time to avoid rate limits and ensure predictable behavior.
978
+ * Sequential processing provides better error handling and rate limit compliance.
979
+ *
980
+ * @param texts - Array of text contents to classify
981
+ * @returns A promise that resolves to an array of theme classifications
982
+ *
983
+ * @example
984
+ * ```typescript
985
+ * const classifier = new LLMThemeClassifier(['technology', 'sports', 'finance'], llm);
986
+ * const results = await classifier.classifyBatch([
987
+ * 'Machine learning is transforming AI',
988
+ * 'The football team won the championship',
989
+ * 'Stock market hits record high'
990
+ * ]);
991
+ * // results[0].theme === 'technology'
992
+ * // results[1].theme === 'sports'
993
+ * // results[2].theme === 'finance'
994
+ * ```
995
+ */
996
+ async classifyBatch(texts) {
997
+ const results = [];
998
+ for (const text of texts) {
999
+ const result = await this.classify(text);
1000
+ results.push(result);
1001
+ }
1002
+ return results;
1003
+ }
1004
+ };
1005
+
1006
+ // src/enrichment/enrichment-pipeline.ts
1007
+ var EnrichmentPipeline = class {
1008
+ /**
1009
+ * Create a new enrichment pipeline.
1010
+ *
1011
+ * @param adapter - Vector database adapter for reading/writing records
1012
+ * @param embedder - Optional embedder for embedding-based enrichment
1013
+ * @param llm - Optional LLM client for automatic enrichment
1014
+ */
1015
+ constructor(adapter, embedder, llm) {
1016
+ this.adapter = adapter;
1017
+ this.embedder = embedder;
1018
+ this.llm = llm;
1019
+ }
1020
+ /**
1021
+ * Enrich records with vertical classifications.
1022
+ *
1023
+ * Supports three strategies:
1024
+ * 1. Field mapping: Map existing field values to verticals
1025
+ * 2. Custom extractor: Use a custom function to extract verticals
1026
+ * 3. Automatic LLM: Use an LLM to classify documents
1027
+ *
1028
+ * @param collection - Name of the collection to enrich
1029
+ * @param config - Vertical enrichment configuration
1030
+ * @returns Statistics about the enrichment operation
1031
+ *
1032
+ * @example
1033
+ * ```typescript
1034
+ * // Field mapping
1035
+ * await pipeline.enrichVertical('docs', {
1036
+ * mapping: { 'tech': 'technology' }
1037
+ * });
1038
+ *
1039
+ * // Custom extractor
1040
+ * await pipeline.enrichVertical('docs', {
1041
+ * extractor: async (doc) => 'technology'
1042
+ * });
1043
+ *
1044
+ * // Automatic LLM
1045
+ * await pipeline.enrichVertical('docs', {
1046
+ * automatic: {
1047
+ * llm: myLLMClient,
1048
+ * fields: ['technology', 'finance']
1049
+ * }
1050
+ * });
1051
+ * ```
1052
+ */
1053
+ async enrichVertical(collection, config) {
1054
+ const startTime = Date.now();
1055
+ const stats = {
1056
+ recordsProcessed: 0,
1057
+ recordsUpdated: 0,
1058
+ recordsSkipped: 0,
1059
+ timeMs: 0,
1060
+ errors: []
1061
+ };
1062
+ try {
1063
+ if ("mapping" in config) {
1064
+ await this.enrichWithFieldMapping(collection, config, stats);
1065
+ } else if ("extractor" in config) {
1066
+ await this.enrichWithExtractor(collection, config, stats);
1067
+ } else if ("automatic" in config) {
1068
+ await this.enrichWithLLM(collection, config, stats);
1069
+ }
1070
+ } catch (error) {
1071
+ stats.errors?.push(
1072
+ `Pipeline error: ${error instanceof Error ? error.message : "unknown error"}`
1073
+ );
1074
+ }
1075
+ stats.timeMs = Date.now() - startTime;
1076
+ return stats;
1077
+ }
1078
+ /**
1079
+ * Enrich records using field mapping strategy.
1080
+ *
1081
+ * Maps values from an existing field to vertical classifications.
1082
+ *
1083
+ * @param collection - Collection name
1084
+ * @param config - Field mapping configuration
1085
+ * @param stats - Statistics object to update
1086
+ */
1087
+ async enrichWithFieldMapping(collection, config, stats) {
1088
+ const batchSize = config.batchSize || 100;
1089
+ for await (const batch of this.adapter.iterate(collection, {
1090
+ batchSize,
1091
+ filter: config.filter
1092
+ })) {
1093
+ const updates = [];
1094
+ for (const record of batch) {
1095
+ stats.recordsProcessed++;
1096
+ try {
1097
+ const vertical = this.applyFieldMapping(record, config.mapping);
1098
+ if (vertical) {
1099
+ updates.push({
1100
+ id: record.id,
1101
+ metadata: { vertical }
1102
+ });
1103
+ } else {
1104
+ stats.recordsSkipped++;
1105
+ }
1106
+ } catch (error) {
1107
+ stats.recordsSkipped++;
1108
+ stats.errors?.push(
1109
+ `Error mapping record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
1110
+ );
1111
+ }
1112
+ }
1113
+ if (updates.length > 0) {
1114
+ try {
1115
+ await this.adapter.updateMetadata(collection, updates);
1116
+ stats.recordsUpdated += updates.length;
1117
+ } catch (error) {
1118
+ stats.errors?.push(
1119
+ `Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
1120
+ );
1121
+ }
1122
+ }
1123
+ }
1124
+ }
1125
+ /**
1126
+ * Apply field mapping to extract vertical from a record.
1127
+ *
1128
+ * @param record - Vector record
1129
+ * @param mapping - Field mapping configuration
1130
+ * @returns Vertical label or null if no match
1131
+ */
1132
+ applyFieldMapping(record, mapping) {
1133
+ const category = record.metadata?.category;
1134
+ if (category && typeof category === "string" && category in mapping) {
1135
+ return mapping[category];
1136
+ }
1137
+ return null;
1138
+ }
1139
+ /**
1140
+ * Enrich records using custom extractor strategy.
1141
+ *
1142
+ * Calls the provided extractor function for each record.
1143
+ *
1144
+ * @param collection - Collection name
1145
+ * @param config - Extractor configuration
1146
+ * @param stats - Statistics object to update
1147
+ */
1148
+ async enrichWithExtractor(collection, config, stats) {
1149
+ const batchSize = config.batchSize || 100;
1150
+ for await (const batch of this.adapter.iterate(collection, {
1151
+ batchSize,
1152
+ filter: config.filter
1153
+ })) {
1154
+ const updates = [];
1155
+ for (const record of batch) {
1156
+ stats.recordsProcessed++;
1157
+ try {
1158
+ const vertical = await config.extractor(record);
1159
+ if (vertical) {
1160
+ updates.push({
1161
+ id: record.id,
1162
+ metadata: { vertical }
1163
+ });
1164
+ } else {
1165
+ stats.recordsSkipped++;
1166
+ }
1167
+ } catch (error) {
1168
+ stats.recordsSkipped++;
1169
+ stats.errors?.push(
1170
+ `Extractor error for record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
1171
+ );
1172
+ }
1173
+ }
1174
+ if (updates.length > 0) {
1175
+ try {
1176
+ await this.adapter.updateMetadata(collection, updates);
1177
+ stats.recordsUpdated += updates.length;
1178
+ } catch (error) {
1179
+ stats.errors?.push(
1180
+ `Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
1181
+ );
1182
+ }
1183
+ }
1184
+ }
1185
+ }
1186
+ /**
1187
+ * Enrich records using automatic LLM strategy.
1188
+ *
1189
+ * Uses a language model to classify documents into verticals.
1190
+ *
1191
+ * @param collection - Collection name
1192
+ * @param config - Automatic extraction configuration
1193
+ * @param stats - Statistics object to update
1194
+ */
1195
+ async enrichWithLLM(collection, config, stats) {
1196
+ const batchSize = config.batchSize || 10;
1197
+ const { llm, fields, promptTemplate, textField } = config.automatic;
1198
+ const fieldName = textField || "content";
1199
+ for await (const batch of this.adapter.iterate(collection, {
1200
+ batchSize,
1201
+ filter: config.filter
1202
+ })) {
1203
+ const updates = [];
1204
+ for (const record of batch) {
1205
+ stats.recordsProcessed++;
1206
+ try {
1207
+ const vertical = await this.extractWithLLM(
1208
+ record,
1209
+ llm,
1210
+ fields,
1211
+ fieldName,
1212
+ promptTemplate
1213
+ );
1214
+ if (vertical) {
1215
+ updates.push({
1216
+ id: record.id,
1217
+ metadata: { vertical }
1218
+ });
1219
+ } else {
1220
+ stats.recordsSkipped++;
1221
+ }
1222
+ } catch (error) {
1223
+ stats.recordsSkipped++;
1224
+ stats.errors?.push(
1225
+ `LLM extraction error for record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
1226
+ );
1227
+ }
1228
+ }
1229
+ if (updates.length > 0) {
1230
+ try {
1231
+ await this.adapter.updateMetadata(collection, updates);
1232
+ stats.recordsUpdated += updates.length;
1233
+ } catch (error) {
1234
+ stats.errors?.push(
1235
+ `Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
1236
+ );
1237
+ }
1238
+ }
1239
+ }
1240
+ }
1241
+ /**
1242
+ * Extract vertical classification using LLM.
1243
+ *
1244
+ * @param record - Vector record
1245
+ * @param llm - LLM client
1246
+ * @param fields - Available vertical fields
1247
+ * @param textField - Field name containing text to classify
1248
+ * @param promptTemplate - Optional custom prompt template
1249
+ * @returns Vertical label
1250
+ */
1251
+ async extractWithLLM(record, llm, fields, textField, promptTemplate) {
1252
+ const text = record.metadata?.[textField];
1253
+ if (!text || typeof text !== "string") {
1254
+ throw new Error(`No text found in field '${textField}'`);
1255
+ }
1256
+ const prompt = promptTemplate ? promptTemplate.replace("{fields}", fields.join(", ")).replace("{text}", text) : `Classify the following text into one of these categories: ${fields.join(", ")}
1257
+
1258
+ Text: ${text}
1259
+
1260
+ Category:`;
1261
+ const result = await llm.generate(prompt);
1262
+ return result.trim();
1263
+ }
1264
+ /**
1265
+ * Enrich records with theme classifications.
1266
+ *
1267
+ * Uses a theme classifier to identify themes in text content and updates
1268
+ * record metadata with theme information. Supports single and multi-theme
1269
+ * classification with configurable confidence thresholds.
1270
+ *
1271
+ * @param collection - Name of the collection to enrich
1272
+ * @param config - Theme enrichment configuration
1273
+ * @returns Statistics about the enrichment operation
1274
+ *
1275
+ * @example
1276
+ * ```typescript
1277
+ * // Single theme classification
1278
+ * await pipeline.enrichThemes('docs', {
1279
+ * themes: ['technology', 'business', 'science'],
1280
+ * classifier: new KeywordThemeClassifier(),
1281
+ * confidenceThreshold: 0.7
1282
+ * });
1283
+ *
1284
+ * // Multi-theme classification
1285
+ * await pipeline.enrichThemes('docs', {
1286
+ * themes: ['technology', 'business', 'science'],
1287
+ * classifier: new LLMThemeClassifier(),
1288
+ * multiTheme: true,
1289
+ * confidenceThreshold: 0.5
1290
+ * });
1291
+ * ```
1292
+ */
1293
+ async enrichThemes(collection, config) {
1294
+ const startTime = Date.now();
1295
+ const stats = {
1296
+ recordsProcessed: 0,
1297
+ recordsUpdated: 0,
1298
+ recordsSkipped: 0,
1299
+ timeMs: 0,
1300
+ errors: []
1301
+ };
1302
+ try {
1303
+ await this.enrichWithThemeClassifier(collection, config, stats);
1304
+ } catch (error) {
1305
+ stats.errors?.push(
1306
+ `Pipeline error: ${error instanceof Error ? error.message : "unknown error"}`
1307
+ );
1308
+ }
1309
+ stats.timeMs = Date.now() - startTime;
1310
+ return stats;
1311
+ }
1312
+ /**
1313
+ * Enrich records using theme classifier.
1314
+ *
1315
+ * @param collection - Collection name
1316
+ * @param config - Theme enrichment configuration
1317
+ * @param stats - Statistics object to update
1318
+ */
1319
+ async enrichWithThemeClassifier(collection, config, stats) {
1320
+ const batchSize = config.batchSize || 100;
1321
+ const textField = config.textField || "content";
1322
+ const confidenceThreshold = config.confidenceThreshold ?? 0.5;
1323
+ const multiTheme = config.multiTheme || false;
1324
+ for await (const batch of this.adapter.iterate(collection, {
1325
+ batchSize,
1326
+ filter: config.filter
1327
+ })) {
1328
+ const textsToClassify = [];
1329
+ const recordsToProcess = [];
1330
+ for (const record of batch) {
1331
+ stats.recordsProcessed++;
1332
+ const text = record.text || record.metadata?.[textField];
1333
+ if (!text || typeof text !== "string" || text.trim() === "") {
1334
+ stats.recordsSkipped++;
1335
+ continue;
1336
+ }
1337
+ textsToClassify.push(text);
1338
+ recordsToProcess.push(record);
1339
+ }
1340
+ if (textsToClassify.length === 0) {
1341
+ continue;
1342
+ }
1343
+ let classifications;
1344
+ try {
1345
+ classifications = await config.classifier.classifyBatch(textsToClassify);
1346
+ } catch (error) {
1347
+ stats.errors?.push(
1348
+ `Batch classification error, falling back to individual classification: ${error instanceof Error ? error.message : "unknown error"}`
1349
+ );
1350
+ classifications = [];
1351
+ for (let i = 0; i < textsToClassify.length; i++) {
1352
+ try {
1353
+ const result = await config.classifier.classify(textsToClassify[i]);
1354
+ classifications.push(result);
1355
+ } catch (individualError) {
1356
+ classifications.push(null);
1357
+ stats.errors?.push(
1358
+ `Classification error for record ${recordsToProcess[i].id}: ${individualError instanceof Error ? individualError.message : "unknown error"}`
1359
+ );
1360
+ }
1361
+ }
1362
+ }
1363
+ const updates = [];
1364
+ for (let i = 0; i < recordsToProcess.length; i++) {
1365
+ const record = recordsToProcess[i];
1366
+ const classification = classifications[i];
1367
+ try {
1368
+ if (!classification || typeof classification !== "object") {
1369
+ stats.recordsSkipped++;
1370
+ stats.errors?.push(
1371
+ `Invalid classification for record ${record.id}`
1372
+ );
1373
+ continue;
1374
+ }
1375
+ if (classification.confidence < confidenceThreshold) {
1376
+ stats.recordsSkipped++;
1377
+ continue;
1378
+ }
1379
+ const metadata = {
1380
+ __h_theme: classification.theme,
1381
+ __h_theme_confidence: classification.confidence
1382
+ };
1383
+ if (multiTheme && classification.allScores) {
1384
+ const themes = Object.entries(classification.allScores).filter(([_, score]) => score >= confidenceThreshold).sort(([_, a], [__, b]) => b - a).map(([theme, _]) => theme);
1385
+ if (themes.length > 0) {
1386
+ metadata.__h_themes = themes;
1387
+ }
1388
+ }
1389
+ updates.push({
1390
+ id: record.id,
1391
+ metadata
1392
+ });
1393
+ } catch (error) {
1394
+ stats.recordsSkipped++;
1395
+ stats.errors?.push(
1396
+ `Error processing record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
1397
+ );
1398
+ }
1399
+ }
1400
+ if (updates.length > 0) {
1401
+ try {
1402
+ await this.adapter.updateMetadata(collection, updates);
1403
+ stats.recordsUpdated += updates.length;
1404
+ } catch (error) {
1405
+ stats.errors?.push(
1406
+ `Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
1407
+ );
1408
+ }
1409
+ }
1410
+ if (config.onProgress) {
1411
+ config.onProgress(stats);
1412
+ }
1413
+ }
1414
+ }
1415
+ /**
1416
+ * Enrich records with section structure.
1417
+ *
1418
+ * Extracts section metadata from documents using either existing field mappings
1419
+ * or automatic detection strategies (markdown, HTML, or pattern-based).
1420
+ *
1421
+ * @param collection - Name of the collection to enrich
1422
+ * @param config - Section enrichment configuration
1423
+ * @returns Statistics about the enrichment operation
1424
+ *
1425
+ * @example
1426
+ * ```typescript
1427
+ * // Use existing section field
1428
+ * await pipeline.enrichSections('docs', {
1429
+ * existingField: 'section_path'
1430
+ * });
1431
+ *
1432
+ * // Auto-detect sections
1433
+ * await pipeline.enrichSections('docs', {
1434
+ * autoDetect: true
1435
+ * });
1436
+ * ```
1437
+ */
1438
+ async enrichSections(collection, config) {
1439
+ const startTime = Date.now();
1440
+ const stats = {
1441
+ recordsProcessed: 0,
1442
+ recordsUpdated: 0,
1443
+ recordsSkipped: 0,
1444
+ timeMs: 0,
1445
+ errors: []
1446
+ };
1447
+ try {
1448
+ await this.enrichWithSectionDetection(collection, config, stats);
1449
+ } catch (error) {
1450
+ stats.errors?.push(
1451
+ `Pipeline error: ${error instanceof Error ? error.message : "unknown error"}`
1452
+ );
1453
+ }
1454
+ stats.timeMs = Date.now() - startTime;
1455
+ return stats;
1456
+ }
1457
+ /**
1458
+ * Enrich records with all enrichment types.
1459
+ *
1460
+ * Runs vertical, theme, and section enrichment sequentially with shared
1461
+ * configuration. Global filters and batch sizes apply to all operations.
1462
+ *
1463
+ * @param collection - Name of the collection to enrich
1464
+ * @param config - Combined enrichment configuration
1465
+ * @returns Statistics about the enrichment operation
1466
+ *
1467
+ * @example
1468
+ * ```typescript
1469
+ * await pipeline.enrichAll('docs', {
1470
+ * vertical: { mapping: { tech: 'technology' } },
1471
+ * themes: { themes: ['innovation'], classifier },
1472
+ * sections: { autoDetect: true },
1473
+ * filter: { field: 'status', op: 'eq', value: 'pending' },
1474
+ * batchSize: 50
1475
+ * });
1476
+ * ```
1477
+ */
1478
+ async enrichAll(collection, config) {
1479
+ const startTime = Date.now();
1480
+ const aggregateStats = {
1481
+ recordsProcessed: 0,
1482
+ recordsUpdated: 0,
1483
+ recordsSkipped: 0,
1484
+ timeMs: 0,
1485
+ errors: []
1486
+ };
1487
+ try {
1488
+ if (config.vertical) {
1489
+ const verticalConfig = this.applyGlobalConfig(config.vertical, config);
1490
+ const stats = await this.enrichVertical(collection, verticalConfig);
1491
+ this.mergeStats(aggregateStats, stats);
1492
+ if (config.onProgress) {
1493
+ config.onProgress(aggregateStats);
1494
+ }
1495
+ }
1496
+ if (config.themes) {
1497
+ const themesConfig = this.applyGlobalConfig(config.themes, config);
1498
+ const stats = await this.enrichThemes(collection, themesConfig);
1499
+ this.mergeStats(aggregateStats, stats);
1500
+ if (config.onProgress) {
1501
+ config.onProgress(aggregateStats);
1502
+ }
1503
+ }
1504
+ if (config.sections) {
1505
+ const sectionsConfig = this.applyGlobalConfig(config.sections, config);
1506
+ const stats = await this.enrichSections(collection, sectionsConfig);
1507
+ this.mergeStats(aggregateStats, stats);
1508
+ if (config.onProgress) {
1509
+ config.onProgress(aggregateStats);
1510
+ }
1511
+ }
1512
+ } catch (error) {
1513
+ aggregateStats.errors?.push(
1514
+ `Pipeline error: ${error instanceof Error ? error.message : "unknown error"}`
1515
+ );
1516
+ }
1517
+ aggregateStats.timeMs = Date.now() - startTime;
1518
+ return aggregateStats;
1519
+ }
1520
+ /**
1521
+ * Apply global configuration to individual enrichment configs.
1522
+ *
1523
+ * @param individualConfig - Configuration for a specific enrichment type
1524
+ * @param globalConfig - Global configuration
1525
+ * @returns Merged configuration
1526
+ */
1527
+ applyGlobalConfig(individualConfig, globalConfig) {
1528
+ const merged = { ...individualConfig };
1529
+ if (globalConfig.filter && !merged.filter) {
1530
+ merged.filter = globalConfig.filter;
1531
+ }
1532
+ if (globalConfig.batchSize && !merged.batchSize) {
1533
+ merged.batchSize = globalConfig.batchSize;
1534
+ }
1535
+ return merged;
1536
+ }
1537
+ /**
1538
+ * Merge stats from an enrichment operation into aggregate stats.
1539
+ *
1540
+ * @param aggregate - Aggregate stats to update
1541
+ * @param stats - Stats from a single operation
1542
+ */
1543
+ mergeStats(aggregate, stats) {
1544
+ aggregate.recordsProcessed += stats.recordsProcessed;
1545
+ aggregate.recordsUpdated += stats.recordsUpdated;
1546
+ aggregate.recordsSkipped += stats.recordsSkipped;
1547
+ if (stats.errors && stats.errors.length > 0) {
1548
+ if (!aggregate.errors) {
1549
+ aggregate.errors = [];
1550
+ }
1551
+ aggregate.errors.push(...stats.errors);
1552
+ }
1553
+ }
1554
+ /**
1555
+ * Enrich records using section detection.
1556
+ *
1557
+ * @param collection - Collection name
1558
+ * @param config - Section enrichment configuration
1559
+ * @param stats - Statistics object to update
1560
+ */
1561
+ async enrichWithSectionDetection(collection, config, stats) {
1562
+ const batchSize = config.batchSize || 100;
1563
+ for await (const batch of this.adapter.iterate(collection, {
1564
+ batchSize,
1565
+ filter: config.filter
1566
+ })) {
1567
+ const updates = [];
1568
+ for (const record of batch) {
1569
+ stats.recordsProcessed++;
1570
+ try {
1571
+ let sectionMetadata = null;
1572
+ if (config.existingField) {
1573
+ sectionMetadata = this.extractSectionMetadata(
1574
+ record.metadata?.[config.existingField]
1575
+ );
1576
+ } else if (config.autoDetect) {
1577
+ const text = record.text || record.metadata?.content || "";
1578
+ if (typeof text === "string") {
1579
+ sectionMetadata = this.detectSections(text);
1580
+ }
1581
+ }
1582
+ if (sectionMetadata) {
1583
+ const metadata = {
1584
+ __h_section_level: sectionMetadata.level,
1585
+ __h_section_title: sectionMetadata.title
1586
+ };
1587
+ if (sectionMetadata.path) {
1588
+ metadata.__h_section_path = sectionMetadata.path;
1589
+ }
1590
+ updates.push({
1591
+ id: record.id,
1592
+ metadata
1593
+ });
1594
+ } else {
1595
+ stats.recordsSkipped++;
1596
+ }
1597
+ } catch (error) {
1598
+ stats.recordsSkipped++;
1599
+ stats.errors?.push(
1600
+ `Error processing record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
1601
+ );
1602
+ }
1603
+ }
1604
+ if (updates.length > 0) {
1605
+ try {
1606
+ await this.adapter.updateMetadata(collection, updates);
1607
+ stats.recordsUpdated += updates.length;
1608
+ } catch (error) {
1609
+ stats.errors?.push(
1610
+ `Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
1611
+ );
1612
+ }
1613
+ }
1614
+ }
1615
+ }
1616
+ /**
1617
+ * Extract section metadata from an existing field value.
1618
+ *
1619
+ * @param sectionPath - Section path string (e.g., "introduction/overview")
1620
+ * @returns Section metadata or null
1621
+ */
1622
+ extractSectionMetadata(sectionPath) {
1623
+ if (!sectionPath || typeof sectionPath !== "string") {
1624
+ return null;
1625
+ }
1626
+ const parts = sectionPath.split("/").filter((p) => p.trim() !== "");
1627
+ if (parts.length === 0) {
1628
+ return null;
1629
+ }
1630
+ return {
1631
+ path: sectionPath,
1632
+ level: parts.length,
1633
+ title: parts[parts.length - 1]
1634
+ };
1635
+ }
1636
+ /**
1637
+ * Detect sections in text using heuristics.
1638
+ *
1639
+ * @param text - Text content to analyze
1640
+ * @returns Section metadata or null
1641
+ */
1642
+ detectSections(text) {
1643
+ const markdown = this.detectMarkdownSections(text);
1644
+ if (markdown) return markdown;
1645
+ const html = this.detectHtmlSections(text);
1646
+ if (html) return html;
1647
+ const pattern = this.detectPatternSections(text);
1648
+ if (pattern) return pattern;
1649
+ return { level: 0, title: "unsectioned" };
1650
+ }
1651
+ /**
1652
+ * Detect markdown headers (# Header).
1653
+ *
1654
+ * @param text - Text content
1655
+ * @returns Section metadata or null
1656
+ */
1657
+ detectMarkdownSections(text) {
1658
+ const match = text.match(/^(#{1,6})\s+(.+)$/m);
1659
+ if (match) {
1660
+ const level = match[1].length;
1661
+ const title = match[2].trim();
1662
+ return { level, title };
1663
+ }
1664
+ return null;
1665
+ }
1666
+ /**
1667
+ * Detect HTML headers (<h1>Header</h1>).
1668
+ *
1669
+ * @param text - Text content
1670
+ * @returns Section metadata or null
1671
+ */
1672
+ detectHtmlSections(text) {
1673
+ const match = text.match(/<h([1-6])>(.+?)<\/h[1-6]>/i);
1674
+ if (match) {
1675
+ const level = parseInt(match[1], 10);
1676
+ const title = match[2].trim();
1677
+ return { level, title };
1678
+ }
1679
+ return null;
1680
+ }
1681
+ /**
1682
+ * Detect sections using common patterns (SECTION: Title).
1683
+ *
1684
+ * @param text - Text content
1685
+ * @returns Section metadata or null
1686
+ */
1687
+ detectPatternSections(text) {
1688
+ const match = text.match(/^SECTION:\s+(.+)$/m);
1689
+ if (match) {
1690
+ const title = match[1].trim();
1691
+ return { level: 1, title };
1692
+ }
1693
+ return null;
1694
+ }
1695
+ };
1696
+
1697
+ // src/ingestion/chunkers/text-chunker.ts
1698
+ var DEFAULT_CHUNK_SIZE = 500;
1699
+ var DEFAULT_CHUNK_OVERLAP = 50;
1700
+ function estimateTokens(text) {
1701
+ return Math.ceil(text.length / 4);
1702
+ }
1703
+ function estimateChars(tokens) {
1704
+ return tokens * 4;
1705
+ }
1706
+
1707
+ // src/ingestion/chunkers/recursive-chunker.ts
1708
+ var RecursiveChunker = class {
1709
+ separators = [
1710
+ "\n\n",
1711
+ // Paragraphs (double newline)
1712
+ "\n",
1713
+ // Lines (single newline)
1714
+ ". ",
1715
+ // Sentences (period + space)
1716
+ " ",
1717
+ // Words (space)
1718
+ ""
1719
+ // Characters (last resort)
1720
+ ];
1721
+ chunk(text, config) {
1722
+ if (!text) return [];
1723
+ const chunkSize = config?.chunkSize ?? DEFAULT_CHUNK_SIZE;
1724
+ const chunkOverlap = config?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
1725
+ const maxChars = estimateChars(chunkSize);
1726
+ const overlapChars = estimateChars(chunkOverlap);
1727
+ if (text.length <= maxChars) {
1728
+ return [{
1729
+ text,
1730
+ index: 0,
1731
+ metadata: {
1732
+ source: "",
1733
+ chunkIndex: 0,
1734
+ totalChunks: 1,
1735
+ startChar: 0,
1736
+ endChar: text.length
1737
+ }
1738
+ }];
1739
+ }
1740
+ const splits = this.recursiveSplit(text, maxChars, 0);
1741
+ const chunks = this.addOverlap(splits, overlapChars);
1742
+ return chunks.map((chunk, index) => ({
1743
+ text: chunk.text,
1744
+ index,
1745
+ metadata: {
1746
+ source: "",
1747
+ // Will be set by pipeline
1748
+ chunkIndex: index,
1749
+ totalChunks: chunks.length,
1750
+ startChar: chunk.start,
1751
+ endChar: chunk.end
1752
+ }
1753
+ }));
1754
+ }
1755
+ recursiveSplit(text, maxChars, separatorIndex) {
1756
+ if (text.length <= maxChars) {
1757
+ return [{ text, start: 0, end: text.length }];
1758
+ }
1759
+ if (separatorIndex >= this.separators.length) {
1760
+ const result2 = [];
1761
+ for (let i = 0; i < text.length; i += maxChars) {
1762
+ result2.push({
1763
+ text: text.slice(i, i + maxChars),
1764
+ start: i,
1765
+ end: Math.min(i + maxChars, text.length)
1766
+ });
1767
+ }
1768
+ return result2;
1769
+ }
1770
+ const separator = this.separators[separatorIndex];
1771
+ const parts = separator ? text.split(separator) : [text];
1772
+ if (parts.length <= 1) {
1773
+ return this.recursiveSplit(text, maxChars, separatorIndex + 1);
1774
+ }
1775
+ const result = [];
1776
+ let currentParts = [];
1777
+ let currentStart = 0;
1778
+ let runningOffset = 0;
1779
+ for (let i = 0; i < parts.length; i++) {
1780
+ const part = parts[i];
1781
+ const combined = currentParts.length > 0 ? [...currentParts, part].join(separator) : part;
1782
+ if (combined.length <= maxChars) {
1783
+ if (currentParts.length === 0) {
1784
+ currentStart = runningOffset;
1785
+ }
1786
+ currentParts.push(part);
1787
+ } else {
1788
+ if (currentParts.length > 0) {
1789
+ const chunkText = currentParts.join(separator);
1790
+ result.push({
1791
+ text: chunkText,
1792
+ start: currentStart,
1793
+ end: currentStart + chunkText.length
1794
+ });
1795
+ }
1796
+ currentStart = runningOffset;
1797
+ if (part.length > maxChars) {
1798
+ const subSplits = this.recursiveSplit(part, maxChars, separatorIndex + 1);
1799
+ for (const sub of subSplits) {
1800
+ result.push({
1801
+ text: sub.text,
1802
+ start: currentStart + sub.start,
1803
+ end: currentStart + sub.end
1804
+ });
1805
+ }
1806
+ currentParts = [];
1807
+ } else {
1808
+ currentParts = [part];
1809
+ }
1810
+ }
1811
+ runningOffset += part.length + (i < parts.length - 1 ? separator.length : 0);
1812
+ }
1813
+ if (currentParts.length > 0) {
1814
+ const chunkText = currentParts.join(separator);
1815
+ result.push({
1816
+ text: chunkText,
1817
+ start: currentStart,
1818
+ end: currentStart + chunkText.length
1819
+ });
1820
+ }
1821
+ return result;
1822
+ }
1823
+ addOverlap(chunks, overlapChars) {
1824
+ if (overlapChars === 0 || chunks.length <= 1) {
1825
+ return chunks;
1826
+ }
1827
+ const result = [chunks[0]];
1828
+ for (let i = 1; i < chunks.length; i++) {
1829
+ const prevChunk = chunks[i - 1];
1830
+ const currChunk = chunks[i];
1831
+ const overlapText = prevChunk.text.slice(-overlapChars);
1832
+ result.push({
1833
+ text: overlapText + currChunk.text,
1834
+ start: Math.max(0, prevChunk.end - overlapChars),
1835
+ end: currChunk.end
1836
+ });
1837
+ }
1838
+ return result;
1839
+ }
1840
+ };
1841
+
1842
+ // src/ingestion/ingestion-pipeline.ts
1843
+ import * as path from "path";
1844
+ var IngestionPipeline = class {
1845
+ constructor(adapter, embedder, loaderRegistry, chunker) {
1846
+ this.adapter = adapter;
1847
+ this.embedder = embedder;
1848
+ this.loaderRegistry = loaderRegistry;
1849
+ this.defaultChunker = chunker || new RecursiveChunker();
1850
+ }
1851
+ defaultChunker;
1852
+ /**
1853
+ * Ingest documents into a vector database collection.
1854
+ * @param sources - File paths
1855
+ * @param collection - Target collection name
1856
+ * @param config - Optional ingestion configuration
1857
+ * @returns Statistics about the ingestion operation
1858
+ */
1859
+ async ingest(sources, collection, config) {
1860
+ const startTime = Date.now();
1861
+ const sourceArray = Array.isArray(sources) ? sources : [sources];
1862
+ const stats = {
1863
+ documentsProcessed: 0,
1864
+ documentsSucceeded: 0,
1865
+ documentsFailed: 0,
1866
+ chunksCreated: 0,
1867
+ chunksUpserted: 0,
1868
+ timeMs: 0,
1869
+ errors: []
1870
+ };
1871
+ const totalDocuments = sourceArray.length;
1872
+ for (const source of sourceArray) {
1873
+ config?.onProgress?.({
1874
+ stage: "loading",
1875
+ documentsProcessed: stats.documentsProcessed,
1876
+ totalDocuments,
1877
+ chunksProcessed: stats.chunksUpserted,
1878
+ currentDocument: source
1879
+ });
1880
+ try {
1881
+ await this.ingestFile(source, collection, config, stats, totalDocuments);
1882
+ stats.documentsSucceeded++;
1883
+ } catch (error) {
1884
+ stats.documentsFailed++;
1885
+ stats.errors.push({
1886
+ source,
1887
+ stage: "load",
1888
+ error
1889
+ });
1890
+ }
1891
+ stats.documentsProcessed++;
1892
+ }
1893
+ stats.timeMs = Date.now() - startTime;
1894
+ return stats;
1895
+ }
1896
+ async ingestFile(filePath, collection, config, stats, totalDocuments) {
1897
+ const doc = await this.loaderRegistry.load(filePath);
1898
+ config?.onDocumentLoaded?.(doc);
1899
+ config?.onProgress?.({
1900
+ stage: "chunking",
1901
+ documentsProcessed: stats.documentsProcessed,
1902
+ totalDocuments,
1903
+ chunksProcessed: stats.chunksUpserted,
1904
+ currentDocument: filePath
1905
+ });
1906
+ const chunker = config?.chunker || this.defaultChunker;
1907
+ const chunks = chunker.chunk(doc.text, {
1908
+ chunkSize: config?.chunkSize,
1909
+ chunkOverlap: config?.chunkOverlap
1910
+ });
1911
+ for (const chunk of chunks) {
1912
+ chunk.metadata.source = doc.source;
1913
+ }
1914
+ stats.chunksCreated += chunks.length;
1915
+ config?.onChunksCreated?.(chunks);
1916
+ config?.onProgress?.({
1917
+ stage: "embedding",
1918
+ documentsProcessed: stats.documentsProcessed,
1919
+ totalDocuments,
1920
+ chunksProcessed: stats.chunksUpserted,
1921
+ totalChunks: stats.chunksCreated,
1922
+ currentDocument: filePath
1923
+ });
1924
+ const texts = chunks.map((c) => c.text);
1925
+ const embeddings = await this.embedder.embedBatch(texts);
1926
+ const records = chunks.map((chunk, i) => {
1927
+ const metadata = this.buildMetadata(doc, chunk, config);
1928
+ return {
1929
+ id: `${path.basename(doc.source)}:${chunk.index}`,
1930
+ embedding: embeddings[i],
1931
+ text: chunk.text,
1932
+ metadata
1933
+ };
1934
+ });
1935
+ config?.onProgress?.({
1936
+ stage: "upserting",
1937
+ documentsProcessed: stats.documentsProcessed,
1938
+ totalDocuments,
1939
+ chunksProcessed: stats.chunksUpserted,
1940
+ totalChunks: stats.chunksCreated,
1941
+ currentDocument: filePath
1942
+ });
1943
+ const batchSize = config?.batchSize || 100;
1944
+ for (let i = 0; i < records.length; i += batchSize) {
1945
+ const batch = records.slice(i, i + batchSize);
1946
+ await this.adapter.upsert(collection, batch);
1947
+ stats.chunksUpserted += batch.length;
1948
+ }
1949
+ }
1950
+ buildMetadata(doc, chunk, config) {
1951
+ const basename2 = path.basename(doc.source, path.extname(doc.source));
1952
+ const dirname2 = path.dirname(doc.source);
1953
+ const autoMetadata = {
1954
+ [VerticalFields.SOURCE]: doc.source,
1955
+ [VerticalFields.DOC_TYPE]: doc.type,
1956
+ [VerticalFields.DOC_ID]: basename2,
1957
+ [VerticalFields.PARTITION]: dirname2
1958
+ };
1959
+ const extractedMetadata = config?.metadataExtractor?.(doc) || {};
1960
+ const userMetadata = config?.metadata || {};
1961
+ const chunkMetadata = {
1962
+ chunkIndex: chunk.metadata.chunkIndex,
1963
+ totalChunks: chunk.metadata.totalChunks,
1964
+ startChar: chunk.metadata.startChar,
1965
+ endChar: chunk.metadata.endChar
1966
+ };
1967
+ return {
1968
+ ...autoMetadata,
1969
+ ...extractedMetadata,
1970
+ ...userMetadata,
1971
+ ...chunkMetadata
1972
+ };
1973
+ }
1974
+ };
1975
+
1976
+ // src/ingestion/loaders/text-loader.ts
1977
+ import * as fs from "fs/promises";
1978
+ import * as path2 from "path";
1979
+ var TextLoader = class {
1980
+ canHandle(filePath) {
1981
+ return /\.(txt|md)$/i.test(filePath);
1982
+ }
1983
+ async load(filePath) {
1984
+ const text = await fs.readFile(filePath, "utf-8");
1985
+ const type = path2.extname(filePath).slice(1).toLowerCase();
1986
+ const stats = await fs.stat(filePath);
1987
+ const extension = path2.extname(filePath);
1988
+ return {
1989
+ text,
1990
+ source: filePath,
1991
+ type,
1992
+ metadata: {
1993
+ size: stats.size,
1994
+ extension
1995
+ }
1996
+ };
1997
+ }
1998
+ };
1999
+
2000
+ // src/ingestion/loaders/pdf-loader.ts
2001
+ import * as fs2 from "fs/promises";
2002
+ import pdfParse from "pdf-parse";
2003
+ var PDFLoader = class {
2004
+ canHandle(filePath) {
2005
+ return /\.pdf$/i.test(filePath);
2006
+ }
2007
+ async load(filePath) {
2008
+ const dataBuffer = await fs2.readFile(filePath);
2009
+ const pdfData = await pdfParse(dataBuffer);
2010
+ return {
2011
+ text: pdfData.text,
2012
+ source: filePath,
2013
+ type: "pdf",
2014
+ metadata: {
2015
+ pages: pdfData.numpages,
2016
+ info: pdfData.info
2017
+ }
2018
+ };
2019
+ }
2020
+ };
2021
+
2022
+ // src/ingestion/loaders/docx-loader.ts
2023
+ import mammoth from "mammoth";
2024
+ var DOCXLoader = class {
2025
+ canHandle(filePath) {
2026
+ return /\.docx$/i.test(filePath);
2027
+ }
2028
+ async load(filePath) {
2029
+ const result = await mammoth.extractRawText({ path: filePath });
2030
+ return {
2031
+ text: result.value,
2032
+ source: filePath,
2033
+ type: "docx",
2034
+ metadata: {
2035
+ warnings: result.messages
2036
+ // Conversion warnings from mammoth
2037
+ }
2038
+ };
2039
+ }
2040
+ };
2041
+
2042
+ // src/ingestion/loaders/html-loader.ts
2043
+ import * as fs3 from "fs/promises";
2044
+ import * as cheerio from "cheerio";
2045
+ var HTMLLoader = class {
2046
+ canHandle(filePath) {
2047
+ return /\.html?$/i.test(filePath);
2048
+ }
2049
+ async load(filePath) {
2050
+ const html = await fs3.readFile(filePath, "utf-8");
2051
+ const $ = cheerio.load(html);
2052
+ $("script, style, nav, footer").remove();
2053
+ const text = $("body").text().replace(/\s+/g, " ").trim();
2054
+ return {
2055
+ text,
2056
+ source: filePath,
2057
+ type: "html",
2058
+ metadata: {
2059
+ title: $("title").text() || void 0,
2060
+ description: $('meta[name="description"]').attr("content") || void 0
2061
+ }
2062
+ };
2063
+ }
2064
+ };
2065
+
2066
+ // src/ingestion/loaders/loader-registry.ts
2067
+ var LoaderRegistry = class {
2068
+ loaders = [];
2069
+ constructor() {
2070
+ this.register(new TextLoader());
2071
+ this.register(new PDFLoader());
2072
+ this.register(new DOCXLoader());
2073
+ this.register(new HTMLLoader());
2074
+ }
2075
+ /**
2076
+ * Register a custom document loader.
2077
+ * @param loader - Loader to register
2078
+ */
2079
+ register(loader) {
2080
+ this.loaders.push(loader);
2081
+ }
2082
+ /**
2083
+ * Check if any loader can handle this file.
2084
+ * @param filePath - Path to check
2085
+ * @returns true if a loader exists for this file type
2086
+ */
2087
+ canLoad(filePath) {
2088
+ return this.loaders.some((l) => l.canHandle(filePath));
2089
+ }
2090
+ /**
2091
+ * Load a document using the appropriate loader.
2092
+ * @param filePath - Path to the file to load
2093
+ * @returns Promise resolving to Document
2094
+ * @throws Error if no loader found for file type
2095
+ */
2096
+ async load(filePath) {
2097
+ const loader = this.loaders.find((l) => l.canHandle(filePath));
2098
+ if (!loader) {
2099
+ throw new Error(`No loader found for file: ${filePath}`);
2100
+ }
2101
+ return loader.load(filePath);
2102
+ }
2103
+ };
2104
+
2105
+ // src/ingestion/chunkers/fixed-chunker.ts
2106
+ var FixedChunker = class {
2107
+ chunk(text, config) {
2108
+ if (!text) return [];
2109
+ const chunkSize = config?.chunkSize ?? DEFAULT_CHUNK_SIZE;
2110
+ const chunkOverlap = config?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
2111
+ const maxChars = estimateChars(chunkSize);
2112
+ const overlapChars = estimateChars(chunkOverlap);
2113
+ const step = maxChars - overlapChars;
2114
+ if (text.length <= maxChars) {
2115
+ return [{
2116
+ text,
2117
+ index: 0,
2118
+ metadata: {
2119
+ source: "",
2120
+ chunkIndex: 0,
2121
+ totalChunks: 1,
2122
+ startChar: 0,
2123
+ endChar: text.length
2124
+ }
2125
+ }];
2126
+ }
2127
+ const chunks = [];
2128
+ let position = 0;
2129
+ while (position < text.length) {
2130
+ const end = Math.min(text.length, position + maxChars);
2131
+ const chunkText = text.slice(position, end);
2132
+ chunks.push({
2133
+ text: chunkText,
2134
+ index: chunks.length,
2135
+ metadata: {
2136
+ source: "",
2137
+ chunkIndex: chunks.length,
2138
+ totalChunks: 0,
2139
+ // Updated after loop
2140
+ startChar: position,
2141
+ endChar: end
2142
+ }
2143
+ });
2144
+ position += step;
2145
+ if (step <= 0) break;
2146
+ }
2147
+ for (const chunk of chunks) {
2148
+ chunk.metadata.totalChunks = chunks.length;
2149
+ }
2150
+ return chunks;
2151
+ }
2152
+ };
2153
+
2154
+ // src/ingestion/chunkers/sentence-chunker.ts
2155
+ var SentenceChunker = class {
2156
+ chunk(text, config) {
2157
+ if (!text) return [];
2158
+ const chunkSize = config?.chunkSize ?? DEFAULT_CHUNK_SIZE;
2159
+ const chunkOverlap = config?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
2160
+ const maxChars = estimateChars(chunkSize);
2161
+ const overlapChars = estimateChars(chunkOverlap);
2162
+ const sentences = this.splitSentences(text);
2163
+ if (sentences.length === 0) {
2164
+ return [{
2165
+ text,
2166
+ index: 0,
2167
+ metadata: {
2168
+ source: "",
2169
+ chunkIndex: 0,
2170
+ totalChunks: 1,
2171
+ startChar: 0,
2172
+ endChar: text.length
2173
+ }
2174
+ }];
2175
+ }
2176
+ const rawChunks = [];
2177
+ let currentSentences = [];
2178
+ let currentStart = 0;
2179
+ for (const sentence of sentences) {
2180
+ const combined = currentSentences.length > 0 ? [...currentSentences, sentence].join(" ") : sentence;
2181
+ if (currentSentences.length === 0) {
2182
+ currentSentences = [sentence];
2183
+ currentStart = text.indexOf(sentence);
2184
+ } else if (combined.length <= maxChars) {
2185
+ currentSentences.push(sentence);
2186
+ } else {
2187
+ const chunkText = currentSentences.join(" ");
2188
+ rawChunks.push({
2189
+ text: chunkText,
2190
+ start: currentStart,
2191
+ end: currentStart + chunkText.length
2192
+ });
2193
+ currentSentences = [sentence];
2194
+ currentStart = text.indexOf(sentence, currentStart + 1);
2195
+ if (currentStart === -1) currentStart = 0;
2196
+ }
2197
+ }
2198
+ if (currentSentences.length > 0) {
2199
+ const chunkText = currentSentences.join(" ");
2200
+ rawChunks.push({
2201
+ text: chunkText,
2202
+ start: currentStart,
2203
+ end: currentStart + chunkText.length
2204
+ });
2205
+ }
2206
+ const withOverlap = this.addSentenceOverlap(rawChunks, overlapChars);
2207
+ return withOverlap.map((chunk, index) => ({
2208
+ text: chunk.text,
2209
+ index,
2210
+ metadata: {
2211
+ source: "",
2212
+ chunkIndex: index,
2213
+ totalChunks: withOverlap.length,
2214
+ startChar: chunk.start,
2215
+ endChar: chunk.end
2216
+ }
2217
+ }));
2218
+ }
2219
+ splitSentences(text) {
2220
+ const parts = text.match(/[^.!?]*[.!?]+(?:\s|$)|[^.!?]+$/g);
2221
+ if (!parts) return [text];
2222
+ return parts.map((s) => s.trim()).filter((s) => s.length > 0);
2223
+ }
2224
+ addSentenceOverlap(chunks, overlapChars) {
2225
+ if (overlapChars === 0 || chunks.length <= 1) {
2226
+ return chunks;
2227
+ }
2228
+ const result = [chunks[0]];
2229
+ for (let i = 1; i < chunks.length; i++) {
2230
+ const prevChunk = chunks[i - 1];
2231
+ const currChunk = chunks[i];
2232
+ const prevSentences = this.splitSentences(prevChunk.text);
2233
+ const lastSentence = prevSentences[prevSentences.length - 1] || "";
2234
+ if (lastSentence && lastSentence.length <= overlapChars) {
2235
+ result.push({
2236
+ text: lastSentence + " " + currChunk.text,
2237
+ start: Math.max(0, prevChunk.end - lastSentence.length),
2238
+ end: currChunk.end
2239
+ });
2240
+ } else {
2241
+ result.push(currChunk);
2242
+ }
2243
+ }
2244
+ return result;
2245
+ }
2246
+ };
2247
+
2248
+ // src/client/rag-client.ts
2249
+ var DEFAULT_TOP_K = 10;
2250
+ var DEFAULT_RAG_SYSTEM_PROMPT = "You are a helpful assistant. Answer the question based on the provided context. If the context doesn't contain enough information, say so.";
2251
+ var RAGClient = class {
2252
+ adapter;
2253
+ embedder;
2254
+ llm;
2255
+ defaultCollection;
2256
+ defaultTopK;
2257
+ queryComposer;
2258
+ ingestionPipeline;
2259
+ enrichmentPipeline;
2260
+ constructor(config) {
2261
+ this.adapter = config.adapter;
2262
+ this.embedder = config.embedder;
2263
+ this.llm = config.llm;
2264
+ this.defaultCollection = config.defaultCollection;
2265
+ this.defaultTopK = config.defaultTopK ?? DEFAULT_TOP_K;
2266
+ this.queryComposer = new RAGQueryComposer(this.adapter, this.embedder);
2267
+ this.ingestionPipeline = new IngestionPipeline(
2268
+ this.adapter,
2269
+ this.embedder,
2270
+ new LoaderRegistry()
2271
+ );
2272
+ this.enrichmentPipeline = new EnrichmentPipeline(this.adapter);
2273
+ }
2274
+ // ==========================================================================
2275
+ // COLLECTION MANAGEMENT
2276
+ // ==========================================================================
2277
+ /**
2278
+ * Create a new vector collection.
2279
+ * Dimension defaults to embedder.dimensions if not specified.
2280
+ */
2281
+ async createCollection(name, dimension, metric) {
2282
+ const dim = dimension ?? this.embedder.dimensions;
2283
+ await this.adapter.createCollection(name, dim, metric);
2284
+ }
2285
+ /**
2286
+ * Delete a collection.
2287
+ */
2288
+ async deleteCollection(name) {
2289
+ await this.adapter.deleteCollection(name);
2290
+ }
2291
+ /**
2292
+ * Check if a collection exists.
2293
+ */
2294
+ async collectionExists(name) {
2295
+ return this.adapter.collectionExists(name);
2296
+ }
2297
+ // ==========================================================================
2298
+ // INGESTION
2299
+ // ==========================================================================
2300
+ /**
2301
+ * Ingest documents into a collection.
2302
+ * Collection defaults to defaultCollection if not specified.
2303
+ */
2304
+ async ingest(sources, collection, config) {
2305
+ const col = collection ?? this.defaultCollection;
2306
+ if (!col) {
2307
+ throw new Error(
2308
+ "No collection specified. Pass a collection name or set defaultCollection in config."
2309
+ );
2310
+ }
2311
+ return this.ingestionPipeline.ingest(sources, col, config);
2312
+ }
2313
+ // ==========================================================================
2314
+ // RETRIEVAL
2315
+ // ==========================================================================
2316
+ /**
2317
+ * Retrieve relevant chunks for a query.
2318
+ * Supports filter shorthands (partition, theme) and groupBy.
2319
+ */
2320
+ async retrieve(query, options) {
2321
+ const collection = options?.collection ?? this.defaultCollection;
2322
+ if (!collection) {
2323
+ throw new Error(
2324
+ "No collection specified. Pass a collection name or set defaultCollection in config."
2325
+ );
2326
+ }
2327
+ const topK = options?.topK ?? this.defaultTopK;
2328
+ let verticalFilters;
2329
+ let horizontalFilters;
2330
+ const customFilters = options?.filter;
2331
+ if (options?.partition) {
2332
+ verticalFilters = {
2333
+ field: VerticalFields.PARTITION,
2334
+ op: "eq",
2335
+ value: options.partition
2336
+ };
2337
+ }
2338
+ if (options?.theme) {
2339
+ horizontalFilters = {
2340
+ field: HorizontalFields.THEME,
2341
+ op: "eq",
2342
+ value: options.theme
2343
+ };
2344
+ }
2345
+ const params = {
2346
+ query,
2347
+ collection,
2348
+ topK,
2349
+ verticalFilters,
2350
+ horizontalFilters,
2351
+ customFilters
2352
+ };
2353
+ if (options?.groupBy === "document") {
2354
+ const grouped = await this.queryComposer.retrieveVertical(params);
2355
+ const records = Array.from(grouped.values()).flat();
2356
+ return { records, query, filtersApplied: { vertical: verticalFilters, horizontal: horizontalFilters, custom: customFilters } };
2357
+ }
2358
+ if (options?.groupBy === "theme") {
2359
+ const grouped = await this.queryComposer.retrieveHorizontal(params);
2360
+ const records = Array.from(grouped.values()).flat();
2361
+ return { records, query, filtersApplied: { vertical: verticalFilters, horizontal: horizontalFilters, custom: customFilters } };
2362
+ }
2363
+ return this.queryComposer.retrieve(params);
2364
+ }
2365
+ // ==========================================================================
2366
+ // ENRICHMENT
2367
+ // ==========================================================================
2368
+ /**
2369
+ * Enrich a collection with vertical, theme, and/or section metadata.
2370
+ */
2371
+ async enrich(collection, config) {
2372
+ return this.enrichmentPipeline.enrichAll(collection, config);
2373
+ }
2374
+ // ==========================================================================
2375
+ // FULL RAG QUERY
2376
+ // ==========================================================================
2377
+ /**
2378
+ * Full RAG: retrieve relevant context and generate an answer using LLM.
2379
+ * Requires an LLM client to be provided in the constructor config.
2380
+ */
2381
+ async query(question, options) {
2382
+ if (!this.llm) {
2383
+ throw new Error(
2384
+ "RAGClient.query() requires an LLM client. Pass one in the constructor config."
2385
+ );
2386
+ }
2387
+ const retrievalResult = await this.retrieve(question, options);
2388
+ const context = retrievalResult.records.map((r) => r.text).filter(Boolean).join("\n\n");
2389
+ const systemPrompt = options?.systemPrompt ?? DEFAULT_RAG_SYSTEM_PROMPT;
2390
+ const prompt = `${systemPrompt}
2391
+
2392
+ Context:
2393
+ ${context}
2394
+
2395
+ Question: ${question}`;
2396
+ const answer = await this.llm.generate(prompt, {
2397
+ temperature: options?.temperature,
2398
+ maxTokens: options?.maxTokens
2399
+ });
2400
+ return {
2401
+ answer,
2402
+ sources: retrievalResult.records,
2403
+ query: question,
2404
+ retrievalResult
2405
+ };
2406
+ }
2407
+ };
2408
+ export {
2409
+ DEFAULT_CHUNK_OVERLAP,
2410
+ DEFAULT_CHUNK_SIZE,
2411
+ DOCXLoader,
2412
+ Embedder,
2413
+ EmbeddingThemeClassifier,
2414
+ EnrichmentPipeline,
2415
+ FilterBuilder,
2416
+ FilterTranslator,
2417
+ FixedChunker,
2418
+ HTMLLoader,
2419
+ HorizontalFields,
2420
+ IngestionPipeline,
2421
+ KeywordThemeClassifier,
2422
+ LLMClient,
2423
+ LLMThemeClassifier,
2424
+ LoaderRegistry,
2425
+ METADATA_PREFIXES,
2426
+ MetadataBuilder,
2427
+ MockLLM,
2428
+ PDFLoader,
2429
+ RAGClient,
2430
+ RAGQueryComposer,
2431
+ RecursiveChunker,
2432
+ SentenceChunker,
2433
+ StructuralFields,
2434
+ TextLoader,
2435
+ VectorDBAdapter,
2436
+ VerticalFields,
2437
+ ZeroShotThemeClassifier,
2438
+ estimateChars,
2439
+ estimateTokens
2440
+ };
2441
+ //# sourceMappingURL=index.mjs.map