@vectororm/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -0
- package/dist/index.d.mts +2493 -0
- package/dist/index.d.ts +2493 -0
- package/dist/index.js +2508 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +2441 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +65 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,2508 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
|
|
30
|
+
// src/index.ts
|
|
31
|
+
var index_exports = {};
|
|
32
|
+
__export(index_exports, {
|
|
33
|
+
DEFAULT_CHUNK_OVERLAP: () => DEFAULT_CHUNK_OVERLAP,
|
|
34
|
+
DEFAULT_CHUNK_SIZE: () => DEFAULT_CHUNK_SIZE,
|
|
35
|
+
DOCXLoader: () => DOCXLoader,
|
|
36
|
+
Embedder: () => Embedder,
|
|
37
|
+
EmbeddingThemeClassifier: () => EmbeddingThemeClassifier,
|
|
38
|
+
EnrichmentPipeline: () => EnrichmentPipeline,
|
|
39
|
+
FilterBuilder: () => FilterBuilder,
|
|
40
|
+
FilterTranslator: () => FilterTranslator,
|
|
41
|
+
FixedChunker: () => FixedChunker,
|
|
42
|
+
HTMLLoader: () => HTMLLoader,
|
|
43
|
+
HorizontalFields: () => HorizontalFields,
|
|
44
|
+
IngestionPipeline: () => IngestionPipeline,
|
|
45
|
+
KeywordThemeClassifier: () => KeywordThemeClassifier,
|
|
46
|
+
LLMClient: () => LLMClient,
|
|
47
|
+
LLMThemeClassifier: () => LLMThemeClassifier,
|
|
48
|
+
LoaderRegistry: () => LoaderRegistry,
|
|
49
|
+
METADATA_PREFIXES: () => METADATA_PREFIXES,
|
|
50
|
+
MetadataBuilder: () => MetadataBuilder,
|
|
51
|
+
MockLLM: () => MockLLM,
|
|
52
|
+
PDFLoader: () => PDFLoader,
|
|
53
|
+
RAGClient: () => RAGClient,
|
|
54
|
+
RAGQueryComposer: () => RAGQueryComposer,
|
|
55
|
+
RecursiveChunker: () => RecursiveChunker,
|
|
56
|
+
SentenceChunker: () => SentenceChunker,
|
|
57
|
+
StructuralFields: () => StructuralFields,
|
|
58
|
+
TextLoader: () => TextLoader,
|
|
59
|
+
VectorDBAdapter: () => VectorDBAdapter,
|
|
60
|
+
VerticalFields: () => VerticalFields,
|
|
61
|
+
ZeroShotThemeClassifier: () => ZeroShotThemeClassifier,
|
|
62
|
+
estimateChars: () => estimateChars,
|
|
63
|
+
estimateTokens: () => estimateTokens
|
|
64
|
+
});
|
|
65
|
+
module.exports = __toCommonJS(index_exports);
|
|
66
|
+
|
|
67
|
+
// src/metadata/constants.ts
|
|
68
|
+
var METADATA_PREFIXES = {
|
|
69
|
+
VERTICAL: "__v_",
|
|
70
|
+
HORIZONTAL: "__h_",
|
|
71
|
+
STRUCTURAL: "__s_"
|
|
72
|
+
};
|
|
73
|
+
var VerticalFields = {
|
|
74
|
+
/** Unique document identifier */
|
|
75
|
+
DOC_ID: "__v_doc_id",
|
|
76
|
+
/** Original source path/URL */
|
|
77
|
+
SOURCE: "__v_source",
|
|
78
|
+
/** Logical partition key (for filtering by document subsets) */
|
|
79
|
+
PARTITION: "__v_partition",
|
|
80
|
+
/** Document type classification */
|
|
81
|
+
DOC_TYPE: "__v_doc_type",
|
|
82
|
+
/** Arbitrary vertical tags */
|
|
83
|
+
TAGS: "__v_tags"
|
|
84
|
+
};
|
|
85
|
+
var HorizontalFields = {
|
|
86
|
+
/** Primary theme classification */
|
|
87
|
+
THEME: "__h_theme",
|
|
88
|
+
/** Multiple themes (if applicable) */
|
|
89
|
+
THEMES: "__h_themes",
|
|
90
|
+
/** Classification confidence score */
|
|
91
|
+
THEME_CONFIDENCE: "__h_theme_confidence",
|
|
92
|
+
/** Hierarchical section path (e.g., "Chapter 3/Pricing/Rates") */
|
|
93
|
+
SECTION_PATH: "__h_section_path",
|
|
94
|
+
/** Depth level in hierarchy (0 = root) */
|
|
95
|
+
SECTION_LEVEL: "__h_section_level",
|
|
96
|
+
/** Section header text */
|
|
97
|
+
SECTION_TITLE: "__h_section_title"
|
|
98
|
+
};
|
|
99
|
+
var StructuralFields = {
|
|
100
|
+
/** Position in document (0-indexed) */
|
|
101
|
+
CHUNK_INDEX: "__s_chunk_index",
|
|
102
|
+
/** Parent chunk ID (for hierarchical chunking) */
|
|
103
|
+
PARENT_ID: "__s_parent_id",
|
|
104
|
+
/** Whether this chunk has children */
|
|
105
|
+
HAS_CHILDREN: "__s_has_children",
|
|
106
|
+
/** Total chunks in this document */
|
|
107
|
+
TOTAL_CHUNKS: "__s_total_chunks"
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
// src/metadata/builder.ts
|
|
111
|
+
var MetadataBuilder = class {
|
|
112
|
+
metadata = {};
|
|
113
|
+
/**
|
|
114
|
+
* Add vertical axis metadata (document identity).
|
|
115
|
+
* Automatically prefixes fields with '__v_'.
|
|
116
|
+
*
|
|
117
|
+
* @param fields - Vertical metadata fields (doc_id, source, partition, etc.)
|
|
118
|
+
* @returns This builder for chaining
|
|
119
|
+
*/
|
|
120
|
+
vertical(fields) {
|
|
121
|
+
for (const [key, value] of Object.entries(fields)) {
|
|
122
|
+
if (value !== void 0) {
|
|
123
|
+
this.metadata[`${METADATA_PREFIXES.VERTICAL}${key}`] = value;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return this;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Add horizontal axis metadata (theme/section identity).
|
|
130
|
+
* Automatically prefixes fields with '__h_'.
|
|
131
|
+
*
|
|
132
|
+
* @param fields - Horizontal metadata fields (theme, section_path, etc.)
|
|
133
|
+
* @returns This builder for chaining
|
|
134
|
+
*/
|
|
135
|
+
horizontal(fields) {
|
|
136
|
+
for (const [key, value] of Object.entries(fields)) {
|
|
137
|
+
if (value !== void 0) {
|
|
138
|
+
this.metadata[`${METADATA_PREFIXES.HORIZONTAL}${key}`] = value;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
return this;
|
|
142
|
+
}
|
|
143
|
+
/**
|
|
144
|
+
* Add structural axis metadata (position/hierarchy).
|
|
145
|
+
* Automatically prefixes fields with '__s_'.
|
|
146
|
+
*
|
|
147
|
+
* @param fields - Structural metadata fields (chunk_index, parent_id, etc.)
|
|
148
|
+
* @returns This builder for chaining
|
|
149
|
+
*/
|
|
150
|
+
structural(fields) {
|
|
151
|
+
for (const [key, value] of Object.entries(fields)) {
|
|
152
|
+
if (value !== void 0) {
|
|
153
|
+
this.metadata[`${METADATA_PREFIXES.STRUCTURAL}${key}`] = value;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
return this;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Add custom user-defined metadata.
|
|
160
|
+
* Fields are added as-is without any prefix.
|
|
161
|
+
*
|
|
162
|
+
* @param fields - Custom metadata fields
|
|
163
|
+
* @returns This builder for chaining
|
|
164
|
+
*/
|
|
165
|
+
custom(fields) {
|
|
166
|
+
for (const [key, value] of Object.entries(fields)) {
|
|
167
|
+
if (value !== void 0) {
|
|
168
|
+
this.metadata[key] = value;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
return this;
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Build and return the complete metadata object.
|
|
175
|
+
* Returns a copy to prevent external modification.
|
|
176
|
+
*
|
|
177
|
+
* @returns Immutable copy of the metadata object
|
|
178
|
+
*/
|
|
179
|
+
build() {
|
|
180
|
+
return { ...this.metadata };
|
|
181
|
+
}
|
|
182
|
+
};
|
|
183
|
+
|
|
184
|
+
// src/filters/translator.ts
|
|
185
|
+
var VALID_OPERATORS = [
|
|
186
|
+
"eq",
|
|
187
|
+
"neq",
|
|
188
|
+
"in",
|
|
189
|
+
"nin",
|
|
190
|
+
"gt",
|
|
191
|
+
"gte",
|
|
192
|
+
"lt",
|
|
193
|
+
"lte",
|
|
194
|
+
"contains",
|
|
195
|
+
"exists"
|
|
196
|
+
];
|
|
197
|
+
var FilterTranslator = class {
|
|
198
|
+
/**
|
|
199
|
+
* Normalize any filter input to standard UniversalFilter format.
|
|
200
|
+
*
|
|
201
|
+
* Handles:
|
|
202
|
+
* - Standard format (pass through)
|
|
203
|
+
* - Shorthand format (convert to standard)
|
|
204
|
+
* - Operator suffixes (field__op syntax)
|
|
205
|
+
*/
|
|
206
|
+
static normalize(input) {
|
|
207
|
+
if (this.isStandardFormat(input)) {
|
|
208
|
+
return input;
|
|
209
|
+
}
|
|
210
|
+
return this.fromShorthand(input);
|
|
211
|
+
}
|
|
212
|
+
/**
|
|
213
|
+
* Validate filter structure and operators.
|
|
214
|
+
*
|
|
215
|
+
* Throws error if filter is invalid.
|
|
216
|
+
*/
|
|
217
|
+
static validate(filter) {
|
|
218
|
+
if (this.isCompound(filter)) {
|
|
219
|
+
const compound = filter;
|
|
220
|
+
const conditions = "and" in compound ? compound.and : compound.or;
|
|
221
|
+
if (!Array.isArray(conditions) || conditions.length === 0) {
|
|
222
|
+
throw new Error("Compound filter must have at least one condition");
|
|
223
|
+
}
|
|
224
|
+
conditions.forEach((c) => this.validate(c));
|
|
225
|
+
} else {
|
|
226
|
+
const condition = filter;
|
|
227
|
+
if (!condition.field || typeof condition.field !== "string") {
|
|
228
|
+
throw new Error("Filter field must be a non-empty string");
|
|
229
|
+
}
|
|
230
|
+
if (!VALID_OPERATORS.includes(condition.op)) {
|
|
231
|
+
throw new Error(`Invalid filter operator: ${condition.op}`);
|
|
232
|
+
}
|
|
233
|
+
if (condition.value === void 0) {
|
|
234
|
+
throw new Error("Filter value is required");
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
/**
|
|
239
|
+
* Check if filter is compound (AND/OR).
|
|
240
|
+
*/
|
|
241
|
+
static isCompound(filter) {
|
|
242
|
+
return "and" in filter || "or" in filter;
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Check if input is already in standard format.
|
|
246
|
+
*/
|
|
247
|
+
static isStandardFormat(input) {
|
|
248
|
+
if (!input || typeof input !== "object") {
|
|
249
|
+
return false;
|
|
250
|
+
}
|
|
251
|
+
if ("and" in input || "or" in input) {
|
|
252
|
+
return true;
|
|
253
|
+
}
|
|
254
|
+
if ("field" in input && "op" in input && "value" in input) {
|
|
255
|
+
return true;
|
|
256
|
+
}
|
|
257
|
+
return false;
|
|
258
|
+
}
|
|
259
|
+
/**
|
|
260
|
+
* Convert shorthand format to standard.
|
|
261
|
+
*/
|
|
262
|
+
static fromShorthand(shorthand) {
|
|
263
|
+
const entries = Object.entries(shorthand);
|
|
264
|
+
if (entries.length === 0) {
|
|
265
|
+
throw new Error("Cannot convert empty shorthand filter object");
|
|
266
|
+
}
|
|
267
|
+
const conditions = [];
|
|
268
|
+
for (const [key, value] of entries) {
|
|
269
|
+
let field;
|
|
270
|
+
let op;
|
|
271
|
+
if (key.includes("__") && !key.startsWith("__")) {
|
|
272
|
+
const lastIndex = key.lastIndexOf("__");
|
|
273
|
+
field = key.substring(0, lastIndex);
|
|
274
|
+
const extractedOp = key.substring(lastIndex + 2);
|
|
275
|
+
if (!VALID_OPERATORS.includes(extractedOp)) {
|
|
276
|
+
throw new Error(`Invalid filter operator in shorthand: ${extractedOp}`);
|
|
277
|
+
}
|
|
278
|
+
op = extractedOp;
|
|
279
|
+
} else {
|
|
280
|
+
field = key;
|
|
281
|
+
op = "eq";
|
|
282
|
+
}
|
|
283
|
+
conditions.push({ field, op, value });
|
|
284
|
+
}
|
|
285
|
+
if (conditions.length === 1) {
|
|
286
|
+
return conditions[0];
|
|
287
|
+
}
|
|
288
|
+
return { and: conditions };
|
|
289
|
+
}
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
// src/adapters/vector-db-adapter.ts
|
|
293
|
+
var VectorDBAdapter = class {
|
|
294
|
+
// ============================================================================
|
|
295
|
+
// CAPABILITY FLAGS (WITH DEFAULT IMPLEMENTATIONS)
|
|
296
|
+
// ============================================================================
|
|
297
|
+
/**
|
|
298
|
+
* Whether this adapter supports metadata updates without re-uploading vectors.
|
|
299
|
+
*
|
|
300
|
+
* Default: false (must re-upload entire record)
|
|
301
|
+
* Override to return true if your DB supports partial updates.
|
|
302
|
+
*/
|
|
303
|
+
supportsMetadataUpdate() {
|
|
304
|
+
return false;
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Whether this adapter supports filtering during search.
|
|
308
|
+
*
|
|
309
|
+
* Default: false (no filtering support)
|
|
310
|
+
* Override to return true if your DB supports metadata filtering.
|
|
311
|
+
*/
|
|
312
|
+
supportsFiltering() {
|
|
313
|
+
return false;
|
|
314
|
+
}
|
|
315
|
+
/**
|
|
316
|
+
* Whether this adapter supports batch operations efficiently.
|
|
317
|
+
*
|
|
318
|
+
* Default: false (single operations only)
|
|
319
|
+
* Override to return true if your DB supports batch upsert/delete.
|
|
320
|
+
*/
|
|
321
|
+
supportsBatchOperations() {
|
|
322
|
+
return false;
|
|
323
|
+
}
|
|
324
|
+
};
|
|
325
|
+
|
|
326
|
+
// src/query/filter-builder.ts
|
|
327
|
+
var FilterBuilder = class {
|
|
328
|
+
verticalFilter;
|
|
329
|
+
horizontalFilter;
|
|
330
|
+
customFilter;
|
|
331
|
+
/**
|
|
332
|
+
* Add a vertical (document-level) filter.
|
|
333
|
+
*
|
|
334
|
+
* @param filter - The vertical filter to add (standard or shorthand format)
|
|
335
|
+
* @returns This builder for method chaining
|
|
336
|
+
*/
|
|
337
|
+
withVerticalFilter(filter) {
|
|
338
|
+
this.verticalFilter = FilterTranslator.normalize(filter);
|
|
339
|
+
return this;
|
|
340
|
+
}
|
|
341
|
+
/**
|
|
342
|
+
* Add a horizontal (theme-level) filter.
|
|
343
|
+
*
|
|
344
|
+
* @param filter - The horizontal filter to add (standard or shorthand format)
|
|
345
|
+
* @returns This builder for method chaining
|
|
346
|
+
*/
|
|
347
|
+
withHorizontalFilter(filter) {
|
|
348
|
+
this.horizontalFilter = FilterTranslator.normalize(filter);
|
|
349
|
+
return this;
|
|
350
|
+
}
|
|
351
|
+
/**
|
|
352
|
+
* Add a custom user-defined filter.
|
|
353
|
+
*
|
|
354
|
+
* @param filter - The custom filter to add (standard or shorthand format)
|
|
355
|
+
* @returns This builder for method chaining
|
|
356
|
+
*/
|
|
357
|
+
withCustomFilter(filter) {
|
|
358
|
+
this.customFilter = FilterTranslator.normalize(filter);
|
|
359
|
+
return this;
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Build the combined filter.
|
|
363
|
+
*
|
|
364
|
+
* Combination logic:
|
|
365
|
+
* - If no filters: returns undefined
|
|
366
|
+
* - If single filter: returns it directly
|
|
367
|
+
* - If multiple filters: combines with AND logic
|
|
368
|
+
*
|
|
369
|
+
* @returns The combined filter, or undefined if no filters were added
|
|
370
|
+
*/
|
|
371
|
+
build() {
|
|
372
|
+
const filters = [];
|
|
373
|
+
if (this.verticalFilter) {
|
|
374
|
+
filters.push(this.verticalFilter);
|
|
375
|
+
}
|
|
376
|
+
if (this.horizontalFilter) {
|
|
377
|
+
filters.push(this.horizontalFilter);
|
|
378
|
+
}
|
|
379
|
+
if (this.customFilter) {
|
|
380
|
+
filters.push(this.customFilter);
|
|
381
|
+
}
|
|
382
|
+
if (filters.length === 0) {
|
|
383
|
+
return void 0;
|
|
384
|
+
}
|
|
385
|
+
if (filters.length === 1) {
|
|
386
|
+
return filters[0];
|
|
387
|
+
}
|
|
388
|
+
return { and: filters };
|
|
389
|
+
}
|
|
390
|
+
};
|
|
391
|
+
|
|
392
|
+
// src/query/rag-query-composer.ts
|
|
393
|
+
var RAGQueryComposer = class {
|
|
394
|
+
/**
|
|
395
|
+
* Create a new RAGQueryComposer.
|
|
396
|
+
*
|
|
397
|
+
* @param adapter - Vector database adapter for search operations
|
|
398
|
+
* @param embedder - Embedder for converting text queries to vectors
|
|
399
|
+
*/
|
|
400
|
+
constructor(adapter, embedder) {
|
|
401
|
+
this.adapter = adapter;
|
|
402
|
+
this.embedder = embedder;
|
|
403
|
+
}
|
|
404
|
+
/**
|
|
405
|
+
* Main retrieval method.
|
|
406
|
+
*
|
|
407
|
+
* Performs semantic search with optional filtering:
|
|
408
|
+
* 1. Embeds query text using embedder
|
|
409
|
+
* 2. Builds combined filter using FilterBuilder
|
|
410
|
+
* 3. Calls adapter.search() with query vector and filter
|
|
411
|
+
* 4. Returns results with filter information
|
|
412
|
+
*
|
|
413
|
+
* @param params - Retrieval parameters
|
|
414
|
+
* @returns Retrieval result with records and filter information
|
|
415
|
+
*/
|
|
416
|
+
async retrieve(params) {
|
|
417
|
+
const queryVector = await this.embedder.embed(params.query);
|
|
418
|
+
const filterBuilder = new FilterBuilder();
|
|
419
|
+
if (params.verticalFilters) {
|
|
420
|
+
filterBuilder.withVerticalFilter(params.verticalFilters);
|
|
421
|
+
}
|
|
422
|
+
if (params.horizontalFilters) {
|
|
423
|
+
filterBuilder.withHorizontalFilter(params.horizontalFilters);
|
|
424
|
+
}
|
|
425
|
+
if (params.customFilters) {
|
|
426
|
+
filterBuilder.withCustomFilter(params.customFilters);
|
|
427
|
+
}
|
|
428
|
+
const combinedFilter = filterBuilder.build();
|
|
429
|
+
const searchResult = await this.adapter.search(
|
|
430
|
+
params.collection,
|
|
431
|
+
queryVector,
|
|
432
|
+
{
|
|
433
|
+
topK: params.topK,
|
|
434
|
+
filter: combinedFilter,
|
|
435
|
+
includeMetadata: true,
|
|
436
|
+
includeValues: params.includeEmbeddings
|
|
437
|
+
}
|
|
438
|
+
);
|
|
439
|
+
return {
|
|
440
|
+
records: searchResult.records,
|
|
441
|
+
query: params.query,
|
|
442
|
+
filtersApplied: {
|
|
443
|
+
...params.verticalFilters && { vertical: params.verticalFilters },
|
|
444
|
+
...params.horizontalFilters && { horizontal: params.horizontalFilters },
|
|
445
|
+
...params.customFilters && { custom: params.customFilters }
|
|
446
|
+
}
|
|
447
|
+
};
|
|
448
|
+
}
|
|
449
|
+
/**
|
|
450
|
+
* Retrieve and group results by document ID.
|
|
451
|
+
*
|
|
452
|
+
* Calls retrieve() and organizes results into a Map keyed by __v_doc_id.
|
|
453
|
+
* Records without a doc_id are excluded.
|
|
454
|
+
*
|
|
455
|
+
* @param params - Retrieval parameters
|
|
456
|
+
* @returns Map of document ID to array of records
|
|
457
|
+
*/
|
|
458
|
+
async retrieveVertical(params) {
|
|
459
|
+
const result = await this.retrieve(params);
|
|
460
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
461
|
+
for (const record of result.records) {
|
|
462
|
+
const docId = record.metadata[VerticalFields.DOC_ID];
|
|
463
|
+
if (typeof docId === "string") {
|
|
464
|
+
if (!grouped.has(docId)) {
|
|
465
|
+
grouped.set(docId, []);
|
|
466
|
+
}
|
|
467
|
+
grouped.get(docId).push(record);
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
return grouped;
|
|
471
|
+
}
|
|
472
|
+
/**
|
|
473
|
+
* Retrieve and group results by theme.
|
|
474
|
+
*
|
|
475
|
+
* Calls retrieve() and organizes results into a Map keyed by __h_theme.
|
|
476
|
+
* Records without a theme are excluded.
|
|
477
|
+
*
|
|
478
|
+
* @param params - Retrieval parameters
|
|
479
|
+
* @returns Map of theme to array of records
|
|
480
|
+
*/
|
|
481
|
+
async retrieveHorizontal(params) {
|
|
482
|
+
const result = await this.retrieve(params);
|
|
483
|
+
const grouped = /* @__PURE__ */ new Map();
|
|
484
|
+
for (const record of result.records) {
|
|
485
|
+
const theme = record.metadata[HorizontalFields.THEME];
|
|
486
|
+
if (typeof theme === "string") {
|
|
487
|
+
if (!grouped.has(theme)) {
|
|
488
|
+
grouped.set(theme, []);
|
|
489
|
+
}
|
|
490
|
+
grouped.get(theme).push(record);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
return grouped;
|
|
494
|
+
}
|
|
495
|
+
};
|
|
496
|
+
|
|
497
|
+
// src/embedders/embedder.ts
|
|
498
|
+
var Embedder = class _Embedder {
|
|
499
|
+
/**
|
|
500
|
+
* Constructor is protected to prevent direct instantiation of abstract class.
|
|
501
|
+
* Subclasses can call super() in their constructors.
|
|
502
|
+
*/
|
|
503
|
+
constructor() {
|
|
504
|
+
if (new.target === _Embedder) {
|
|
505
|
+
throw new Error("Cannot instantiate abstract class Embedder directly");
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
};
|
|
509
|
+
|
|
510
|
+
// src/llm/llm-client.ts
|
|
511
|
+
var LLMClient = class _LLMClient {
|
|
512
|
+
/**
|
|
513
|
+
* Constructor is protected to prevent direct instantiation of abstract class.
|
|
514
|
+
* Subclasses can call super() in their constructors.
|
|
515
|
+
*/
|
|
516
|
+
constructor() {
|
|
517
|
+
if (new.target === _LLMClient) {
|
|
518
|
+
throw new Error("Cannot instantiate abstract class LLMClient directly");
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
};
|
|
522
|
+
|
|
523
|
+
// src/llm/mock-llm.ts
|
|
524
|
+
var MockLLM = class extends LLMClient {
|
|
525
|
+
_response = "";
|
|
526
|
+
constructor() {
|
|
527
|
+
super();
|
|
528
|
+
}
|
|
529
|
+
get modelName() {
|
|
530
|
+
return "mock-llm-v1";
|
|
531
|
+
}
|
|
532
|
+
get provider() {
|
|
533
|
+
return "mock";
|
|
534
|
+
}
|
|
535
|
+
/**
|
|
536
|
+
* Set the canned response that will be returned by generate methods.
|
|
537
|
+
*
|
|
538
|
+
* @param response - The response text to return
|
|
539
|
+
*/
|
|
540
|
+
setResponse(response) {
|
|
541
|
+
this._response = response;
|
|
542
|
+
}
|
|
543
|
+
async generate(prompt, options) {
|
|
544
|
+
return this._response;
|
|
545
|
+
}
|
|
546
|
+
async generateJSON(prompt, options) {
|
|
547
|
+
try {
|
|
548
|
+
return JSON.parse(this._response);
|
|
549
|
+
} catch (error) {
|
|
550
|
+
throw new Error(
|
|
551
|
+
`Failed to parse mock response as JSON: ${error instanceof Error ? error.message : "unknown error"}`
|
|
552
|
+
);
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
async generateBatch(prompts, options) {
|
|
556
|
+
return prompts.map(() => this._response);
|
|
557
|
+
}
|
|
558
|
+
};
|
|
559
|
+
|
|
560
|
+
// src/enrichment/classifiers/keyword-classifier.ts
|
|
561
|
+
var KeywordThemeClassifier = class {
|
|
562
|
+
/**
|
|
563
|
+
* Creates a new KeywordThemeClassifier
|
|
564
|
+
* @param themes - Array of theme names
|
|
565
|
+
* @param keywords - Map of theme names to their keyword arrays
|
|
566
|
+
* @param caseSensitive - Whether matching should be case sensitive (default: false)
|
|
567
|
+
*/
|
|
568
|
+
constructor(themes, keywords, caseSensitive = false) {
|
|
569
|
+
this.themes = themes;
|
|
570
|
+
this.caseSensitive = caseSensitive;
|
|
571
|
+
this.patterns = /* @__PURE__ */ new Map();
|
|
572
|
+
this.keywordCounts = /* @__PURE__ */ new Map();
|
|
573
|
+
for (const theme of themes) {
|
|
574
|
+
const themeKeywords = keywords[theme] || [];
|
|
575
|
+
this.keywordCounts.set(theme, themeKeywords.length);
|
|
576
|
+
const patterns = themeKeywords.map((keyword) => {
|
|
577
|
+
const escapedKeyword = this.escapeRegex(keyword);
|
|
578
|
+
const flags = caseSensitive ? "g" : "gi";
|
|
579
|
+
return new RegExp(`\\b${escapedKeyword}\\b`, flags);
|
|
580
|
+
});
|
|
581
|
+
this.patterns.set(theme, patterns);
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
patterns;
|
|
585
|
+
keywordCounts;
|
|
586
|
+
/**
|
|
587
|
+
* Classify a single text
|
|
588
|
+
* @param text - Text to classify
|
|
589
|
+
* @returns Classification result with theme, confidence, and all scores
|
|
590
|
+
*/
|
|
591
|
+
classify(text) {
|
|
592
|
+
if (!text || text.trim().length === 0) {
|
|
593
|
+
return {
|
|
594
|
+
theme: "unknown",
|
|
595
|
+
confidence: 0,
|
|
596
|
+
allScores: {}
|
|
597
|
+
};
|
|
598
|
+
}
|
|
599
|
+
const scores = {};
|
|
600
|
+
let maxScore = 0;
|
|
601
|
+
let winningTheme = "unknown";
|
|
602
|
+
for (const theme of this.themes) {
|
|
603
|
+
const patterns = this.patterns.get(theme) || [];
|
|
604
|
+
let matchCount = 0;
|
|
605
|
+
for (const pattern of patterns) {
|
|
606
|
+
const matches = text.match(pattern);
|
|
607
|
+
if (matches) {
|
|
608
|
+
matchCount += matches.length;
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
scores[theme] = matchCount;
|
|
612
|
+
if (matchCount > maxScore) {
|
|
613
|
+
maxScore = matchCount;
|
|
614
|
+
winningTheme = theme;
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
if (maxScore === 0) {
|
|
618
|
+
return {
|
|
619
|
+
theme: "unknown",
|
|
620
|
+
confidence: 0,
|
|
621
|
+
allScores: scores
|
|
622
|
+
};
|
|
623
|
+
}
|
|
624
|
+
const totalKeywords = this.keywordCounts.get(winningTheme) || 1;
|
|
625
|
+
const confidence = maxScore / totalKeywords;
|
|
626
|
+
return {
|
|
627
|
+
theme: winningTheme,
|
|
628
|
+
confidence: Math.min(confidence, 1),
|
|
629
|
+
// Cap at 1.0
|
|
630
|
+
allScores: scores
|
|
631
|
+
};
|
|
632
|
+
}
|
|
633
|
+
/**
|
|
634
|
+
* Classify multiple texts in batch
|
|
635
|
+
* @param texts - Array of texts to classify
|
|
636
|
+
* @returns Array of classification results
|
|
637
|
+
*/
|
|
638
|
+
classifyBatch(texts) {
|
|
639
|
+
return texts.map((text) => this.classify(text));
|
|
640
|
+
}
|
|
641
|
+
/**
|
|
642
|
+
* Escape special regex characters in a string
|
|
643
|
+
* @param str - String to escape
|
|
644
|
+
* @returns Escaped string safe for use in regex
|
|
645
|
+
*/
|
|
646
|
+
escapeRegex(str) {
|
|
647
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
648
|
+
}
|
|
649
|
+
};
|
|
650
|
+
|
|
651
|
+
// src/enrichment/classifiers/zero-shot-classifier.ts
|
|
652
|
+
var ZeroShotThemeClassifier = class {
|
|
653
|
+
model = null;
|
|
654
|
+
modelName;
|
|
655
|
+
themes;
|
|
656
|
+
/**
|
|
657
|
+
* Creates a new ZeroShotThemeClassifier
|
|
658
|
+
*
|
|
659
|
+
* @param themes - Array of theme labels to classify into
|
|
660
|
+
* @param modelName - Name of the Hugging Face model to use (default: 'Xenova/distilbert-base-uncased-mnli')
|
|
661
|
+
*
|
|
662
|
+
* @example
|
|
663
|
+
* ```typescript
|
|
664
|
+
* // Use default model
|
|
665
|
+
* const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'finance']);
|
|
666
|
+
*
|
|
667
|
+
* // Use custom model
|
|
668
|
+
* const classifier = new ZeroShotThemeClassifier(
|
|
669
|
+
* ['positive', 'negative'],
|
|
670
|
+
* 'Xenova/distilbert-base-uncased-mnli'
|
|
671
|
+
* );
|
|
672
|
+
* ```
|
|
673
|
+
*/
|
|
674
|
+
constructor(themes, modelName = "Xenova/distilbert-base-uncased-mnli") {
|
|
675
|
+
this.themes = themes;
|
|
676
|
+
this.modelName = modelName;
|
|
677
|
+
}
|
|
678
|
+
/**
|
|
679
|
+
* Lazy loads the zero-shot classification model
|
|
680
|
+
* Only loads once on first call, subsequent calls reuse the loaded model
|
|
681
|
+
*
|
|
682
|
+
* @returns Promise that resolves to the loaded pipeline
|
|
683
|
+
*/
|
|
684
|
+
async ensureModelLoaded() {
|
|
685
|
+
if (!this.model) {
|
|
686
|
+
const { pipeline } = await import("@xenova/transformers");
|
|
687
|
+
this.model = await pipeline("zero-shot-classification", this.modelName);
|
|
688
|
+
}
|
|
689
|
+
return this.model;
|
|
690
|
+
}
|
|
691
|
+
/**
|
|
692
|
+
* Classify a single text into one of the provided themes
|
|
693
|
+
*
|
|
694
|
+
* @param text - The text content to classify
|
|
695
|
+
* @returns A promise that resolves to the theme classification result
|
|
696
|
+
*
|
|
697
|
+
* @example
|
|
698
|
+
* ```typescript
|
|
699
|
+
* const classifier = new ZeroShotThemeClassifier(['technology', 'sports']);
|
|
700
|
+
* const result = await classifier.classify('Machine learning and AI');
|
|
701
|
+
* console.log(result.theme); // 'technology'
|
|
702
|
+
* console.log(result.confidence); // 0.92
|
|
703
|
+
* console.log(result.allScores); // { technology: 0.92, sports: 0.08 }
|
|
704
|
+
* ```
|
|
705
|
+
*/
|
|
706
|
+
async classify(text) {
|
|
707
|
+
if (!text || text.trim().length === 0) {
|
|
708
|
+
const uniformScore = 1 / this.themes.length;
|
|
709
|
+
const allScores2 = {};
|
|
710
|
+
for (const theme of this.themes) {
|
|
711
|
+
allScores2[theme] = uniformScore;
|
|
712
|
+
}
|
|
713
|
+
return {
|
|
714
|
+
theme: this.themes[0],
|
|
715
|
+
// Return first theme
|
|
716
|
+
confidence: uniformScore,
|
|
717
|
+
allScores: allScores2
|
|
718
|
+
};
|
|
719
|
+
}
|
|
720
|
+
const model = await this.ensureModelLoaded();
|
|
721
|
+
const result = await model(text, this.themes);
|
|
722
|
+
const allScores = {};
|
|
723
|
+
for (let i = 0; i < result.labels.length; i++) {
|
|
724
|
+
allScores[result.labels[i]] = result.scores[i];
|
|
725
|
+
}
|
|
726
|
+
return {
|
|
727
|
+
theme: result.labels[0],
|
|
728
|
+
confidence: result.scores[0],
|
|
729
|
+
allScores
|
|
730
|
+
};
|
|
731
|
+
}
|
|
732
|
+
/**
|
|
733
|
+
* Classify multiple texts efficiently
|
|
734
|
+
*
|
|
735
|
+
* Processes texts sequentially to avoid memory issues with large batches.
|
|
736
|
+
* The model is loaded once and reused for all texts.
|
|
737
|
+
*
|
|
738
|
+
* @param texts - Array of text contents to classify
|
|
739
|
+
* @returns A promise that resolves to an array of theme classifications
|
|
740
|
+
*
|
|
741
|
+
* @example
|
|
742
|
+
* ```typescript
|
|
743
|
+
* const classifier = new ZeroShotThemeClassifier(['technology', 'sports', 'finance']);
|
|
744
|
+
* const results = await classifier.classifyBatch([
|
|
745
|
+
* 'Machine learning is transforming AI',
|
|
746
|
+
* 'The football team won the championship',
|
|
747
|
+
* 'Stock market hits record high'
|
|
748
|
+
* ]);
|
|
749
|
+
* // results[0].theme === 'technology'
|
|
750
|
+
* // results[1].theme === 'sports'
|
|
751
|
+
* // results[2].theme === 'finance'
|
|
752
|
+
* ```
|
|
753
|
+
*/
|
|
754
|
+
async classifyBatch(texts) {
|
|
755
|
+
await this.ensureModelLoaded();
|
|
756
|
+
const results = [];
|
|
757
|
+
for (const text of texts) {
|
|
758
|
+
const result = await this.classify(text);
|
|
759
|
+
results.push(result);
|
|
760
|
+
}
|
|
761
|
+
return results;
|
|
762
|
+
}
|
|
763
|
+
};
|
|
764
|
+
|
|
765
|
+
// src/enrichment/classifiers/embedding-classifier.ts
|
|
766
|
+
var EmbeddingThemeClassifier = class {
|
|
767
|
+
themeEmbeddings = null;
|
|
768
|
+
embedder;
|
|
769
|
+
themes;
|
|
770
|
+
/**
|
|
771
|
+
* Creates a new EmbeddingThemeClassifier
|
|
772
|
+
*
|
|
773
|
+
* @param themes - Array of theme labels to classify into
|
|
774
|
+
* @param embedder - Embedder instance to use for generating embeddings
|
|
775
|
+
* @param precomputedEmbeddings - Optional precomputed theme embeddings for faster startup
|
|
776
|
+
*
|
|
777
|
+
* @example
|
|
778
|
+
* ```typescript
|
|
779
|
+
* // Lazy initialization
|
|
780
|
+
* const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder);
|
|
781
|
+
*
|
|
782
|
+
* // With precomputed embeddings
|
|
783
|
+
* const themeEmbeddings = {
|
|
784
|
+
* technology: await embedder.embed('technology'),
|
|
785
|
+
* sports: await embedder.embed('sports')
|
|
786
|
+
* };
|
|
787
|
+
* const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder, themeEmbeddings);
|
|
788
|
+
* ```
|
|
789
|
+
*/
|
|
790
|
+
constructor(themes, embedder, precomputedEmbeddings) {
|
|
791
|
+
this.themes = themes;
|
|
792
|
+
this.embedder = embedder;
|
|
793
|
+
this.themeEmbeddings = precomputedEmbeddings || null;
|
|
794
|
+
}
|
|
795
|
+
/**
|
|
796
|
+
* Lazy loads theme embeddings on first use
|
|
797
|
+
* Computes embeddings for all theme labels if not already computed
|
|
798
|
+
*
|
|
799
|
+
* @returns Promise that resolves to the theme embeddings map
|
|
800
|
+
*/
|
|
801
|
+
async ensureThemeEmbeddings() {
|
|
802
|
+
if (!this.themeEmbeddings) {
|
|
803
|
+
this.themeEmbeddings = {};
|
|
804
|
+
const embeddings = await this.embedder.embedBatch(this.themes);
|
|
805
|
+
for (let i = 0; i < this.themes.length; i++) {
|
|
806
|
+
this.themeEmbeddings[this.themes[i]] = embeddings[i];
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
return this.themeEmbeddings;
|
|
810
|
+
}
|
|
811
|
+
/**
|
|
812
|
+
* Compute cosine similarity between two vectors
|
|
813
|
+
*
|
|
814
|
+
* Cosine similarity = dotProduct / (normA * normB)
|
|
815
|
+
* Returns value in range [-1, 1] where:
|
|
816
|
+
* - 1 means vectors point in the same direction
|
|
817
|
+
* - 0 means vectors are orthogonal
|
|
818
|
+
* - -1 means vectors point in opposite directions
|
|
819
|
+
*
|
|
820
|
+
* @param a - First vector
|
|
821
|
+
* @param b - Second vector
|
|
822
|
+
* @returns Cosine similarity between the vectors
|
|
823
|
+
*/
|
|
824
|
+
cosineSimilarity(a, b) {
|
|
825
|
+
if (a.length !== b.length) {
|
|
826
|
+
throw new Error("Vectors must have the same length for cosine similarity");
|
|
827
|
+
}
|
|
828
|
+
let dotProduct = 0;
|
|
829
|
+
let normA = 0;
|
|
830
|
+
let normB = 0;
|
|
831
|
+
for (let i = 0; i < a.length; i++) {
|
|
832
|
+
dotProduct += a[i] * b[i];
|
|
833
|
+
normA += a[i] * a[i];
|
|
834
|
+
normB += b[i] * b[i];
|
|
835
|
+
}
|
|
836
|
+
normA = Math.sqrt(normA);
|
|
837
|
+
normB = Math.sqrt(normB);
|
|
838
|
+
if (normA === 0 || normB === 0) {
|
|
839
|
+
return 0;
|
|
840
|
+
}
|
|
841
|
+
return dotProduct / (normA * normB);
|
|
842
|
+
}
|
|
843
|
+
/**
|
|
844
|
+
* Normalize cosine similarity from [-1, 1] to confidence score [0, 1]
|
|
845
|
+
*
|
|
846
|
+
* Uses linear transformation: (similarity + 1) / 2
|
|
847
|
+
*
|
|
848
|
+
* @param similarity - Cosine similarity value in range [-1, 1]
|
|
849
|
+
* @returns Confidence score in range [0, 1]
|
|
850
|
+
*/
|
|
851
|
+
normalizeToConfidence(similarity) {
|
|
852
|
+
return (similarity + 1) / 2;
|
|
853
|
+
}
|
|
854
|
+
/**
|
|
855
|
+
* Classify a single text into one of the provided themes
|
|
856
|
+
*
|
|
857
|
+
* @param text - The text content to classify
|
|
858
|
+
* @returns A promise that resolves to the theme classification result
|
|
859
|
+
*
|
|
860
|
+
* @example
|
|
861
|
+
* ```typescript
|
|
862
|
+
* const classifier = new EmbeddingThemeClassifier(['technology', 'sports'], embedder);
|
|
863
|
+
* const result = await classifier.classify('Machine learning and AI');
|
|
864
|
+
* console.log(result.theme); // 'technology'
|
|
865
|
+
* console.log(result.confidence); // 0.92
|
|
866
|
+
* console.log(result.allScores); // { technology: 0.92, sports: 0.45 }
|
|
867
|
+
* ```
|
|
868
|
+
*/
|
|
869
|
+
async classify(text) {
|
|
870
|
+
if (!text || text.trim().length === 0) {
|
|
871
|
+
const uniformScore = 1 / this.themes.length;
|
|
872
|
+
const allScores2 = {};
|
|
873
|
+
for (const theme of this.themes) {
|
|
874
|
+
allScores2[theme] = uniformScore;
|
|
875
|
+
}
|
|
876
|
+
return {
|
|
877
|
+
theme: this.themes[0],
|
|
878
|
+
// Return first theme
|
|
879
|
+
confidence: uniformScore,
|
|
880
|
+
allScores: allScores2
|
|
881
|
+
};
|
|
882
|
+
}
|
|
883
|
+
const themeEmbeddings = await this.ensureThemeEmbeddings();
|
|
884
|
+
const textEmbedding = await this.embedder.embed(text);
|
|
885
|
+
const similarities = {};
|
|
886
|
+
let maxSimilarity = -Infinity;
|
|
887
|
+
let winningTheme = this.themes[0];
|
|
888
|
+
for (const theme of this.themes) {
|
|
889
|
+
const themeEmbedding = themeEmbeddings[theme];
|
|
890
|
+
const similarity = this.cosineSimilarity(textEmbedding, themeEmbedding);
|
|
891
|
+
similarities[theme] = similarity;
|
|
892
|
+
if (similarity > maxSimilarity) {
|
|
893
|
+
maxSimilarity = similarity;
|
|
894
|
+
winningTheme = theme;
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
const allScores = {};
|
|
898
|
+
for (const theme of this.themes) {
|
|
899
|
+
allScores[theme] = this.normalizeToConfidence(similarities[theme]);
|
|
900
|
+
}
|
|
901
|
+
return {
|
|
902
|
+
theme: winningTheme,
|
|
903
|
+
confidence: this.normalizeToConfidence(maxSimilarity),
|
|
904
|
+
allScores
|
|
905
|
+
};
|
|
906
|
+
}
|
|
907
|
+
/**
|
|
908
|
+
* Classify multiple texts efficiently
|
|
909
|
+
*
|
|
910
|
+
* Ensures theme embeddings are loaded once, then processes all texts.
|
|
911
|
+
* Text embeddings are computed in batch for efficiency.
|
|
912
|
+
*
|
|
913
|
+
* @param texts - Array of text contents to classify
|
|
914
|
+
* @returns A promise that resolves to an array of theme classifications
|
|
915
|
+
*
|
|
916
|
+
* @example
|
|
917
|
+
* ```typescript
|
|
918
|
+
* const classifier = new EmbeddingThemeClassifier(['technology', 'sports', 'finance'], embedder);
|
|
919
|
+
* const results = await classifier.classifyBatch([
|
|
920
|
+
* 'Machine learning is transforming AI',
|
|
921
|
+
* 'The football team won the championship',
|
|
922
|
+
* 'Stock market hits record high'
|
|
923
|
+
* ]);
|
|
924
|
+
* // results[0].theme === 'technology'
|
|
925
|
+
* // results[1].theme === 'sports'
|
|
926
|
+
* // results[2].theme === 'finance'
|
|
927
|
+
* ```
|
|
928
|
+
*/
|
|
929
|
+
async classifyBatch(texts) {
|
|
930
|
+
await this.ensureThemeEmbeddings();
|
|
931
|
+
const results = [];
|
|
932
|
+
for (const text of texts) {
|
|
933
|
+
const result = await this.classify(text);
|
|
934
|
+
results.push(result);
|
|
935
|
+
}
|
|
936
|
+
return results;
|
|
937
|
+
}
|
|
938
|
+
};
|
|
939
|
+
|
|
940
|
+
// src/enrichment/classifiers/llm-classifier.ts
|
|
941
|
+
var DEFAULT_PROMPT_TEMPLATE = `You are a theme classification system. Classify the following text into one of the provided themes.
|
|
942
|
+
|
|
943
|
+
Available themes: {themes}
|
|
944
|
+
|
|
945
|
+
Text to classify:
|
|
946
|
+
{text}
|
|
947
|
+
|
|
948
|
+
Return a JSON object with the following structure:
|
|
949
|
+
- theme: the most appropriate theme from the list (string)
|
|
950
|
+
- confidence: confidence score between 0 and 1 (number)
|
|
951
|
+
- allScores: an object mapping each theme to its confidence score (object)
|
|
952
|
+
|
|
953
|
+
Return only valid JSON, no additional text.`;
|
|
954
|
+
var LLMThemeClassifier = class {
|
|
955
|
+
themes;
|
|
956
|
+
llm;
|
|
957
|
+
promptTemplate;
|
|
958
|
+
/**
|
|
959
|
+
* Creates a new LLMThemeClassifier
|
|
960
|
+
*
|
|
961
|
+
* @param themes - Array of theme labels to classify into
|
|
962
|
+
* @param llm - LLM client instance to use for classification
|
|
963
|
+
* @param promptTemplate - Optional custom prompt template with {themes} and {text} placeholders
|
|
964
|
+
*
|
|
965
|
+
* @example
|
|
966
|
+
* ```typescript
|
|
967
|
+
* const classifier = new LLMThemeClassifier(
|
|
968
|
+
* ['technology', 'sports', 'finance'],
|
|
969
|
+
* llm
|
|
970
|
+
* );
|
|
971
|
+
* ```
|
|
972
|
+
*
|
|
973
|
+
* @example With custom prompt
|
|
974
|
+
* ```typescript
|
|
975
|
+
* const customTemplate = `Classify: {text}\nThemes: {themes}\nReturn JSON.`;
|
|
976
|
+
* const classifier = new LLMThemeClassifier(
|
|
977
|
+
* ['technology', 'sports'],
|
|
978
|
+
* llm,
|
|
979
|
+
* customTemplate
|
|
980
|
+
* );
|
|
981
|
+
* ```
|
|
982
|
+
*/
|
|
983
|
+
constructor(themes, llm, promptTemplate = DEFAULT_PROMPT_TEMPLATE) {
|
|
984
|
+
this.themes = themes;
|
|
985
|
+
this.llm = llm;
|
|
986
|
+
this.promptTemplate = promptTemplate;
|
|
987
|
+
}
|
|
988
|
+
/**
|
|
989
|
+
* Build the classification prompt by replacing placeholders
|
|
990
|
+
*
|
|
991
|
+
* @param text - The text to classify
|
|
992
|
+
* @returns The complete prompt with placeholders replaced
|
|
993
|
+
*/
|
|
994
|
+
buildPrompt(text) {
|
|
995
|
+
const themesStr = this.themes.join(", ");
|
|
996
|
+
return this.promptTemplate.replace("{themes}", themesStr).replace("{text}", text);
|
|
997
|
+
}
|
|
998
|
+
/**
|
|
999
|
+
* Classify a single text into one of the provided themes
|
|
1000
|
+
*
|
|
1001
|
+
* @param text - The text content to classify
|
|
1002
|
+
* @returns A promise that resolves to the theme classification result
|
|
1003
|
+
*
|
|
1004
|
+
* @example
|
|
1005
|
+
* ```typescript
|
|
1006
|
+
* const classifier = new LLMThemeClassifier(['technology', 'sports'], llm);
|
|
1007
|
+
* const result = await classifier.classify('Machine learning and AI');
|
|
1008
|
+
* console.log(result.theme); // 'technology'
|
|
1009
|
+
* console.log(result.confidence); // 0.95
|
|
1010
|
+
* console.log(result.allScores); // { technology: 0.95, sports: 0.05 }
|
|
1011
|
+
* ```
|
|
1012
|
+
*/
|
|
1013
|
+
async classify(text) {
|
|
1014
|
+
if (!text || text.trim().length === 0) {
|
|
1015
|
+
const uniformScore = 1 / this.themes.length;
|
|
1016
|
+
const allScores = {};
|
|
1017
|
+
for (const theme of this.themes) {
|
|
1018
|
+
allScores[theme] = uniformScore;
|
|
1019
|
+
}
|
|
1020
|
+
return {
|
|
1021
|
+
theme: this.themes[0],
|
|
1022
|
+
// Return first theme
|
|
1023
|
+
confidence: uniformScore,
|
|
1024
|
+
allScores
|
|
1025
|
+
};
|
|
1026
|
+
}
|
|
1027
|
+
const prompt = this.buildPrompt(text);
|
|
1028
|
+
try {
|
|
1029
|
+
const result = await this.llm.generateJSON(prompt);
|
|
1030
|
+
return result;
|
|
1031
|
+
} catch (error) {
|
|
1032
|
+
const message = `Failed to classify text with LLM: ${error instanceof Error ? error.message : "unknown error"}`;
|
|
1033
|
+
const classificationError = new Error(message);
|
|
1034
|
+
if (error instanceof Error) {
|
|
1035
|
+
classificationError.cause = error;
|
|
1036
|
+
}
|
|
1037
|
+
throw classificationError;
|
|
1038
|
+
}
|
|
1039
|
+
}
|
|
1040
|
+
/**
|
|
1041
|
+
* Classify multiple texts sequentially
|
|
1042
|
+
*
|
|
1043
|
+
* Processes texts one at a time to avoid rate limits and ensure predictable behavior.
|
|
1044
|
+
* Sequential processing provides better error handling and rate limit compliance.
|
|
1045
|
+
*
|
|
1046
|
+
* @param texts - Array of text contents to classify
|
|
1047
|
+
* @returns A promise that resolves to an array of theme classifications
|
|
1048
|
+
*
|
|
1049
|
+
* @example
|
|
1050
|
+
* ```typescript
|
|
1051
|
+
* const classifier = new LLMThemeClassifier(['technology', 'sports', 'finance'], llm);
|
|
1052
|
+
* const results = await classifier.classifyBatch([
|
|
1053
|
+
* 'Machine learning is transforming AI',
|
|
1054
|
+
* 'The football team won the championship',
|
|
1055
|
+
* 'Stock market hits record high'
|
|
1056
|
+
* ]);
|
|
1057
|
+
* // results[0].theme === 'technology'
|
|
1058
|
+
* // results[1].theme === 'sports'
|
|
1059
|
+
* // results[2].theme === 'finance'
|
|
1060
|
+
* ```
|
|
1061
|
+
*/
|
|
1062
|
+
async classifyBatch(texts) {
|
|
1063
|
+
const results = [];
|
|
1064
|
+
for (const text of texts) {
|
|
1065
|
+
const result = await this.classify(text);
|
|
1066
|
+
results.push(result);
|
|
1067
|
+
}
|
|
1068
|
+
return results;
|
|
1069
|
+
}
|
|
1070
|
+
};
|
|
1071
|
+
|
|
1072
|
+
// src/enrichment/enrichment-pipeline.ts
|
|
1073
|
+
var EnrichmentPipeline = class {
|
|
1074
|
+
/**
|
|
1075
|
+
* Create a new enrichment pipeline.
|
|
1076
|
+
*
|
|
1077
|
+
* @param adapter - Vector database adapter for reading/writing records
|
|
1078
|
+
* @param embedder - Optional embedder for embedding-based enrichment
|
|
1079
|
+
* @param llm - Optional LLM client for automatic enrichment
|
|
1080
|
+
*/
|
|
1081
|
+
constructor(adapter, embedder, llm) {
|
|
1082
|
+
this.adapter = adapter;
|
|
1083
|
+
this.embedder = embedder;
|
|
1084
|
+
this.llm = llm;
|
|
1085
|
+
}
|
|
1086
|
+
/**
|
|
1087
|
+
* Enrich records with vertical classifications.
|
|
1088
|
+
*
|
|
1089
|
+
* Supports three strategies:
|
|
1090
|
+
* 1. Field mapping: Map existing field values to verticals
|
|
1091
|
+
* 2. Custom extractor: Use a custom function to extract verticals
|
|
1092
|
+
* 3. Automatic LLM: Use an LLM to classify documents
|
|
1093
|
+
*
|
|
1094
|
+
* @param collection - Name of the collection to enrich
|
|
1095
|
+
* @param config - Vertical enrichment configuration
|
|
1096
|
+
* @returns Statistics about the enrichment operation
|
|
1097
|
+
*
|
|
1098
|
+
* @example
|
|
1099
|
+
* ```typescript
|
|
1100
|
+
* // Field mapping
|
|
1101
|
+
* await pipeline.enrichVertical('docs', {
|
|
1102
|
+
* mapping: { 'tech': 'technology' }
|
|
1103
|
+
* });
|
|
1104
|
+
*
|
|
1105
|
+
* // Custom extractor
|
|
1106
|
+
* await pipeline.enrichVertical('docs', {
|
|
1107
|
+
* extractor: async (doc) => 'technology'
|
|
1108
|
+
* });
|
|
1109
|
+
*
|
|
1110
|
+
* // Automatic LLM
|
|
1111
|
+
* await pipeline.enrichVertical('docs', {
|
|
1112
|
+
* automatic: {
|
|
1113
|
+
* llm: myLLMClient,
|
|
1114
|
+
* fields: ['technology', 'finance']
|
|
1115
|
+
* }
|
|
1116
|
+
* });
|
|
1117
|
+
* ```
|
|
1118
|
+
*/
|
|
1119
|
+
async enrichVertical(collection, config) {
|
|
1120
|
+
const startTime = Date.now();
|
|
1121
|
+
const stats = {
|
|
1122
|
+
recordsProcessed: 0,
|
|
1123
|
+
recordsUpdated: 0,
|
|
1124
|
+
recordsSkipped: 0,
|
|
1125
|
+
timeMs: 0,
|
|
1126
|
+
errors: []
|
|
1127
|
+
};
|
|
1128
|
+
try {
|
|
1129
|
+
if ("mapping" in config) {
|
|
1130
|
+
await this.enrichWithFieldMapping(collection, config, stats);
|
|
1131
|
+
} else if ("extractor" in config) {
|
|
1132
|
+
await this.enrichWithExtractor(collection, config, stats);
|
|
1133
|
+
} else if ("automatic" in config) {
|
|
1134
|
+
await this.enrichWithLLM(collection, config, stats);
|
|
1135
|
+
}
|
|
1136
|
+
} catch (error) {
|
|
1137
|
+
stats.errors?.push(
|
|
1138
|
+
`Pipeline error: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1139
|
+
);
|
|
1140
|
+
}
|
|
1141
|
+
stats.timeMs = Date.now() - startTime;
|
|
1142
|
+
return stats;
|
|
1143
|
+
}
|
|
1144
|
+
/**
|
|
1145
|
+
* Enrich records using field mapping strategy.
|
|
1146
|
+
*
|
|
1147
|
+
* Maps values from an existing field to vertical classifications.
|
|
1148
|
+
*
|
|
1149
|
+
* @param collection - Collection name
|
|
1150
|
+
* @param config - Field mapping configuration
|
|
1151
|
+
* @param stats - Statistics object to update
|
|
1152
|
+
*/
|
|
1153
|
+
async enrichWithFieldMapping(collection, config, stats) {
|
|
1154
|
+
const batchSize = config.batchSize || 100;
|
|
1155
|
+
for await (const batch of this.adapter.iterate(collection, {
|
|
1156
|
+
batchSize,
|
|
1157
|
+
filter: config.filter
|
|
1158
|
+
})) {
|
|
1159
|
+
const updates = [];
|
|
1160
|
+
for (const record of batch) {
|
|
1161
|
+
stats.recordsProcessed++;
|
|
1162
|
+
try {
|
|
1163
|
+
const vertical = this.applyFieldMapping(record, config.mapping);
|
|
1164
|
+
if (vertical) {
|
|
1165
|
+
updates.push({
|
|
1166
|
+
id: record.id,
|
|
1167
|
+
metadata: { vertical }
|
|
1168
|
+
});
|
|
1169
|
+
} else {
|
|
1170
|
+
stats.recordsSkipped++;
|
|
1171
|
+
}
|
|
1172
|
+
} catch (error) {
|
|
1173
|
+
stats.recordsSkipped++;
|
|
1174
|
+
stats.errors?.push(
|
|
1175
|
+
`Error mapping record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1176
|
+
);
|
|
1177
|
+
}
|
|
1178
|
+
}
|
|
1179
|
+
if (updates.length > 0) {
|
|
1180
|
+
try {
|
|
1181
|
+
await this.adapter.updateMetadata(collection, updates);
|
|
1182
|
+
stats.recordsUpdated += updates.length;
|
|
1183
|
+
} catch (error) {
|
|
1184
|
+
stats.errors?.push(
|
|
1185
|
+
`Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1186
|
+
);
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
/**
|
|
1192
|
+
* Apply field mapping to extract vertical from a record.
|
|
1193
|
+
*
|
|
1194
|
+
* @param record - Vector record
|
|
1195
|
+
* @param mapping - Field mapping configuration
|
|
1196
|
+
* @returns Vertical label or null if no match
|
|
1197
|
+
*/
|
|
1198
|
+
applyFieldMapping(record, mapping) {
|
|
1199
|
+
const category = record.metadata?.category;
|
|
1200
|
+
if (category && typeof category === "string" && category in mapping) {
|
|
1201
|
+
return mapping[category];
|
|
1202
|
+
}
|
|
1203
|
+
return null;
|
|
1204
|
+
}
|
|
1205
|
+
/**
|
|
1206
|
+
* Enrich records using custom extractor strategy.
|
|
1207
|
+
*
|
|
1208
|
+
* Calls the provided extractor function for each record.
|
|
1209
|
+
*
|
|
1210
|
+
* @param collection - Collection name
|
|
1211
|
+
* @param config - Extractor configuration
|
|
1212
|
+
* @param stats - Statistics object to update
|
|
1213
|
+
*/
|
|
1214
|
+
async enrichWithExtractor(collection, config, stats) {
|
|
1215
|
+
const batchSize = config.batchSize || 100;
|
|
1216
|
+
for await (const batch of this.adapter.iterate(collection, {
|
|
1217
|
+
batchSize,
|
|
1218
|
+
filter: config.filter
|
|
1219
|
+
})) {
|
|
1220
|
+
const updates = [];
|
|
1221
|
+
for (const record of batch) {
|
|
1222
|
+
stats.recordsProcessed++;
|
|
1223
|
+
try {
|
|
1224
|
+
const vertical = await config.extractor(record);
|
|
1225
|
+
if (vertical) {
|
|
1226
|
+
updates.push({
|
|
1227
|
+
id: record.id,
|
|
1228
|
+
metadata: { vertical }
|
|
1229
|
+
});
|
|
1230
|
+
} else {
|
|
1231
|
+
stats.recordsSkipped++;
|
|
1232
|
+
}
|
|
1233
|
+
} catch (error) {
|
|
1234
|
+
stats.recordsSkipped++;
|
|
1235
|
+
stats.errors?.push(
|
|
1236
|
+
`Extractor error for record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1237
|
+
);
|
|
1238
|
+
}
|
|
1239
|
+
}
|
|
1240
|
+
if (updates.length > 0) {
|
|
1241
|
+
try {
|
|
1242
|
+
await this.adapter.updateMetadata(collection, updates);
|
|
1243
|
+
stats.recordsUpdated += updates.length;
|
|
1244
|
+
} catch (error) {
|
|
1245
|
+
stats.errors?.push(
|
|
1246
|
+
`Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1247
|
+
);
|
|
1248
|
+
}
|
|
1249
|
+
}
|
|
1250
|
+
}
|
|
1251
|
+
}
|
|
1252
|
+
/**
|
|
1253
|
+
* Enrich records using automatic LLM strategy.
|
|
1254
|
+
*
|
|
1255
|
+
* Uses a language model to classify documents into verticals.
|
|
1256
|
+
*
|
|
1257
|
+
* @param collection - Collection name
|
|
1258
|
+
* @param config - Automatic extraction configuration
|
|
1259
|
+
* @param stats - Statistics object to update
|
|
1260
|
+
*/
|
|
1261
|
+
async enrichWithLLM(collection, config, stats) {
|
|
1262
|
+
const batchSize = config.batchSize || 10;
|
|
1263
|
+
const { llm, fields, promptTemplate, textField } = config.automatic;
|
|
1264
|
+
const fieldName = textField || "content";
|
|
1265
|
+
for await (const batch of this.adapter.iterate(collection, {
|
|
1266
|
+
batchSize,
|
|
1267
|
+
filter: config.filter
|
|
1268
|
+
})) {
|
|
1269
|
+
const updates = [];
|
|
1270
|
+
for (const record of batch) {
|
|
1271
|
+
stats.recordsProcessed++;
|
|
1272
|
+
try {
|
|
1273
|
+
const vertical = await this.extractWithLLM(
|
|
1274
|
+
record,
|
|
1275
|
+
llm,
|
|
1276
|
+
fields,
|
|
1277
|
+
fieldName,
|
|
1278
|
+
promptTemplate
|
|
1279
|
+
);
|
|
1280
|
+
if (vertical) {
|
|
1281
|
+
updates.push({
|
|
1282
|
+
id: record.id,
|
|
1283
|
+
metadata: { vertical }
|
|
1284
|
+
});
|
|
1285
|
+
} else {
|
|
1286
|
+
stats.recordsSkipped++;
|
|
1287
|
+
}
|
|
1288
|
+
} catch (error) {
|
|
1289
|
+
stats.recordsSkipped++;
|
|
1290
|
+
stats.errors?.push(
|
|
1291
|
+
`LLM extraction error for record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1292
|
+
);
|
|
1293
|
+
}
|
|
1294
|
+
}
|
|
1295
|
+
if (updates.length > 0) {
|
|
1296
|
+
try {
|
|
1297
|
+
await this.adapter.updateMetadata(collection, updates);
|
|
1298
|
+
stats.recordsUpdated += updates.length;
|
|
1299
|
+
} catch (error) {
|
|
1300
|
+
stats.errors?.push(
|
|
1301
|
+
`Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1302
|
+
);
|
|
1303
|
+
}
|
|
1304
|
+
}
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
|
+
/**
|
|
1308
|
+
* Extract vertical classification using LLM.
|
|
1309
|
+
*
|
|
1310
|
+
* @param record - Vector record
|
|
1311
|
+
* @param llm - LLM client
|
|
1312
|
+
* @param fields - Available vertical fields
|
|
1313
|
+
* @param textField - Field name containing text to classify
|
|
1314
|
+
* @param promptTemplate - Optional custom prompt template
|
|
1315
|
+
* @returns Vertical label
|
|
1316
|
+
*/
|
|
1317
|
+
async extractWithLLM(record, llm, fields, textField, promptTemplate) {
|
|
1318
|
+
const text = record.metadata?.[textField];
|
|
1319
|
+
if (!text || typeof text !== "string") {
|
|
1320
|
+
throw new Error(`No text found in field '${textField}'`);
|
|
1321
|
+
}
|
|
1322
|
+
const prompt = promptTemplate ? promptTemplate.replace("{fields}", fields.join(", ")).replace("{text}", text) : `Classify the following text into one of these categories: ${fields.join(", ")}
|
|
1323
|
+
|
|
1324
|
+
Text: ${text}
|
|
1325
|
+
|
|
1326
|
+
Category:`;
|
|
1327
|
+
const result = await llm.generate(prompt);
|
|
1328
|
+
return result.trim();
|
|
1329
|
+
}
|
|
1330
|
+
/**
|
|
1331
|
+
* Enrich records with theme classifications.
|
|
1332
|
+
*
|
|
1333
|
+
* Uses a theme classifier to identify themes in text content and updates
|
|
1334
|
+
* record metadata with theme information. Supports single and multi-theme
|
|
1335
|
+
* classification with configurable confidence thresholds.
|
|
1336
|
+
*
|
|
1337
|
+
* @param collection - Name of the collection to enrich
|
|
1338
|
+
* @param config - Theme enrichment configuration
|
|
1339
|
+
* @returns Statistics about the enrichment operation
|
|
1340
|
+
*
|
|
1341
|
+
* @example
|
|
1342
|
+
* ```typescript
|
|
1343
|
+
* // Single theme classification
|
|
1344
|
+
* await pipeline.enrichThemes('docs', {
|
|
1345
|
+
* themes: ['technology', 'business', 'science'],
|
|
1346
|
+
* classifier: new KeywordThemeClassifier(),
|
|
1347
|
+
* confidenceThreshold: 0.7
|
|
1348
|
+
* });
|
|
1349
|
+
*
|
|
1350
|
+
* // Multi-theme classification
|
|
1351
|
+
* await pipeline.enrichThemes('docs', {
|
|
1352
|
+
* themes: ['technology', 'business', 'science'],
|
|
1353
|
+
* classifier: new LLMThemeClassifier(),
|
|
1354
|
+
* multiTheme: true,
|
|
1355
|
+
* confidenceThreshold: 0.5
|
|
1356
|
+
* });
|
|
1357
|
+
* ```
|
|
1358
|
+
*/
|
|
1359
|
+
async enrichThemes(collection, config) {
|
|
1360
|
+
const startTime = Date.now();
|
|
1361
|
+
const stats = {
|
|
1362
|
+
recordsProcessed: 0,
|
|
1363
|
+
recordsUpdated: 0,
|
|
1364
|
+
recordsSkipped: 0,
|
|
1365
|
+
timeMs: 0,
|
|
1366
|
+
errors: []
|
|
1367
|
+
};
|
|
1368
|
+
try {
|
|
1369
|
+
await this.enrichWithThemeClassifier(collection, config, stats);
|
|
1370
|
+
} catch (error) {
|
|
1371
|
+
stats.errors?.push(
|
|
1372
|
+
`Pipeline error: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1373
|
+
);
|
|
1374
|
+
}
|
|
1375
|
+
stats.timeMs = Date.now() - startTime;
|
|
1376
|
+
return stats;
|
|
1377
|
+
}
|
|
1378
|
+
/**
|
|
1379
|
+
* Enrich records using theme classifier.
|
|
1380
|
+
*
|
|
1381
|
+
* @param collection - Collection name
|
|
1382
|
+
* @param config - Theme enrichment configuration
|
|
1383
|
+
* @param stats - Statistics object to update
|
|
1384
|
+
*/
|
|
1385
|
+
async enrichWithThemeClassifier(collection, config, stats) {
|
|
1386
|
+
const batchSize = config.batchSize || 100;
|
|
1387
|
+
const textField = config.textField || "content";
|
|
1388
|
+
const confidenceThreshold = config.confidenceThreshold ?? 0.5;
|
|
1389
|
+
const multiTheme = config.multiTheme || false;
|
|
1390
|
+
for await (const batch of this.adapter.iterate(collection, {
|
|
1391
|
+
batchSize,
|
|
1392
|
+
filter: config.filter
|
|
1393
|
+
})) {
|
|
1394
|
+
const textsToClassify = [];
|
|
1395
|
+
const recordsToProcess = [];
|
|
1396
|
+
for (const record of batch) {
|
|
1397
|
+
stats.recordsProcessed++;
|
|
1398
|
+
const text = record.text || record.metadata?.[textField];
|
|
1399
|
+
if (!text || typeof text !== "string" || text.trim() === "") {
|
|
1400
|
+
stats.recordsSkipped++;
|
|
1401
|
+
continue;
|
|
1402
|
+
}
|
|
1403
|
+
textsToClassify.push(text);
|
|
1404
|
+
recordsToProcess.push(record);
|
|
1405
|
+
}
|
|
1406
|
+
if (textsToClassify.length === 0) {
|
|
1407
|
+
continue;
|
|
1408
|
+
}
|
|
1409
|
+
let classifications;
|
|
1410
|
+
try {
|
|
1411
|
+
classifications = await config.classifier.classifyBatch(textsToClassify);
|
|
1412
|
+
} catch (error) {
|
|
1413
|
+
stats.errors?.push(
|
|
1414
|
+
`Batch classification error, falling back to individual classification: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1415
|
+
);
|
|
1416
|
+
classifications = [];
|
|
1417
|
+
for (let i = 0; i < textsToClassify.length; i++) {
|
|
1418
|
+
try {
|
|
1419
|
+
const result = await config.classifier.classify(textsToClassify[i]);
|
|
1420
|
+
classifications.push(result);
|
|
1421
|
+
} catch (individualError) {
|
|
1422
|
+
classifications.push(null);
|
|
1423
|
+
stats.errors?.push(
|
|
1424
|
+
`Classification error for record ${recordsToProcess[i].id}: ${individualError instanceof Error ? individualError.message : "unknown error"}`
|
|
1425
|
+
);
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
}
|
|
1429
|
+
const updates = [];
|
|
1430
|
+
for (let i = 0; i < recordsToProcess.length; i++) {
|
|
1431
|
+
const record = recordsToProcess[i];
|
|
1432
|
+
const classification = classifications[i];
|
|
1433
|
+
try {
|
|
1434
|
+
if (!classification || typeof classification !== "object") {
|
|
1435
|
+
stats.recordsSkipped++;
|
|
1436
|
+
stats.errors?.push(
|
|
1437
|
+
`Invalid classification for record ${record.id}`
|
|
1438
|
+
);
|
|
1439
|
+
continue;
|
|
1440
|
+
}
|
|
1441
|
+
if (classification.confidence < confidenceThreshold) {
|
|
1442
|
+
stats.recordsSkipped++;
|
|
1443
|
+
continue;
|
|
1444
|
+
}
|
|
1445
|
+
const metadata = {
|
|
1446
|
+
__h_theme: classification.theme,
|
|
1447
|
+
__h_theme_confidence: classification.confidence
|
|
1448
|
+
};
|
|
1449
|
+
if (multiTheme && classification.allScores) {
|
|
1450
|
+
const themes = Object.entries(classification.allScores).filter(([_, score]) => score >= confidenceThreshold).sort(([_, a], [__, b]) => b - a).map(([theme, _]) => theme);
|
|
1451
|
+
if (themes.length > 0) {
|
|
1452
|
+
metadata.__h_themes = themes;
|
|
1453
|
+
}
|
|
1454
|
+
}
|
|
1455
|
+
updates.push({
|
|
1456
|
+
id: record.id,
|
|
1457
|
+
metadata
|
|
1458
|
+
});
|
|
1459
|
+
} catch (error) {
|
|
1460
|
+
stats.recordsSkipped++;
|
|
1461
|
+
stats.errors?.push(
|
|
1462
|
+
`Error processing record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1463
|
+
);
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
if (updates.length > 0) {
|
|
1467
|
+
try {
|
|
1468
|
+
await this.adapter.updateMetadata(collection, updates);
|
|
1469
|
+
stats.recordsUpdated += updates.length;
|
|
1470
|
+
} catch (error) {
|
|
1471
|
+
stats.errors?.push(
|
|
1472
|
+
`Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1473
|
+
);
|
|
1474
|
+
}
|
|
1475
|
+
}
|
|
1476
|
+
if (config.onProgress) {
|
|
1477
|
+
config.onProgress(stats);
|
|
1478
|
+
}
|
|
1479
|
+
}
|
|
1480
|
+
}
|
|
1481
|
+
/**
|
|
1482
|
+
* Enrich records with section structure.
|
|
1483
|
+
*
|
|
1484
|
+
* Extracts section metadata from documents using either existing field mappings
|
|
1485
|
+
* or automatic detection strategies (markdown, HTML, or pattern-based).
|
|
1486
|
+
*
|
|
1487
|
+
* @param collection - Name of the collection to enrich
|
|
1488
|
+
* @param config - Section enrichment configuration
|
|
1489
|
+
* @returns Statistics about the enrichment operation
|
|
1490
|
+
*
|
|
1491
|
+
* @example
|
|
1492
|
+
* ```typescript
|
|
1493
|
+
* // Use existing section field
|
|
1494
|
+
* await pipeline.enrichSections('docs', {
|
|
1495
|
+
* existingField: 'section_path'
|
|
1496
|
+
* });
|
|
1497
|
+
*
|
|
1498
|
+
* // Auto-detect sections
|
|
1499
|
+
* await pipeline.enrichSections('docs', {
|
|
1500
|
+
* autoDetect: true
|
|
1501
|
+
* });
|
|
1502
|
+
* ```
|
|
1503
|
+
*/
|
|
1504
|
+
async enrichSections(collection, config) {
|
|
1505
|
+
const startTime = Date.now();
|
|
1506
|
+
const stats = {
|
|
1507
|
+
recordsProcessed: 0,
|
|
1508
|
+
recordsUpdated: 0,
|
|
1509
|
+
recordsSkipped: 0,
|
|
1510
|
+
timeMs: 0,
|
|
1511
|
+
errors: []
|
|
1512
|
+
};
|
|
1513
|
+
try {
|
|
1514
|
+
await this.enrichWithSectionDetection(collection, config, stats);
|
|
1515
|
+
} catch (error) {
|
|
1516
|
+
stats.errors?.push(
|
|
1517
|
+
`Pipeline error: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1518
|
+
);
|
|
1519
|
+
}
|
|
1520
|
+
stats.timeMs = Date.now() - startTime;
|
|
1521
|
+
return stats;
|
|
1522
|
+
}
|
|
1523
|
+
/**
|
|
1524
|
+
* Enrich records with all enrichment types.
|
|
1525
|
+
*
|
|
1526
|
+
* Runs vertical, theme, and section enrichment sequentially with shared
|
|
1527
|
+
* configuration. Global filters and batch sizes apply to all operations.
|
|
1528
|
+
*
|
|
1529
|
+
* @param collection - Name of the collection to enrich
|
|
1530
|
+
* @param config - Combined enrichment configuration
|
|
1531
|
+
* @returns Statistics about the enrichment operation
|
|
1532
|
+
*
|
|
1533
|
+
* @example
|
|
1534
|
+
* ```typescript
|
|
1535
|
+
* await pipeline.enrichAll('docs', {
|
|
1536
|
+
* vertical: { mapping: { tech: 'technology' } },
|
|
1537
|
+
* themes: { themes: ['innovation'], classifier },
|
|
1538
|
+
* sections: { autoDetect: true },
|
|
1539
|
+
* filter: { field: 'status', op: 'eq', value: 'pending' },
|
|
1540
|
+
* batchSize: 50
|
|
1541
|
+
* });
|
|
1542
|
+
* ```
|
|
1543
|
+
*/
|
|
1544
|
+
async enrichAll(collection, config) {
|
|
1545
|
+
const startTime = Date.now();
|
|
1546
|
+
const aggregateStats = {
|
|
1547
|
+
recordsProcessed: 0,
|
|
1548
|
+
recordsUpdated: 0,
|
|
1549
|
+
recordsSkipped: 0,
|
|
1550
|
+
timeMs: 0,
|
|
1551
|
+
errors: []
|
|
1552
|
+
};
|
|
1553
|
+
try {
|
|
1554
|
+
if (config.vertical) {
|
|
1555
|
+
const verticalConfig = this.applyGlobalConfig(config.vertical, config);
|
|
1556
|
+
const stats = await this.enrichVertical(collection, verticalConfig);
|
|
1557
|
+
this.mergeStats(aggregateStats, stats);
|
|
1558
|
+
if (config.onProgress) {
|
|
1559
|
+
config.onProgress(aggregateStats);
|
|
1560
|
+
}
|
|
1561
|
+
}
|
|
1562
|
+
if (config.themes) {
|
|
1563
|
+
const themesConfig = this.applyGlobalConfig(config.themes, config);
|
|
1564
|
+
const stats = await this.enrichThemes(collection, themesConfig);
|
|
1565
|
+
this.mergeStats(aggregateStats, stats);
|
|
1566
|
+
if (config.onProgress) {
|
|
1567
|
+
config.onProgress(aggregateStats);
|
|
1568
|
+
}
|
|
1569
|
+
}
|
|
1570
|
+
if (config.sections) {
|
|
1571
|
+
const sectionsConfig = this.applyGlobalConfig(config.sections, config);
|
|
1572
|
+
const stats = await this.enrichSections(collection, sectionsConfig);
|
|
1573
|
+
this.mergeStats(aggregateStats, stats);
|
|
1574
|
+
if (config.onProgress) {
|
|
1575
|
+
config.onProgress(aggregateStats);
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
} catch (error) {
|
|
1579
|
+
aggregateStats.errors?.push(
|
|
1580
|
+
`Pipeline error: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1581
|
+
);
|
|
1582
|
+
}
|
|
1583
|
+
aggregateStats.timeMs = Date.now() - startTime;
|
|
1584
|
+
return aggregateStats;
|
|
1585
|
+
}
|
|
1586
|
+
/**
|
|
1587
|
+
* Apply global configuration to individual enrichment configs.
|
|
1588
|
+
*
|
|
1589
|
+
* @param individualConfig - Configuration for a specific enrichment type
|
|
1590
|
+
* @param globalConfig - Global configuration
|
|
1591
|
+
* @returns Merged configuration
|
|
1592
|
+
*/
|
|
1593
|
+
applyGlobalConfig(individualConfig, globalConfig) {
|
|
1594
|
+
const merged = { ...individualConfig };
|
|
1595
|
+
if (globalConfig.filter && !merged.filter) {
|
|
1596
|
+
merged.filter = globalConfig.filter;
|
|
1597
|
+
}
|
|
1598
|
+
if (globalConfig.batchSize && !merged.batchSize) {
|
|
1599
|
+
merged.batchSize = globalConfig.batchSize;
|
|
1600
|
+
}
|
|
1601
|
+
return merged;
|
|
1602
|
+
}
|
|
1603
|
+
/**
|
|
1604
|
+
* Merge stats from an enrichment operation into aggregate stats.
|
|
1605
|
+
*
|
|
1606
|
+
* @param aggregate - Aggregate stats to update
|
|
1607
|
+
* @param stats - Stats from a single operation
|
|
1608
|
+
*/
|
|
1609
|
+
mergeStats(aggregate, stats) {
|
|
1610
|
+
aggregate.recordsProcessed += stats.recordsProcessed;
|
|
1611
|
+
aggregate.recordsUpdated += stats.recordsUpdated;
|
|
1612
|
+
aggregate.recordsSkipped += stats.recordsSkipped;
|
|
1613
|
+
if (stats.errors && stats.errors.length > 0) {
|
|
1614
|
+
if (!aggregate.errors) {
|
|
1615
|
+
aggregate.errors = [];
|
|
1616
|
+
}
|
|
1617
|
+
aggregate.errors.push(...stats.errors);
|
|
1618
|
+
}
|
|
1619
|
+
}
|
|
1620
|
+
/**
|
|
1621
|
+
* Enrich records using section detection.
|
|
1622
|
+
*
|
|
1623
|
+
* @param collection - Collection name
|
|
1624
|
+
* @param config - Section enrichment configuration
|
|
1625
|
+
* @param stats - Statistics object to update
|
|
1626
|
+
*/
|
|
1627
|
+
async enrichWithSectionDetection(collection, config, stats) {
|
|
1628
|
+
const batchSize = config.batchSize || 100;
|
|
1629
|
+
for await (const batch of this.adapter.iterate(collection, {
|
|
1630
|
+
batchSize,
|
|
1631
|
+
filter: config.filter
|
|
1632
|
+
})) {
|
|
1633
|
+
const updates = [];
|
|
1634
|
+
for (const record of batch) {
|
|
1635
|
+
stats.recordsProcessed++;
|
|
1636
|
+
try {
|
|
1637
|
+
let sectionMetadata = null;
|
|
1638
|
+
if (config.existingField) {
|
|
1639
|
+
sectionMetadata = this.extractSectionMetadata(
|
|
1640
|
+
record.metadata?.[config.existingField]
|
|
1641
|
+
);
|
|
1642
|
+
} else if (config.autoDetect) {
|
|
1643
|
+
const text = record.text || record.metadata?.content || "";
|
|
1644
|
+
if (typeof text === "string") {
|
|
1645
|
+
sectionMetadata = this.detectSections(text);
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
if (sectionMetadata) {
|
|
1649
|
+
const metadata = {
|
|
1650
|
+
__h_section_level: sectionMetadata.level,
|
|
1651
|
+
__h_section_title: sectionMetadata.title
|
|
1652
|
+
};
|
|
1653
|
+
if (sectionMetadata.path) {
|
|
1654
|
+
metadata.__h_section_path = sectionMetadata.path;
|
|
1655
|
+
}
|
|
1656
|
+
updates.push({
|
|
1657
|
+
id: record.id,
|
|
1658
|
+
metadata
|
|
1659
|
+
});
|
|
1660
|
+
} else {
|
|
1661
|
+
stats.recordsSkipped++;
|
|
1662
|
+
}
|
|
1663
|
+
} catch (error) {
|
|
1664
|
+
stats.recordsSkipped++;
|
|
1665
|
+
stats.errors?.push(
|
|
1666
|
+
`Error processing record ${record.id}: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1667
|
+
);
|
|
1668
|
+
}
|
|
1669
|
+
}
|
|
1670
|
+
if (updates.length > 0) {
|
|
1671
|
+
try {
|
|
1672
|
+
await this.adapter.updateMetadata(collection, updates);
|
|
1673
|
+
stats.recordsUpdated += updates.length;
|
|
1674
|
+
} catch (error) {
|
|
1675
|
+
stats.errors?.push(
|
|
1676
|
+
`Error updating batch: ${error instanceof Error ? error.message : "unknown error"}`
|
|
1677
|
+
);
|
|
1678
|
+
}
|
|
1679
|
+
}
|
|
1680
|
+
}
|
|
1681
|
+
}
|
|
1682
|
+
/**
|
|
1683
|
+
* Extract section metadata from an existing field value.
|
|
1684
|
+
*
|
|
1685
|
+
* @param sectionPath - Section path string (e.g., "introduction/overview")
|
|
1686
|
+
* @returns Section metadata or null
|
|
1687
|
+
*/
|
|
1688
|
+
extractSectionMetadata(sectionPath) {
|
|
1689
|
+
if (!sectionPath || typeof sectionPath !== "string") {
|
|
1690
|
+
return null;
|
|
1691
|
+
}
|
|
1692
|
+
const parts = sectionPath.split("/").filter((p) => p.trim() !== "");
|
|
1693
|
+
if (parts.length === 0) {
|
|
1694
|
+
return null;
|
|
1695
|
+
}
|
|
1696
|
+
return {
|
|
1697
|
+
path: sectionPath,
|
|
1698
|
+
level: parts.length,
|
|
1699
|
+
title: parts[parts.length - 1]
|
|
1700
|
+
};
|
|
1701
|
+
}
|
|
1702
|
+
/**
|
|
1703
|
+
* Detect sections in text using heuristics.
|
|
1704
|
+
*
|
|
1705
|
+
* @param text - Text content to analyze
|
|
1706
|
+
* @returns Section metadata or null
|
|
1707
|
+
*/
|
|
1708
|
+
detectSections(text) {
|
|
1709
|
+
const markdown = this.detectMarkdownSections(text);
|
|
1710
|
+
if (markdown) return markdown;
|
|
1711
|
+
const html = this.detectHtmlSections(text);
|
|
1712
|
+
if (html) return html;
|
|
1713
|
+
const pattern = this.detectPatternSections(text);
|
|
1714
|
+
if (pattern) return pattern;
|
|
1715
|
+
return { level: 0, title: "unsectioned" };
|
|
1716
|
+
}
|
|
1717
|
+
/**
|
|
1718
|
+
* Detect markdown headers (# Header).
|
|
1719
|
+
*
|
|
1720
|
+
* @param text - Text content
|
|
1721
|
+
* @returns Section metadata or null
|
|
1722
|
+
*/
|
|
1723
|
+
detectMarkdownSections(text) {
|
|
1724
|
+
const match = text.match(/^(#{1,6})\s+(.+)$/m);
|
|
1725
|
+
if (match) {
|
|
1726
|
+
const level = match[1].length;
|
|
1727
|
+
const title = match[2].trim();
|
|
1728
|
+
return { level, title };
|
|
1729
|
+
}
|
|
1730
|
+
return null;
|
|
1731
|
+
}
|
|
1732
|
+
/**
|
|
1733
|
+
* Detect HTML headers (<h1>Header</h1>).
|
|
1734
|
+
*
|
|
1735
|
+
* @param text - Text content
|
|
1736
|
+
* @returns Section metadata or null
|
|
1737
|
+
*/
|
|
1738
|
+
detectHtmlSections(text) {
|
|
1739
|
+
const match = text.match(/<h([1-6])>(.+?)<\/h[1-6]>/i);
|
|
1740
|
+
if (match) {
|
|
1741
|
+
const level = parseInt(match[1], 10);
|
|
1742
|
+
const title = match[2].trim();
|
|
1743
|
+
return { level, title };
|
|
1744
|
+
}
|
|
1745
|
+
return null;
|
|
1746
|
+
}
|
|
1747
|
+
/**
|
|
1748
|
+
* Detect sections using common patterns (SECTION: Title).
|
|
1749
|
+
*
|
|
1750
|
+
* @param text - Text content
|
|
1751
|
+
* @returns Section metadata or null
|
|
1752
|
+
*/
|
|
1753
|
+
detectPatternSections(text) {
|
|
1754
|
+
const match = text.match(/^SECTION:\s+(.+)$/m);
|
|
1755
|
+
if (match) {
|
|
1756
|
+
const title = match[1].trim();
|
|
1757
|
+
return { level: 1, title };
|
|
1758
|
+
}
|
|
1759
|
+
return null;
|
|
1760
|
+
}
|
|
1761
|
+
};
|
|
1762
|
+
|
|
1763
|
+
// src/ingestion/chunkers/text-chunker.ts
|
|
1764
|
+
var DEFAULT_CHUNK_SIZE = 500;
|
|
1765
|
+
var DEFAULT_CHUNK_OVERLAP = 50;
|
|
1766
|
+
function estimateTokens(text) {
|
|
1767
|
+
return Math.ceil(text.length / 4);
|
|
1768
|
+
}
|
|
1769
|
+
function estimateChars(tokens) {
|
|
1770
|
+
return tokens * 4;
|
|
1771
|
+
}
|
|
1772
|
+
|
|
1773
|
+
// src/ingestion/chunkers/recursive-chunker.ts
|
|
1774
|
+
var RecursiveChunker = class {
|
|
1775
|
+
separators = [
|
|
1776
|
+
"\n\n",
|
|
1777
|
+
// Paragraphs (double newline)
|
|
1778
|
+
"\n",
|
|
1779
|
+
// Lines (single newline)
|
|
1780
|
+
". ",
|
|
1781
|
+
// Sentences (period + space)
|
|
1782
|
+
" ",
|
|
1783
|
+
// Words (space)
|
|
1784
|
+
""
|
|
1785
|
+
// Characters (last resort)
|
|
1786
|
+
];
|
|
1787
|
+
chunk(text, config) {
|
|
1788
|
+
if (!text) return [];
|
|
1789
|
+
const chunkSize = config?.chunkSize ?? DEFAULT_CHUNK_SIZE;
|
|
1790
|
+
const chunkOverlap = config?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
|
|
1791
|
+
const maxChars = estimateChars(chunkSize);
|
|
1792
|
+
const overlapChars = estimateChars(chunkOverlap);
|
|
1793
|
+
if (text.length <= maxChars) {
|
|
1794
|
+
return [{
|
|
1795
|
+
text,
|
|
1796
|
+
index: 0,
|
|
1797
|
+
metadata: {
|
|
1798
|
+
source: "",
|
|
1799
|
+
chunkIndex: 0,
|
|
1800
|
+
totalChunks: 1,
|
|
1801
|
+
startChar: 0,
|
|
1802
|
+
endChar: text.length
|
|
1803
|
+
}
|
|
1804
|
+
}];
|
|
1805
|
+
}
|
|
1806
|
+
const splits = this.recursiveSplit(text, maxChars, 0);
|
|
1807
|
+
const chunks = this.addOverlap(splits, overlapChars);
|
|
1808
|
+
return chunks.map((chunk, index) => ({
|
|
1809
|
+
text: chunk.text,
|
|
1810
|
+
index,
|
|
1811
|
+
metadata: {
|
|
1812
|
+
source: "",
|
|
1813
|
+
// Will be set by pipeline
|
|
1814
|
+
chunkIndex: index,
|
|
1815
|
+
totalChunks: chunks.length,
|
|
1816
|
+
startChar: chunk.start,
|
|
1817
|
+
endChar: chunk.end
|
|
1818
|
+
}
|
|
1819
|
+
}));
|
|
1820
|
+
}
|
|
1821
|
+
recursiveSplit(text, maxChars, separatorIndex) {
|
|
1822
|
+
if (text.length <= maxChars) {
|
|
1823
|
+
return [{ text, start: 0, end: text.length }];
|
|
1824
|
+
}
|
|
1825
|
+
if (separatorIndex >= this.separators.length) {
|
|
1826
|
+
const result2 = [];
|
|
1827
|
+
for (let i = 0; i < text.length; i += maxChars) {
|
|
1828
|
+
result2.push({
|
|
1829
|
+
text: text.slice(i, i + maxChars),
|
|
1830
|
+
start: i,
|
|
1831
|
+
end: Math.min(i + maxChars, text.length)
|
|
1832
|
+
});
|
|
1833
|
+
}
|
|
1834
|
+
return result2;
|
|
1835
|
+
}
|
|
1836
|
+
const separator = this.separators[separatorIndex];
|
|
1837
|
+
const parts = separator ? text.split(separator) : [text];
|
|
1838
|
+
if (parts.length <= 1) {
|
|
1839
|
+
return this.recursiveSplit(text, maxChars, separatorIndex + 1);
|
|
1840
|
+
}
|
|
1841
|
+
const result = [];
|
|
1842
|
+
let currentParts = [];
|
|
1843
|
+
let currentStart = 0;
|
|
1844
|
+
let runningOffset = 0;
|
|
1845
|
+
for (let i = 0; i < parts.length; i++) {
|
|
1846
|
+
const part = parts[i];
|
|
1847
|
+
const combined = currentParts.length > 0 ? [...currentParts, part].join(separator) : part;
|
|
1848
|
+
if (combined.length <= maxChars) {
|
|
1849
|
+
if (currentParts.length === 0) {
|
|
1850
|
+
currentStart = runningOffset;
|
|
1851
|
+
}
|
|
1852
|
+
currentParts.push(part);
|
|
1853
|
+
} else {
|
|
1854
|
+
if (currentParts.length > 0) {
|
|
1855
|
+
const chunkText = currentParts.join(separator);
|
|
1856
|
+
result.push({
|
|
1857
|
+
text: chunkText,
|
|
1858
|
+
start: currentStart,
|
|
1859
|
+
end: currentStart + chunkText.length
|
|
1860
|
+
});
|
|
1861
|
+
}
|
|
1862
|
+
currentStart = runningOffset;
|
|
1863
|
+
if (part.length > maxChars) {
|
|
1864
|
+
const subSplits = this.recursiveSplit(part, maxChars, separatorIndex + 1);
|
|
1865
|
+
for (const sub of subSplits) {
|
|
1866
|
+
result.push({
|
|
1867
|
+
text: sub.text,
|
|
1868
|
+
start: currentStart + sub.start,
|
|
1869
|
+
end: currentStart + sub.end
|
|
1870
|
+
});
|
|
1871
|
+
}
|
|
1872
|
+
currentParts = [];
|
|
1873
|
+
} else {
|
|
1874
|
+
currentParts = [part];
|
|
1875
|
+
}
|
|
1876
|
+
}
|
|
1877
|
+
runningOffset += part.length + (i < parts.length - 1 ? separator.length : 0);
|
|
1878
|
+
}
|
|
1879
|
+
if (currentParts.length > 0) {
|
|
1880
|
+
const chunkText = currentParts.join(separator);
|
|
1881
|
+
result.push({
|
|
1882
|
+
text: chunkText,
|
|
1883
|
+
start: currentStart,
|
|
1884
|
+
end: currentStart + chunkText.length
|
|
1885
|
+
});
|
|
1886
|
+
}
|
|
1887
|
+
return result;
|
|
1888
|
+
}
|
|
1889
|
+
addOverlap(chunks, overlapChars) {
|
|
1890
|
+
if (overlapChars === 0 || chunks.length <= 1) {
|
|
1891
|
+
return chunks;
|
|
1892
|
+
}
|
|
1893
|
+
const result = [chunks[0]];
|
|
1894
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
1895
|
+
const prevChunk = chunks[i - 1];
|
|
1896
|
+
const currChunk = chunks[i];
|
|
1897
|
+
const overlapText = prevChunk.text.slice(-overlapChars);
|
|
1898
|
+
result.push({
|
|
1899
|
+
text: overlapText + currChunk.text,
|
|
1900
|
+
start: Math.max(0, prevChunk.end - overlapChars),
|
|
1901
|
+
end: currChunk.end
|
|
1902
|
+
});
|
|
1903
|
+
}
|
|
1904
|
+
return result;
|
|
1905
|
+
}
|
|
1906
|
+
};
|
|
1907
|
+
|
|
1908
|
+
// src/ingestion/ingestion-pipeline.ts
|
|
1909
|
+
var path = __toESM(require("path"));
|
|
1910
|
+
var IngestionPipeline = class {
|
|
1911
|
+
constructor(adapter, embedder, loaderRegistry, chunker) {
|
|
1912
|
+
this.adapter = adapter;
|
|
1913
|
+
this.embedder = embedder;
|
|
1914
|
+
this.loaderRegistry = loaderRegistry;
|
|
1915
|
+
this.defaultChunker = chunker || new RecursiveChunker();
|
|
1916
|
+
}
|
|
1917
|
+
defaultChunker;
|
|
1918
|
+
/**
|
|
1919
|
+
* Ingest documents into a vector database collection.
|
|
1920
|
+
* @param sources - File paths
|
|
1921
|
+
* @param collection - Target collection name
|
|
1922
|
+
* @param config - Optional ingestion configuration
|
|
1923
|
+
* @returns Statistics about the ingestion operation
|
|
1924
|
+
*/
|
|
1925
|
+
async ingest(sources, collection, config) {
|
|
1926
|
+
const startTime = Date.now();
|
|
1927
|
+
const sourceArray = Array.isArray(sources) ? sources : [sources];
|
|
1928
|
+
const stats = {
|
|
1929
|
+
documentsProcessed: 0,
|
|
1930
|
+
documentsSucceeded: 0,
|
|
1931
|
+
documentsFailed: 0,
|
|
1932
|
+
chunksCreated: 0,
|
|
1933
|
+
chunksUpserted: 0,
|
|
1934
|
+
timeMs: 0,
|
|
1935
|
+
errors: []
|
|
1936
|
+
};
|
|
1937
|
+
const totalDocuments = sourceArray.length;
|
|
1938
|
+
for (const source of sourceArray) {
|
|
1939
|
+
config?.onProgress?.({
|
|
1940
|
+
stage: "loading",
|
|
1941
|
+
documentsProcessed: stats.documentsProcessed,
|
|
1942
|
+
totalDocuments,
|
|
1943
|
+
chunksProcessed: stats.chunksUpserted,
|
|
1944
|
+
currentDocument: source
|
|
1945
|
+
});
|
|
1946
|
+
try {
|
|
1947
|
+
await this.ingestFile(source, collection, config, stats, totalDocuments);
|
|
1948
|
+
stats.documentsSucceeded++;
|
|
1949
|
+
} catch (error) {
|
|
1950
|
+
stats.documentsFailed++;
|
|
1951
|
+
stats.errors.push({
|
|
1952
|
+
source,
|
|
1953
|
+
stage: "load",
|
|
1954
|
+
error
|
|
1955
|
+
});
|
|
1956
|
+
}
|
|
1957
|
+
stats.documentsProcessed++;
|
|
1958
|
+
}
|
|
1959
|
+
stats.timeMs = Date.now() - startTime;
|
|
1960
|
+
return stats;
|
|
1961
|
+
}
|
|
1962
|
+
async ingestFile(filePath, collection, config, stats, totalDocuments) {
|
|
1963
|
+
const doc = await this.loaderRegistry.load(filePath);
|
|
1964
|
+
config?.onDocumentLoaded?.(doc);
|
|
1965
|
+
config?.onProgress?.({
|
|
1966
|
+
stage: "chunking",
|
|
1967
|
+
documentsProcessed: stats.documentsProcessed,
|
|
1968
|
+
totalDocuments,
|
|
1969
|
+
chunksProcessed: stats.chunksUpserted,
|
|
1970
|
+
currentDocument: filePath
|
|
1971
|
+
});
|
|
1972
|
+
const chunker = config?.chunker || this.defaultChunker;
|
|
1973
|
+
const chunks = chunker.chunk(doc.text, {
|
|
1974
|
+
chunkSize: config?.chunkSize,
|
|
1975
|
+
chunkOverlap: config?.chunkOverlap
|
|
1976
|
+
});
|
|
1977
|
+
for (const chunk of chunks) {
|
|
1978
|
+
chunk.metadata.source = doc.source;
|
|
1979
|
+
}
|
|
1980
|
+
stats.chunksCreated += chunks.length;
|
|
1981
|
+
config?.onChunksCreated?.(chunks);
|
|
1982
|
+
config?.onProgress?.({
|
|
1983
|
+
stage: "embedding",
|
|
1984
|
+
documentsProcessed: stats.documentsProcessed,
|
|
1985
|
+
totalDocuments,
|
|
1986
|
+
chunksProcessed: stats.chunksUpserted,
|
|
1987
|
+
totalChunks: stats.chunksCreated,
|
|
1988
|
+
currentDocument: filePath
|
|
1989
|
+
});
|
|
1990
|
+
const texts = chunks.map((c) => c.text);
|
|
1991
|
+
const embeddings = await this.embedder.embedBatch(texts);
|
|
1992
|
+
const records = chunks.map((chunk, i) => {
|
|
1993
|
+
const metadata = this.buildMetadata(doc, chunk, config);
|
|
1994
|
+
return {
|
|
1995
|
+
id: `${path.basename(doc.source)}:${chunk.index}`,
|
|
1996
|
+
embedding: embeddings[i],
|
|
1997
|
+
text: chunk.text,
|
|
1998
|
+
metadata
|
|
1999
|
+
};
|
|
2000
|
+
});
|
|
2001
|
+
config?.onProgress?.({
|
|
2002
|
+
stage: "upserting",
|
|
2003
|
+
documentsProcessed: stats.documentsProcessed,
|
|
2004
|
+
totalDocuments,
|
|
2005
|
+
chunksProcessed: stats.chunksUpserted,
|
|
2006
|
+
totalChunks: stats.chunksCreated,
|
|
2007
|
+
currentDocument: filePath
|
|
2008
|
+
});
|
|
2009
|
+
const batchSize = config?.batchSize || 100;
|
|
2010
|
+
for (let i = 0; i < records.length; i += batchSize) {
|
|
2011
|
+
const batch = records.slice(i, i + batchSize);
|
|
2012
|
+
await this.adapter.upsert(collection, batch);
|
|
2013
|
+
stats.chunksUpserted += batch.length;
|
|
2014
|
+
}
|
|
2015
|
+
}
|
|
2016
|
+
buildMetadata(doc, chunk, config) {
|
|
2017
|
+
const basename2 = path.basename(doc.source, path.extname(doc.source));
|
|
2018
|
+
const dirname2 = path.dirname(doc.source);
|
|
2019
|
+
const autoMetadata = {
|
|
2020
|
+
[VerticalFields.SOURCE]: doc.source,
|
|
2021
|
+
[VerticalFields.DOC_TYPE]: doc.type,
|
|
2022
|
+
[VerticalFields.DOC_ID]: basename2,
|
|
2023
|
+
[VerticalFields.PARTITION]: dirname2
|
|
2024
|
+
};
|
|
2025
|
+
const extractedMetadata = config?.metadataExtractor?.(doc) || {};
|
|
2026
|
+
const userMetadata = config?.metadata || {};
|
|
2027
|
+
const chunkMetadata = {
|
|
2028
|
+
chunkIndex: chunk.metadata.chunkIndex,
|
|
2029
|
+
totalChunks: chunk.metadata.totalChunks,
|
|
2030
|
+
startChar: chunk.metadata.startChar,
|
|
2031
|
+
endChar: chunk.metadata.endChar
|
|
2032
|
+
};
|
|
2033
|
+
return {
|
|
2034
|
+
...autoMetadata,
|
|
2035
|
+
...extractedMetadata,
|
|
2036
|
+
...userMetadata,
|
|
2037
|
+
...chunkMetadata
|
|
2038
|
+
};
|
|
2039
|
+
}
|
|
2040
|
+
};
|
|
2041
|
+
|
|
2042
|
+
// src/ingestion/loaders/text-loader.ts
|
|
2043
|
+
var fs = __toESM(require("fs/promises"));
|
|
2044
|
+
var path2 = __toESM(require("path"));
|
|
2045
|
+
var TextLoader = class {
|
|
2046
|
+
canHandle(filePath) {
|
|
2047
|
+
return /\.(txt|md)$/i.test(filePath);
|
|
2048
|
+
}
|
|
2049
|
+
async load(filePath) {
|
|
2050
|
+
const text = await fs.readFile(filePath, "utf-8");
|
|
2051
|
+
const type = path2.extname(filePath).slice(1).toLowerCase();
|
|
2052
|
+
const stats = await fs.stat(filePath);
|
|
2053
|
+
const extension = path2.extname(filePath);
|
|
2054
|
+
return {
|
|
2055
|
+
text,
|
|
2056
|
+
source: filePath,
|
|
2057
|
+
type,
|
|
2058
|
+
metadata: {
|
|
2059
|
+
size: stats.size,
|
|
2060
|
+
extension
|
|
2061
|
+
}
|
|
2062
|
+
};
|
|
2063
|
+
}
|
|
2064
|
+
};
|
|
2065
|
+
|
|
2066
|
+
// src/ingestion/loaders/pdf-loader.ts
|
|
2067
|
+
var fs2 = __toESM(require("fs/promises"));
|
|
2068
|
+
var import_pdf_parse = __toESM(require("pdf-parse"));
|
|
2069
|
+
var PDFLoader = class {
|
|
2070
|
+
canHandle(filePath) {
|
|
2071
|
+
return /\.pdf$/i.test(filePath);
|
|
2072
|
+
}
|
|
2073
|
+
async load(filePath) {
|
|
2074
|
+
const dataBuffer = await fs2.readFile(filePath);
|
|
2075
|
+
const pdfData = await (0, import_pdf_parse.default)(dataBuffer);
|
|
2076
|
+
return {
|
|
2077
|
+
text: pdfData.text,
|
|
2078
|
+
source: filePath,
|
|
2079
|
+
type: "pdf",
|
|
2080
|
+
metadata: {
|
|
2081
|
+
pages: pdfData.numpages,
|
|
2082
|
+
info: pdfData.info
|
|
2083
|
+
}
|
|
2084
|
+
};
|
|
2085
|
+
}
|
|
2086
|
+
};
|
|
2087
|
+
|
|
2088
|
+
// src/ingestion/loaders/docx-loader.ts
|
|
2089
|
+
var import_mammoth = __toESM(require("mammoth"));
|
|
2090
|
+
var DOCXLoader = class {
|
|
2091
|
+
canHandle(filePath) {
|
|
2092
|
+
return /\.docx$/i.test(filePath);
|
|
2093
|
+
}
|
|
2094
|
+
async load(filePath) {
|
|
2095
|
+
const result = await import_mammoth.default.extractRawText({ path: filePath });
|
|
2096
|
+
return {
|
|
2097
|
+
text: result.value,
|
|
2098
|
+
source: filePath,
|
|
2099
|
+
type: "docx",
|
|
2100
|
+
metadata: {
|
|
2101
|
+
warnings: result.messages
|
|
2102
|
+
// Conversion warnings from mammoth
|
|
2103
|
+
}
|
|
2104
|
+
};
|
|
2105
|
+
}
|
|
2106
|
+
};
|
|
2107
|
+
|
|
2108
|
+
// src/ingestion/loaders/html-loader.ts
|
|
2109
|
+
var fs3 = __toESM(require("fs/promises"));
|
|
2110
|
+
var cheerio = __toESM(require("cheerio"));
|
|
2111
|
+
var HTMLLoader = class {
|
|
2112
|
+
canHandle(filePath) {
|
|
2113
|
+
return /\.html?$/i.test(filePath);
|
|
2114
|
+
}
|
|
2115
|
+
async load(filePath) {
|
|
2116
|
+
const html = await fs3.readFile(filePath, "utf-8");
|
|
2117
|
+
const $ = cheerio.load(html);
|
|
2118
|
+
$("script, style, nav, footer").remove();
|
|
2119
|
+
const text = $("body").text().replace(/\s+/g, " ").trim();
|
|
2120
|
+
return {
|
|
2121
|
+
text,
|
|
2122
|
+
source: filePath,
|
|
2123
|
+
type: "html",
|
|
2124
|
+
metadata: {
|
|
2125
|
+
title: $("title").text() || void 0,
|
|
2126
|
+
description: $('meta[name="description"]').attr("content") || void 0
|
|
2127
|
+
}
|
|
2128
|
+
};
|
|
2129
|
+
}
|
|
2130
|
+
};
|
|
2131
|
+
|
|
2132
|
+
// src/ingestion/loaders/loader-registry.ts
|
|
2133
|
+
var LoaderRegistry = class {
|
|
2134
|
+
loaders = [];
|
|
2135
|
+
constructor() {
|
|
2136
|
+
this.register(new TextLoader());
|
|
2137
|
+
this.register(new PDFLoader());
|
|
2138
|
+
this.register(new DOCXLoader());
|
|
2139
|
+
this.register(new HTMLLoader());
|
|
2140
|
+
}
|
|
2141
|
+
/**
|
|
2142
|
+
* Register a custom document loader.
|
|
2143
|
+
* @param loader - Loader to register
|
|
2144
|
+
*/
|
|
2145
|
+
register(loader) {
|
|
2146
|
+
this.loaders.push(loader);
|
|
2147
|
+
}
|
|
2148
|
+
/**
|
|
2149
|
+
* Check if any loader can handle this file.
|
|
2150
|
+
* @param filePath - Path to check
|
|
2151
|
+
* @returns true if a loader exists for this file type
|
|
2152
|
+
*/
|
|
2153
|
+
canLoad(filePath) {
|
|
2154
|
+
return this.loaders.some((l) => l.canHandle(filePath));
|
|
2155
|
+
}
|
|
2156
|
+
/**
|
|
2157
|
+
* Load a document using the appropriate loader.
|
|
2158
|
+
* @param filePath - Path to the file to load
|
|
2159
|
+
* @returns Promise resolving to Document
|
|
2160
|
+
* @throws Error if no loader found for file type
|
|
2161
|
+
*/
|
|
2162
|
+
async load(filePath) {
|
|
2163
|
+
const loader = this.loaders.find((l) => l.canHandle(filePath));
|
|
2164
|
+
if (!loader) {
|
|
2165
|
+
throw new Error(`No loader found for file: ${filePath}`);
|
|
2166
|
+
}
|
|
2167
|
+
return loader.load(filePath);
|
|
2168
|
+
}
|
|
2169
|
+
};
|
|
2170
|
+
|
|
2171
|
+
// src/ingestion/chunkers/fixed-chunker.ts
|
|
2172
|
+
var FixedChunker = class {
|
|
2173
|
+
chunk(text, config) {
|
|
2174
|
+
if (!text) return [];
|
|
2175
|
+
const chunkSize = config?.chunkSize ?? DEFAULT_CHUNK_SIZE;
|
|
2176
|
+
const chunkOverlap = config?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
|
|
2177
|
+
const maxChars = estimateChars(chunkSize);
|
|
2178
|
+
const overlapChars = estimateChars(chunkOverlap);
|
|
2179
|
+
const step = maxChars - overlapChars;
|
|
2180
|
+
if (text.length <= maxChars) {
|
|
2181
|
+
return [{
|
|
2182
|
+
text,
|
|
2183
|
+
index: 0,
|
|
2184
|
+
metadata: {
|
|
2185
|
+
source: "",
|
|
2186
|
+
chunkIndex: 0,
|
|
2187
|
+
totalChunks: 1,
|
|
2188
|
+
startChar: 0,
|
|
2189
|
+
endChar: text.length
|
|
2190
|
+
}
|
|
2191
|
+
}];
|
|
2192
|
+
}
|
|
2193
|
+
const chunks = [];
|
|
2194
|
+
let position = 0;
|
|
2195
|
+
while (position < text.length) {
|
|
2196
|
+
const end = Math.min(text.length, position + maxChars);
|
|
2197
|
+
const chunkText = text.slice(position, end);
|
|
2198
|
+
chunks.push({
|
|
2199
|
+
text: chunkText,
|
|
2200
|
+
index: chunks.length,
|
|
2201
|
+
metadata: {
|
|
2202
|
+
source: "",
|
|
2203
|
+
chunkIndex: chunks.length,
|
|
2204
|
+
totalChunks: 0,
|
|
2205
|
+
// Updated after loop
|
|
2206
|
+
startChar: position,
|
|
2207
|
+
endChar: end
|
|
2208
|
+
}
|
|
2209
|
+
});
|
|
2210
|
+
position += step;
|
|
2211
|
+
if (step <= 0) break;
|
|
2212
|
+
}
|
|
2213
|
+
for (const chunk of chunks) {
|
|
2214
|
+
chunk.metadata.totalChunks = chunks.length;
|
|
2215
|
+
}
|
|
2216
|
+
return chunks;
|
|
2217
|
+
}
|
|
2218
|
+
};
|
|
2219
|
+
|
|
2220
|
+
// src/ingestion/chunkers/sentence-chunker.ts
|
|
2221
|
+
var SentenceChunker = class {
|
|
2222
|
+
chunk(text, config) {
|
|
2223
|
+
if (!text) return [];
|
|
2224
|
+
const chunkSize = config?.chunkSize ?? DEFAULT_CHUNK_SIZE;
|
|
2225
|
+
const chunkOverlap = config?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP;
|
|
2226
|
+
const maxChars = estimateChars(chunkSize);
|
|
2227
|
+
const overlapChars = estimateChars(chunkOverlap);
|
|
2228
|
+
const sentences = this.splitSentences(text);
|
|
2229
|
+
if (sentences.length === 0) {
|
|
2230
|
+
return [{
|
|
2231
|
+
text,
|
|
2232
|
+
index: 0,
|
|
2233
|
+
metadata: {
|
|
2234
|
+
source: "",
|
|
2235
|
+
chunkIndex: 0,
|
|
2236
|
+
totalChunks: 1,
|
|
2237
|
+
startChar: 0,
|
|
2238
|
+
endChar: text.length
|
|
2239
|
+
}
|
|
2240
|
+
}];
|
|
2241
|
+
}
|
|
2242
|
+
const rawChunks = [];
|
|
2243
|
+
let currentSentences = [];
|
|
2244
|
+
let currentStart = 0;
|
|
2245
|
+
for (const sentence of sentences) {
|
|
2246
|
+
const combined = currentSentences.length > 0 ? [...currentSentences, sentence].join(" ") : sentence;
|
|
2247
|
+
if (currentSentences.length === 0) {
|
|
2248
|
+
currentSentences = [sentence];
|
|
2249
|
+
currentStart = text.indexOf(sentence);
|
|
2250
|
+
} else if (combined.length <= maxChars) {
|
|
2251
|
+
currentSentences.push(sentence);
|
|
2252
|
+
} else {
|
|
2253
|
+
const chunkText = currentSentences.join(" ");
|
|
2254
|
+
rawChunks.push({
|
|
2255
|
+
text: chunkText,
|
|
2256
|
+
start: currentStart,
|
|
2257
|
+
end: currentStart + chunkText.length
|
|
2258
|
+
});
|
|
2259
|
+
currentSentences = [sentence];
|
|
2260
|
+
currentStart = text.indexOf(sentence, currentStart + 1);
|
|
2261
|
+
if (currentStart === -1) currentStart = 0;
|
|
2262
|
+
}
|
|
2263
|
+
}
|
|
2264
|
+
if (currentSentences.length > 0) {
|
|
2265
|
+
const chunkText = currentSentences.join(" ");
|
|
2266
|
+
rawChunks.push({
|
|
2267
|
+
text: chunkText,
|
|
2268
|
+
start: currentStart,
|
|
2269
|
+
end: currentStart + chunkText.length
|
|
2270
|
+
});
|
|
2271
|
+
}
|
|
2272
|
+
const withOverlap = this.addSentenceOverlap(rawChunks, overlapChars);
|
|
2273
|
+
return withOverlap.map((chunk, index) => ({
|
|
2274
|
+
text: chunk.text,
|
|
2275
|
+
index,
|
|
2276
|
+
metadata: {
|
|
2277
|
+
source: "",
|
|
2278
|
+
chunkIndex: index,
|
|
2279
|
+
totalChunks: withOverlap.length,
|
|
2280
|
+
startChar: chunk.start,
|
|
2281
|
+
endChar: chunk.end
|
|
2282
|
+
}
|
|
2283
|
+
}));
|
|
2284
|
+
}
|
|
2285
|
+
splitSentences(text) {
|
|
2286
|
+
const parts = text.match(/[^.!?]*[.!?]+(?:\s|$)|[^.!?]+$/g);
|
|
2287
|
+
if (!parts) return [text];
|
|
2288
|
+
return parts.map((s) => s.trim()).filter((s) => s.length > 0);
|
|
2289
|
+
}
|
|
2290
|
+
addSentenceOverlap(chunks, overlapChars) {
|
|
2291
|
+
if (overlapChars === 0 || chunks.length <= 1) {
|
|
2292
|
+
return chunks;
|
|
2293
|
+
}
|
|
2294
|
+
const result = [chunks[0]];
|
|
2295
|
+
for (let i = 1; i < chunks.length; i++) {
|
|
2296
|
+
const prevChunk = chunks[i - 1];
|
|
2297
|
+
const currChunk = chunks[i];
|
|
2298
|
+
const prevSentences = this.splitSentences(prevChunk.text);
|
|
2299
|
+
const lastSentence = prevSentences[prevSentences.length - 1] || "";
|
|
2300
|
+
if (lastSentence && lastSentence.length <= overlapChars) {
|
|
2301
|
+
result.push({
|
|
2302
|
+
text: lastSentence + " " + currChunk.text,
|
|
2303
|
+
start: Math.max(0, prevChunk.end - lastSentence.length),
|
|
2304
|
+
end: currChunk.end
|
|
2305
|
+
});
|
|
2306
|
+
} else {
|
|
2307
|
+
result.push(currChunk);
|
|
2308
|
+
}
|
|
2309
|
+
}
|
|
2310
|
+
return result;
|
|
2311
|
+
}
|
|
2312
|
+
};
|
|
2313
|
+
|
|
2314
|
+
// src/client/rag-client.ts
|
|
2315
|
+
var DEFAULT_TOP_K = 10;
|
|
2316
|
+
var DEFAULT_RAG_SYSTEM_PROMPT = "You are a helpful assistant. Answer the question based on the provided context. If the context doesn't contain enough information, say so.";
|
|
2317
|
+
var RAGClient = class {
|
|
2318
|
+
adapter;
|
|
2319
|
+
embedder;
|
|
2320
|
+
llm;
|
|
2321
|
+
defaultCollection;
|
|
2322
|
+
defaultTopK;
|
|
2323
|
+
queryComposer;
|
|
2324
|
+
ingestionPipeline;
|
|
2325
|
+
enrichmentPipeline;
|
|
2326
|
+
constructor(config) {
|
|
2327
|
+
this.adapter = config.adapter;
|
|
2328
|
+
this.embedder = config.embedder;
|
|
2329
|
+
this.llm = config.llm;
|
|
2330
|
+
this.defaultCollection = config.defaultCollection;
|
|
2331
|
+
this.defaultTopK = config.defaultTopK ?? DEFAULT_TOP_K;
|
|
2332
|
+
this.queryComposer = new RAGQueryComposer(this.adapter, this.embedder);
|
|
2333
|
+
this.ingestionPipeline = new IngestionPipeline(
|
|
2334
|
+
this.adapter,
|
|
2335
|
+
this.embedder,
|
|
2336
|
+
new LoaderRegistry()
|
|
2337
|
+
);
|
|
2338
|
+
this.enrichmentPipeline = new EnrichmentPipeline(this.adapter);
|
|
2339
|
+
}
|
|
2340
|
+
// ==========================================================================
|
|
2341
|
+
// COLLECTION MANAGEMENT
|
|
2342
|
+
// ==========================================================================
|
|
2343
|
+
/**
|
|
2344
|
+
* Create a new vector collection.
|
|
2345
|
+
* Dimension defaults to embedder.dimensions if not specified.
|
|
2346
|
+
*/
|
|
2347
|
+
async createCollection(name, dimension, metric) {
|
|
2348
|
+
const dim = dimension ?? this.embedder.dimensions;
|
|
2349
|
+
await this.adapter.createCollection(name, dim, metric);
|
|
2350
|
+
}
|
|
2351
|
+
/**
|
|
2352
|
+
* Delete a collection.
|
|
2353
|
+
*/
|
|
2354
|
+
async deleteCollection(name) {
|
|
2355
|
+
await this.adapter.deleteCollection(name);
|
|
2356
|
+
}
|
|
2357
|
+
/**
|
|
2358
|
+
* Check if a collection exists.
|
|
2359
|
+
*/
|
|
2360
|
+
async collectionExists(name) {
|
|
2361
|
+
return this.adapter.collectionExists(name);
|
|
2362
|
+
}
|
|
2363
|
+
// ==========================================================================
|
|
2364
|
+
// INGESTION
|
|
2365
|
+
// ==========================================================================
|
|
2366
|
+
/**
|
|
2367
|
+
* Ingest documents into a collection.
|
|
2368
|
+
* Collection defaults to defaultCollection if not specified.
|
|
2369
|
+
*/
|
|
2370
|
+
async ingest(sources, collection, config) {
|
|
2371
|
+
const col = collection ?? this.defaultCollection;
|
|
2372
|
+
if (!col) {
|
|
2373
|
+
throw new Error(
|
|
2374
|
+
"No collection specified. Pass a collection name or set defaultCollection in config."
|
|
2375
|
+
);
|
|
2376
|
+
}
|
|
2377
|
+
return this.ingestionPipeline.ingest(sources, col, config);
|
|
2378
|
+
}
|
|
2379
|
+
// ==========================================================================
|
|
2380
|
+
// RETRIEVAL
|
|
2381
|
+
// ==========================================================================
|
|
2382
|
+
/**
|
|
2383
|
+
* Retrieve relevant chunks for a query.
|
|
2384
|
+
* Supports filter shorthands (partition, theme) and groupBy.
|
|
2385
|
+
*/
|
|
2386
|
+
async retrieve(query, options) {
|
|
2387
|
+
const collection = options?.collection ?? this.defaultCollection;
|
|
2388
|
+
if (!collection) {
|
|
2389
|
+
throw new Error(
|
|
2390
|
+
"No collection specified. Pass a collection name or set defaultCollection in config."
|
|
2391
|
+
);
|
|
2392
|
+
}
|
|
2393
|
+
const topK = options?.topK ?? this.defaultTopK;
|
|
2394
|
+
let verticalFilters;
|
|
2395
|
+
let horizontalFilters;
|
|
2396
|
+
const customFilters = options?.filter;
|
|
2397
|
+
if (options?.partition) {
|
|
2398
|
+
verticalFilters = {
|
|
2399
|
+
field: VerticalFields.PARTITION,
|
|
2400
|
+
op: "eq",
|
|
2401
|
+
value: options.partition
|
|
2402
|
+
};
|
|
2403
|
+
}
|
|
2404
|
+
if (options?.theme) {
|
|
2405
|
+
horizontalFilters = {
|
|
2406
|
+
field: HorizontalFields.THEME,
|
|
2407
|
+
op: "eq",
|
|
2408
|
+
value: options.theme
|
|
2409
|
+
};
|
|
2410
|
+
}
|
|
2411
|
+
const params = {
|
|
2412
|
+
query,
|
|
2413
|
+
collection,
|
|
2414
|
+
topK,
|
|
2415
|
+
verticalFilters,
|
|
2416
|
+
horizontalFilters,
|
|
2417
|
+
customFilters
|
|
2418
|
+
};
|
|
2419
|
+
if (options?.groupBy === "document") {
|
|
2420
|
+
const grouped = await this.queryComposer.retrieveVertical(params);
|
|
2421
|
+
const records = Array.from(grouped.values()).flat();
|
|
2422
|
+
return { records, query, filtersApplied: { vertical: verticalFilters, horizontal: horizontalFilters, custom: customFilters } };
|
|
2423
|
+
}
|
|
2424
|
+
if (options?.groupBy === "theme") {
|
|
2425
|
+
const grouped = await this.queryComposer.retrieveHorizontal(params);
|
|
2426
|
+
const records = Array.from(grouped.values()).flat();
|
|
2427
|
+
return { records, query, filtersApplied: { vertical: verticalFilters, horizontal: horizontalFilters, custom: customFilters } };
|
|
2428
|
+
}
|
|
2429
|
+
return this.queryComposer.retrieve(params);
|
|
2430
|
+
}
|
|
2431
|
+
// ==========================================================================
|
|
2432
|
+
// ENRICHMENT
|
|
2433
|
+
// ==========================================================================
|
|
2434
|
+
/**
|
|
2435
|
+
* Enrich a collection with vertical, theme, and/or section metadata.
|
|
2436
|
+
*/
|
|
2437
|
+
async enrich(collection, config) {
|
|
2438
|
+
return this.enrichmentPipeline.enrichAll(collection, config);
|
|
2439
|
+
}
|
|
2440
|
+
// ==========================================================================
|
|
2441
|
+
// FULL RAG QUERY
|
|
2442
|
+
// ==========================================================================
|
|
2443
|
+
/**
|
|
2444
|
+
* Full RAG: retrieve relevant context and generate an answer using LLM.
|
|
2445
|
+
* Requires an LLM client to be provided in the constructor config.
|
|
2446
|
+
*/
|
|
2447
|
+
async query(question, options) {
|
|
2448
|
+
if (!this.llm) {
|
|
2449
|
+
throw new Error(
|
|
2450
|
+
"RAGClient.query() requires an LLM client. Pass one in the constructor config."
|
|
2451
|
+
);
|
|
2452
|
+
}
|
|
2453
|
+
const retrievalResult = await this.retrieve(question, options);
|
|
2454
|
+
const context = retrievalResult.records.map((r) => r.text).filter(Boolean).join("\n\n");
|
|
2455
|
+
const systemPrompt = options?.systemPrompt ?? DEFAULT_RAG_SYSTEM_PROMPT;
|
|
2456
|
+
const prompt = `${systemPrompt}
|
|
2457
|
+
|
|
2458
|
+
Context:
|
|
2459
|
+
${context}
|
|
2460
|
+
|
|
2461
|
+
Question: ${question}`;
|
|
2462
|
+
const answer = await this.llm.generate(prompt, {
|
|
2463
|
+
temperature: options?.temperature,
|
|
2464
|
+
maxTokens: options?.maxTokens
|
|
2465
|
+
});
|
|
2466
|
+
return {
|
|
2467
|
+
answer,
|
|
2468
|
+
sources: retrievalResult.records,
|
|
2469
|
+
query: question,
|
|
2470
|
+
retrievalResult
|
|
2471
|
+
};
|
|
2472
|
+
}
|
|
2473
|
+
};
|
|
2474
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
2475
|
+
0 && (module.exports = {
|
|
2476
|
+
DEFAULT_CHUNK_OVERLAP,
|
|
2477
|
+
DEFAULT_CHUNK_SIZE,
|
|
2478
|
+
DOCXLoader,
|
|
2479
|
+
Embedder,
|
|
2480
|
+
EmbeddingThemeClassifier,
|
|
2481
|
+
EnrichmentPipeline,
|
|
2482
|
+
FilterBuilder,
|
|
2483
|
+
FilterTranslator,
|
|
2484
|
+
FixedChunker,
|
|
2485
|
+
HTMLLoader,
|
|
2486
|
+
HorizontalFields,
|
|
2487
|
+
IngestionPipeline,
|
|
2488
|
+
KeywordThemeClassifier,
|
|
2489
|
+
LLMClient,
|
|
2490
|
+
LLMThemeClassifier,
|
|
2491
|
+
LoaderRegistry,
|
|
2492
|
+
METADATA_PREFIXES,
|
|
2493
|
+
MetadataBuilder,
|
|
2494
|
+
MockLLM,
|
|
2495
|
+
PDFLoader,
|
|
2496
|
+
RAGClient,
|
|
2497
|
+
RAGQueryComposer,
|
|
2498
|
+
RecursiveChunker,
|
|
2499
|
+
SentenceChunker,
|
|
2500
|
+
StructuralFields,
|
|
2501
|
+
TextLoader,
|
|
2502
|
+
VectorDBAdapter,
|
|
2503
|
+
VerticalFields,
|
|
2504
|
+
ZeroShotThemeClassifier,
|
|
2505
|
+
estimateChars,
|
|
2506
|
+
estimateTokens
|
|
2507
|
+
});
|
|
2508
|
+
//# sourceMappingURL=index.js.map
|