@absolutejs/absolute 0.19.0-beta.493 → 0.19.0-beta.495
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/index.js +268 -76
- package/dist/ai/index.js.map +6 -6
- package/dist/angular/index.js +2 -2
- package/dist/angular/index.js.map +1 -1
- package/dist/angular/server.js +2 -2
- package/dist/angular/server.js.map +1 -1
- package/dist/build.js +2 -2
- package/dist/build.js.map +1 -1
- package/dist/index.js +2 -2
- package/dist/index.js.map +1 -1
- package/dist/src/ai/rag/ingestion.d.ts +22 -2
- package/dist/src/ai/rag/lexical.d.ts +12 -0
- package/package.json +1 -1
package/dist/ai/index.js
CHANGED
|
@@ -2149,6 +2149,8 @@ var STOP_WORDS = new Set([
|
|
|
2149
2149
|
"why"
|
|
2150
2150
|
]);
|
|
2151
2151
|
var tokenize = (value) => value.toLowerCase().split(/[^a-z0-9]+/i).map((token) => token.trim()).filter((token) => !STOP_WORDS.has(token)).map((token) => token.endsWith("ies") && token.length > 3 ? `${token.slice(0, -3)}y` : token.endsWith("ing") && token.length > 5 ? token.slice(0, -3) : token.endsWith("ed") && token.length > 4 ? token.slice(0, -2) : token.endsWith("es") && token.length > 4 ? token.slice(0, -2) : token.endsWith("s") && token.length > 3 ? token.slice(0, -1) : token).filter((token) => token.length > 1);
|
|
2152
|
+
var BM25_K1 = 1.2;
|
|
2153
|
+
var BM25_B = 0.75;
|
|
2152
2154
|
var collectMetadataStrings = (value) => {
|
|
2153
2155
|
if (typeof value === "string" || typeof value === "number") {
|
|
2154
2156
|
return [String(value)];
|
|
@@ -2164,7 +2166,7 @@ var collectMetadataStrings = (value) => {
|
|
|
2164
2166
|
var normalizeSourceForLexical = (source) => source.replace(/[#/_.-]+/g, " ").replace(/\bmd\b/g, "markdown").replace(/\bpptx\b/g, "presentation").replace(/\bxlsx\b/g, "spreadsheet workbook sheet").replace(/\bmp3\b/g, "audio transcript media").replace(/\bmp4\b/g, "video transcript media").replace(/\bzip\b/g, "archive bundle");
|
|
2165
2167
|
var toFieldText = (value) => collectMetadataStrings(value).filter(Boolean).join(" ");
|
|
2166
2168
|
var scoreTokenCoverage = (queryTokens, text) => {
|
|
2167
|
-
const normalizedText = text.toLowerCase();
|
|
2169
|
+
const normalizedText = (text ?? "").toLowerCase();
|
|
2168
2170
|
if (normalizedText.length === 0) {
|
|
2169
2171
|
return 0;
|
|
2170
2172
|
}
|
|
@@ -2178,7 +2180,7 @@ var scoreTokenCoverage = (queryTokens, text) => {
|
|
|
2178
2180
|
};
|
|
2179
2181
|
var scorePhraseMatch = (query, text) => {
|
|
2180
2182
|
const normalizedQuery = tokenize(query).join(" ");
|
|
2181
|
-
const normalizedText = tokenize(text).join(" ");
|
|
2183
|
+
const normalizedText = tokenize(text ?? "").join(" ");
|
|
2182
2184
|
if (normalizedQuery.length === 0 || normalizedText.length === 0) {
|
|
2183
2185
|
return 0;
|
|
2184
2186
|
}
|
|
@@ -2190,7 +2192,7 @@ var scoreWeightedField = ({
|
|
|
2190
2192
|
query,
|
|
2191
2193
|
queryTokens,
|
|
2192
2194
|
text
|
|
2193
|
-
}) => scoreTokenCoverage(queryTokens, text) * coverageWeight + scorePhraseMatch(query, text) * phraseWeight;
|
|
2195
|
+
}) => scoreTokenCoverage(queryTokens, text ?? "") * coverageWeight + scorePhraseMatch(query, text ?? "") * phraseWeight;
|
|
2194
2196
|
var extractWeightedLexicalFields = (result) => {
|
|
2195
2197
|
const metadata = result.metadata ?? {};
|
|
2196
2198
|
const source = result.source ?? "";
|
|
@@ -2209,7 +2211,7 @@ var extractWeightedLexicalFields = (result) => {
|
|
|
2209
2211
|
].flatMap((value) => collectMetadataStrings(value)).join(" ");
|
|
2210
2212
|
return {
|
|
2211
2213
|
archivePath,
|
|
2212
|
-
chunkText: result.
|
|
2214
|
+
chunkText: result.text,
|
|
2213
2215
|
mediaSegments,
|
|
2214
2216
|
metadataFocus,
|
|
2215
2217
|
metadataText: toFieldText(metadata),
|
|
@@ -2217,6 +2219,34 @@ var extractWeightedLexicalFields = (result) => {
|
|
|
2217
2219
|
title: result.title ?? ""
|
|
2218
2220
|
};
|
|
2219
2221
|
};
|
|
2222
|
+
var FIELD_WEIGHTS = {
|
|
2223
|
+
archivePath: 4.2,
|
|
2224
|
+
chunkText: 1,
|
|
2225
|
+
mediaSegments: 3.8,
|
|
2226
|
+
metadataFocus: 3.2,
|
|
2227
|
+
metadataText: 1.4,
|
|
2228
|
+
source: 3.4,
|
|
2229
|
+
title: 2
|
|
2230
|
+
};
|
|
2231
|
+
var getWeightedFieldTokens = (result) => {
|
|
2232
|
+
const fields = extractWeightedLexicalFields({
|
|
2233
|
+
metadata: result.metadata,
|
|
2234
|
+
source: result.source,
|
|
2235
|
+
text: result.text,
|
|
2236
|
+
title: result.title
|
|
2237
|
+
});
|
|
2238
|
+
return {
|
|
2239
|
+
archivePath: tokenize(fields.archivePath ?? ""),
|
|
2240
|
+
chunkText: tokenize(fields.chunkText ?? ""),
|
|
2241
|
+
mediaSegments: tokenize(fields.mediaSegments ?? ""),
|
|
2242
|
+
metadataFocus: tokenize(fields.metadataFocus ?? ""),
|
|
2243
|
+
metadataText: tokenize(fields.metadataText ?? ""),
|
|
2244
|
+
source: tokenize(fields.source ?? ""),
|
|
2245
|
+
title: tokenize(fields.title ?? "")
|
|
2246
|
+
};
|
|
2247
|
+
};
|
|
2248
|
+
var countWeightedTermFrequency = (fieldTokens, token) => Object.keys(FIELD_WEIGHTS).reduce((total, fieldName) => total + fieldTokens[fieldName].filter((value) => value === token).length * FIELD_WEIGHTS[fieldName], 0);
|
|
2249
|
+
var computeWeightedDocumentLength = (fieldTokens) => Object.keys(FIELD_WEIGHTS).reduce((total, fieldName) => total + fieldTokens[fieldName].length * FIELD_WEIGHTS[fieldName], 0);
|
|
2220
2250
|
var buildRAGLexicalHaystack = (result) => [
|
|
2221
2251
|
result.title,
|
|
2222
2252
|
result.source,
|
|
@@ -2229,7 +2259,12 @@ var scoreRAGLexicalMatch = (query, result) => {
|
|
|
2229
2259
|
if (queryTokens.length === 0) {
|
|
2230
2260
|
return 0;
|
|
2231
2261
|
}
|
|
2232
|
-
const fields = extractWeightedLexicalFields(
|
|
2262
|
+
const fields = extractWeightedLexicalFields({
|
|
2263
|
+
metadata: result.metadata,
|
|
2264
|
+
source: result.source,
|
|
2265
|
+
text: result.chunkText,
|
|
2266
|
+
title: result.title
|
|
2267
|
+
});
|
|
2233
2268
|
const haystack = buildRAGLexicalHaystack(result).toLowerCase();
|
|
2234
2269
|
const overallCoverage = scoreTokenCoverage(queryTokens, haystack);
|
|
2235
2270
|
if (overallCoverage === 0) {
|
|
@@ -2291,6 +2326,65 @@ var scoreRAGLexicalMatch = (query, result) => {
|
|
|
2291
2326
|
const archiveBoost = resolveArchiveBoost(queryTokens, result);
|
|
2292
2327
|
return titleScore + sourceScore + metadataFocusScore + archivePathScore + mediaSegmentScore + metadataScore + chunkScore + coverageBoost + exactPhraseBoost + fileKindBoost + transcriptBoost + archiveBoost;
|
|
2293
2328
|
};
|
|
2329
|
+
var rankRAGLexicalMatches = (query, results) => {
|
|
2330
|
+
const queryTokens = tokenize(query);
|
|
2331
|
+
if (queryTokens.length === 0 || results.length === 0) {
|
|
2332
|
+
return [];
|
|
2333
|
+
}
|
|
2334
|
+
const candidates = results.map((result) => {
|
|
2335
|
+
const fieldTokens = getWeightedFieldTokens(result);
|
|
2336
|
+
return {
|
|
2337
|
+
fieldTokens,
|
|
2338
|
+
length: computeWeightedDocumentLength(fieldTokens),
|
|
2339
|
+
result
|
|
2340
|
+
};
|
|
2341
|
+
});
|
|
2342
|
+
const averageDocumentLength = candidates.reduce((total, candidate) => total + candidate.length, 0) / Math.max(1, candidates.length);
|
|
2343
|
+
const uniqueQueryTokens = [...new Set(queryTokens)];
|
|
2344
|
+
const documentFrequency = new Map;
|
|
2345
|
+
for (const token of uniqueQueryTokens) {
|
|
2346
|
+
let seen = 0;
|
|
2347
|
+
for (const candidate of candidates) {
|
|
2348
|
+
const tf = countWeightedTermFrequency(candidate.fieldTokens, token);
|
|
2349
|
+
if (tf > 0) {
|
|
2350
|
+
seen += 1;
|
|
2351
|
+
}
|
|
2352
|
+
}
|
|
2353
|
+
documentFrequency.set(token, seen);
|
|
2354
|
+
}
|
|
2355
|
+
return candidates.map((candidate, index) => {
|
|
2356
|
+
let bm25Score = 0;
|
|
2357
|
+
for (const token of uniqueQueryTokens) {
|
|
2358
|
+
const termFrequency = countWeightedTermFrequency(candidate.fieldTokens, token);
|
|
2359
|
+
if (termFrequency <= 0) {
|
|
2360
|
+
continue;
|
|
2361
|
+
}
|
|
2362
|
+
const df = documentFrequency.get(token) ?? 0;
|
|
2363
|
+
const idf = Math.log(1 + (candidates.length - df + 0.5) / (df + 0.5));
|
|
2364
|
+
const denominator = termFrequency + BM25_K1 * (1 - BM25_B + BM25_B * (candidate.length / Math.max(1, averageDocumentLength)));
|
|
2365
|
+
bm25Score += idf * (termFrequency * (BM25_K1 + 1) / Math.max(0.000000001, denominator));
|
|
2366
|
+
}
|
|
2367
|
+
const heuristicScore = scoreRAGLexicalMatch(query, {
|
|
2368
|
+
chunkText: candidate.result.text,
|
|
2369
|
+
metadata: candidate.result.metadata,
|
|
2370
|
+
source: candidate.result.source,
|
|
2371
|
+
title: candidate.result.title
|
|
2372
|
+
});
|
|
2373
|
+
return {
|
|
2374
|
+
index,
|
|
2375
|
+
result: candidate.result,
|
|
2376
|
+
score: bm25Score + heuristicScore * 0.35
|
|
2377
|
+
};
|
|
2378
|
+
}).filter((entry) => entry.score > 0).sort((left, right) => {
|
|
2379
|
+
if (right.score !== left.score) {
|
|
2380
|
+
return right.score - left.score;
|
|
2381
|
+
}
|
|
2382
|
+
return left.index - right.index;
|
|
2383
|
+
}).map(({ result, score }) => ({
|
|
2384
|
+
result,
|
|
2385
|
+
score
|
|
2386
|
+
}));
|
|
2387
|
+
};
|
|
2294
2388
|
var hasAnyToken = (tokens, values) => values.some((value) => tokens.includes(value));
|
|
2295
2389
|
var resolveFileKindBoost = (queryTokens, metadata) => {
|
|
2296
2390
|
const fileKind = typeof metadata?.fileKind === "string" ? metadata.fileKind : "";
|
|
@@ -3138,6 +3232,30 @@ var spreadsheetText = (entries) => {
|
|
|
3138
3232
|
return normalizeWhitespace(sheetValues.join(`
|
|
3139
3233
|
`));
|
|
3140
3234
|
};
|
|
3235
|
+
var spreadsheetSheetTexts = (entries) => {
|
|
3236
|
+
const sharedStrings = entries.filter((entry) => entry.path === "xl/sharedStrings.xml").flatMap((entry) => [
|
|
3237
|
+
...decodeUtf8(entry.data).matchAll(/<t[^>]*>([\s\S]*?)<\/t>/g)
|
|
3238
|
+
].map((match) => decodeHtmlEntities(match[1] ?? "")));
|
|
3239
|
+
const sheetNames = spreadsheetSheetNames(entries);
|
|
3240
|
+
const sheetEntries = entries.filter((entry) => entry.path.startsWith("xl/worksheets/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path));
|
|
3241
|
+
return sheetEntries.map((entry, index) => {
|
|
3242
|
+
const values = [
|
|
3243
|
+
...decodeUtf8(entry.data).matchAll(/<v>([\s\S]*?)<\/v>/g)
|
|
3244
|
+
].map((match) => match[1] ?? "").map((value) => {
|
|
3245
|
+
const sharedStringIndex = Number(value);
|
|
3246
|
+
return Number.isInteger(sharedStringIndex) && sharedStrings[sharedStringIndex] ? sharedStrings[sharedStringIndex] : value;
|
|
3247
|
+
});
|
|
3248
|
+
const text = normalizeWhitespace(values.join(`
|
|
3249
|
+
`));
|
|
3250
|
+
if (!text) {
|
|
3251
|
+
return null;
|
|
3252
|
+
}
|
|
3253
|
+
return {
|
|
3254
|
+
name: sheetNames[index] ?? `Sheet ${index + 1}`,
|
|
3255
|
+
text
|
|
3256
|
+
};
|
|
3257
|
+
}).filter((entry) => Boolean(entry));
|
|
3258
|
+
};
|
|
3141
3259
|
var spreadsheetSheetNames = (entries) => entries.filter((entry) => entry.path === "xl/workbook.xml").flatMap((entry) => [
|
|
3142
3260
|
...decodeUtf8(entry.data).matchAll(/<sheet[^>]*name="([^"]+)"/g)
|
|
3143
3261
|
].map((match) => match[1] ?? "")).filter(Boolean);
|
|
@@ -3147,6 +3265,10 @@ var presentationText = (entries) => {
|
|
|
3147
3265
|
|
|
3148
3266
|
`));
|
|
3149
3267
|
};
|
|
3268
|
+
var presentationSlides = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).sort((left, right) => left.path.localeCompare(right.path)).map((entry, index) => ({
|
|
3269
|
+
index,
|
|
3270
|
+
text: normalizeWhitespace(extractXmlText(decodeUtf8(entry.data)))
|
|
3271
|
+
})).filter((slide) => Boolean(slide.text));
|
|
3150
3272
|
var presentationSlideCount = (entries) => entries.filter((entry) => entry.path.startsWith("ppt/slides/") && entry.path.endsWith(".xml")).length;
|
|
3151
3273
|
var epubText = (entries) => {
|
|
3152
3274
|
const htmlEntries = entries.filter((entry) => /\.(xhtml|html|htm)$/i.test(entry.path));
|
|
@@ -3364,6 +3486,7 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
3364
3486
|
const entries = unzipEntries(input.data);
|
|
3365
3487
|
let text = "";
|
|
3366
3488
|
let officeMetadata = {};
|
|
3489
|
+
let structuredDocuments = [];
|
|
3367
3490
|
if (extension === ".docx" || extension === ".odt") {
|
|
3368
3491
|
text = officeDocumentText(entries);
|
|
3369
3492
|
officeMetadata = {
|
|
@@ -3371,19 +3494,53 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
3371
3494
|
};
|
|
3372
3495
|
} else if (extension === ".xlsx" || extension === ".ods") {
|
|
3373
3496
|
text = spreadsheetText(entries);
|
|
3497
|
+
const sheets = spreadsheetSheetTexts(entries);
|
|
3374
3498
|
officeMetadata = {
|
|
3375
3499
|
sheetNames: spreadsheetSheetNames(entries)
|
|
3376
3500
|
};
|
|
3501
|
+
structuredDocuments = sheets.map((sheet, index) => ({
|
|
3502
|
+
chunking: input.chunking,
|
|
3503
|
+
contentType: input.contentType,
|
|
3504
|
+
format: "text",
|
|
3505
|
+
metadata: {
|
|
3506
|
+
...input.metadata ?? {},
|
|
3507
|
+
fileKind: "office",
|
|
3508
|
+
...officeMetadata,
|
|
3509
|
+
sheetIndex: index,
|
|
3510
|
+
sheetName: sheet.name
|
|
3511
|
+
},
|
|
3512
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
|
|
3513
|
+
text: `Sheet ${sheet.name}
|
|
3514
|
+
${sheet.text}`,
|
|
3515
|
+
title: input.title ? `${input.title} \xB7 ${sheet.name}` : sheet.name
|
|
3516
|
+
}));
|
|
3377
3517
|
} else if (extension === ".pptx" || extension === ".odp") {
|
|
3378
3518
|
text = presentationText(entries);
|
|
3519
|
+
const slides = presentationSlides(entries);
|
|
3379
3520
|
officeMetadata = {
|
|
3380
3521
|
slideCount: presentationSlideCount(entries)
|
|
3381
3522
|
};
|
|
3523
|
+
structuredDocuments = slides.map((slide) => ({
|
|
3524
|
+
chunking: input.chunking,
|
|
3525
|
+
contentType: input.contentType,
|
|
3526
|
+
format: "text",
|
|
3527
|
+
metadata: {
|
|
3528
|
+
...input.metadata ?? {},
|
|
3529
|
+
fileKind: "office",
|
|
3530
|
+
...officeMetadata,
|
|
3531
|
+
slideIndex: slide.index,
|
|
3532
|
+
slideNumber: slide.index + 1
|
|
3533
|
+
},
|
|
3534
|
+
source: input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}${extension || ".office"}`,
|
|
3535
|
+
text: `Slide ${slide.index + 1}
|
|
3536
|
+
${slide.text}`,
|
|
3537
|
+
title: input.title ? `${input.title} \xB7 Slide ${slide.index + 1}` : `Slide ${slide.index + 1}`
|
|
3538
|
+
}));
|
|
3382
3539
|
}
|
|
3383
3540
|
if (!text) {
|
|
3384
3541
|
throw new Error(`AbsoluteJS could not extract readable text from ${inferNameFromInput(input)}`);
|
|
3385
3542
|
}
|
|
3386
|
-
|
|
3543
|
+
const summaryDocument = {
|
|
3387
3544
|
chunking: input.chunking,
|
|
3388
3545
|
contentType: input.contentType,
|
|
3389
3546
|
format: "text",
|
|
@@ -3396,6 +3553,7 @@ var createOfficeDocumentExtractor = () => ({
|
|
|
3396
3553
|
text,
|
|
3397
3554
|
title: input.title
|
|
3398
3555
|
};
|
|
3556
|
+
return [summaryDocument, ...structuredDocuments];
|
|
3399
3557
|
}
|
|
3400
3558
|
});
|
|
3401
3559
|
var createRAGArchiveExpander = (expander) => expander;
|
|
@@ -3425,7 +3583,36 @@ var createRAGMediaFileExtractor = (transcriber) => ({
|
|
|
3425
3583
|
supports: mediaExtractorSupports,
|
|
3426
3584
|
extract: async (input) => {
|
|
3427
3585
|
const result = await transcriber.transcribe(input);
|
|
3428
|
-
|
|
3586
|
+
const source = input.source ?? input.path ?? input.name ?? `${slugify(input.title ?? DEFAULT_BINARY_NAME)}.media.txt`;
|
|
3587
|
+
const segmentDocuments = [];
|
|
3588
|
+
for (const [index, segment] of (result.segments ?? []).entries()) {
|
|
3589
|
+
const text = normalizeWhitespace(segment.text ?? "");
|
|
3590
|
+
if (!text) {
|
|
3591
|
+
continue;
|
|
3592
|
+
}
|
|
3593
|
+
const startMs = typeof segment.startMs === "number" ? segment.startMs : undefined;
|
|
3594
|
+
const endMs = typeof segment.endMs === "number" ? segment.endMs : undefined;
|
|
3595
|
+
segmentDocuments.push({
|
|
3596
|
+
chunking: input.chunking,
|
|
3597
|
+
contentType: input.contentType,
|
|
3598
|
+
format: "text",
|
|
3599
|
+
metadata: {
|
|
3600
|
+
...input.metadata ?? {},
|
|
3601
|
+
...result.metadata ?? {},
|
|
3602
|
+
fileKind: "media",
|
|
3603
|
+
mediaSegmentIndex: index,
|
|
3604
|
+
mediaSegmentStartMs: startMs,
|
|
3605
|
+
mediaSegmentEndMs: endMs,
|
|
3606
|
+
mediaSegments: [segment],
|
|
3607
|
+
speaker: typeof segment.speaker === "string" ? segment.speaker : undefined
|
|
3608
|
+
},
|
|
3609
|
+
source,
|
|
3610
|
+
text: `Transcript segment${typeof startMs === "number" ? ` ${startMs}-${endMs ?? startMs}ms` : ""}
|
|
3611
|
+
${text}`,
|
|
3612
|
+
title: input.title ? `${input.title} \xB7 Segment ${index + 1}` : `Segment ${index + 1}`
|
|
3613
|
+
});
|
|
3614
|
+
}
|
|
3615
|
+
const summaryDocument = {
|
|
3429
3616
|
chunking: input.chunking,
|
|
3430
3617
|
contentType: input.contentType,
|
|
3431
3618
|
format: "text",
|
|
@@ -3435,10 +3622,11 @@ var createRAGMediaFileExtractor = (transcriber) => ({
|
|
|
3435
3622
|
fileKind: "media",
|
|
3436
3623
|
mediaSegments: result.segments
|
|
3437
3624
|
},
|
|
3438
|
-
source
|
|
3625
|
+
source,
|
|
3439
3626
|
text: result.text,
|
|
3440
3627
|
title: result.title ?? input.title
|
|
3441
3628
|
};
|
|
3629
|
+
return [summaryDocument, ...segmentDocuments];
|
|
3442
3630
|
}
|
|
3443
3631
|
});
|
|
3444
3632
|
var createRAGMediaTranscriber = (transcriber) => transcriber;
|
|
@@ -3470,7 +3658,7 @@ var expandArchiveEntry = async (entry, archiveInput, extractors) => {
|
|
|
3470
3658
|
},
|
|
3471
3659
|
name: basename(entry.path),
|
|
3472
3660
|
source: archiveInput.source && !archiveInput.source.startsWith("http") ? `${archiveInput.source}#${entry.path}` : entry.path,
|
|
3473
|
-
title:
|
|
3661
|
+
title: basename(entry.path)
|
|
3474
3662
|
}, extractors);
|
|
3475
3663
|
return documents;
|
|
3476
3664
|
};
|
|
@@ -3599,6 +3787,7 @@ var getFirstExtractedDocument = (documents, label) => {
|
|
|
3599
3787
|
}
|
|
3600
3788
|
return document;
|
|
3601
3789
|
};
|
|
3790
|
+
var loadExtractedDocuments = async (input, extractors) => extractRAGFileDocuments(input, extractors);
|
|
3602
3791
|
var sentenceUnits = (text) => {
|
|
3603
3792
|
const matches = text.match(/[^.!?\n]+(?:[.!?]+|$)/g);
|
|
3604
3793
|
if (!matches) {
|
|
@@ -3821,32 +4010,55 @@ var loadRAGDocumentFromURL = async (input) => {
|
|
|
3821
4010
|
};
|
|
3822
4011
|
var loadRAGDocumentsFromUploads = async (input) => {
|
|
3823
4012
|
const documents = await Promise.all(input.uploads.map(async (upload) => {
|
|
3824
|
-
const loaded = await
|
|
3825
|
-
|
|
3826
|
-
|
|
3827
|
-
|
|
3828
|
-
|
|
3829
|
-
|
|
3830
|
-
|
|
3831
|
-
|
|
4013
|
+
const loaded = await loadExtractedDocuments({
|
|
4014
|
+
chunking: upload.chunking,
|
|
4015
|
+
contentType: upload.contentType,
|
|
4016
|
+
data: decodeUploadContent(upload),
|
|
4017
|
+
format: upload.format,
|
|
4018
|
+
metadata: upload.metadata,
|
|
4019
|
+
name: upload.name,
|
|
4020
|
+
source: upload.source ?? upload.name,
|
|
4021
|
+
title: upload.title
|
|
4022
|
+
}, input.extractors);
|
|
4023
|
+
return loaded.map((document) => ({
|
|
4024
|
+
...document,
|
|
4025
|
+
metadata: mergeMetadata(document.metadata, { uploadFile: upload.name }, input.baseMetadata)
|
|
4026
|
+
}));
|
|
3832
4027
|
}));
|
|
3833
4028
|
return {
|
|
3834
4029
|
defaultChunking: input.defaultChunking,
|
|
3835
|
-
documents
|
|
4030
|
+
documents: documents.flat()
|
|
3836
4031
|
};
|
|
3837
4032
|
};
|
|
3838
4033
|
var loadRAGDocumentsFromURLs = async (input) => {
|
|
3839
|
-
const documents = await Promise.all(input.urls.map(async (urlInput) =>
|
|
3840
|
-
|
|
3841
|
-
|
|
3842
|
-
|
|
3843
|
-
}
|
|
3844
|
-
|
|
3845
|
-
|
|
3846
|
-
|
|
4034
|
+
const documents = await Promise.all(input.urls.map(async (urlInput) => {
|
|
4035
|
+
const url = urlInput.url.trim();
|
|
4036
|
+
if (!url) {
|
|
4037
|
+
throw new Error("RAG URL is required");
|
|
4038
|
+
}
|
|
4039
|
+
const response = await fetch(url);
|
|
4040
|
+
if (!response.ok) {
|
|
4041
|
+
throw new Error(`Failed to fetch RAG URL ${url}: ${response.status} ${response.statusText}`);
|
|
4042
|
+
}
|
|
4043
|
+
const data = new Uint8Array(await response.arrayBuffer());
|
|
4044
|
+
const loaded = await loadExtractedDocuments({
|
|
4045
|
+
chunking: urlInput.chunking,
|
|
4046
|
+
contentType: urlInput.contentType ?? response.headers.get("content-type") ?? undefined,
|
|
4047
|
+
data,
|
|
4048
|
+
format: urlInput.format ?? inferFormatFromUrl(url),
|
|
4049
|
+
metadata: urlInput.metadata,
|
|
4050
|
+
name: basename(new URL(url).pathname),
|
|
4051
|
+
source: urlInput.source ?? url,
|
|
4052
|
+
title: urlInput.title
|
|
4053
|
+
}, urlInput.extractors ?? input.extractors);
|
|
4054
|
+
return loaded.map((document) => ({
|
|
4055
|
+
...document,
|
|
4056
|
+
metadata: mergeMetadata(document.metadata, { sourceUrl: urlInput.url }, input.baseMetadata)
|
|
4057
|
+
}));
|
|
4058
|
+
}));
|
|
3847
4059
|
return {
|
|
3848
4060
|
defaultChunking: input.defaultChunking,
|
|
3849
|
-
documents
|
|
4061
|
+
documents: documents.flat()
|
|
3850
4062
|
};
|
|
3851
4063
|
};
|
|
3852
4064
|
var loadRAGDocumentUpload = async (input) => {
|
|
@@ -3926,21 +4138,25 @@ var loadRAGDocumentsFromDirectory = async (input) => {
|
|
|
3926
4138
|
const files = await collectDirectoryFiles(root, input.recursive !== false, includeExtensions);
|
|
3927
4139
|
const documents = await Promise.all(files.map(async (path) => {
|
|
3928
4140
|
const source = relative(root, path).replace(/\\/g, "/");
|
|
3929
|
-
const
|
|
4141
|
+
const data = await readFile(path);
|
|
4142
|
+
const loaded = await loadExtractedDocuments({
|
|
4143
|
+
chunking: input.defaultChunking,
|
|
4144
|
+
data,
|
|
3930
4145
|
metadata: {
|
|
3931
|
-
...input.baseMetadata ?? {},
|
|
3932
4146
|
fileName: basename(path),
|
|
3933
4147
|
relativePath: source
|
|
3934
4148
|
},
|
|
3935
4149
|
path,
|
|
3936
|
-
source
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
|
|
4150
|
+
source
|
|
4151
|
+
}, input.extractors);
|
|
4152
|
+
return loaded.map((document) => ({
|
|
4153
|
+
...document,
|
|
4154
|
+
metadata: mergeMetadata(document.metadata, undefined, input.baseMetadata)
|
|
4155
|
+
}));
|
|
3940
4156
|
}));
|
|
3941
4157
|
return {
|
|
3942
4158
|
defaultChunking: input.defaultChunking,
|
|
3943
|
-
documents
|
|
4159
|
+
documents: documents.flat()
|
|
3944
4160
|
};
|
|
3945
4161
|
};
|
|
3946
4162
|
var prepareRAGDirectoryDocuments = async (input) => prepareRAGDocuments(await loadRAGDocumentsFromDirectory(input));
|
|
@@ -6860,27 +7076,15 @@ var createInMemoryRAGStore = (options = {}) => {
|
|
|
6860
7076
|
}));
|
|
6861
7077
|
};
|
|
6862
7078
|
const queryLexical = async (input) => {
|
|
6863
|
-
const
|
|
6864
|
-
|
|
6865
|
-
|
|
6866
|
-
|
|
6867
|
-
|
|
6868
|
-
|
|
6869
|
-
|
|
6870
|
-
|
|
6871
|
-
|
|
6872
|
-
if (right.score !== left.score) {
|
|
6873
|
-
return right.score - left.score;
|
|
6874
|
-
}
|
|
6875
|
-
return left.chunk.chunkId.localeCompare(right.chunk.chunkId);
|
|
6876
|
-
});
|
|
6877
|
-
return results.slice(0, input.topK).map((entry) => ({
|
|
6878
|
-
chunkId: entry.chunk.chunkId,
|
|
6879
|
-
chunkText: entry.chunk.text,
|
|
6880
|
-
metadata: entry.chunk.metadata,
|
|
6881
|
-
score: entry.score,
|
|
6882
|
-
source: entry.chunk.source,
|
|
6883
|
-
title: entry.chunk.title
|
|
7079
|
+
const filtered = chunks.filter((chunk) => matchesFilter(chunk, input.filter));
|
|
7080
|
+
const ranked = rankRAGLexicalMatches(input.query, filtered);
|
|
7081
|
+
return ranked.slice(0, input.topK).map(({ result, score }) => ({
|
|
7082
|
+
chunkId: result.chunkId,
|
|
7083
|
+
chunkText: result.text,
|
|
7084
|
+
metadata: result.metadata,
|
|
7085
|
+
score,
|
|
7086
|
+
source: result.source,
|
|
7087
|
+
title: result.title
|
|
6884
7088
|
}));
|
|
6885
7089
|
};
|
|
6886
7090
|
const upsert = async (input) => {
|
|
@@ -7511,27 +7715,15 @@ var createSQLiteRAGStore = (options = {}) => {
|
|
|
7511
7715
|
};
|
|
7512
7716
|
const queryLexical = async (input) => {
|
|
7513
7717
|
const rawRows = toStoredRows(jsonStatements.query.all());
|
|
7514
|
-
const chunks = mapFilterToRows(rawRows).filter((chunk) => matchesFilter(chunk, input.filter))
|
|
7515
|
-
|
|
7516
|
-
|
|
7517
|
-
|
|
7518
|
-
|
|
7519
|
-
|
|
7520
|
-
title: chunk.title
|
|
7521
|
-
})
|
|
7522
|
-
})).filter(({ score }) => score > 0).sort((left, right) => {
|
|
7523
|
-
if (right.score !== left.score) {
|
|
7524
|
-
return right.score - left.score;
|
|
7525
|
-
}
|
|
7526
|
-
return left.chunk.chunkId.localeCompare(right.chunk.chunkId);
|
|
7527
|
-
});
|
|
7528
|
-
return chunks.slice(0, input.topK).map(({ chunk, score }) => ({
|
|
7529
|
-
chunkId: chunk.chunkId,
|
|
7530
|
-
chunkText: chunk.text,
|
|
7531
|
-
metadata: chunk.metadata,
|
|
7718
|
+
const chunks = mapFilterToRows(rawRows).filter((chunk) => matchesFilter(chunk, input.filter));
|
|
7719
|
+
const ranked = rankRAGLexicalMatches(input.query, chunks);
|
|
7720
|
+
return ranked.slice(0, input.topK).map(({ result, score }) => ({
|
|
7721
|
+
chunkId: result.chunkId,
|
|
7722
|
+
chunkText: result.text,
|
|
7723
|
+
metadata: result.metadata,
|
|
7532
7724
|
score,
|
|
7533
|
-
source:
|
|
7534
|
-
title:
|
|
7725
|
+
source: result.source,
|
|
7726
|
+
title: result.title
|
|
7535
7727
|
}));
|
|
7536
7728
|
};
|
|
7537
7729
|
const upsert = async (input) => {
|
|
@@ -8703,5 +8895,5 @@ export {
|
|
|
8703
8895
|
aiChat
|
|
8704
8896
|
};
|
|
8705
8897
|
|
|
8706
|
-
//# debugId=
|
|
8898
|
+
//# debugId=A1829EEFE0D80F9264756E2164756E21
|
|
8707
8899
|
//# sourceMappingURL=index.js.map
|