@absolutejs/absolute 0.19.0-beta.492 → 0.19.0-beta.494
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ai/index.js +224 -51
- package/dist/ai/index.js.map +5 -5
- package/dist/src/ai/rag/lexical.d.ts +12 -0
- package/package.json +1 -1
package/dist/ai/index.js
CHANGED
|
@@ -2149,6 +2149,8 @@ var STOP_WORDS = new Set([
|
|
|
2149
2149
|
"why"
|
|
2150
2150
|
]);
|
|
2151
2151
|
var tokenize = (value) => value.toLowerCase().split(/[^a-z0-9]+/i).map((token) => token.trim()).filter((token) => !STOP_WORDS.has(token)).map((token) => token.endsWith("ies") && token.length > 3 ? `${token.slice(0, -3)}y` : token.endsWith("ing") && token.length > 5 ? token.slice(0, -3) : token.endsWith("ed") && token.length > 4 ? token.slice(0, -2) : token.endsWith("es") && token.length > 4 ? token.slice(0, -2) : token.endsWith("s") && token.length > 3 ? token.slice(0, -1) : token).filter((token) => token.length > 1);
|
|
2152
|
+
var BM25_K1 = 1.2;
|
|
2153
|
+
var BM25_B = 0.75;
|
|
2152
2154
|
var collectMetadataStrings = (value) => {
|
|
2153
2155
|
if (typeof value === "string" || typeof value === "number") {
|
|
2154
2156
|
return [String(value)];
|
|
@@ -2161,10 +2163,94 @@ var collectMetadataStrings = (value) => {
|
|
|
2161
2163
|
}
|
|
2162
2164
|
return [];
|
|
2163
2165
|
};
|
|
2166
|
+
var normalizeSourceForLexical = (source) => source.replace(/[#/_.-]+/g, " ").replace(/\bmd\b/g, "markdown").replace(/\bpptx\b/g, "presentation").replace(/\bxlsx\b/g, "spreadsheet workbook sheet").replace(/\bmp3\b/g, "audio transcript media").replace(/\bmp4\b/g, "video transcript media").replace(/\bzip\b/g, "archive bundle");
|
|
2167
|
+
var toFieldText = (value) => collectMetadataStrings(value).filter(Boolean).join(" ");
|
|
2168
|
+
var scoreTokenCoverage = (queryTokens, text) => {
|
|
2169
|
+
const normalizedText = (text ?? "").toLowerCase();
|
|
2170
|
+
if (normalizedText.length === 0) {
|
|
2171
|
+
return 0;
|
|
2172
|
+
}
|
|
2173
|
+
const tokens = tokenize(normalizedText);
|
|
2174
|
+
if (tokens.length === 0) {
|
|
2175
|
+
return 0;
|
|
2176
|
+
}
|
|
2177
|
+
const tokenSet = new Set(tokens);
|
|
2178
|
+
const overlap = queryTokens.filter((token) => tokenSet.has(token)).length;
|
|
2179
|
+
return overlap / Math.max(1, queryTokens.length);
|
|
2180
|
+
};
|
|
2181
|
+
var scorePhraseMatch = (query, text) => {
|
|
2182
|
+
const normalizedQuery = tokenize(query).join(" ");
|
|
2183
|
+
const normalizedText = tokenize(text ?? "").join(" ");
|
|
2184
|
+
if (normalizedQuery.length === 0 || normalizedText.length === 0) {
|
|
2185
|
+
return 0;
|
|
2186
|
+
}
|
|
2187
|
+
return normalizedText.includes(normalizedQuery) ? 1 : 0;
|
|
2188
|
+
};
|
|
2189
|
+
var scoreWeightedField = ({
|
|
2190
|
+
coverageWeight,
|
|
2191
|
+
phraseWeight,
|
|
2192
|
+
query,
|
|
2193
|
+
queryTokens,
|
|
2194
|
+
text
|
|
2195
|
+
}) => scoreTokenCoverage(queryTokens, text ?? "") * coverageWeight + scorePhraseMatch(query, text ?? "") * phraseWeight;
|
|
2196
|
+
var extractWeightedLexicalFields = (result) => {
|
|
2197
|
+
const metadata = result.metadata ?? {};
|
|
2198
|
+
const source = result.source ?? "";
|
|
2199
|
+
const archivePath = typeof metadata.archivePath === "string" ? metadata.archivePath : source.includes("#") ? source.split("#")[1] ?? "" : "";
|
|
2200
|
+
const mediaSegments = Array.isArray(metadata.mediaSegments) ? metadata.mediaSegments.map((segment) => segment && typeof segment === "object" ? toFieldText(segment) : "").filter(Boolean).join(" ") : "";
|
|
2201
|
+
const metadataFocus = [
|
|
2202
|
+
metadata.sheetName,
|
|
2203
|
+
metadata.sheetNames,
|
|
2204
|
+
metadata.slideTitle,
|
|
2205
|
+
metadata.slideTitles,
|
|
2206
|
+
metadata.threadTopic,
|
|
2207
|
+
metadata.speaker,
|
|
2208
|
+
metadata.fileKind,
|
|
2209
|
+
metadata.transcriptSource,
|
|
2210
|
+
metadata.archiveType
|
|
2211
|
+
].flatMap((value) => collectMetadataStrings(value)).join(" ");
|
|
2212
|
+
return {
|
|
2213
|
+
archivePath,
|
|
2214
|
+
chunkText: result.text,
|
|
2215
|
+
mediaSegments,
|
|
2216
|
+
metadataFocus,
|
|
2217
|
+
metadataText: toFieldText(metadata),
|
|
2218
|
+
source: source ? normalizeSourceForLexical(source) : "",
|
|
2219
|
+
title: result.title ?? ""
|
|
2220
|
+
};
|
|
2221
|
+
};
|
|
2222
|
+
var FIELD_WEIGHTS = {
|
|
2223
|
+
archivePath: 4.2,
|
|
2224
|
+
chunkText: 1,
|
|
2225
|
+
mediaSegments: 3.8,
|
|
2226
|
+
metadataFocus: 3.2,
|
|
2227
|
+
metadataText: 1.4,
|
|
2228
|
+
source: 3.4,
|
|
2229
|
+
title: 2
|
|
2230
|
+
};
|
|
2231
|
+
var getWeightedFieldTokens = (result) => {
|
|
2232
|
+
const fields = extractWeightedLexicalFields({
|
|
2233
|
+
metadata: result.metadata,
|
|
2234
|
+
source: result.source,
|
|
2235
|
+
text: result.text,
|
|
2236
|
+
title: result.title
|
|
2237
|
+
});
|
|
2238
|
+
return {
|
|
2239
|
+
archivePath: tokenize(fields.archivePath ?? ""),
|
|
2240
|
+
chunkText: tokenize(fields.chunkText ?? ""),
|
|
2241
|
+
mediaSegments: tokenize(fields.mediaSegments ?? ""),
|
|
2242
|
+
metadataFocus: tokenize(fields.metadataFocus ?? ""),
|
|
2243
|
+
metadataText: tokenize(fields.metadataText ?? ""),
|
|
2244
|
+
source: tokenize(fields.source ?? ""),
|
|
2245
|
+
title: tokenize(fields.title ?? "")
|
|
2246
|
+
};
|
|
2247
|
+
};
|
|
2248
|
+
var countWeightedTermFrequency = (fieldTokens, token) => Object.keys(FIELD_WEIGHTS).reduce((total, fieldName) => total + fieldTokens[fieldName].filter((value) => value === token).length * FIELD_WEIGHTS[fieldName], 0);
|
|
2249
|
+
var computeWeightedDocumentLength = (fieldTokens) => Object.keys(FIELD_WEIGHTS).reduce((total, fieldName) => total + fieldTokens[fieldName].length * FIELD_WEIGHTS[fieldName], 0);
|
|
2164
2250
|
var buildRAGLexicalHaystack = (result) => [
|
|
2165
2251
|
result.title,
|
|
2166
2252
|
result.source,
|
|
2167
|
-
typeof result.source === "string" ? result.source
|
|
2253
|
+
typeof result.source === "string" ? normalizeSourceForLexical(result.source) : undefined,
|
|
2168
2254
|
result.chunkText,
|
|
2169
2255
|
...collectMetadataStrings(result.metadata)
|
|
2170
2256
|
].filter((value) => Boolean(value)).join(" ");
|
|
@@ -2173,20 +2259,131 @@ var scoreRAGLexicalMatch = (query, result) => {
|
|
|
2173
2259
|
if (queryTokens.length === 0) {
|
|
2174
2260
|
return 0;
|
|
2175
2261
|
}
|
|
2262
|
+
const fields = extractWeightedLexicalFields({
|
|
2263
|
+
metadata: result.metadata,
|
|
2264
|
+
source: result.source,
|
|
2265
|
+
text: result.chunkText,
|
|
2266
|
+
title: result.title
|
|
2267
|
+
});
|
|
2176
2268
|
const haystack = buildRAGLexicalHaystack(result).toLowerCase();
|
|
2177
|
-
const
|
|
2178
|
-
|
|
2179
|
-
const overlap = queryTokens.filter((token) => haystackSet.has(token)).length;
|
|
2180
|
-
if (overlap === 0) {
|
|
2269
|
+
const overallCoverage = scoreTokenCoverage(queryTokens, haystack);
|
|
2270
|
+
if (overallCoverage === 0) {
|
|
2181
2271
|
return 0;
|
|
2182
2272
|
}
|
|
2183
|
-
const
|
|
2184
|
-
|
|
2185
|
-
|
|
2273
|
+
const titleScore = scoreWeightedField({
|
|
2274
|
+
coverageWeight: 1.8,
|
|
2275
|
+
phraseWeight: 1.2,
|
|
2276
|
+
query,
|
|
2277
|
+
queryTokens,
|
|
2278
|
+
text: fields.title
|
|
2279
|
+
});
|
|
2280
|
+
const sourceScore = scoreWeightedField({
|
|
2281
|
+
coverageWeight: 2.6,
|
|
2282
|
+
phraseWeight: 1.4,
|
|
2283
|
+
query,
|
|
2284
|
+
queryTokens,
|
|
2285
|
+
text: fields.source
|
|
2286
|
+
});
|
|
2287
|
+
const metadataFocusScore = scoreWeightedField({
|
|
2288
|
+
coverageWeight: 2.8,
|
|
2289
|
+
phraseWeight: 1.6,
|
|
2290
|
+
query,
|
|
2291
|
+
queryTokens,
|
|
2292
|
+
text: fields.metadataFocus
|
|
2293
|
+
});
|
|
2294
|
+
const archivePathScore = scoreWeightedField({
|
|
2295
|
+
coverageWeight: 3.2,
|
|
2296
|
+
phraseWeight: 2.2,
|
|
2297
|
+
query,
|
|
2298
|
+
queryTokens,
|
|
2299
|
+
text: fields.archivePath
|
|
2300
|
+
});
|
|
2301
|
+
const mediaSegmentScore = scoreWeightedField({
|
|
2302
|
+
coverageWeight: 3,
|
|
2303
|
+
phraseWeight: 1.8,
|
|
2304
|
+
query,
|
|
2305
|
+
queryTokens,
|
|
2306
|
+
text: fields.mediaSegments
|
|
2307
|
+
});
|
|
2308
|
+
const metadataScore = scoreWeightedField({
|
|
2309
|
+
coverageWeight: 1.2,
|
|
2310
|
+
phraseWeight: 0.8,
|
|
2311
|
+
query,
|
|
2312
|
+
queryTokens,
|
|
2313
|
+
text: fields.metadataText
|
|
2314
|
+
});
|
|
2315
|
+
const chunkScore = scoreWeightedField({
|
|
2316
|
+
coverageWeight: 0.9,
|
|
2317
|
+
phraseWeight: 0.6,
|
|
2318
|
+
query,
|
|
2319
|
+
queryTokens,
|
|
2320
|
+
text: fields.chunkText
|
|
2321
|
+
});
|
|
2322
|
+
const exactPhraseBoost = scorePhraseMatch(query, haystack);
|
|
2323
|
+
const coverageBoost = overallCoverage;
|
|
2186
2324
|
const fileKindBoost = resolveFileKindBoost(queryTokens, result.metadata);
|
|
2187
2325
|
const transcriptBoost = resolveTranscriptBoost(queryTokens, result.metadata);
|
|
2188
2326
|
const archiveBoost = resolveArchiveBoost(queryTokens, result);
|
|
2189
|
-
return
|
|
2327
|
+
return titleScore + sourceScore + metadataFocusScore + archivePathScore + mediaSegmentScore + metadataScore + chunkScore + coverageBoost + exactPhraseBoost + fileKindBoost + transcriptBoost + archiveBoost;
|
|
2328
|
+
};
|
|
2329
|
+
var rankRAGLexicalMatches = (query, results) => {
|
|
2330
|
+
const queryTokens = tokenize(query);
|
|
2331
|
+
if (queryTokens.length === 0 || results.length === 0) {
|
|
2332
|
+
return [];
|
|
2333
|
+
}
|
|
2334
|
+
const candidates = results.map((result) => {
|
|
2335
|
+
const fieldTokens = getWeightedFieldTokens(result);
|
|
2336
|
+
return {
|
|
2337
|
+
fieldTokens,
|
|
2338
|
+
length: computeWeightedDocumentLength(fieldTokens),
|
|
2339
|
+
result
|
|
2340
|
+
};
|
|
2341
|
+
});
|
|
2342
|
+
const averageDocumentLength = candidates.reduce((total, candidate) => total + candidate.length, 0) / Math.max(1, candidates.length);
|
|
2343
|
+
const uniqueQueryTokens = [...new Set(queryTokens)];
|
|
2344
|
+
const documentFrequency = new Map;
|
|
2345
|
+
for (const token of uniqueQueryTokens) {
|
|
2346
|
+
let seen = 0;
|
|
2347
|
+
for (const candidate of candidates) {
|
|
2348
|
+
const tf = countWeightedTermFrequency(candidate.fieldTokens, token);
|
|
2349
|
+
if (tf > 0) {
|
|
2350
|
+
seen += 1;
|
|
2351
|
+
}
|
|
2352
|
+
}
|
|
2353
|
+
documentFrequency.set(token, seen);
|
|
2354
|
+
}
|
|
2355
|
+
return candidates.map((candidate, index) => {
|
|
2356
|
+
let bm25Score = 0;
|
|
2357
|
+
for (const token of uniqueQueryTokens) {
|
|
2358
|
+
const termFrequency = countWeightedTermFrequency(candidate.fieldTokens, token);
|
|
2359
|
+
if (termFrequency <= 0) {
|
|
2360
|
+
continue;
|
|
2361
|
+
}
|
|
2362
|
+
const df = documentFrequency.get(token) ?? 0;
|
|
2363
|
+
const idf = Math.log(1 + (candidates.length - df + 0.5) / (df + 0.5));
|
|
2364
|
+
const denominator = termFrequency + BM25_K1 * (1 - BM25_B + BM25_B * (candidate.length / Math.max(1, averageDocumentLength)));
|
|
2365
|
+
bm25Score += idf * (termFrequency * (BM25_K1 + 1) / Math.max(0.000000001, denominator));
|
|
2366
|
+
}
|
|
2367
|
+
const heuristicScore = scoreRAGLexicalMatch(query, {
|
|
2368
|
+
chunkText: candidate.result.text,
|
|
2369
|
+
metadata: candidate.result.metadata,
|
|
2370
|
+
source: candidate.result.source,
|
|
2371
|
+
title: candidate.result.title
|
|
2372
|
+
});
|
|
2373
|
+
return {
|
|
2374
|
+
index,
|
|
2375
|
+
result: candidate.result,
|
|
2376
|
+
score: bm25Score + heuristicScore * 0.35
|
|
2377
|
+
};
|
|
2378
|
+
}).filter((entry) => entry.score > 0).sort((left, right) => {
|
|
2379
|
+
if (right.score !== left.score) {
|
|
2380
|
+
return right.score - left.score;
|
|
2381
|
+
}
|
|
2382
|
+
return left.index - right.index;
|
|
2383
|
+
}).map(({ result, score }) => ({
|
|
2384
|
+
result,
|
|
2385
|
+
score
|
|
2386
|
+
}));
|
|
2190
2387
|
};
|
|
2191
2388
|
var hasAnyToken = (tokens, values) => values.some((value) => tokens.includes(value));
|
|
2192
2389
|
var resolveFileKindBoost = (queryTokens, metadata) => {
|
|
@@ -6757,27 +6954,15 @@ var createInMemoryRAGStore = (options = {}) => {
|
|
|
6757
6954
|
}));
|
|
6758
6955
|
};
|
|
6759
6956
|
const queryLexical = async (input) => {
|
|
6760
|
-
const
|
|
6761
|
-
|
|
6762
|
-
|
|
6763
|
-
|
|
6764
|
-
|
|
6765
|
-
|
|
6766
|
-
|
|
6767
|
-
|
|
6768
|
-
|
|
6769
|
-
if (right.score !== left.score) {
|
|
6770
|
-
return right.score - left.score;
|
|
6771
|
-
}
|
|
6772
|
-
return left.chunk.chunkId.localeCompare(right.chunk.chunkId);
|
|
6773
|
-
});
|
|
6774
|
-
return results.slice(0, input.topK).map((entry) => ({
|
|
6775
|
-
chunkId: entry.chunk.chunkId,
|
|
6776
|
-
chunkText: entry.chunk.text,
|
|
6777
|
-
metadata: entry.chunk.metadata,
|
|
6778
|
-
score: entry.score,
|
|
6779
|
-
source: entry.chunk.source,
|
|
6780
|
-
title: entry.chunk.title
|
|
6957
|
+
const filtered = chunks.filter((chunk) => matchesFilter(chunk, input.filter));
|
|
6958
|
+
const ranked = rankRAGLexicalMatches(input.query, filtered);
|
|
6959
|
+
return ranked.slice(0, input.topK).map(({ result, score }) => ({
|
|
6960
|
+
chunkId: result.chunkId,
|
|
6961
|
+
chunkText: result.text,
|
|
6962
|
+
metadata: result.metadata,
|
|
6963
|
+
score,
|
|
6964
|
+
source: result.source,
|
|
6965
|
+
title: result.title
|
|
6781
6966
|
}));
|
|
6782
6967
|
};
|
|
6783
6968
|
const upsert = async (input) => {
|
|
@@ -7408,27 +7593,15 @@ var createSQLiteRAGStore = (options = {}) => {
|
|
|
7408
7593
|
};
|
|
7409
7594
|
const queryLexical = async (input) => {
|
|
7410
7595
|
const rawRows = toStoredRows(jsonStatements.query.all());
|
|
7411
|
-
const chunks = mapFilterToRows(rawRows).filter((chunk) => matchesFilter(chunk, input.filter))
|
|
7412
|
-
|
|
7413
|
-
|
|
7414
|
-
|
|
7415
|
-
|
|
7416
|
-
|
|
7417
|
-
title: chunk.title
|
|
7418
|
-
})
|
|
7419
|
-
})).filter(({ score }) => score > 0).sort((left, right) => {
|
|
7420
|
-
if (right.score !== left.score) {
|
|
7421
|
-
return right.score - left.score;
|
|
7422
|
-
}
|
|
7423
|
-
return left.chunk.chunkId.localeCompare(right.chunk.chunkId);
|
|
7424
|
-
});
|
|
7425
|
-
return chunks.slice(0, input.topK).map(({ chunk, score }) => ({
|
|
7426
|
-
chunkId: chunk.chunkId,
|
|
7427
|
-
chunkText: chunk.text,
|
|
7428
|
-
metadata: chunk.metadata,
|
|
7596
|
+
const chunks = mapFilterToRows(rawRows).filter((chunk) => matchesFilter(chunk, input.filter));
|
|
7597
|
+
const ranked = rankRAGLexicalMatches(input.query, chunks);
|
|
7598
|
+
return ranked.slice(0, input.topK).map(({ result, score }) => ({
|
|
7599
|
+
chunkId: result.chunkId,
|
|
7600
|
+
chunkText: result.text,
|
|
7601
|
+
metadata: result.metadata,
|
|
7429
7602
|
score,
|
|
7430
|
-
source:
|
|
7431
|
-
title:
|
|
7603
|
+
source: result.source,
|
|
7604
|
+
title: result.title
|
|
7432
7605
|
}));
|
|
7433
7606
|
};
|
|
7434
7607
|
const upsert = async (input) => {
|
|
@@ -8600,5 +8773,5 @@ export {
|
|
|
8600
8773
|
aiChat
|
|
8601
8774
|
};
|
|
8602
8775
|
|
|
8603
|
-
//# debugId=
|
|
8776
|
+
//# debugId=F37A373F20F3691864756E2164756E21
|
|
8604
8777
|
//# sourceMappingURL=index.js.map
|