@absolutejs/absolute 0.19.0-beta.492 → 0.19.0-beta.494

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/ai/index.js CHANGED
@@ -2149,6 +2149,8 @@ var STOP_WORDS = new Set([
2149
2149
  "why"
2150
2150
  ]);
2151
2151
  var tokenize = (value) => value.toLowerCase().split(/[^a-z0-9]+/i).map((token) => token.trim()).filter((token) => !STOP_WORDS.has(token)).map((token) => token.endsWith("ies") && token.length > 3 ? `${token.slice(0, -3)}y` : token.endsWith("ing") && token.length > 5 ? token.slice(0, -3) : token.endsWith("ed") && token.length > 4 ? token.slice(0, -2) : token.endsWith("es") && token.length > 4 ? token.slice(0, -2) : token.endsWith("s") && token.length > 3 ? token.slice(0, -1) : token).filter((token) => token.length > 1);
2152
+ var BM25_K1 = 1.2;
2153
+ var BM25_B = 0.75;
2152
2154
  var collectMetadataStrings = (value) => {
2153
2155
  if (typeof value === "string" || typeof value === "number") {
2154
2156
  return [String(value)];
@@ -2161,10 +2163,94 @@ var collectMetadataStrings = (value) => {
2161
2163
  }
2162
2164
  return [];
2163
2165
  };
2166
+ var normalizeSourceForLexical = (source) => source.replace(/[#/_.-]+/g, " ").replace(/\bmd\b/g, "markdown").replace(/\bpptx\b/g, "presentation").replace(/\bxlsx\b/g, "spreadsheet workbook sheet").replace(/\bmp3\b/g, "audio transcript media").replace(/\bmp4\b/g, "video transcript media").replace(/\bzip\b/g, "archive bundle");
2167
+ var toFieldText = (value) => collectMetadataStrings(value).filter(Boolean).join(" ");
2168
+ var scoreTokenCoverage = (queryTokens, text) => {
2169
+ const normalizedText = (text ?? "").toLowerCase();
2170
+ if (normalizedText.length === 0) {
2171
+ return 0;
2172
+ }
2173
+ const tokens = tokenize(normalizedText);
2174
+ if (tokens.length === 0) {
2175
+ return 0;
2176
+ }
2177
+ const tokenSet = new Set(tokens);
2178
+ const overlap = queryTokens.filter((token) => tokenSet.has(token)).length;
2179
+ return overlap / Math.max(1, queryTokens.length);
2180
+ };
2181
+ var scorePhraseMatch = (query, text) => {
2182
+ const normalizedQuery = tokenize(query).join(" ");
2183
+ const normalizedText = tokenize(text ?? "").join(" ");
2184
+ if (normalizedQuery.length === 0 || normalizedText.length === 0) {
2185
+ return 0;
2186
+ }
2187
+ return normalizedText.includes(normalizedQuery) ? 1 : 0;
2188
+ };
2189
+ var scoreWeightedField = ({
2190
+ coverageWeight,
2191
+ phraseWeight,
2192
+ query,
2193
+ queryTokens,
2194
+ text
2195
+ }) => scoreTokenCoverage(queryTokens, text ?? "") * coverageWeight + scorePhraseMatch(query, text ?? "") * phraseWeight;
2196
+ var extractWeightedLexicalFields = (result) => {
2197
+ const metadata = result.metadata ?? {};
2198
+ const source = result.source ?? "";
2199
+ const archivePath = typeof metadata.archivePath === "string" ? metadata.archivePath : source.includes("#") ? source.split("#")[1] ?? "" : "";
2200
+ const mediaSegments = Array.isArray(metadata.mediaSegments) ? metadata.mediaSegments.map((segment) => segment && typeof segment === "object" ? toFieldText(segment) : "").filter(Boolean).join(" ") : "";
2201
+ const metadataFocus = [
2202
+ metadata.sheetName,
2203
+ metadata.sheetNames,
2204
+ metadata.slideTitle,
2205
+ metadata.slideTitles,
2206
+ metadata.threadTopic,
2207
+ metadata.speaker,
2208
+ metadata.fileKind,
2209
+ metadata.transcriptSource,
2210
+ metadata.archiveType
2211
+ ].flatMap((value) => collectMetadataStrings(value)).join(" ");
2212
+ return {
2213
+ archivePath,
2214
+ chunkText: result.text,
2215
+ mediaSegments,
2216
+ metadataFocus,
2217
+ metadataText: toFieldText(metadata),
2218
+ source: source ? normalizeSourceForLexical(source) : "",
2219
+ title: result.title ?? ""
2220
+ };
2221
+ };
2222
+ var FIELD_WEIGHTS = {
2223
+ archivePath: 4.2,
2224
+ chunkText: 1,
2225
+ mediaSegments: 3.8,
2226
+ metadataFocus: 3.2,
2227
+ metadataText: 1.4,
2228
+ source: 3.4,
2229
+ title: 2
2230
+ };
2231
+ var getWeightedFieldTokens = (result) => {
2232
+ const fields = extractWeightedLexicalFields({
2233
+ metadata: result.metadata,
2234
+ source: result.source,
2235
+ text: result.text,
2236
+ title: result.title
2237
+ });
2238
+ return {
2239
+ archivePath: tokenize(fields.archivePath ?? ""),
2240
+ chunkText: tokenize(fields.chunkText ?? ""),
2241
+ mediaSegments: tokenize(fields.mediaSegments ?? ""),
2242
+ metadataFocus: tokenize(fields.metadataFocus ?? ""),
2243
+ metadataText: tokenize(fields.metadataText ?? ""),
2244
+ source: tokenize(fields.source ?? ""),
2245
+ title: tokenize(fields.title ?? "")
2246
+ };
2247
+ };
2248
+ var countWeightedTermFrequency = (fieldTokens, token) => Object.keys(FIELD_WEIGHTS).reduce((total, fieldName) => total + fieldTokens[fieldName].filter((value) => value === token).length * FIELD_WEIGHTS[fieldName], 0);
2249
+ var computeWeightedDocumentLength = (fieldTokens) => Object.keys(FIELD_WEIGHTS).reduce((total, fieldName) => total + fieldTokens[fieldName].length * FIELD_WEIGHTS[fieldName], 0);
2164
2250
  var buildRAGLexicalHaystack = (result) => [
2165
2251
  result.title,
2166
2252
  result.source,
2167
- typeof result.source === "string" ? result.source.replace(/[#/_.-]+/g, " ").replace(/\bmd\b/g, "markdown").replace(/\bpptx\b/g, "presentation").replace(/\bxlsx\b/g, "spreadsheet workbook sheet").replace(/\bmp3\b/g, "audio transcript media").replace(/\bmp4\b/g, "video transcript media").replace(/\bzip\b/g, "archive bundle") : undefined,
2253
+ typeof result.source === "string" ? normalizeSourceForLexical(result.source) : undefined,
2168
2254
  result.chunkText,
2169
2255
  ...collectMetadataStrings(result.metadata)
2170
2256
  ].filter((value) => Boolean(value)).join(" ");
@@ -2173,20 +2259,131 @@ var scoreRAGLexicalMatch = (query, result) => {
2173
2259
  if (queryTokens.length === 0) {
2174
2260
  return 0;
2175
2261
  }
2262
+ const fields = extractWeightedLexicalFields({
2263
+ metadata: result.metadata,
2264
+ source: result.source,
2265
+ text: result.chunkText,
2266
+ title: result.title
2267
+ });
2176
2268
  const haystack = buildRAGLexicalHaystack(result).toLowerCase();
2177
- const haystackTokens = tokenize(haystack);
2178
- const haystackSet = new Set(haystackTokens);
2179
- const overlap = queryTokens.filter((token) => haystackSet.has(token)).length;
2180
- if (overlap === 0) {
2269
+ const overallCoverage = scoreTokenCoverage(queryTokens, haystack);
2270
+ if (overallCoverage === 0) {
2181
2271
  return 0;
2182
2272
  }
2183
- const exactPhraseBoost = haystack.includes(query.toLowerCase()) ? 1 : 0;
2184
- const sourceBoost = typeof result.source === "string" && queryTokens.some((token) => result.source?.toLowerCase().includes(token)) ? 0.5 : 0;
2185
- const coverageBoost = overlap / queryTokens.length;
2273
+ const titleScore = scoreWeightedField({
2274
+ coverageWeight: 1.8,
2275
+ phraseWeight: 1.2,
2276
+ query,
2277
+ queryTokens,
2278
+ text: fields.title
2279
+ });
2280
+ const sourceScore = scoreWeightedField({
2281
+ coverageWeight: 2.6,
2282
+ phraseWeight: 1.4,
2283
+ query,
2284
+ queryTokens,
2285
+ text: fields.source
2286
+ });
2287
+ const metadataFocusScore = scoreWeightedField({
2288
+ coverageWeight: 2.8,
2289
+ phraseWeight: 1.6,
2290
+ query,
2291
+ queryTokens,
2292
+ text: fields.metadataFocus
2293
+ });
2294
+ const archivePathScore = scoreWeightedField({
2295
+ coverageWeight: 3.2,
2296
+ phraseWeight: 2.2,
2297
+ query,
2298
+ queryTokens,
2299
+ text: fields.archivePath
2300
+ });
2301
+ const mediaSegmentScore = scoreWeightedField({
2302
+ coverageWeight: 3,
2303
+ phraseWeight: 1.8,
2304
+ query,
2305
+ queryTokens,
2306
+ text: fields.mediaSegments
2307
+ });
2308
+ const metadataScore = scoreWeightedField({
2309
+ coverageWeight: 1.2,
2310
+ phraseWeight: 0.8,
2311
+ query,
2312
+ queryTokens,
2313
+ text: fields.metadataText
2314
+ });
2315
+ const chunkScore = scoreWeightedField({
2316
+ coverageWeight: 0.9,
2317
+ phraseWeight: 0.6,
2318
+ query,
2319
+ queryTokens,
2320
+ text: fields.chunkText
2321
+ });
2322
+ const exactPhraseBoost = scorePhraseMatch(query, haystack);
2323
+ const coverageBoost = overallCoverage;
2186
2324
  const fileKindBoost = resolveFileKindBoost(queryTokens, result.metadata);
2187
2325
  const transcriptBoost = resolveTranscriptBoost(queryTokens, result.metadata);
2188
2326
  const archiveBoost = resolveArchiveBoost(queryTokens, result);
2189
- return coverageBoost + exactPhraseBoost + sourceBoost + fileKindBoost + transcriptBoost + archiveBoost;
2327
+ return titleScore + sourceScore + metadataFocusScore + archivePathScore + mediaSegmentScore + metadataScore + chunkScore + coverageBoost + exactPhraseBoost + fileKindBoost + transcriptBoost + archiveBoost;
2328
+ };
2329
+ var rankRAGLexicalMatches = (query, results) => {
2330
+ const queryTokens = tokenize(query);
2331
+ if (queryTokens.length === 0 || results.length === 0) {
2332
+ return [];
2333
+ }
2334
+ const candidates = results.map((result) => {
2335
+ const fieldTokens = getWeightedFieldTokens(result);
2336
+ return {
2337
+ fieldTokens,
2338
+ length: computeWeightedDocumentLength(fieldTokens),
2339
+ result
2340
+ };
2341
+ });
2342
+ const averageDocumentLength = candidates.reduce((total, candidate) => total + candidate.length, 0) / Math.max(1, candidates.length);
2343
+ const uniqueQueryTokens = [...new Set(queryTokens)];
2344
+ const documentFrequency = new Map;
2345
+ for (const token of uniqueQueryTokens) {
2346
+ let seen = 0;
2347
+ for (const candidate of candidates) {
2348
+ const tf = countWeightedTermFrequency(candidate.fieldTokens, token);
2349
+ if (tf > 0) {
2350
+ seen += 1;
2351
+ }
2352
+ }
2353
+ documentFrequency.set(token, seen);
2354
+ }
2355
+ return candidates.map((candidate, index) => {
2356
+ let bm25Score = 0;
2357
+ for (const token of uniqueQueryTokens) {
2358
+ const termFrequency = countWeightedTermFrequency(candidate.fieldTokens, token);
2359
+ if (termFrequency <= 0) {
2360
+ continue;
2361
+ }
2362
+ const df = documentFrequency.get(token) ?? 0;
2363
+ const idf = Math.log(1 + (candidates.length - df + 0.5) / (df + 0.5));
2364
+ const denominator = termFrequency + BM25_K1 * (1 - BM25_B + BM25_B * (candidate.length / Math.max(1, averageDocumentLength)));
2365
+ bm25Score += idf * (termFrequency * (BM25_K1 + 1) / Math.max(0.000000001, denominator));
2366
+ }
2367
+ const heuristicScore = scoreRAGLexicalMatch(query, {
2368
+ chunkText: candidate.result.text,
2369
+ metadata: candidate.result.metadata,
2370
+ source: candidate.result.source,
2371
+ title: candidate.result.title
2372
+ });
2373
+ return {
2374
+ index,
2375
+ result: candidate.result,
2376
+ score: bm25Score + heuristicScore * 0.35
2377
+ };
2378
+ }).filter((entry) => entry.score > 0).sort((left, right) => {
2379
+ if (right.score !== left.score) {
2380
+ return right.score - left.score;
2381
+ }
2382
+ return left.index - right.index;
2383
+ }).map(({ result, score }) => ({
2384
+ result,
2385
+ score
2386
+ }));
2190
2387
  };
2191
2388
  var hasAnyToken = (tokens, values) => values.some((value) => tokens.includes(value));
2192
2389
  var resolveFileKindBoost = (queryTokens, metadata) => {
@@ -6757,27 +6954,15 @@ var createInMemoryRAGStore = (options = {}) => {
6757
6954
  }));
6758
6955
  };
6759
6956
  const queryLexical = async (input) => {
6760
- const results = chunks.map((chunk) => ({
6761
- chunk,
6762
- score: scoreRAGLexicalMatch(input.query, {
6763
- chunkText: chunk.text,
6764
- metadata: chunk.metadata,
6765
- source: chunk.source,
6766
- title: chunk.title
6767
- })
6768
- })).filter(({ chunk }) => matchesFilter(chunk, input.filter)).filter(({ score }) => score > 0).sort((left, right) => {
6769
- if (right.score !== left.score) {
6770
- return right.score - left.score;
6771
- }
6772
- return left.chunk.chunkId.localeCompare(right.chunk.chunkId);
6773
- });
6774
- return results.slice(0, input.topK).map((entry) => ({
6775
- chunkId: entry.chunk.chunkId,
6776
- chunkText: entry.chunk.text,
6777
- metadata: entry.chunk.metadata,
6778
- score: entry.score,
6779
- source: entry.chunk.source,
6780
- title: entry.chunk.title
6957
+ const filtered = chunks.filter((chunk) => matchesFilter(chunk, input.filter));
6958
+ const ranked = rankRAGLexicalMatches(input.query, filtered);
6959
+ return ranked.slice(0, input.topK).map(({ result, score }) => ({
6960
+ chunkId: result.chunkId,
6961
+ chunkText: result.text,
6962
+ metadata: result.metadata,
6963
+ score,
6964
+ source: result.source,
6965
+ title: result.title
6781
6966
  }));
6782
6967
  };
6783
6968
  const upsert = async (input) => {
@@ -7408,27 +7593,15 @@ var createSQLiteRAGStore = (options = {}) => {
7408
7593
  };
7409
7594
  const queryLexical = async (input) => {
7410
7595
  const rawRows = toStoredRows(jsonStatements.query.all());
7411
- const chunks = mapFilterToRows(rawRows).filter((chunk) => matchesFilter(chunk, input.filter)).map((chunk) => ({
7412
- chunk,
7413
- score: scoreRAGLexicalMatch(input.query, {
7414
- chunkText: chunk.text,
7415
- metadata: chunk.metadata,
7416
- source: chunk.source,
7417
- title: chunk.title
7418
- })
7419
- })).filter(({ score }) => score > 0).sort((left, right) => {
7420
- if (right.score !== left.score) {
7421
- return right.score - left.score;
7422
- }
7423
- return left.chunk.chunkId.localeCompare(right.chunk.chunkId);
7424
- });
7425
- return chunks.slice(0, input.topK).map(({ chunk, score }) => ({
7426
- chunkId: chunk.chunkId,
7427
- chunkText: chunk.text,
7428
- metadata: chunk.metadata,
7596
+ const chunks = mapFilterToRows(rawRows).filter((chunk) => matchesFilter(chunk, input.filter));
7597
+ const ranked = rankRAGLexicalMatches(input.query, chunks);
7598
+ return ranked.slice(0, input.topK).map(({ result, score }) => ({
7599
+ chunkId: result.chunkId,
7600
+ chunkText: result.text,
7601
+ metadata: result.metadata,
7429
7602
  score,
7430
- source: chunk.source,
7431
- title: chunk.title
7603
+ source: result.source,
7604
+ title: result.title
7432
7605
  }));
7433
7606
  };
7434
7607
  const upsert = async (input) => {
@@ -8600,5 +8773,5 @@ export {
8600
8773
  aiChat
8601
8774
  };
8602
8775
 
8603
- //# debugId=8B383E0793D06CEF64756E2164756E21
8776
+ //# debugId=F37A373F20F3691864756E2164756E21
8604
8777
  //# sourceMappingURL=index.js.map