@absolutejs/absolute 0.19.0-beta.493 → 0.19.0-beta.494

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/ai/index.js CHANGED
@@ -2149,6 +2149,8 @@ var STOP_WORDS = new Set([
2149
2149
  "why"
2150
2150
  ]);
2151
2151
  var tokenize = (value) => value.toLowerCase().split(/[^a-z0-9]+/i).map((token) => token.trim()).filter((token) => !STOP_WORDS.has(token)).map((token) => token.endsWith("ies") && token.length > 3 ? `${token.slice(0, -3)}y` : token.endsWith("ing") && token.length > 5 ? token.slice(0, -3) : token.endsWith("ed") && token.length > 4 ? token.slice(0, -2) : token.endsWith("es") && token.length > 4 ? token.slice(0, -2) : token.endsWith("s") && token.length > 3 ? token.slice(0, -1) : token).filter((token) => token.length > 1);
2152
+ var BM25_K1 = 1.2;
2153
+ var BM25_B = 0.75;
2152
2154
  var collectMetadataStrings = (value) => {
2153
2155
  if (typeof value === "string" || typeof value === "number") {
2154
2156
  return [String(value)];
@@ -2164,7 +2166,7 @@ var collectMetadataStrings = (value) => {
2164
2166
  var normalizeSourceForLexical = (source) => source.replace(/[#/_.-]+/g, " ").replace(/\bmd\b/g, "markdown").replace(/\bpptx\b/g, "presentation").replace(/\bxlsx\b/g, "spreadsheet workbook sheet").replace(/\bmp3\b/g, "audio transcript media").replace(/\bmp4\b/g, "video transcript media").replace(/\bzip\b/g, "archive bundle");
2165
2167
  var toFieldText = (value) => collectMetadataStrings(value).filter(Boolean).join(" ");
2166
2168
  var scoreTokenCoverage = (queryTokens, text) => {
2167
- const normalizedText = text.toLowerCase();
2169
+ const normalizedText = (text ?? "").toLowerCase();
2168
2170
  if (normalizedText.length === 0) {
2169
2171
  return 0;
2170
2172
  }
@@ -2178,7 +2180,7 @@ var scoreTokenCoverage = (queryTokens, text) => {
2178
2180
  };
2179
2181
  var scorePhraseMatch = (query, text) => {
2180
2182
  const normalizedQuery = tokenize(query).join(" ");
2181
- const normalizedText = tokenize(text).join(" ");
2183
+ const normalizedText = tokenize(text ?? "").join(" ");
2182
2184
  if (normalizedQuery.length === 0 || normalizedText.length === 0) {
2183
2185
  return 0;
2184
2186
  }
@@ -2190,7 +2192,7 @@ var scoreWeightedField = ({
2190
2192
  query,
2191
2193
  queryTokens,
2192
2194
  text
2193
- }) => scoreTokenCoverage(queryTokens, text) * coverageWeight + scorePhraseMatch(query, text) * phraseWeight;
2195
+ }) => scoreTokenCoverage(queryTokens, text ?? "") * coverageWeight + scorePhraseMatch(query, text ?? "") * phraseWeight;
2194
2196
  var extractWeightedLexicalFields = (result) => {
2195
2197
  const metadata = result.metadata ?? {};
2196
2198
  const source = result.source ?? "";
@@ -2209,7 +2211,7 @@ var extractWeightedLexicalFields = (result) => {
2209
2211
  ].flatMap((value) => collectMetadataStrings(value)).join(" ");
2210
2212
  return {
2211
2213
  archivePath,
2212
- chunkText: result.chunkText,
2214
+ chunkText: result.text,
2213
2215
  mediaSegments,
2214
2216
  metadataFocus,
2215
2217
  metadataText: toFieldText(metadata),
@@ -2217,6 +2219,34 @@ var extractWeightedLexicalFields = (result) => {
2217
2219
  title: result.title ?? ""
2218
2220
  };
2219
2221
  };
2222
+ var FIELD_WEIGHTS = {
2223
+ archivePath: 4.2,
2224
+ chunkText: 1,
2225
+ mediaSegments: 3.8,
2226
+ metadataFocus: 3.2,
2227
+ metadataText: 1.4,
2228
+ source: 3.4,
2229
+ title: 2
2230
+ };
2231
+ var getWeightedFieldTokens = (result) => {
2232
+ const fields = extractWeightedLexicalFields({
2233
+ metadata: result.metadata,
2234
+ source: result.source,
2235
+ text: result.text,
2236
+ title: result.title
2237
+ });
2238
+ return {
2239
+ archivePath: tokenize(fields.archivePath ?? ""),
2240
+ chunkText: tokenize(fields.chunkText ?? ""),
2241
+ mediaSegments: tokenize(fields.mediaSegments ?? ""),
2242
+ metadataFocus: tokenize(fields.metadataFocus ?? ""),
2243
+ metadataText: tokenize(fields.metadataText ?? ""),
2244
+ source: tokenize(fields.source ?? ""),
2245
+ title: tokenize(fields.title ?? "")
2246
+ };
2247
+ };
2248
+ var countWeightedTermFrequency = (fieldTokens, token) => Object.keys(FIELD_WEIGHTS).reduce((total, fieldName) => total + fieldTokens[fieldName].filter((value) => value === token).length * FIELD_WEIGHTS[fieldName], 0);
2249
+ var computeWeightedDocumentLength = (fieldTokens) => Object.keys(FIELD_WEIGHTS).reduce((total, fieldName) => total + fieldTokens[fieldName].length * FIELD_WEIGHTS[fieldName], 0);
2220
2250
  var buildRAGLexicalHaystack = (result) => [
2221
2251
  result.title,
2222
2252
  result.source,
@@ -2229,7 +2259,12 @@ var scoreRAGLexicalMatch = (query, result) => {
2229
2259
  if (queryTokens.length === 0) {
2230
2260
  return 0;
2231
2261
  }
2232
- const fields = extractWeightedLexicalFields(result);
2262
+ const fields = extractWeightedLexicalFields({
2263
+ metadata: result.metadata,
2264
+ source: result.source,
2265
+ text: result.chunkText,
2266
+ title: result.title
2267
+ });
2233
2268
  const haystack = buildRAGLexicalHaystack(result).toLowerCase();
2234
2269
  const overallCoverage = scoreTokenCoverage(queryTokens, haystack);
2235
2270
  if (overallCoverage === 0) {
@@ -2291,6 +2326,65 @@ var scoreRAGLexicalMatch = (query, result) => {
2291
2326
  const archiveBoost = resolveArchiveBoost(queryTokens, result);
2292
2327
  return titleScore + sourceScore + metadataFocusScore + archivePathScore + mediaSegmentScore + metadataScore + chunkScore + coverageBoost + exactPhraseBoost + fileKindBoost + transcriptBoost + archiveBoost;
2293
2328
  };
2329
+ var rankRAGLexicalMatches = (query, results) => {
2330
+ const queryTokens = tokenize(query);
2331
+ if (queryTokens.length === 0 || results.length === 0) {
2332
+ return [];
2333
+ }
2334
+ const candidates = results.map((result) => {
2335
+ const fieldTokens = getWeightedFieldTokens(result);
2336
+ return {
2337
+ fieldTokens,
2338
+ length: computeWeightedDocumentLength(fieldTokens),
2339
+ result
2340
+ };
2341
+ });
2342
+ const averageDocumentLength = candidates.reduce((total, candidate) => total + candidate.length, 0) / Math.max(1, candidates.length);
2343
+ const uniqueQueryTokens = [...new Set(queryTokens)];
2344
+ const documentFrequency = new Map;
2345
+ for (const token of uniqueQueryTokens) {
2346
+ let seen = 0;
2347
+ for (const candidate of candidates) {
2348
+ const tf = countWeightedTermFrequency(candidate.fieldTokens, token);
2349
+ if (tf > 0) {
2350
+ seen += 1;
2351
+ }
2352
+ }
2353
+ documentFrequency.set(token, seen);
2354
+ }
2355
+ return candidates.map((candidate, index) => {
2356
+ let bm25Score = 0;
2357
+ for (const token of uniqueQueryTokens) {
2358
+ const termFrequency = countWeightedTermFrequency(candidate.fieldTokens, token);
2359
+ if (termFrequency <= 0) {
2360
+ continue;
2361
+ }
2362
+ const df = documentFrequency.get(token) ?? 0;
2363
+ const idf = Math.log(1 + (candidates.length - df + 0.5) / (df + 0.5));
2364
+ const denominator = termFrequency + BM25_K1 * (1 - BM25_B + BM25_B * (candidate.length / Math.max(1, averageDocumentLength)));
2365
+ bm25Score += idf * (termFrequency * (BM25_K1 + 1) / Math.max(0.000000001, denominator));
2366
+ }
2367
+ const heuristicScore = scoreRAGLexicalMatch(query, {
2368
+ chunkText: candidate.result.text,
2369
+ metadata: candidate.result.metadata,
2370
+ source: candidate.result.source,
2371
+ title: candidate.result.title
2372
+ });
2373
+ return {
2374
+ index,
2375
+ result: candidate.result,
2376
+ score: bm25Score + heuristicScore * 0.35
2377
+ };
2378
+ }).filter((entry) => entry.score > 0).sort((left, right) => {
2379
+ if (right.score !== left.score) {
2380
+ return right.score - left.score;
2381
+ }
2382
+ return left.index - right.index;
2383
+ }).map(({ result, score }) => ({
2384
+ result,
2385
+ score
2386
+ }));
2387
+ };
2294
2388
  var hasAnyToken = (tokens, values) => values.some((value) => tokens.includes(value));
2295
2389
  var resolveFileKindBoost = (queryTokens, metadata) => {
2296
2390
  const fileKind = typeof metadata?.fileKind === "string" ? metadata.fileKind : "";
@@ -6860,27 +6954,15 @@ var createInMemoryRAGStore = (options = {}) => {
6860
6954
  }));
6861
6955
  };
6862
6956
  const queryLexical = async (input) => {
6863
- const results = chunks.map((chunk) => ({
6864
- chunk,
6865
- score: scoreRAGLexicalMatch(input.query, {
6866
- chunkText: chunk.text,
6867
- metadata: chunk.metadata,
6868
- source: chunk.source,
6869
- title: chunk.title
6870
- })
6871
- })).filter(({ chunk }) => matchesFilter(chunk, input.filter)).filter(({ score }) => score > 0).sort((left, right) => {
6872
- if (right.score !== left.score) {
6873
- return right.score - left.score;
6874
- }
6875
- return left.chunk.chunkId.localeCompare(right.chunk.chunkId);
6876
- });
6877
- return results.slice(0, input.topK).map((entry) => ({
6878
- chunkId: entry.chunk.chunkId,
6879
- chunkText: entry.chunk.text,
6880
- metadata: entry.chunk.metadata,
6881
- score: entry.score,
6882
- source: entry.chunk.source,
6883
- title: entry.chunk.title
6957
+ const filtered = chunks.filter((chunk) => matchesFilter(chunk, input.filter));
6958
+ const ranked = rankRAGLexicalMatches(input.query, filtered);
6959
+ return ranked.slice(0, input.topK).map(({ result, score }) => ({
6960
+ chunkId: result.chunkId,
6961
+ chunkText: result.text,
6962
+ metadata: result.metadata,
6963
+ score,
6964
+ source: result.source,
6965
+ title: result.title
6884
6966
  }));
6885
6967
  };
6886
6968
  const upsert = async (input) => {
@@ -7511,27 +7593,15 @@ var createSQLiteRAGStore = (options = {}) => {
7511
7593
  };
7512
7594
  const queryLexical = async (input) => {
7513
7595
  const rawRows = toStoredRows(jsonStatements.query.all());
7514
- const chunks = mapFilterToRows(rawRows).filter((chunk) => matchesFilter(chunk, input.filter)).map((chunk) => ({
7515
- chunk,
7516
- score: scoreRAGLexicalMatch(input.query, {
7517
- chunkText: chunk.text,
7518
- metadata: chunk.metadata,
7519
- source: chunk.source,
7520
- title: chunk.title
7521
- })
7522
- })).filter(({ score }) => score > 0).sort((left, right) => {
7523
- if (right.score !== left.score) {
7524
- return right.score - left.score;
7525
- }
7526
- return left.chunk.chunkId.localeCompare(right.chunk.chunkId);
7527
- });
7528
- return chunks.slice(0, input.topK).map(({ chunk, score }) => ({
7529
- chunkId: chunk.chunkId,
7530
- chunkText: chunk.text,
7531
- metadata: chunk.metadata,
7596
+ const chunks = mapFilterToRows(rawRows).filter((chunk) => matchesFilter(chunk, input.filter));
7597
+ const ranked = rankRAGLexicalMatches(input.query, chunks);
7598
+ return ranked.slice(0, input.topK).map(({ result, score }) => ({
7599
+ chunkId: result.chunkId,
7600
+ chunkText: result.text,
7601
+ metadata: result.metadata,
7532
7602
  score,
7533
- source: chunk.source,
7534
- title: chunk.title
7603
+ source: result.source,
7604
+ title: result.title
7535
7605
  }));
7536
7606
  };
7537
7607
  const upsert = async (input) => {
@@ -8703,5 +8773,5 @@ export {
8703
8773
  aiChat
8704
8774
  };
8705
8775
 
8706
- //# debugId=36F6407CE8163A4F64756E2164756E21
8776
+ //# debugId=F37A373F20F3691864756E2164756E21
8707
8777
  //# sourceMappingURL=index.js.map