@astro-minimax/ai 0.8.2 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cache/global-cache.js +145 -0
- package/dist/cache/index.js +96 -0
- package/dist/cache/kv-adapter.js +99 -0
- package/dist/cache/memory-adapter.js +97 -0
- package/dist/cache/response-cache.js +87 -0
- package/dist/cache/types.js +8 -0
- package/dist/data/metadata-loader.js +48 -0
- package/dist/data/types.js +0 -0
- package/dist/fact-registry/fact-matcher.js +128 -0
- package/dist/fact-registry/prompt-injector.js +54 -0
- package/dist/fact-registry/registry.js +41 -0
- package/dist/fact-registry/types.js +0 -0
- package/dist/intelligence/citation-appender.js +63 -0
- package/dist/intelligence/citation-guard.js +108 -0
- package/dist/intelligence/evidence-analysis.js +79 -0
- package/dist/intelligence/intent-detect.js +93 -0
- package/dist/intelligence/keyword-extract.js +89 -0
- package/dist/intelligence/response-templates.js +117 -0
- package/dist/intelligence/types.js +0 -0
- package/dist/middleware/rate-limiter.js +110 -0
- package/dist/prompt/dynamic-layer.js +64 -0
- package/dist/prompt/prompt-builder.js +15 -0
- package/dist/prompt/semi-static-layer.js +28 -0
- package/dist/prompt/static-layer.js +153 -0
- package/dist/prompt/types.js +0 -0
- package/dist/provider-manager/base.js +53 -0
- package/dist/provider-manager/config.js +135 -0
- package/dist/provider-manager/index.js +19 -0
- package/dist/provider-manager/manager.js +122 -0
- package/dist/provider-manager/mock.js +77 -0
- package/dist/provider-manager/openai.js +106 -0
- package/dist/provider-manager/types.js +0 -0
- package/dist/provider-manager/workers.js +76 -0
- package/dist/providers/mock.js +227 -0
- package/dist/search/idf.js +24 -0
- package/dist/search/search-api.js +94 -0
- package/dist/search/search-index.js +32 -0
- package/dist/search/search-utils.js +81 -0
- package/dist/search/session-cache.js +96 -0
- package/dist/search/types.js +0 -0
- package/dist/search/vector-reranker.js +103 -0
- package/dist/server/chat-handler.js +603 -0
- package/dist/server/errors.js +46 -0
- package/dist/server/metadata-init.js +49 -0
- package/dist/server/notify.js +70 -0
- package/dist/server/stream-helpers.js +202 -0
- package/dist/server/types.js +16 -0
- package/dist/stream/mock-stream.js +26 -0
- package/dist/stream/response.js +21 -0
- package/dist/utils/i18n.js +154 -0
- package/package.json +3 -3
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { scoreDocument, filterLowRelevance, tokenize, pickAnchorTerms, normalizeText } from "./search-utils.js";
|
|
2
|
+
import { buildSearchIndex, getIDFMapForIndex } from "./search-index.js";
|
|
3
|
+
import { hasVectorIndex, rerankWithVectors } from "./vector-reranker.js";
|
|
4
|
+
let articleIndex = null;
|
|
5
|
+
let projectIndex = null;
|
|
6
|
+
const ARTICLE_LIMIT = 10;
|
|
7
|
+
const ARTICLE_LIMIT_BROAD = 20;
|
|
8
|
+
const PROJECT_LIMIT = 5;
|
|
9
|
+
const DEEP_CONTENT_SCORE_THRESHOLD = 8;
|
|
10
|
+
const DEEP_CONTENT_MAX_LENGTH = 1500;
|
|
11
|
+
function initArticleIndex(documents) {
|
|
12
|
+
articleIndex = buildSearchIndex(documents);
|
|
13
|
+
}
|
|
14
|
+
function initProjectIndex(documents) {
|
|
15
|
+
projectIndex = buildSearchIndex(documents);
|
|
16
|
+
}
|
|
17
|
+
function searchArticles(query, options = {}) {
|
|
18
|
+
if (!query.trim() || !articleIndex) return [];
|
|
19
|
+
const tokens = tokenize(query);
|
|
20
|
+
if (!tokens.length) return [];
|
|
21
|
+
const limit = tokens.length <= 2 ? ARTICLE_LIMIT_BROAD : ARTICLE_LIMIT;
|
|
22
|
+
const rawResults = scoreDocs(articleIndex, tokens, limit * 2);
|
|
23
|
+
const filtered = applyAnchorFilter(rawResults, query, tokens);
|
|
24
|
+
const deduplicated = filterLowRelevance(filtered.length > 0 ? filtered : rawResults);
|
|
25
|
+
const results = deduplicated.slice(0, limit);
|
|
26
|
+
const topScore = results[0]?.score ?? 0;
|
|
27
|
+
const secondScore = results[1]?.score ?? 0;
|
|
28
|
+
const isDeepHit = options.enableDeepContent && topScore >= DEEP_CONTENT_SCORE_THRESHOLD && topScore > secondScore * 1.5;
|
|
29
|
+
let articles = results.map((result, index) => {
|
|
30
|
+
const baseUrl = options.siteUrl ?? "";
|
|
31
|
+
const url = result.url.startsWith("http") ? result.url : `${baseUrl}${result.url}`;
|
|
32
|
+
const fullContent = isDeepHit && index === 0 && result.content ? result.content.slice(0, DEEP_CONTENT_MAX_LENGTH) : void 0;
|
|
33
|
+
return {
|
|
34
|
+
title: result.title,
|
|
35
|
+
url,
|
|
36
|
+
summary: result.summary ?? result.excerpt,
|
|
37
|
+
keyPoints: result.keyPoints,
|
|
38
|
+
categories: result.categories,
|
|
39
|
+
dateTime: result.dateTime,
|
|
40
|
+
fullContent,
|
|
41
|
+
score: result.score
|
|
42
|
+
};
|
|
43
|
+
});
|
|
44
|
+
if (hasVectorIndex() && articles.length > 1) {
|
|
45
|
+
articles = rerankWithVectors(query, articles);
|
|
46
|
+
}
|
|
47
|
+
return articles;
|
|
48
|
+
}
|
|
49
|
+
function searchProjects(query, options = {}) {
|
|
50
|
+
if (!query.trim() || !projectIndex) return [];
|
|
51
|
+
const tokens = tokenize(query);
|
|
52
|
+
if (!tokens.length) return [];
|
|
53
|
+
const rawResults = scoreDocs(projectIndex, tokens, PROJECT_LIMIT * 2);
|
|
54
|
+
if (!rawResults.length) return [];
|
|
55
|
+
const baseUrl = options.siteUrl ?? "";
|
|
56
|
+
return rawResults.slice(0, PROJECT_LIMIT).map((r) => ({
|
|
57
|
+
name: r.title,
|
|
58
|
+
url: r.url.startsWith("http") ? r.url : `${baseUrl}${r.url}`,
|
|
59
|
+
description: r.excerpt || r.content.slice(0, 200),
|
|
60
|
+
score: r.score
|
|
61
|
+
}));
|
|
62
|
+
}
|
|
63
|
+
function mergeResults(primary, secondary) {
|
|
64
|
+
const seen = new Set(primary.map((i) => i.url));
|
|
65
|
+
const merged = [...primary];
|
|
66
|
+
for (const item of secondary) {
|
|
67
|
+
if (!seen.has(item.url)) {
|
|
68
|
+
seen.add(item.url);
|
|
69
|
+
merged.push(item);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return merged;
|
|
73
|
+
}
|
|
74
|
+
function scoreDocs(index, tokens, limit) {
|
|
75
|
+
const idfMap = getIDFMapForIndex();
|
|
76
|
+
return index.map((doc) => ({ ...doc, score: scoreDocument(tokens, doc, idfMap) })).filter((doc) => doc.score > 0).sort((a, b) => b.score - a.score).slice(0, limit);
|
|
77
|
+
}
|
|
78
|
+
function applyAnchorFilter(results, query, tokens) {
|
|
79
|
+
if (tokens.length > 2) return results;
|
|
80
|
+
const anchorTerms = pickAnchorTerms(query, results, 2, 2);
|
|
81
|
+
if (!anchorTerms.length) return results;
|
|
82
|
+
const strict = results.filter((r) => {
|
|
83
|
+
const text = normalizeText([r.title, ...r.keyPoints, ...r.categories].join(" "));
|
|
84
|
+
return anchorTerms.some((term) => text.includes(term));
|
|
85
|
+
});
|
|
86
|
+
return strict.length > 0 ? strict : results;
|
|
87
|
+
}
|
|
88
|
+
export {
|
|
89
|
+
initArticleIndex,
|
|
90
|
+
initProjectIndex,
|
|
91
|
+
mergeResults,
|
|
92
|
+
searchArticles,
|
|
93
|
+
searchProjects
|
|
94
|
+
};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { normalizeText } from "./search-utils.js";
|
|
2
|
+
import { buildIDFMap } from "./idf.js";
|
|
3
|
+
let cachedIDFMap = null;
|
|
4
|
+
function buildSearchIndex(documents) {
|
|
5
|
+
const indexed = documents.map((doc) => ({
|
|
6
|
+
...doc,
|
|
7
|
+
tokens: buildDocumentTokens(doc)
|
|
8
|
+
}));
|
|
9
|
+
if (indexed.length > 0) {
|
|
10
|
+
cachedIDFMap = buildIDFMap(indexed);
|
|
11
|
+
}
|
|
12
|
+
return indexed;
|
|
13
|
+
}
|
|
14
|
+
function getIDFMapForIndex() {
|
|
15
|
+
return cachedIDFMap;
|
|
16
|
+
}
|
|
17
|
+
function buildDocumentTokens(doc) {
|
|
18
|
+
const parts = [
|
|
19
|
+
doc.title,
|
|
20
|
+
doc.excerpt,
|
|
21
|
+
doc.content.slice(0, 1e3),
|
|
22
|
+
...doc.keyPoints,
|
|
23
|
+
...doc.categories,
|
|
24
|
+
...doc.tags,
|
|
25
|
+
doc.summary ?? ""
|
|
26
|
+
];
|
|
27
|
+
return [...new Set(parts.map(normalizeText).join(" ").split(/\s+/).filter(Boolean))];
|
|
28
|
+
}
|
|
29
|
+
export {
|
|
30
|
+
buildSearchIndex,
|
|
31
|
+
getIDFMapForIndex
|
|
32
|
+
};
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import { getIDFWeight } from "./idf.js";
|
|
2
|
+
function normalizeText(text) {
|
|
3
|
+
return text.toLowerCase().replace(/[^\u4e00-\u9fa5\w\s]/g, " ").replace(/\s+/g, " ").trim();
|
|
4
|
+
}
|
|
5
|
+
function tokenize(text) {
|
|
6
|
+
const normalized = normalizeText(text);
|
|
7
|
+
const parts = normalized.split(/\s+/).filter(Boolean);
|
|
8
|
+
return dedupeByContainment(parts);
|
|
9
|
+
}
|
|
10
|
+
function dedupeByContainment(terms) {
|
|
11
|
+
const unique = [...new Set(terms)];
|
|
12
|
+
const kept = [];
|
|
13
|
+
for (const term of unique.sort((a, b) => b.length - a.length)) {
|
|
14
|
+
if (!kept.some((existing) => existing.includes(term))) {
|
|
15
|
+
kept.push(term);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return kept;
|
|
19
|
+
}
|
|
20
|
+
const FIELD_WEIGHTS = {
|
|
21
|
+
title: 8,
|
|
22
|
+
keyPoints: 5,
|
|
23
|
+
categories: 4,
|
|
24
|
+
tags: 3,
|
|
25
|
+
excerpt: 3,
|
|
26
|
+
content: 1
|
|
27
|
+
};
|
|
28
|
+
function scoreDocument(tokens, doc, idfMap) {
|
|
29
|
+
if (!tokens.length) return 0;
|
|
30
|
+
let score = 0;
|
|
31
|
+
const title = normalizeText(doc.title);
|
|
32
|
+
const excerpt = normalizeText(doc.excerpt);
|
|
33
|
+
const keyPointsText = normalizeText(doc.keyPoints.join(" "));
|
|
34
|
+
const categoriesText = normalizeText(doc.categories.join(" "));
|
|
35
|
+
const tagsText = normalizeText(doc.tags.join(" "));
|
|
36
|
+
const contentSample = normalizeText(doc.content.slice(0, 500));
|
|
37
|
+
for (const token of tokens) {
|
|
38
|
+
if (!token) continue;
|
|
39
|
+
const idf = getIDFWeight(idfMap ?? null, token);
|
|
40
|
+
if (title.includes(token)) score += FIELD_WEIGHTS.title * idf;
|
|
41
|
+
if (keyPointsText.includes(token)) score += FIELD_WEIGHTS.keyPoints * idf;
|
|
42
|
+
if (categoriesText.includes(token)) score += FIELD_WEIGHTS.categories * idf;
|
|
43
|
+
if (tagsText.includes(token)) score += FIELD_WEIGHTS.tags * idf;
|
|
44
|
+
if (excerpt.includes(token)) score += FIELD_WEIGHTS.excerpt * idf;
|
|
45
|
+
if (contentSample.includes(token)) score += FIELD_WEIGHTS.content * idf;
|
|
46
|
+
}
|
|
47
|
+
return score;
|
|
48
|
+
}
|
|
49
|
+
function filterLowRelevance(results, relativeThreshold = 0.35, minAbsoluteScore = 2) {
|
|
50
|
+
if (results.length <= 3) return results;
|
|
51
|
+
const topScore = results[0]?.score ?? 0;
|
|
52
|
+
if (topScore <= 0) return results;
|
|
53
|
+
const threshold = Math.max(minAbsoluteScore, topScore * relativeThreshold);
|
|
54
|
+
return results.filter((item, index) => index < 3 || item.score >= threshold);
|
|
55
|
+
}
|
|
56
|
+
function pickAnchorTerms(query, candidates, maxTerms = 2, minTermLength = 2) {
|
|
57
|
+
const terms = tokenize(query).filter((t) => t.length >= minTermLength);
|
|
58
|
+
if (terms.length <= maxTerms) return terms.slice(0, maxTerms);
|
|
59
|
+
if (!candidates.length) return terms.slice(0, maxTerms);
|
|
60
|
+
const scored = terms.map((term) => {
|
|
61
|
+
let hitCount = 0;
|
|
62
|
+
for (const c of candidates) {
|
|
63
|
+
const text = normalizeText([c.title, ...c.keyPoints, ...c.categories].join(" "));
|
|
64
|
+
if (text.includes(term)) hitCount++;
|
|
65
|
+
}
|
|
66
|
+
if (hitCount <= 0) return { term, score: Number.NEGATIVE_INFINITY };
|
|
67
|
+
const coverage = hitCount / candidates.length;
|
|
68
|
+
const specificity = 1 - coverage;
|
|
69
|
+
const lengthScore = Math.min(term.length, 8) / 8;
|
|
70
|
+
return { term, score: specificity * 2 + lengthScore };
|
|
71
|
+
});
|
|
72
|
+
return scored.filter((s) => Number.isFinite(s.score)).sort((a, b) => b.score - a.score).map((s) => s.term).slice(0, maxTerms);
|
|
73
|
+
}
|
|
74
|
+
export {
|
|
75
|
+
dedupeByContainment,
|
|
76
|
+
filterLowRelevance,
|
|
77
|
+
normalizeText,
|
|
78
|
+
pickAnchorTerms,
|
|
79
|
+
scoreDocument,
|
|
80
|
+
tokenize
|
|
81
|
+
};
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { MemoryCacheAdapter } from "../cache/memory-adapter.js";
|
|
2
|
+
const SESSION_ID_PATTERN = /^[a-z0-9][a-z0-9_-]{7,63}$/i;
|
|
3
|
+
const SESSION_CACHE_TTL_SECONDS = 600;
|
|
4
|
+
const SESSION_CACHE_TTL_MS = SESSION_CACHE_TTL_SECONDS * 1e3;
|
|
5
|
+
let defaultCache = null;
|
|
6
|
+
function getDefaultCache() {
|
|
7
|
+
if (!defaultCache) {
|
|
8
|
+
defaultCache = new MemoryCacheAdapter({
|
|
9
|
+
defaultTtl: SESSION_CACHE_TTL_SECONDS,
|
|
10
|
+
maxEntries: 400
|
|
11
|
+
});
|
|
12
|
+
}
|
|
13
|
+
return defaultCache;
|
|
14
|
+
}
|
|
15
|
+
function getSessionCacheKey(req) {
|
|
16
|
+
const sessionId = req.headers.get("x-session-id")?.trim();
|
|
17
|
+
if (sessionId && SESSION_ID_PATTERN.test(sessionId)) {
|
|
18
|
+
return `sid:${sessionId}`;
|
|
19
|
+
}
|
|
20
|
+
return null;
|
|
21
|
+
}
|
|
22
|
+
function setCacheAdapter(cache) {
|
|
23
|
+
defaultCache = cache;
|
|
24
|
+
}
|
|
25
|
+
function getCacheAdapter() {
|
|
26
|
+
return getDefaultCache();
|
|
27
|
+
}
|
|
28
|
+
async function getCachedContext(key, cache) {
|
|
29
|
+
const adapter = cache ?? getDefaultCache();
|
|
30
|
+
const entry = await adapter.get(key);
|
|
31
|
+
return entry?.value;
|
|
32
|
+
}
|
|
33
|
+
async function setCachedContext(key, ctx, cache) {
|
|
34
|
+
const adapter = cache ?? getDefaultCache();
|
|
35
|
+
await adapter.set(key, ctx, { ttl: SESSION_CACHE_TTL_SECONDS });
|
|
36
|
+
}
|
|
37
|
+
async function deleteCachedContext(key, cache) {
|
|
38
|
+
const adapter = cache ?? getDefaultCache();
|
|
39
|
+
return adapter.delete(key);
|
|
40
|
+
}
|
|
41
|
+
function cleanupCache(_now) {
|
|
42
|
+
}
|
|
43
|
+
const legacyCache = /* @__PURE__ */ new Map();
|
|
44
|
+
const LEGACY_TTL_MS = 10 * 60 * 1e3;
|
|
45
|
+
const MAX_CACHE_SIZE = 400;
|
|
46
|
+
function getCachedContextSync(key) {
|
|
47
|
+
const entry = legacyCache.get(key);
|
|
48
|
+
if (!entry) return void 0;
|
|
49
|
+
if (Date.now() - entry.updatedAt > LEGACY_TTL_MS) {
|
|
50
|
+
legacyCache.delete(key);
|
|
51
|
+
return void 0;
|
|
52
|
+
}
|
|
53
|
+
return entry;
|
|
54
|
+
}
|
|
55
|
+
function setCachedContextSync(key, ctx) {
|
|
56
|
+
legacyCache.set(key, ctx);
|
|
57
|
+
if (legacyCache.size > MAX_CACHE_SIZE) {
|
|
58
|
+
const overflow = legacyCache.size - MAX_CACHE_SIZE;
|
|
59
|
+
const keys = legacyCache.keys();
|
|
60
|
+
for (let i = 0; i < overflow; i++) {
|
|
61
|
+
const next = keys.next();
|
|
62
|
+
if (next.done) break;
|
|
63
|
+
legacyCache.delete(next.value);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
function cleanupCacheLegacy(now) {
|
|
68
|
+
for (const [key, value] of legacyCache) {
|
|
69
|
+
if (now - value.updatedAt > LEGACY_TTL_MS) {
|
|
70
|
+
legacyCache.delete(key);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
if (legacyCache.size > MAX_CACHE_SIZE) {
|
|
74
|
+
const overflow = legacyCache.size - MAX_CACHE_SIZE;
|
|
75
|
+
const keys = legacyCache.keys();
|
|
76
|
+
for (let i = 0; i < overflow; i++) {
|
|
77
|
+
const next = keys.next();
|
|
78
|
+
if (next.done) break;
|
|
79
|
+
legacyCache.delete(next.value);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
export {
|
|
84
|
+
SESSION_CACHE_TTL_MS,
|
|
85
|
+
SESSION_CACHE_TTL_SECONDS,
|
|
86
|
+
cleanupCache,
|
|
87
|
+
cleanupCacheLegacy,
|
|
88
|
+
deleteCachedContext,
|
|
89
|
+
getCacheAdapter,
|
|
90
|
+
getCachedContext,
|
|
91
|
+
getCachedContextSync,
|
|
92
|
+
getSessionCacheKey,
|
|
93
|
+
setCacheAdapter,
|
|
94
|
+
setCachedContext,
|
|
95
|
+
setCachedContextSync
|
|
96
|
+
};
|
|
File without changes
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
let loadedIndex = null;
|
|
2
|
+
let idfCache = null;
|
|
3
|
+
function loadVectorIndex(data) {
|
|
4
|
+
loadedIndex = data;
|
|
5
|
+
idfCache = null;
|
|
6
|
+
if (data?.vocabulary && data.chunks.length > 0) {
|
|
7
|
+
idfCache = buildIDFFromVocab(data.vocabulary, data.chunks);
|
|
8
|
+
}
|
|
9
|
+
}
|
|
10
|
+
function clearVectorIndex() {
|
|
11
|
+
loadedIndex = null;
|
|
12
|
+
idfCache = null;
|
|
13
|
+
}
|
|
14
|
+
function hasVectorIndex() {
|
|
15
|
+
return loadedIndex !== null && loadedIndex.chunks.length > 0;
|
|
16
|
+
}
|
|
17
|
+
function rerankWithVectors(query, candidates, alpha = 0.3) {
|
|
18
|
+
if (!loadedIndex || !idfCache || !loadedIndex.vocabulary) {
|
|
19
|
+
return candidates;
|
|
20
|
+
}
|
|
21
|
+
const queryVector = computeQueryVector(query, loadedIndex.vocabulary, idfCache);
|
|
22
|
+
if (!queryVector) return candidates;
|
|
23
|
+
const articleScores = /* @__PURE__ */ new Map();
|
|
24
|
+
for (const chunk of loadedIndex.chunks) {
|
|
25
|
+
if (!chunk.vector) continue;
|
|
26
|
+
const sim = cosineSimilarity(queryVector, chunk.vector);
|
|
27
|
+
const current = articleScores.get(chunk.postId) ?? 0;
|
|
28
|
+
if (sim > current) {
|
|
29
|
+
articleScores.set(chunk.postId, sim);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
if (articleScores.size === 0) return candidates;
|
|
33
|
+
const maxOriginal = Math.max(...candidates.map((c) => c.score ?? 0), 1);
|
|
34
|
+
const reranked = candidates.map((article) => {
|
|
35
|
+
const slug = extractSlugFromUrl(article.url);
|
|
36
|
+
const vectorScore = articleScores.get(slug) ?? 0;
|
|
37
|
+
const originalNorm = (article.score ?? 0) / maxOriginal;
|
|
38
|
+
const blended = originalNorm * (1 - alpha) + vectorScore * alpha;
|
|
39
|
+
return { ...article, score: blended };
|
|
40
|
+
});
|
|
41
|
+
return reranked.sort((a, b) => (b.score ?? 0) - (a.score ?? 0));
|
|
42
|
+
}
|
|
43
|
+
function cosineSimilarity(a, b) {
|
|
44
|
+
if (a.length !== b.length) return 0;
|
|
45
|
+
let dot = 0, magA = 0, magB = 0;
|
|
46
|
+
for (let i = 0; i < a.length; i++) {
|
|
47
|
+
dot += a[i] * b[i];
|
|
48
|
+
magA += a[i] * a[i];
|
|
49
|
+
magB += b[i] * b[i];
|
|
50
|
+
}
|
|
51
|
+
const mag = Math.sqrt(magA) * Math.sqrt(magB);
|
|
52
|
+
return mag === 0 ? 0 : dot / mag;
|
|
53
|
+
}
|
|
54
|
+
function computeQueryVector(query, vocabulary, idf) {
|
|
55
|
+
const tokens = tokenizeForVector(query);
|
|
56
|
+
if (!tokens.length) return null;
|
|
57
|
+
const tf = /* @__PURE__ */ new Map();
|
|
58
|
+
for (const t of tokens) tf.set(t, (tf.get(t) || 0) + 1);
|
|
59
|
+
const maxTf = Math.max(...tf.values(), 1);
|
|
60
|
+
const vector = vocabulary.map((term) => {
|
|
61
|
+
const termTf = (tf.get(term) || 0) / maxTf;
|
|
62
|
+
const termIdf = idf.get(term) || 0;
|
|
63
|
+
return termTf * termIdf;
|
|
64
|
+
});
|
|
65
|
+
if (vector.every((v) => v === 0)) return null;
|
|
66
|
+
return vector;
|
|
67
|
+
}
|
|
68
|
+
function tokenizeForVector(text) {
|
|
69
|
+
const CJK = /[\u4e00-\u9fff\u3400-\u4dbf]/g;
|
|
70
|
+
const cjkChars = text.match(CJK) || [];
|
|
71
|
+
const latin = text.replace(CJK, " ").toLowerCase().split(/\W+/).filter((w) => w.length > 2);
|
|
72
|
+
return [...cjkChars, ...latin];
|
|
73
|
+
}
|
|
74
|
+
function buildIDFFromVocab(vocabulary, chunks) {
|
|
75
|
+
const N = chunks.length;
|
|
76
|
+
const df = /* @__PURE__ */ new Map();
|
|
77
|
+
for (const chunk of chunks) {
|
|
78
|
+
const tokens = new Set(tokenizeForVector(chunk.text));
|
|
79
|
+
for (const term of vocabulary) {
|
|
80
|
+
if (tokens.has(term)) {
|
|
81
|
+
df.set(term, (df.get(term) || 0) + 1);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
const idf = /* @__PURE__ */ new Map();
|
|
86
|
+
for (const term of vocabulary) {
|
|
87
|
+
const docCount = df.get(term) || 0;
|
|
88
|
+
idf.set(term, Math.log(N / (docCount + 1)) + 1);
|
|
89
|
+
}
|
|
90
|
+
return idf;
|
|
91
|
+
}
|
|
92
|
+
function extractSlugFromUrl(url) {
|
|
93
|
+
const path = url.replace(/^https?:\/\/[^/]+/, "");
|
|
94
|
+
const match = path.match(/^\/([\w-]+)\/posts\/(.+?)\/?$/);
|
|
95
|
+
if (match) return `${match[1]}/${match[2]}`;
|
|
96
|
+
return path.replace(/^\/|\/$/g, "");
|
|
97
|
+
}
|
|
98
|
+
export {
|
|
99
|
+
clearVectorIndex,
|
|
100
|
+
hasVectorIndex,
|
|
101
|
+
loadVectorIndex,
|
|
102
|
+
rerankWithVectors
|
|
103
|
+
};
|