searchsocket 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +456 -187
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +590 -169
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +590 -170
- package/dist/sveltekit.cjs +380 -82
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +380 -82
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -5025,32 +5025,32 @@ var require_URL = __commonJS({
|
|
|
5025
5025
|
else
|
|
5026
5026
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5027
5027
|
}
|
|
5028
|
-
function remove_dot_segments(
|
|
5029
|
-
if (!
|
|
5028
|
+
function remove_dot_segments(path15) {
|
|
5029
|
+
if (!path15) return path15;
|
|
5030
5030
|
var output = "";
|
|
5031
|
-
while (
|
|
5032
|
-
if (
|
|
5033
|
-
|
|
5031
|
+
while (path15.length > 0) {
|
|
5032
|
+
if (path15 === "." || path15 === "..") {
|
|
5033
|
+
path15 = "";
|
|
5034
5034
|
break;
|
|
5035
5035
|
}
|
|
5036
|
-
var twochars =
|
|
5037
|
-
var threechars =
|
|
5038
|
-
var fourchars =
|
|
5036
|
+
var twochars = path15.substring(0, 2);
|
|
5037
|
+
var threechars = path15.substring(0, 3);
|
|
5038
|
+
var fourchars = path15.substring(0, 4);
|
|
5039
5039
|
if (threechars === "../") {
|
|
5040
|
-
|
|
5040
|
+
path15 = path15.substring(3);
|
|
5041
5041
|
} else if (twochars === "./") {
|
|
5042
|
-
|
|
5042
|
+
path15 = path15.substring(2);
|
|
5043
5043
|
} else if (threechars === "/./") {
|
|
5044
|
-
|
|
5045
|
-
} else if (twochars === "/." &&
|
|
5046
|
-
|
|
5047
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5048
|
-
|
|
5044
|
+
path15 = "/" + path15.substring(3);
|
|
5045
|
+
} else if (twochars === "/." && path15.length === 2) {
|
|
5046
|
+
path15 = "/";
|
|
5047
|
+
} else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
|
|
5048
|
+
path15 = "/" + path15.substring(4);
|
|
5049
5049
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5050
5050
|
} else {
|
|
5051
|
-
var segment =
|
|
5051
|
+
var segment = path15.match(/(\/?([^\/]*))/)[0];
|
|
5052
5052
|
output += segment;
|
|
5053
|
-
|
|
5053
|
+
path15 = path15.substring(segment.length);
|
|
5054
5054
|
}
|
|
5055
5055
|
}
|
|
5056
5056
|
return output;
|
|
@@ -16614,6 +16614,8 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16614
16614
|
envVar: zod.z.string().min(1).optional(),
|
|
16615
16615
|
sanitize: zod.z.boolean().optional()
|
|
16616
16616
|
}).optional(),
|
|
16617
|
+
exclude: zod.z.array(zod.z.string()).optional(),
|
|
16618
|
+
respectRobotsTxt: zod.z.boolean().optional(),
|
|
16617
16619
|
source: zod.z.object({
|
|
16618
16620
|
mode: zod.z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16619
16621
|
staticOutputDir: zod.z.string().min(1).optional(),
|
|
@@ -16744,6 +16746,8 @@ function createDefaultConfig(projectId) {
|
|
|
16744
16746
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16745
16747
|
sanitize: true
|
|
16746
16748
|
},
|
|
16749
|
+
exclude: [],
|
|
16750
|
+
respectRobotsTxt: true,
|
|
16747
16751
|
source: {
|
|
16748
16752
|
mode: "static-output",
|
|
16749
16753
|
staticOutputDir: "build",
|
|
@@ -16774,7 +16778,7 @@ function createDefaultConfig(projectId) {
|
|
|
16774
16778
|
},
|
|
16775
16779
|
embeddings: {
|
|
16776
16780
|
provider: "jina",
|
|
16777
|
-
model: "jina-embeddings-
|
|
16781
|
+
model: "jina-embeddings-v5-text-small",
|
|
16778
16782
|
apiKeyEnv: "JINA_API_KEY",
|
|
16779
16783
|
batchSize: 64,
|
|
16780
16784
|
concurrency: 4
|
|
@@ -16787,9 +16791,9 @@ function createDefaultConfig(projectId) {
|
|
|
16787
16791
|
}
|
|
16788
16792
|
},
|
|
16789
16793
|
rerank: {
|
|
16790
|
-
enabled:
|
|
16794
|
+
enabled: true,
|
|
16791
16795
|
topN: 20,
|
|
16792
|
-
model: "jina-reranker-
|
|
16796
|
+
model: "jina-reranker-v3"
|
|
16793
16797
|
},
|
|
16794
16798
|
ranking: {
|
|
16795
16799
|
enableIncomingLinkBoost: true,
|
|
@@ -16908,6 +16912,8 @@ ${issues}`
|
|
|
16908
16912
|
...defaults.scope,
|
|
16909
16913
|
...parsed.scope
|
|
16910
16914
|
},
|
|
16915
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16916
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16911
16917
|
source: {
|
|
16912
16918
|
...defaults.source,
|
|
16913
16919
|
...parsed.source,
|
|
@@ -17273,7 +17279,7 @@ var JinaReranker = class {
|
|
|
17273
17279
|
constructor(options) {
|
|
17274
17280
|
this.apiKey = options.apiKey;
|
|
17275
17281
|
this.model = options.model;
|
|
17276
|
-
this.maxRetries = options.maxRetries ??
|
|
17282
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
17277
17283
|
}
|
|
17278
17284
|
async rerank(query, candidates, topN) {
|
|
17279
17285
|
if (candidates.length === 0) {
|
|
@@ -17283,7 +17289,8 @@ var JinaReranker = class {
|
|
|
17283
17289
|
model: this.model,
|
|
17284
17290
|
query,
|
|
17285
17291
|
documents: candidates.map((candidate) => candidate.text),
|
|
17286
|
-
top_n: topN ?? candidates.length
|
|
17292
|
+
top_n: topN ?? candidates.length,
|
|
17293
|
+
return_documents: false
|
|
17287
17294
|
};
|
|
17288
17295
|
let attempt = 0;
|
|
17289
17296
|
while (attempt <= this.maxRetries) {
|
|
@@ -19048,6 +19055,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19048
19055
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19049
19056
|
return null;
|
|
19050
19057
|
}
|
|
19058
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
19059
|
+
let weight;
|
|
19060
|
+
if (weightRaw !== void 0) {
|
|
19061
|
+
const parsed = Number(weightRaw);
|
|
19062
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
19063
|
+
weight = parsed;
|
|
19064
|
+
}
|
|
19065
|
+
}
|
|
19066
|
+
if (weight === 0) {
|
|
19067
|
+
return null;
|
|
19068
|
+
}
|
|
19051
19069
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19052
19070
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19053
19071
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19103,7 +19121,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19103
19121
|
noindex: false,
|
|
19104
19122
|
tags,
|
|
19105
19123
|
description,
|
|
19106
|
-
keywords
|
|
19124
|
+
keywords,
|
|
19125
|
+
weight
|
|
19107
19126
|
};
|
|
19108
19127
|
}
|
|
19109
19128
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19116,6 +19135,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19116
19135
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19117
19136
|
return null;
|
|
19118
19137
|
}
|
|
19138
|
+
let mdWeight;
|
|
19139
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
19140
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
19141
|
+
mdWeight = rawWeight;
|
|
19142
|
+
}
|
|
19143
|
+
if (mdWeight === 0) {
|
|
19144
|
+
return null;
|
|
19145
|
+
}
|
|
19119
19146
|
const content = parsed.content;
|
|
19120
19147
|
const normalized = normalizeMarkdown(content);
|
|
19121
19148
|
if (!normalizeText(normalized)) {
|
|
@@ -19138,7 +19165,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19138
19165
|
noindex: false,
|
|
19139
19166
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19140
19167
|
description: fmDescription,
|
|
19141
|
-
keywords: fmKeywords
|
|
19168
|
+
keywords: fmKeywords,
|
|
19169
|
+
weight: mdWeight
|
|
19142
19170
|
};
|
|
19143
19171
|
}
|
|
19144
19172
|
function yamlString(value) {
|
|
@@ -19334,6 +19362,38 @@ var Logger = class {
|
|
|
19334
19362
|
`);
|
|
19335
19363
|
}
|
|
19336
19364
|
};
|
|
19365
|
+
|
|
19366
|
+
// src/utils/pattern.ts
|
|
19367
|
+
function matchUrlPattern(url, pattern) {
|
|
19368
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
19369
|
+
const normalizedUrl = norm(url);
|
|
19370
|
+
const normalizedPattern = norm(pattern);
|
|
19371
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
19372
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
19373
|
+
if (prefix === "") {
|
|
19374
|
+
return true;
|
|
19375
|
+
}
|
|
19376
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
19377
|
+
}
|
|
19378
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
19379
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
19380
|
+
if (prefix === "") {
|
|
19381
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
19382
|
+
}
|
|
19383
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
19384
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
19385
|
+
return rest.length > 0 && !rest.includes("/");
|
|
19386
|
+
}
|
|
19387
|
+
return normalizedUrl === normalizedPattern;
|
|
19388
|
+
}
|
|
19389
|
+
function matchUrlPatterns(url, patterns) {
|
|
19390
|
+
for (const pattern of patterns) {
|
|
19391
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
19392
|
+
}
|
|
19393
|
+
return false;
|
|
19394
|
+
}
|
|
19395
|
+
|
|
19396
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
19337
19397
|
function routeIdToFile(routeId) {
|
|
19338
19398
|
if (routeId === "/") {
|
|
19339
19399
|
return "src/routes/+page.svelte";
|
|
@@ -19407,15 +19467,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19407
19467
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19408
19468
|
}
|
|
19409
19469
|
function isExcluded(url, patterns) {
|
|
19410
|
-
|
|
19411
|
-
if (pattern.endsWith("/*")) {
|
|
19412
|
-
const prefix = pattern.slice(0, -1);
|
|
19413
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19414
|
-
} else if (url === pattern) {
|
|
19415
|
-
return true;
|
|
19416
|
-
}
|
|
19417
|
-
}
|
|
19418
|
-
return false;
|
|
19470
|
+
return matchUrlPatterns(url, patterns);
|
|
19419
19471
|
}
|
|
19420
19472
|
function findFreePort() {
|
|
19421
19473
|
return new Promise((resolve, reject) => {
|
|
@@ -19831,6 +19883,158 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19831
19883
|
}
|
|
19832
19884
|
return pages;
|
|
19833
19885
|
}
|
|
19886
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
19887
|
+
const lines = content.split(/\r?\n/);
|
|
19888
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
19889
|
+
let currentAgents = [];
|
|
19890
|
+
for (const rawLine of lines) {
|
|
19891
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
19892
|
+
if (!line) continue;
|
|
19893
|
+
const colonIdx = line.indexOf(":");
|
|
19894
|
+
if (colonIdx === -1) continue;
|
|
19895
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
19896
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
19897
|
+
if (directive === "user-agent") {
|
|
19898
|
+
const agentName = value.toLowerCase();
|
|
19899
|
+
currentAgents.push(agentName);
|
|
19900
|
+
if (!agentGroups.has(agentName)) {
|
|
19901
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
19902
|
+
}
|
|
19903
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
19904
|
+
for (const agent of currentAgents) {
|
|
19905
|
+
agentGroups.get(agent).disallow.push(value);
|
|
19906
|
+
}
|
|
19907
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
19908
|
+
for (const agent of currentAgents) {
|
|
19909
|
+
agentGroups.get(agent).allow.push(value);
|
|
19910
|
+
}
|
|
19911
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
19912
|
+
currentAgents = [];
|
|
19913
|
+
}
|
|
19914
|
+
}
|
|
19915
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
19916
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
19917
|
+
return specific;
|
|
19918
|
+
}
|
|
19919
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
19920
|
+
}
|
|
19921
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
19922
|
+
let longestDisallow = "";
|
|
19923
|
+
for (const pattern of rules3.disallow) {
|
|
19924
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
19925
|
+
longestDisallow = pattern;
|
|
19926
|
+
}
|
|
19927
|
+
}
|
|
19928
|
+
if (!longestDisallow) return false;
|
|
19929
|
+
let longestAllow = "";
|
|
19930
|
+
for (const pattern of rules3.allow) {
|
|
19931
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
19932
|
+
longestAllow = pattern;
|
|
19933
|
+
}
|
|
19934
|
+
}
|
|
19935
|
+
return longestAllow.length < longestDisallow.length;
|
|
19936
|
+
}
|
|
19937
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
19938
|
+
try {
|
|
19939
|
+
const content = await fs4__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
|
|
19940
|
+
return parseRobotsTxt(content);
|
|
19941
|
+
} catch {
|
|
19942
|
+
return null;
|
|
19943
|
+
}
|
|
19944
|
+
}
|
|
19945
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
19946
|
+
try {
|
|
19947
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
19948
|
+
const response = await fetch(url);
|
|
19949
|
+
if (!response.ok) return null;
|
|
19950
|
+
const content = await response.text();
|
|
19951
|
+
return parseRobotsTxt(content);
|
|
19952
|
+
} catch {
|
|
19953
|
+
return null;
|
|
19954
|
+
}
|
|
19955
|
+
}
|
|
19956
|
+
|
|
19957
|
+
// src/search/ranking.ts
|
|
19958
|
+
function nonNegativeOrZero(value) {
|
|
19959
|
+
if (!Number.isFinite(value)) {
|
|
19960
|
+
return 0;
|
|
19961
|
+
}
|
|
19962
|
+
return Math.max(0, value);
|
|
19963
|
+
}
|
|
19964
|
+
function rankHits(hits, config) {
|
|
19965
|
+
return hits.map((hit) => {
|
|
19966
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
19967
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
19968
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19969
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
19970
|
+
}
|
|
19971
|
+
if (config.ranking.enableDepthBoost) {
|
|
19972
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19973
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
19974
|
+
}
|
|
19975
|
+
return {
|
|
19976
|
+
hit,
|
|
19977
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19978
|
+
};
|
|
19979
|
+
}).sort((a, b) => {
|
|
19980
|
+
const delta = b.finalScore - a.finalScore;
|
|
19981
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19982
|
+
});
|
|
19983
|
+
}
|
|
19984
|
+
function findPageWeight(url, pageWeights) {
|
|
19985
|
+
let bestPattern = "";
|
|
19986
|
+
let bestWeight = 1;
|
|
19987
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
19988
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
19989
|
+
bestPattern = pattern;
|
|
19990
|
+
bestWeight = weight;
|
|
19991
|
+
}
|
|
19992
|
+
}
|
|
19993
|
+
return bestWeight;
|
|
19994
|
+
}
|
|
19995
|
+
function aggregateByPage(ranked, config) {
|
|
19996
|
+
const groups = /* @__PURE__ */ new Map();
|
|
19997
|
+
for (const hit of ranked) {
|
|
19998
|
+
const url = hit.hit.metadata.url;
|
|
19999
|
+
const group = groups.get(url);
|
|
20000
|
+
if (group) group.push(hit);
|
|
20001
|
+
else groups.set(url, [hit]);
|
|
20002
|
+
}
|
|
20003
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
20004
|
+
const pages = [];
|
|
20005
|
+
for (const [url, chunks] of groups) {
|
|
20006
|
+
chunks.sort((a, b) => {
|
|
20007
|
+
const delta = b.finalScore - a.finalScore;
|
|
20008
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
20009
|
+
});
|
|
20010
|
+
const best = chunks[0];
|
|
20011
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20012
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
20013
|
+
let aggregationBonus = 0;
|
|
20014
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
20015
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20016
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20017
|
+
}
|
|
20018
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20019
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20020
|
+
if (pageWeight === 0) continue;
|
|
20021
|
+
if (pageWeight !== 1) {
|
|
20022
|
+
pageScore *= pageWeight;
|
|
20023
|
+
}
|
|
20024
|
+
pages.push({
|
|
20025
|
+
url,
|
|
20026
|
+
title: best.hit.metadata.title,
|
|
20027
|
+
routeFile: best.hit.metadata.routeFile,
|
|
20028
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20029
|
+
bestChunk: best,
|
|
20030
|
+
matchingChunks: chunks
|
|
20031
|
+
});
|
|
20032
|
+
}
|
|
20033
|
+
return pages.sort((a, b) => {
|
|
20034
|
+
const delta = b.pageScore - a.pageScore;
|
|
20035
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
20036
|
+
});
|
|
20037
|
+
}
|
|
19834
20038
|
|
|
19835
20039
|
// src/utils/time.ts
|
|
19836
20040
|
function nowIso() {
|
|
@@ -19842,9 +20046,10 @@ function hrTimeMs(start) {
|
|
|
19842
20046
|
|
|
19843
20047
|
// src/indexing/pipeline.ts
|
|
19844
20048
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19845
|
-
"jina-embeddings-v3": 2e-5
|
|
20049
|
+
"jina-embeddings-v3": 2e-5,
|
|
20050
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
19846
20051
|
};
|
|
19847
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
20052
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
19848
20053
|
var IndexPipeline = class _IndexPipeline {
|
|
19849
20054
|
cwd;
|
|
19850
20055
|
config;
|
|
@@ -19922,6 +20127,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19922
20127
|
}
|
|
19923
20128
|
stageEnd("source", sourceStart);
|
|
19924
20129
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20130
|
+
const filterStart = stageStart();
|
|
20131
|
+
let filteredSourcePages = sourcePages;
|
|
20132
|
+
if (this.config.exclude.length > 0) {
|
|
20133
|
+
const beforeExclude = filteredSourcePages.length;
|
|
20134
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20135
|
+
const url = normalizeUrlPath(p.url);
|
|
20136
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
20137
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
20138
|
+
return false;
|
|
20139
|
+
}
|
|
20140
|
+
return true;
|
|
20141
|
+
});
|
|
20142
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
20143
|
+
if (excludedCount > 0) {
|
|
20144
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
20145
|
+
}
|
|
20146
|
+
}
|
|
20147
|
+
if (this.config.respectRobotsTxt) {
|
|
20148
|
+
let robotsRules = null;
|
|
20149
|
+
if (sourceMode === "static-output") {
|
|
20150
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20151
|
+
path__default.default.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
20152
|
+
);
|
|
20153
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
20154
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20155
|
+
path__default.default.resolve(this.cwd, this.config.source.build.outputDir)
|
|
20156
|
+
);
|
|
20157
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
20158
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
20159
|
+
}
|
|
20160
|
+
if (robotsRules) {
|
|
20161
|
+
const beforeRobots = filteredSourcePages.length;
|
|
20162
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20163
|
+
const url = normalizeUrlPath(p.url);
|
|
20164
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
20165
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
20166
|
+
return false;
|
|
20167
|
+
}
|
|
20168
|
+
return true;
|
|
20169
|
+
});
|
|
20170
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
20171
|
+
if (robotsExcluded > 0) {
|
|
20172
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
20173
|
+
}
|
|
20174
|
+
}
|
|
20175
|
+
}
|
|
20176
|
+
stageEnd("filter", filterStart);
|
|
19925
20177
|
const routeStart = stageStart();
|
|
19926
20178
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19927
20179
|
stageEnd("route_map", routeStart);
|
|
@@ -19929,7 +20181,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19929
20181
|
const extractStart = stageStart();
|
|
19930
20182
|
this.logger.info("Extracting content...");
|
|
19931
20183
|
const extractedPages = [];
|
|
19932
|
-
for (const sourcePage of
|
|
20184
|
+
for (const sourcePage of filteredSourcePages) {
|
|
19933
20185
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
19934
20186
|
if (!extracted) {
|
|
19935
20187
|
this.logger.warn(
|
|
@@ -19955,16 +20207,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19955
20207
|
seenUrls.add(page.url);
|
|
19956
20208
|
uniquePages.push(page);
|
|
19957
20209
|
}
|
|
20210
|
+
const indexablePages = [];
|
|
20211
|
+
for (const page of uniquePages) {
|
|
20212
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
20213
|
+
if (effectiveWeight === 0) {
|
|
20214
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
20215
|
+
continue;
|
|
20216
|
+
}
|
|
20217
|
+
indexablePages.push(page);
|
|
20218
|
+
}
|
|
20219
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
20220
|
+
if (zeroWeightCount > 0) {
|
|
20221
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
20222
|
+
}
|
|
19958
20223
|
stageEnd("extract", extractStart);
|
|
19959
|
-
const skippedPages =
|
|
19960
|
-
this.logger.info(`Extracted ${
|
|
20224
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
20225
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19961
20226
|
const linkStart = stageStart();
|
|
19962
|
-
const pageSet = new Set(
|
|
20227
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19963
20228
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
19964
|
-
for (const page of
|
|
20229
|
+
for (const page of indexablePages) {
|
|
19965
20230
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19966
20231
|
}
|
|
19967
|
-
for (const page of
|
|
20232
|
+
for (const page of indexablePages) {
|
|
19968
20233
|
for (const outgoing of page.outgoingLinks) {
|
|
19969
20234
|
if (!pageSet.has(outgoing)) {
|
|
19970
20235
|
continue;
|
|
@@ -19988,7 +20253,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19988
20253
|
});
|
|
19989
20254
|
}
|
|
19990
20255
|
}
|
|
19991
|
-
for (const page of
|
|
20256
|
+
for (const page of indexablePages) {
|
|
19992
20257
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19993
20258
|
if (routeMatch.routeResolution === "best-effort") {
|
|
19994
20259
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -20205,100 +20470,6 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20205
20470
|
};
|
|
20206
20471
|
}
|
|
20207
20472
|
};
|
|
20208
|
-
|
|
20209
|
-
// src/search/ranking.ts
|
|
20210
|
-
function nonNegativeOrZero(value) {
|
|
20211
|
-
if (!Number.isFinite(value)) {
|
|
20212
|
-
return 0;
|
|
20213
|
-
}
|
|
20214
|
-
return Math.max(0, value);
|
|
20215
|
-
}
|
|
20216
|
-
function rankHits(hits, config) {
|
|
20217
|
-
return hits.map((hit) => {
|
|
20218
|
-
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20219
|
-
if (config.ranking.enableIncomingLinkBoost) {
|
|
20220
|
-
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
20221
|
-
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
20222
|
-
}
|
|
20223
|
-
if (config.ranking.enableDepthBoost) {
|
|
20224
|
-
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
20225
|
-
score += depthBoost * config.ranking.weights.depth;
|
|
20226
|
-
}
|
|
20227
|
-
return {
|
|
20228
|
-
hit,
|
|
20229
|
-
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
20230
|
-
};
|
|
20231
|
-
}).sort((a, b) => {
|
|
20232
|
-
const delta = b.finalScore - a.finalScore;
|
|
20233
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20234
|
-
});
|
|
20235
|
-
}
|
|
20236
|
-
function findPageWeight(url, pageWeights) {
|
|
20237
|
-
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
20238
|
-
const normalizedUrl = norm(url);
|
|
20239
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20240
|
-
if (norm(pattern) === normalizedUrl) {
|
|
20241
|
-
return weight;
|
|
20242
|
-
}
|
|
20243
|
-
}
|
|
20244
|
-
let bestPrefix = "";
|
|
20245
|
-
let bestWeight = 1;
|
|
20246
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20247
|
-
const normalizedPattern = norm(pattern);
|
|
20248
|
-
if (normalizedPattern === "/") continue;
|
|
20249
|
-
const prefix = `${normalizedPattern}/`;
|
|
20250
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
20251
|
-
bestPrefix = prefix;
|
|
20252
|
-
bestWeight = weight;
|
|
20253
|
-
}
|
|
20254
|
-
}
|
|
20255
|
-
return bestWeight;
|
|
20256
|
-
}
|
|
20257
|
-
function aggregateByPage(ranked, config) {
|
|
20258
|
-
const groups = /* @__PURE__ */ new Map();
|
|
20259
|
-
for (const hit of ranked) {
|
|
20260
|
-
const url = hit.hit.metadata.url;
|
|
20261
|
-
const group = groups.get(url);
|
|
20262
|
-
if (group) group.push(hit);
|
|
20263
|
-
else groups.set(url, [hit]);
|
|
20264
|
-
}
|
|
20265
|
-
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
20266
|
-
const pages = [];
|
|
20267
|
-
for (const [url, chunks] of groups) {
|
|
20268
|
-
chunks.sort((a, b) => {
|
|
20269
|
-
const delta = b.finalScore - a.finalScore;
|
|
20270
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20271
|
-
});
|
|
20272
|
-
const best = chunks[0];
|
|
20273
|
-
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20274
|
-
const topChunks = chunks.slice(0, aggregationCap);
|
|
20275
|
-
let aggregationBonus = 0;
|
|
20276
|
-
for (let i = 1; i < topChunks.length; i++) {
|
|
20277
|
-
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20278
|
-
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20279
|
-
}
|
|
20280
|
-
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20281
|
-
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20282
|
-
if (pageWeight === 0) continue;
|
|
20283
|
-
if (pageWeight !== 1) {
|
|
20284
|
-
pageScore *= pageWeight;
|
|
20285
|
-
}
|
|
20286
|
-
pages.push({
|
|
20287
|
-
url,
|
|
20288
|
-
title: best.hit.metadata.title,
|
|
20289
|
-
routeFile: best.hit.metadata.routeFile,
|
|
20290
|
-
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20291
|
-
bestChunk: best,
|
|
20292
|
-
matchingChunks: chunks
|
|
20293
|
-
});
|
|
20294
|
-
}
|
|
20295
|
-
return pages.sort((a, b) => {
|
|
20296
|
-
const delta = b.pageScore - a.pageScore;
|
|
20297
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20298
|
-
});
|
|
20299
|
-
}
|
|
20300
|
-
|
|
20301
|
-
// src/search/engine.ts
|
|
20302
20473
|
var requestSchema = zod.z.object({
|
|
20303
20474
|
q: zod.z.string().trim().min(1),
|
|
20304
20475
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
@@ -20306,7 +20477,8 @@ var requestSchema = zod.z.object({
|
|
|
20306
20477
|
pathPrefix: zod.z.string().optional(),
|
|
20307
20478
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
20308
20479
|
rerank: zod.z.boolean().optional(),
|
|
20309
|
-
groupBy: zod.z.enum(["page", "chunk"]).optional()
|
|
20480
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
20481
|
+
stream: zod.z.boolean().optional()
|
|
20310
20482
|
});
|
|
20311
20483
|
var SearchEngine = class _SearchEngine {
|
|
20312
20484
|
cwd;
|
|
@@ -20379,7 +20551,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
20379
20551
|
rerankMs = hrTimeMs(rerankStart);
|
|
20380
20552
|
usedRerank = true;
|
|
20381
20553
|
}
|
|
20382
|
-
|
|
20554
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
20555
|
+
return {
|
|
20556
|
+
q: input.q,
|
|
20557
|
+
scope: resolvedScope.scopeName,
|
|
20558
|
+
results,
|
|
20559
|
+
meta: {
|
|
20560
|
+
timingsMs: {
|
|
20561
|
+
embed: Math.round(embedMs),
|
|
20562
|
+
vector: Math.round(vectorMs),
|
|
20563
|
+
rerank: Math.round(rerankMs),
|
|
20564
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20565
|
+
},
|
|
20566
|
+
usedRerank,
|
|
20567
|
+
modelId: this.config.embeddings.model
|
|
20568
|
+
}
|
|
20569
|
+
};
|
|
20570
|
+
}
|
|
20571
|
+
async *searchStreaming(request) {
|
|
20572
|
+
const parsed = requestSchema.safeParse(request);
|
|
20573
|
+
if (!parsed.success) {
|
|
20574
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
20575
|
+
}
|
|
20576
|
+
const input = parsed.data;
|
|
20577
|
+
const wantsRerank = Boolean(input.rerank);
|
|
20578
|
+
if (!wantsRerank) {
|
|
20579
|
+
const response = await this.search(request);
|
|
20580
|
+
yield { phase: "initial", data: response };
|
|
20581
|
+
return;
|
|
20582
|
+
}
|
|
20583
|
+
const totalStart = process.hrtime.bigint();
|
|
20584
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20585
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
20586
|
+
const topK = input.topK ?? 10;
|
|
20587
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20588
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20589
|
+
const embedStart = process.hrtime.bigint();
|
|
20590
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
20591
|
+
const queryVector = queryEmbeddings[0];
|
|
20592
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
20593
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
20594
|
+
}
|
|
20595
|
+
const embedMs = hrTimeMs(embedStart);
|
|
20596
|
+
const vectorStart = process.hrtime.bigint();
|
|
20597
|
+
const hits = await this.vectorStore.query(
|
|
20598
|
+
queryVector,
|
|
20599
|
+
{
|
|
20600
|
+
topK: candidateK,
|
|
20601
|
+
pathPrefix: input.pathPrefix,
|
|
20602
|
+
tags: input.tags
|
|
20603
|
+
},
|
|
20604
|
+
resolvedScope
|
|
20605
|
+
);
|
|
20606
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
20607
|
+
const ranked = rankHits(hits, this.config);
|
|
20608
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
20609
|
+
yield {
|
|
20610
|
+
phase: "initial",
|
|
20611
|
+
data: {
|
|
20612
|
+
q: input.q,
|
|
20613
|
+
scope: resolvedScope.scopeName,
|
|
20614
|
+
results: initialResults,
|
|
20615
|
+
meta: {
|
|
20616
|
+
timingsMs: {
|
|
20617
|
+
embed: Math.round(embedMs),
|
|
20618
|
+
vector: Math.round(vectorMs),
|
|
20619
|
+
rerank: 0,
|
|
20620
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20621
|
+
},
|
|
20622
|
+
usedRerank: false,
|
|
20623
|
+
modelId: this.config.embeddings.model
|
|
20624
|
+
}
|
|
20625
|
+
}
|
|
20626
|
+
};
|
|
20627
|
+
const rerankStart = process.hrtime.bigint();
|
|
20628
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
20629
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
20630
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
20631
|
+
yield {
|
|
20632
|
+
phase: "reranked",
|
|
20633
|
+
data: {
|
|
20634
|
+
q: input.q,
|
|
20635
|
+
scope: resolvedScope.scopeName,
|
|
20636
|
+
results: rerankedResults,
|
|
20637
|
+
meta: {
|
|
20638
|
+
timingsMs: {
|
|
20639
|
+
embed: Math.round(embedMs),
|
|
20640
|
+
vector: Math.round(vectorMs),
|
|
20641
|
+
rerank: Math.round(rerankMs),
|
|
20642
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20643
|
+
},
|
|
20644
|
+
usedRerank: true,
|
|
20645
|
+
modelId: this.config.embeddings.model
|
|
20646
|
+
}
|
|
20647
|
+
}
|
|
20648
|
+
};
|
|
20649
|
+
}
|
|
20650
|
+
buildResults(ordered, topK, groupByPage) {
|
|
20383
20651
|
const minScore = this.config.ranking.minScore;
|
|
20384
20652
|
if (groupByPage) {
|
|
20385
20653
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -20387,10 +20655,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
20387
20655
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20388
20656
|
}
|
|
20389
20657
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20390
|
-
|
|
20658
|
+
return pages.slice(0, topK).map((page) => {
|
|
20391
20659
|
const bestScore = page.bestChunk.finalScore;
|
|
20392
|
-
const
|
|
20393
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20660
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20661
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
20394
20662
|
return {
|
|
20395
20663
|
url: page.url,
|
|
20396
20664
|
title: page.title,
|
|
@@ -20407,10 +20675,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
20407
20675
|
};
|
|
20408
20676
|
});
|
|
20409
20677
|
} else {
|
|
20678
|
+
let filtered = ordered;
|
|
20410
20679
|
if (minScore > 0) {
|
|
20411
|
-
|
|
20680
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20412
20681
|
}
|
|
20413
|
-
|
|
20682
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20414
20683
|
url: hit.metadata.url,
|
|
20415
20684
|
title: hit.metadata.title,
|
|
20416
20685
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -20419,21 +20688,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
20419
20688
|
routeFile: hit.metadata.routeFile
|
|
20420
20689
|
}));
|
|
20421
20690
|
}
|
|
20422
|
-
return {
|
|
20423
|
-
q: input.q,
|
|
20424
|
-
scope: resolvedScope.scopeName,
|
|
20425
|
-
results,
|
|
20426
|
-
meta: {
|
|
20427
|
-
timingsMs: {
|
|
20428
|
-
embed: Math.round(embedMs),
|
|
20429
|
-
vector: Math.round(vectorMs),
|
|
20430
|
-
rerank: Math.round(rerankMs),
|
|
20431
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
20432
|
-
},
|
|
20433
|
-
usedRerank,
|
|
20434
|
-
modelId: this.config.embeddings.model
|
|
20435
|
-
}
|
|
20436
|
-
};
|
|
20437
20691
|
}
|
|
20438
20692
|
async getPage(pathOrUrl, scope) {
|
|
20439
20693
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -20505,6 +20759,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20505
20759
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20506
20760
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20507
20761
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20762
|
+
const MAX_DOC_CHARS = 2e3;
|
|
20508
20763
|
const pageCandidates = [];
|
|
20509
20764
|
for (const [url, chunks] of pageGroups) {
|
|
20510
20765
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -20524,12 +20779,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
20524
20779
|
}
|
|
20525
20780
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20526
20781
|
parts.push(body);
|
|
20527
|
-
|
|
20782
|
+
let text = parts.join("\n\n");
|
|
20783
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
20784
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
20785
|
+
}
|
|
20786
|
+
pageCandidates.push({ id: url, text });
|
|
20528
20787
|
}
|
|
20788
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
20789
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
20529
20790
|
const reranked = await this.reranker.rerank(
|
|
20530
20791
|
query,
|
|
20531
|
-
|
|
20532
|
-
|
|
20792
|
+
cappedCandidates,
|
|
20793
|
+
maxCandidates
|
|
20533
20794
|
);
|
|
20534
20795
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20535
20796
|
return ranked.map((entry) => {
|
|
@@ -20549,7 +20810,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20549
20810
|
});
|
|
20550
20811
|
}
|
|
20551
20812
|
};
|
|
20552
|
-
function createServer(engine) {
|
|
20813
|
+
function createServer(engine, config) {
|
|
20553
20814
|
const server = new mcp_js.McpServer({
|
|
20554
20815
|
name: "searchsocket-mcp",
|
|
20555
20816
|
version: "0.1.0"
|
|
@@ -20557,14 +20818,15 @@ function createServer(engine) {
|
|
|
20557
20818
|
server.registerTool(
|
|
20558
20819
|
"search",
|
|
20559
20820
|
{
|
|
20560
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and
|
|
20821
|
+
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and rerank. Enable rerank for better relevance on natural-language queries.",
|
|
20561
20822
|
inputSchema: {
|
|
20562
20823
|
query: zod.z.string().min(1),
|
|
20563
20824
|
scope: zod.z.string().optional(),
|
|
20564
20825
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
20565
20826
|
pathPrefix: zod.z.string().optional(),
|
|
20566
20827
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
20567
|
-
groupBy: zod.z.enum(["page", "chunk"]).optional()
|
|
20828
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
20829
|
+
rerank: zod.z.boolean().optional().describe("Enable reranking for better relevance (uses Jina Reranker). Defaults to true when rerank is enabled in config.")
|
|
20568
20830
|
}
|
|
20569
20831
|
},
|
|
20570
20832
|
async (input) => {
|
|
@@ -20574,7 +20836,8 @@ function createServer(engine) {
|
|
|
20574
20836
|
scope: input.scope,
|
|
20575
20837
|
pathPrefix: input.pathPrefix,
|
|
20576
20838
|
tags: input.tags,
|
|
20577
|
-
groupBy: input.groupBy
|
|
20839
|
+
groupBy: input.groupBy,
|
|
20840
|
+
rerank: input.rerank ?? config.rerank.enabled
|
|
20578
20841
|
});
|
|
20579
20842
|
return {
|
|
20580
20843
|
content: [
|
|
@@ -20700,10 +20963,10 @@ async function runMcpServer(options = {}) {
|
|
|
20700
20963
|
config
|
|
20701
20964
|
});
|
|
20702
20965
|
if (resolvedTransport === "http") {
|
|
20703
|
-
await startHttpServer(() => createServer(engine), config, options);
|
|
20966
|
+
await startHttpServer(() => createServer(engine, config), config, options);
|
|
20704
20967
|
return;
|
|
20705
20968
|
}
|
|
20706
|
-
const server = createServer(engine);
|
|
20969
|
+
const server = createServer(engine, config);
|
|
20707
20970
|
const stdioTransport = new stdio_js.StdioServerTransport();
|
|
20708
20971
|
await server.connect(stdioTransport);
|
|
20709
20972
|
}
|
|
@@ -20859,7 +21122,44 @@ function searchsocketHandle(options = {}) {
|
|
|
20859
21122
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20860
21123
|
}
|
|
20861
21124
|
const engine = await getEngine();
|
|
20862
|
-
const
|
|
21125
|
+
const searchRequest = body;
|
|
21126
|
+
if (searchRequest.stream && searchRequest.rerank) {
|
|
21127
|
+
const encoder = new TextEncoder();
|
|
21128
|
+
const stream = new ReadableStream({
|
|
21129
|
+
async start(controller) {
|
|
21130
|
+
try {
|
|
21131
|
+
for await (const event2 of engine.searchStreaming(searchRequest)) {
|
|
21132
|
+
const line = JSON.stringify(event2) + "\n";
|
|
21133
|
+
controller.enqueue(encoder.encode(line));
|
|
21134
|
+
}
|
|
21135
|
+
} catch (streamError) {
|
|
21136
|
+
const errorEvent = {
|
|
21137
|
+
phase: "error",
|
|
21138
|
+
data: {
|
|
21139
|
+
error: {
|
|
21140
|
+
code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
|
|
21141
|
+
message: streamError instanceof Error ? streamError.message : "Unknown error"
|
|
21142
|
+
}
|
|
21143
|
+
}
|
|
21144
|
+
};
|
|
21145
|
+
controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
|
|
21146
|
+
} finally {
|
|
21147
|
+
controller.close();
|
|
21148
|
+
}
|
|
21149
|
+
}
|
|
21150
|
+
});
|
|
21151
|
+
return withCors(
|
|
21152
|
+
new Response(stream, {
|
|
21153
|
+
status: 200,
|
|
21154
|
+
headers: {
|
|
21155
|
+
"content-type": "application/x-ndjson"
|
|
21156
|
+
}
|
|
21157
|
+
}),
|
|
21158
|
+
event.request,
|
|
21159
|
+
config
|
|
21160
|
+
);
|
|
21161
|
+
}
|
|
21162
|
+
const result = await engine.search(searchRequest);
|
|
20863
21163
|
return withCors(
|
|
20864
21164
|
new Response(JSON.stringify(result), {
|
|
20865
21165
|
status: 200,
|
|
@@ -20972,7 +21272,7 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20972
21272
|
});
|
|
20973
21273
|
const stats = await pipeline.run({
|
|
20974
21274
|
changedOnly: options.changedOnly ?? true,
|
|
20975
|
-
force: options.force ?? false,
|
|
21275
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20976
21276
|
dryRun: options.dryRun ?? false,
|
|
20977
21277
|
scopeOverride: options.scope,
|
|
20978
21278
|
verbose: options.verbose
|
|
@@ -20989,6 +21289,60 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20989
21289
|
};
|
|
20990
21290
|
}
|
|
20991
21291
|
|
|
21292
|
+
// src/merge.ts
|
|
21293
|
+
function mergeSearchResults(initial, reranked, options) {
|
|
21294
|
+
const maxDisplacement = options?.maxDisplacement ?? 3;
|
|
21295
|
+
const initialUrls = initial.results.map((r) => r.url);
|
|
21296
|
+
const rerankedUrls = reranked.results.map((r) => r.url);
|
|
21297
|
+
const initialPos = /* @__PURE__ */ new Map();
|
|
21298
|
+
for (let i = 0; i < initialUrls.length; i++) {
|
|
21299
|
+
initialPos.set(initialUrls[i], i);
|
|
21300
|
+
}
|
|
21301
|
+
const rerankedPos = /* @__PURE__ */ new Map();
|
|
21302
|
+
for (let i = 0; i < rerankedUrls.length; i++) {
|
|
21303
|
+
rerankedPos.set(rerankedUrls[i], i);
|
|
21304
|
+
}
|
|
21305
|
+
const displacements = [];
|
|
21306
|
+
for (const url of initialUrls) {
|
|
21307
|
+
const iPos = initialPos.get(url);
|
|
21308
|
+
const rPos = rerankedPos.get(url);
|
|
21309
|
+
const displacement = rPos !== void 0 ? Math.abs(iPos - rPos) : 0;
|
|
21310
|
+
displacements.push({ url, displacement });
|
|
21311
|
+
}
|
|
21312
|
+
const totalResults = displacements.length;
|
|
21313
|
+
if (totalResults === 0) {
|
|
21314
|
+
return {
|
|
21315
|
+
response: reranked,
|
|
21316
|
+
usedRerankedOrder: true,
|
|
21317
|
+
displacements
|
|
21318
|
+
};
|
|
21319
|
+
}
|
|
21320
|
+
const hasLargeDisplacement = displacements.some((d) => d.displacement > maxDisplacement);
|
|
21321
|
+
if (hasLargeDisplacement) {
|
|
21322
|
+
return {
|
|
21323
|
+
response: reranked,
|
|
21324
|
+
usedRerankedOrder: true,
|
|
21325
|
+
displacements
|
|
21326
|
+
};
|
|
21327
|
+
}
|
|
21328
|
+
const rerankedScoreMap = /* @__PURE__ */ new Map();
|
|
21329
|
+
for (const result of reranked.results) {
|
|
21330
|
+
rerankedScoreMap.set(result.url, result.score);
|
|
21331
|
+
}
|
|
21332
|
+
const mergedResults = initial.results.map((result) => ({
|
|
21333
|
+
...result,
|
|
21334
|
+
score: rerankedScoreMap.get(result.url) ?? result.score
|
|
21335
|
+
}));
|
|
21336
|
+
return {
|
|
21337
|
+
response: {
|
|
21338
|
+
...reranked,
|
|
21339
|
+
results: mergedResults
|
|
21340
|
+
},
|
|
21341
|
+
usedRerankedOrder: false,
|
|
21342
|
+
displacements
|
|
21343
|
+
};
|
|
21344
|
+
}
|
|
21345
|
+
|
|
20992
21346
|
// src/client.ts
|
|
20993
21347
|
function createSearchClient(options = {}) {
|
|
20994
21348
|
const endpoint = options.endpoint ?? "/api/search";
|
|
@@ -21016,6 +21370,72 @@ function createSearchClient(options = {}) {
|
|
|
21016
21370
|
throw new Error(message);
|
|
21017
21371
|
}
|
|
21018
21372
|
return payload;
|
|
21373
|
+
},
|
|
21374
|
+
async streamSearch(request, onPhase) {
|
|
21375
|
+
const response = await fetchImpl(endpoint, {
|
|
21376
|
+
method: "POST",
|
|
21377
|
+
headers: {
|
|
21378
|
+
"content-type": "application/json"
|
|
21379
|
+
},
|
|
21380
|
+
body: JSON.stringify(request)
|
|
21381
|
+
});
|
|
21382
|
+
if (!response.ok) {
|
|
21383
|
+
let payload;
|
|
21384
|
+
try {
|
|
21385
|
+
payload = await response.json();
|
|
21386
|
+
} catch {
|
|
21387
|
+
throw new Error("Search failed");
|
|
21388
|
+
}
|
|
21389
|
+
const message = payload.error?.message ?? "Search failed";
|
|
21390
|
+
throw new Error(message);
|
|
21391
|
+
}
|
|
21392
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
21393
|
+
if (contentType.includes("application/json")) {
|
|
21394
|
+
const data = await response.json();
|
|
21395
|
+
onPhase({ phase: "initial", data });
|
|
21396
|
+
return data;
|
|
21397
|
+
}
|
|
21398
|
+
if (!response.body) {
|
|
21399
|
+
throw new Error("Response body is not readable");
|
|
21400
|
+
}
|
|
21401
|
+
const reader = response.body.getReader();
|
|
21402
|
+
const decoder = new TextDecoder();
|
|
21403
|
+
let buffer = "";
|
|
21404
|
+
let lastResponse = null;
|
|
21405
|
+
for (; ; ) {
|
|
21406
|
+
const { done, value } = await reader.read();
|
|
21407
|
+
if (done) break;
|
|
21408
|
+
buffer += decoder.decode(value, { stream: true });
|
|
21409
|
+
let newlineIdx;
|
|
21410
|
+
while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
|
|
21411
|
+
const line = buffer.slice(0, newlineIdx).trim();
|
|
21412
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
21413
|
+
if (line.length === 0) continue;
|
|
21414
|
+
const event = JSON.parse(line);
|
|
21415
|
+
if (event.phase === "error") {
|
|
21416
|
+
const errData = event.data;
|
|
21417
|
+
throw new Error(errData.error.message ?? "Streaming search error");
|
|
21418
|
+
}
|
|
21419
|
+
const searchEvent = event;
|
|
21420
|
+
onPhase(searchEvent);
|
|
21421
|
+
lastResponse = searchEvent.data;
|
|
21422
|
+
}
|
|
21423
|
+
}
|
|
21424
|
+
const remaining = buffer.trim();
|
|
21425
|
+
if (remaining.length > 0) {
|
|
21426
|
+
const event = JSON.parse(remaining);
|
|
21427
|
+
if (event.phase === "error") {
|
|
21428
|
+
const errData = event.data;
|
|
21429
|
+
throw new Error(errData.error.message ?? "Streaming search error");
|
|
21430
|
+
}
|
|
21431
|
+
const searchEvent = event;
|
|
21432
|
+
onPhase(searchEvent);
|
|
21433
|
+
lastResponse = searchEvent.data;
|
|
21434
|
+
}
|
|
21435
|
+
if (!lastResponse) {
|
|
21436
|
+
throw new Error("No search results received");
|
|
21437
|
+
}
|
|
21438
|
+
return lastResponse;
|
|
21019
21439
|
}
|
|
21020
21440
|
};
|
|
21021
21441
|
}
|
|
@@ -21042,6 +21462,7 @@ exports.isServerless = isServerless;
|
|
|
21042
21462
|
exports.loadConfig = loadConfig;
|
|
21043
21463
|
exports.mergeConfig = mergeConfig;
|
|
21044
21464
|
exports.mergeConfigServerless = mergeConfigServerless;
|
|
21465
|
+
exports.mergeSearchResults = mergeSearchResults;
|
|
21045
21466
|
exports.resolveScope = resolveScope;
|
|
21046
21467
|
exports.runMcpServer = runMcpServer;
|
|
21047
21468
|
exports.searchsocketHandle = searchsocketHandle;
|