searchsocket 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +443 -182
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +577 -164
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +577 -165
- package/dist/sveltekit.cjs +367 -77
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +367 -77
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -5025,32 +5025,32 @@ var require_URL = __commonJS({
|
|
|
5025
5025
|
else
|
|
5026
5026
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5027
5027
|
}
|
|
5028
|
-
function remove_dot_segments(
|
|
5029
|
-
if (!
|
|
5028
|
+
function remove_dot_segments(path15) {
|
|
5029
|
+
if (!path15) return path15;
|
|
5030
5030
|
var output = "";
|
|
5031
|
-
while (
|
|
5032
|
-
if (
|
|
5033
|
-
|
|
5031
|
+
while (path15.length > 0) {
|
|
5032
|
+
if (path15 === "." || path15 === "..") {
|
|
5033
|
+
path15 = "";
|
|
5034
5034
|
break;
|
|
5035
5035
|
}
|
|
5036
|
-
var twochars =
|
|
5037
|
-
var threechars =
|
|
5038
|
-
var fourchars =
|
|
5036
|
+
var twochars = path15.substring(0, 2);
|
|
5037
|
+
var threechars = path15.substring(0, 3);
|
|
5038
|
+
var fourchars = path15.substring(0, 4);
|
|
5039
5039
|
if (threechars === "../") {
|
|
5040
|
-
|
|
5040
|
+
path15 = path15.substring(3);
|
|
5041
5041
|
} else if (twochars === "./") {
|
|
5042
|
-
|
|
5042
|
+
path15 = path15.substring(2);
|
|
5043
5043
|
} else if (threechars === "/./") {
|
|
5044
|
-
|
|
5045
|
-
} else if (twochars === "/." &&
|
|
5046
|
-
|
|
5047
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5048
|
-
|
|
5044
|
+
path15 = "/" + path15.substring(3);
|
|
5045
|
+
} else if (twochars === "/." && path15.length === 2) {
|
|
5046
|
+
path15 = "/";
|
|
5047
|
+
} else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
|
|
5048
|
+
path15 = "/" + path15.substring(4);
|
|
5049
5049
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5050
5050
|
} else {
|
|
5051
|
-
var segment =
|
|
5051
|
+
var segment = path15.match(/(\/?([^\/]*))/)[0];
|
|
5052
5052
|
output += segment;
|
|
5053
|
-
|
|
5053
|
+
path15 = path15.substring(segment.length);
|
|
5054
5054
|
}
|
|
5055
5055
|
}
|
|
5056
5056
|
return output;
|
|
@@ -16614,6 +16614,8 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16614
16614
|
envVar: zod.z.string().min(1).optional(),
|
|
16615
16615
|
sanitize: zod.z.boolean().optional()
|
|
16616
16616
|
}).optional(),
|
|
16617
|
+
exclude: zod.z.array(zod.z.string()).optional(),
|
|
16618
|
+
respectRobotsTxt: zod.z.boolean().optional(),
|
|
16617
16619
|
source: zod.z.object({
|
|
16618
16620
|
mode: zod.z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16619
16621
|
staticOutputDir: zod.z.string().min(1).optional(),
|
|
@@ -16744,6 +16746,8 @@ function createDefaultConfig(projectId) {
|
|
|
16744
16746
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16745
16747
|
sanitize: true
|
|
16746
16748
|
},
|
|
16749
|
+
exclude: [],
|
|
16750
|
+
respectRobotsTxt: true,
|
|
16747
16751
|
source: {
|
|
16748
16752
|
mode: "static-output",
|
|
16749
16753
|
staticOutputDir: "build",
|
|
@@ -16774,7 +16778,7 @@ function createDefaultConfig(projectId) {
|
|
|
16774
16778
|
},
|
|
16775
16779
|
embeddings: {
|
|
16776
16780
|
provider: "jina",
|
|
16777
|
-
model: "jina-embeddings-
|
|
16781
|
+
model: "jina-embeddings-v5-text-small",
|
|
16778
16782
|
apiKeyEnv: "JINA_API_KEY",
|
|
16779
16783
|
batchSize: 64,
|
|
16780
16784
|
concurrency: 4
|
|
@@ -16787,9 +16791,9 @@ function createDefaultConfig(projectId) {
|
|
|
16787
16791
|
}
|
|
16788
16792
|
},
|
|
16789
16793
|
rerank: {
|
|
16790
|
-
enabled:
|
|
16794
|
+
enabled: true,
|
|
16791
16795
|
topN: 20,
|
|
16792
|
-
model: "jina-reranker-
|
|
16796
|
+
model: "jina-reranker-v3"
|
|
16793
16797
|
},
|
|
16794
16798
|
ranking: {
|
|
16795
16799
|
enableIncomingLinkBoost: true,
|
|
@@ -16908,6 +16912,8 @@ ${issues}`
|
|
|
16908
16912
|
...defaults.scope,
|
|
16909
16913
|
...parsed.scope
|
|
16910
16914
|
},
|
|
16915
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16916
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16911
16917
|
source: {
|
|
16912
16918
|
...defaults.source,
|
|
16913
16919
|
...parsed.source,
|
|
@@ -19049,6 +19055,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19049
19055
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19050
19056
|
return null;
|
|
19051
19057
|
}
|
|
19058
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
19059
|
+
let weight;
|
|
19060
|
+
if (weightRaw !== void 0) {
|
|
19061
|
+
const parsed = Number(weightRaw);
|
|
19062
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
19063
|
+
weight = parsed;
|
|
19064
|
+
}
|
|
19065
|
+
}
|
|
19066
|
+
if (weight === 0) {
|
|
19067
|
+
return null;
|
|
19068
|
+
}
|
|
19052
19069
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19053
19070
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19054
19071
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19104,7 +19121,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19104
19121
|
noindex: false,
|
|
19105
19122
|
tags,
|
|
19106
19123
|
description,
|
|
19107
|
-
keywords
|
|
19124
|
+
keywords,
|
|
19125
|
+
weight
|
|
19108
19126
|
};
|
|
19109
19127
|
}
|
|
19110
19128
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19117,6 +19135,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19117
19135
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19118
19136
|
return null;
|
|
19119
19137
|
}
|
|
19138
|
+
let mdWeight;
|
|
19139
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
19140
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
19141
|
+
mdWeight = rawWeight;
|
|
19142
|
+
}
|
|
19143
|
+
if (mdWeight === 0) {
|
|
19144
|
+
return null;
|
|
19145
|
+
}
|
|
19120
19146
|
const content = parsed.content;
|
|
19121
19147
|
const normalized = normalizeMarkdown(content);
|
|
19122
19148
|
if (!normalizeText(normalized)) {
|
|
@@ -19139,7 +19165,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19139
19165
|
noindex: false,
|
|
19140
19166
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19141
19167
|
description: fmDescription,
|
|
19142
|
-
keywords: fmKeywords
|
|
19168
|
+
keywords: fmKeywords,
|
|
19169
|
+
weight: mdWeight
|
|
19143
19170
|
};
|
|
19144
19171
|
}
|
|
19145
19172
|
function yamlString(value) {
|
|
@@ -19335,6 +19362,38 @@ var Logger = class {
|
|
|
19335
19362
|
`);
|
|
19336
19363
|
}
|
|
19337
19364
|
};
|
|
19365
|
+
|
|
19366
|
+
// src/utils/pattern.ts
|
|
19367
|
+
function matchUrlPattern(url, pattern) {
|
|
19368
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
19369
|
+
const normalizedUrl = norm(url);
|
|
19370
|
+
const normalizedPattern = norm(pattern);
|
|
19371
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
19372
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
19373
|
+
if (prefix === "") {
|
|
19374
|
+
return true;
|
|
19375
|
+
}
|
|
19376
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
19377
|
+
}
|
|
19378
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
19379
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
19380
|
+
if (prefix === "") {
|
|
19381
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
19382
|
+
}
|
|
19383
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
19384
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
19385
|
+
return rest.length > 0 && !rest.includes("/");
|
|
19386
|
+
}
|
|
19387
|
+
return normalizedUrl === normalizedPattern;
|
|
19388
|
+
}
|
|
19389
|
+
function matchUrlPatterns(url, patterns) {
|
|
19390
|
+
for (const pattern of patterns) {
|
|
19391
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
19392
|
+
}
|
|
19393
|
+
return false;
|
|
19394
|
+
}
|
|
19395
|
+
|
|
19396
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
19338
19397
|
function routeIdToFile(routeId) {
|
|
19339
19398
|
if (routeId === "/") {
|
|
19340
19399
|
return "src/routes/+page.svelte";
|
|
@@ -19408,15 +19467,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19408
19467
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19409
19468
|
}
|
|
19410
19469
|
function isExcluded(url, patterns) {
|
|
19411
|
-
|
|
19412
|
-
if (pattern.endsWith("/*")) {
|
|
19413
|
-
const prefix = pattern.slice(0, -1);
|
|
19414
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19415
|
-
} else if (url === pattern) {
|
|
19416
|
-
return true;
|
|
19417
|
-
}
|
|
19418
|
-
}
|
|
19419
|
-
return false;
|
|
19470
|
+
return matchUrlPatterns(url, patterns);
|
|
19420
19471
|
}
|
|
19421
19472
|
function findFreePort() {
|
|
19422
19473
|
return new Promise((resolve, reject) => {
|
|
@@ -19832,6 +19883,158 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19832
19883
|
}
|
|
19833
19884
|
return pages;
|
|
19834
19885
|
}
|
|
19886
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
19887
|
+
const lines = content.split(/\r?\n/);
|
|
19888
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
19889
|
+
let currentAgents = [];
|
|
19890
|
+
for (const rawLine of lines) {
|
|
19891
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
19892
|
+
if (!line) continue;
|
|
19893
|
+
const colonIdx = line.indexOf(":");
|
|
19894
|
+
if (colonIdx === -1) continue;
|
|
19895
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
19896
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
19897
|
+
if (directive === "user-agent") {
|
|
19898
|
+
const agentName = value.toLowerCase();
|
|
19899
|
+
currentAgents.push(agentName);
|
|
19900
|
+
if (!agentGroups.has(agentName)) {
|
|
19901
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
19902
|
+
}
|
|
19903
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
19904
|
+
for (const agent of currentAgents) {
|
|
19905
|
+
agentGroups.get(agent).disallow.push(value);
|
|
19906
|
+
}
|
|
19907
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
19908
|
+
for (const agent of currentAgents) {
|
|
19909
|
+
agentGroups.get(agent).allow.push(value);
|
|
19910
|
+
}
|
|
19911
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
19912
|
+
currentAgents = [];
|
|
19913
|
+
}
|
|
19914
|
+
}
|
|
19915
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
19916
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
19917
|
+
return specific;
|
|
19918
|
+
}
|
|
19919
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
19920
|
+
}
|
|
19921
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
19922
|
+
let longestDisallow = "";
|
|
19923
|
+
for (const pattern of rules3.disallow) {
|
|
19924
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
19925
|
+
longestDisallow = pattern;
|
|
19926
|
+
}
|
|
19927
|
+
}
|
|
19928
|
+
if (!longestDisallow) return false;
|
|
19929
|
+
let longestAllow = "";
|
|
19930
|
+
for (const pattern of rules3.allow) {
|
|
19931
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
19932
|
+
longestAllow = pattern;
|
|
19933
|
+
}
|
|
19934
|
+
}
|
|
19935
|
+
return longestAllow.length < longestDisallow.length;
|
|
19936
|
+
}
|
|
19937
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
19938
|
+
try {
|
|
19939
|
+
const content = await fs4__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
|
|
19940
|
+
return parseRobotsTxt(content);
|
|
19941
|
+
} catch {
|
|
19942
|
+
return null;
|
|
19943
|
+
}
|
|
19944
|
+
}
|
|
19945
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
19946
|
+
try {
|
|
19947
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
19948
|
+
const response = await fetch(url);
|
|
19949
|
+
if (!response.ok) return null;
|
|
19950
|
+
const content = await response.text();
|
|
19951
|
+
return parseRobotsTxt(content);
|
|
19952
|
+
} catch {
|
|
19953
|
+
return null;
|
|
19954
|
+
}
|
|
19955
|
+
}
|
|
19956
|
+
|
|
19957
|
+
// src/search/ranking.ts
|
|
19958
|
+
function nonNegativeOrZero(value) {
|
|
19959
|
+
if (!Number.isFinite(value)) {
|
|
19960
|
+
return 0;
|
|
19961
|
+
}
|
|
19962
|
+
return Math.max(0, value);
|
|
19963
|
+
}
|
|
19964
|
+
function rankHits(hits, config) {
|
|
19965
|
+
return hits.map((hit) => {
|
|
19966
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
19967
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
19968
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19969
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
19970
|
+
}
|
|
19971
|
+
if (config.ranking.enableDepthBoost) {
|
|
19972
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19973
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
19974
|
+
}
|
|
19975
|
+
return {
|
|
19976
|
+
hit,
|
|
19977
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19978
|
+
};
|
|
19979
|
+
}).sort((a, b) => {
|
|
19980
|
+
const delta = b.finalScore - a.finalScore;
|
|
19981
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19982
|
+
});
|
|
19983
|
+
}
|
|
19984
|
+
function findPageWeight(url, pageWeights) {
|
|
19985
|
+
let bestPattern = "";
|
|
19986
|
+
let bestWeight = 1;
|
|
19987
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
19988
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
19989
|
+
bestPattern = pattern;
|
|
19990
|
+
bestWeight = weight;
|
|
19991
|
+
}
|
|
19992
|
+
}
|
|
19993
|
+
return bestWeight;
|
|
19994
|
+
}
|
|
19995
|
+
function aggregateByPage(ranked, config) {
|
|
19996
|
+
const groups = /* @__PURE__ */ new Map();
|
|
19997
|
+
for (const hit of ranked) {
|
|
19998
|
+
const url = hit.hit.metadata.url;
|
|
19999
|
+
const group = groups.get(url);
|
|
20000
|
+
if (group) group.push(hit);
|
|
20001
|
+
else groups.set(url, [hit]);
|
|
20002
|
+
}
|
|
20003
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
20004
|
+
const pages = [];
|
|
20005
|
+
for (const [url, chunks] of groups) {
|
|
20006
|
+
chunks.sort((a, b) => {
|
|
20007
|
+
const delta = b.finalScore - a.finalScore;
|
|
20008
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
20009
|
+
});
|
|
20010
|
+
const best = chunks[0];
|
|
20011
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20012
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
20013
|
+
let aggregationBonus = 0;
|
|
20014
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
20015
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20016
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20017
|
+
}
|
|
20018
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20019
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20020
|
+
if (pageWeight === 0) continue;
|
|
20021
|
+
if (pageWeight !== 1) {
|
|
20022
|
+
pageScore *= pageWeight;
|
|
20023
|
+
}
|
|
20024
|
+
pages.push({
|
|
20025
|
+
url,
|
|
20026
|
+
title: best.hit.metadata.title,
|
|
20027
|
+
routeFile: best.hit.metadata.routeFile,
|
|
20028
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20029
|
+
bestChunk: best,
|
|
20030
|
+
matchingChunks: chunks
|
|
20031
|
+
});
|
|
20032
|
+
}
|
|
20033
|
+
return pages.sort((a, b) => {
|
|
20034
|
+
const delta = b.pageScore - a.pageScore;
|
|
20035
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
20036
|
+
});
|
|
20037
|
+
}
|
|
19835
20038
|
|
|
19836
20039
|
// src/utils/time.ts
|
|
19837
20040
|
function nowIso() {
|
|
@@ -19843,9 +20046,10 @@ function hrTimeMs(start) {
|
|
|
19843
20046
|
|
|
19844
20047
|
// src/indexing/pipeline.ts
|
|
19845
20048
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19846
|
-
"jina-embeddings-v3": 2e-5
|
|
20049
|
+
"jina-embeddings-v3": 2e-5,
|
|
20050
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
19847
20051
|
};
|
|
19848
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
20052
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
19849
20053
|
var IndexPipeline = class _IndexPipeline {
|
|
19850
20054
|
cwd;
|
|
19851
20055
|
config;
|
|
@@ -19923,6 +20127,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19923
20127
|
}
|
|
19924
20128
|
stageEnd("source", sourceStart);
|
|
19925
20129
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20130
|
+
const filterStart = stageStart();
|
|
20131
|
+
let filteredSourcePages = sourcePages;
|
|
20132
|
+
if (this.config.exclude.length > 0) {
|
|
20133
|
+
const beforeExclude = filteredSourcePages.length;
|
|
20134
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20135
|
+
const url = normalizeUrlPath(p.url);
|
|
20136
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
20137
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
20138
|
+
return false;
|
|
20139
|
+
}
|
|
20140
|
+
return true;
|
|
20141
|
+
});
|
|
20142
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
20143
|
+
if (excludedCount > 0) {
|
|
20144
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
20145
|
+
}
|
|
20146
|
+
}
|
|
20147
|
+
if (this.config.respectRobotsTxt) {
|
|
20148
|
+
let robotsRules = null;
|
|
20149
|
+
if (sourceMode === "static-output") {
|
|
20150
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20151
|
+
path__default.default.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
20152
|
+
);
|
|
20153
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
20154
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20155
|
+
path__default.default.resolve(this.cwd, this.config.source.build.outputDir)
|
|
20156
|
+
);
|
|
20157
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
20158
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
20159
|
+
}
|
|
20160
|
+
if (robotsRules) {
|
|
20161
|
+
const beforeRobots = filteredSourcePages.length;
|
|
20162
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20163
|
+
const url = normalizeUrlPath(p.url);
|
|
20164
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
20165
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
20166
|
+
return false;
|
|
20167
|
+
}
|
|
20168
|
+
return true;
|
|
20169
|
+
});
|
|
20170
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
20171
|
+
if (robotsExcluded > 0) {
|
|
20172
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
20173
|
+
}
|
|
20174
|
+
}
|
|
20175
|
+
}
|
|
20176
|
+
stageEnd("filter", filterStart);
|
|
19926
20177
|
const routeStart = stageStart();
|
|
19927
20178
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19928
20179
|
stageEnd("route_map", routeStart);
|
|
@@ -19930,7 +20181,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19930
20181
|
const extractStart = stageStart();
|
|
19931
20182
|
this.logger.info("Extracting content...");
|
|
19932
20183
|
const extractedPages = [];
|
|
19933
|
-
for (const sourcePage of
|
|
20184
|
+
for (const sourcePage of filteredSourcePages) {
|
|
19934
20185
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
19935
20186
|
if (!extracted) {
|
|
19936
20187
|
this.logger.warn(
|
|
@@ -19956,16 +20207,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19956
20207
|
seenUrls.add(page.url);
|
|
19957
20208
|
uniquePages.push(page);
|
|
19958
20209
|
}
|
|
20210
|
+
const indexablePages = [];
|
|
20211
|
+
for (const page of uniquePages) {
|
|
20212
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
20213
|
+
if (effectiveWeight === 0) {
|
|
20214
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
20215
|
+
continue;
|
|
20216
|
+
}
|
|
20217
|
+
indexablePages.push(page);
|
|
20218
|
+
}
|
|
20219
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
20220
|
+
if (zeroWeightCount > 0) {
|
|
20221
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
20222
|
+
}
|
|
19959
20223
|
stageEnd("extract", extractStart);
|
|
19960
|
-
const skippedPages =
|
|
19961
|
-
this.logger.info(`Extracted ${
|
|
20224
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
20225
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19962
20226
|
const linkStart = stageStart();
|
|
19963
|
-
const pageSet = new Set(
|
|
20227
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19964
20228
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
19965
|
-
for (const page of
|
|
20229
|
+
for (const page of indexablePages) {
|
|
19966
20230
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19967
20231
|
}
|
|
19968
|
-
for (const page of
|
|
20232
|
+
for (const page of indexablePages) {
|
|
19969
20233
|
for (const outgoing of page.outgoingLinks) {
|
|
19970
20234
|
if (!pageSet.has(outgoing)) {
|
|
19971
20235
|
continue;
|
|
@@ -19989,7 +20253,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19989
20253
|
});
|
|
19990
20254
|
}
|
|
19991
20255
|
}
|
|
19992
|
-
for (const page of
|
|
20256
|
+
for (const page of indexablePages) {
|
|
19993
20257
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19994
20258
|
if (routeMatch.routeResolution === "best-effort") {
|
|
19995
20259
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -20206,100 +20470,6 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20206
20470
|
};
|
|
20207
20471
|
}
|
|
20208
20472
|
};
|
|
20209
|
-
|
|
20210
|
-
// src/search/ranking.ts
|
|
20211
|
-
function nonNegativeOrZero(value) {
|
|
20212
|
-
if (!Number.isFinite(value)) {
|
|
20213
|
-
return 0;
|
|
20214
|
-
}
|
|
20215
|
-
return Math.max(0, value);
|
|
20216
|
-
}
|
|
20217
|
-
function rankHits(hits, config) {
|
|
20218
|
-
return hits.map((hit) => {
|
|
20219
|
-
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20220
|
-
if (config.ranking.enableIncomingLinkBoost) {
|
|
20221
|
-
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
20222
|
-
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
20223
|
-
}
|
|
20224
|
-
if (config.ranking.enableDepthBoost) {
|
|
20225
|
-
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
20226
|
-
score += depthBoost * config.ranking.weights.depth;
|
|
20227
|
-
}
|
|
20228
|
-
return {
|
|
20229
|
-
hit,
|
|
20230
|
-
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
20231
|
-
};
|
|
20232
|
-
}).sort((a, b) => {
|
|
20233
|
-
const delta = b.finalScore - a.finalScore;
|
|
20234
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20235
|
-
});
|
|
20236
|
-
}
|
|
20237
|
-
function findPageWeight(url, pageWeights) {
|
|
20238
|
-
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
20239
|
-
const normalizedUrl = norm(url);
|
|
20240
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20241
|
-
if (norm(pattern) === normalizedUrl) {
|
|
20242
|
-
return weight;
|
|
20243
|
-
}
|
|
20244
|
-
}
|
|
20245
|
-
let bestPrefix = "";
|
|
20246
|
-
let bestWeight = 1;
|
|
20247
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20248
|
-
const normalizedPattern = norm(pattern);
|
|
20249
|
-
if (normalizedPattern === "/") continue;
|
|
20250
|
-
const prefix = `${normalizedPattern}/`;
|
|
20251
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
20252
|
-
bestPrefix = prefix;
|
|
20253
|
-
bestWeight = weight;
|
|
20254
|
-
}
|
|
20255
|
-
}
|
|
20256
|
-
return bestWeight;
|
|
20257
|
-
}
|
|
20258
|
-
function aggregateByPage(ranked, config) {
|
|
20259
|
-
const groups = /* @__PURE__ */ new Map();
|
|
20260
|
-
for (const hit of ranked) {
|
|
20261
|
-
const url = hit.hit.metadata.url;
|
|
20262
|
-
const group = groups.get(url);
|
|
20263
|
-
if (group) group.push(hit);
|
|
20264
|
-
else groups.set(url, [hit]);
|
|
20265
|
-
}
|
|
20266
|
-
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
20267
|
-
const pages = [];
|
|
20268
|
-
for (const [url, chunks] of groups) {
|
|
20269
|
-
chunks.sort((a, b) => {
|
|
20270
|
-
const delta = b.finalScore - a.finalScore;
|
|
20271
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20272
|
-
});
|
|
20273
|
-
const best = chunks[0];
|
|
20274
|
-
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20275
|
-
const topChunks = chunks.slice(0, aggregationCap);
|
|
20276
|
-
let aggregationBonus = 0;
|
|
20277
|
-
for (let i = 1; i < topChunks.length; i++) {
|
|
20278
|
-
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20279
|
-
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20280
|
-
}
|
|
20281
|
-
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20282
|
-
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20283
|
-
if (pageWeight === 0) continue;
|
|
20284
|
-
if (pageWeight !== 1) {
|
|
20285
|
-
pageScore *= pageWeight;
|
|
20286
|
-
}
|
|
20287
|
-
pages.push({
|
|
20288
|
-
url,
|
|
20289
|
-
title: best.hit.metadata.title,
|
|
20290
|
-
routeFile: best.hit.metadata.routeFile,
|
|
20291
|
-
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20292
|
-
bestChunk: best,
|
|
20293
|
-
matchingChunks: chunks
|
|
20294
|
-
});
|
|
20295
|
-
}
|
|
20296
|
-
return pages.sort((a, b) => {
|
|
20297
|
-
const delta = b.pageScore - a.pageScore;
|
|
20298
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20299
|
-
});
|
|
20300
|
-
}
|
|
20301
|
-
|
|
20302
|
-
// src/search/engine.ts
|
|
20303
20473
|
var requestSchema = zod.z.object({
|
|
20304
20474
|
q: zod.z.string().trim().min(1),
|
|
20305
20475
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
@@ -20307,7 +20477,8 @@ var requestSchema = zod.z.object({
|
|
|
20307
20477
|
pathPrefix: zod.z.string().optional(),
|
|
20308
20478
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
20309
20479
|
rerank: zod.z.boolean().optional(),
|
|
20310
|
-
groupBy: zod.z.enum(["page", "chunk"]).optional()
|
|
20480
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
20481
|
+
stream: zod.z.boolean().optional()
|
|
20311
20482
|
});
|
|
20312
20483
|
var SearchEngine = class _SearchEngine {
|
|
20313
20484
|
cwd;
|
|
@@ -20380,7 +20551,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
20380
20551
|
rerankMs = hrTimeMs(rerankStart);
|
|
20381
20552
|
usedRerank = true;
|
|
20382
20553
|
}
|
|
20383
|
-
|
|
20554
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
20555
|
+
return {
|
|
20556
|
+
q: input.q,
|
|
20557
|
+
scope: resolvedScope.scopeName,
|
|
20558
|
+
results,
|
|
20559
|
+
meta: {
|
|
20560
|
+
timingsMs: {
|
|
20561
|
+
embed: Math.round(embedMs),
|
|
20562
|
+
vector: Math.round(vectorMs),
|
|
20563
|
+
rerank: Math.round(rerankMs),
|
|
20564
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20565
|
+
},
|
|
20566
|
+
usedRerank,
|
|
20567
|
+
modelId: this.config.embeddings.model
|
|
20568
|
+
}
|
|
20569
|
+
};
|
|
20570
|
+
}
|
|
20571
|
+
async *searchStreaming(request) {
|
|
20572
|
+
const parsed = requestSchema.safeParse(request);
|
|
20573
|
+
if (!parsed.success) {
|
|
20574
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
20575
|
+
}
|
|
20576
|
+
const input = parsed.data;
|
|
20577
|
+
const wantsRerank = Boolean(input.rerank);
|
|
20578
|
+
if (!wantsRerank) {
|
|
20579
|
+
const response = await this.search(request);
|
|
20580
|
+
yield { phase: "initial", data: response };
|
|
20581
|
+
return;
|
|
20582
|
+
}
|
|
20583
|
+
const totalStart = process.hrtime.bigint();
|
|
20584
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20585
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
20586
|
+
const topK = input.topK ?? 10;
|
|
20587
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20588
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20589
|
+
const embedStart = process.hrtime.bigint();
|
|
20590
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
20591
|
+
const queryVector = queryEmbeddings[0];
|
|
20592
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
20593
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
20594
|
+
}
|
|
20595
|
+
const embedMs = hrTimeMs(embedStart);
|
|
20596
|
+
const vectorStart = process.hrtime.bigint();
|
|
20597
|
+
const hits = await this.vectorStore.query(
|
|
20598
|
+
queryVector,
|
|
20599
|
+
{
|
|
20600
|
+
topK: candidateK,
|
|
20601
|
+
pathPrefix: input.pathPrefix,
|
|
20602
|
+
tags: input.tags
|
|
20603
|
+
},
|
|
20604
|
+
resolvedScope
|
|
20605
|
+
);
|
|
20606
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
20607
|
+
const ranked = rankHits(hits, this.config);
|
|
20608
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
20609
|
+
yield {
|
|
20610
|
+
phase: "initial",
|
|
20611
|
+
data: {
|
|
20612
|
+
q: input.q,
|
|
20613
|
+
scope: resolvedScope.scopeName,
|
|
20614
|
+
results: initialResults,
|
|
20615
|
+
meta: {
|
|
20616
|
+
timingsMs: {
|
|
20617
|
+
embed: Math.round(embedMs),
|
|
20618
|
+
vector: Math.round(vectorMs),
|
|
20619
|
+
rerank: 0,
|
|
20620
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20621
|
+
},
|
|
20622
|
+
usedRerank: false,
|
|
20623
|
+
modelId: this.config.embeddings.model
|
|
20624
|
+
}
|
|
20625
|
+
}
|
|
20626
|
+
};
|
|
20627
|
+
const rerankStart = process.hrtime.bigint();
|
|
20628
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
20629
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
20630
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
20631
|
+
yield {
|
|
20632
|
+
phase: "reranked",
|
|
20633
|
+
data: {
|
|
20634
|
+
q: input.q,
|
|
20635
|
+
scope: resolvedScope.scopeName,
|
|
20636
|
+
results: rerankedResults,
|
|
20637
|
+
meta: {
|
|
20638
|
+
timingsMs: {
|
|
20639
|
+
embed: Math.round(embedMs),
|
|
20640
|
+
vector: Math.round(vectorMs),
|
|
20641
|
+
rerank: Math.round(rerankMs),
|
|
20642
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20643
|
+
},
|
|
20644
|
+
usedRerank: true,
|
|
20645
|
+
modelId: this.config.embeddings.model
|
|
20646
|
+
}
|
|
20647
|
+
}
|
|
20648
|
+
};
|
|
20649
|
+
}
|
|
20650
|
+
buildResults(ordered, topK, groupByPage) {
|
|
20384
20651
|
const minScore = this.config.ranking.minScore;
|
|
20385
20652
|
if (groupByPage) {
|
|
20386
20653
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -20388,10 +20655,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
20388
20655
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20389
20656
|
}
|
|
20390
20657
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20391
|
-
|
|
20658
|
+
return pages.slice(0, topK).map((page) => {
|
|
20392
20659
|
const bestScore = page.bestChunk.finalScore;
|
|
20393
|
-
const
|
|
20394
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20660
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20661
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
20395
20662
|
return {
|
|
20396
20663
|
url: page.url,
|
|
20397
20664
|
title: page.title,
|
|
@@ -20408,10 +20675,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
20408
20675
|
};
|
|
20409
20676
|
});
|
|
20410
20677
|
} else {
|
|
20678
|
+
let filtered = ordered;
|
|
20411
20679
|
if (minScore > 0) {
|
|
20412
|
-
|
|
20680
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20413
20681
|
}
|
|
20414
|
-
|
|
20682
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20415
20683
|
url: hit.metadata.url,
|
|
20416
20684
|
title: hit.metadata.title,
|
|
20417
20685
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -20420,21 +20688,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
20420
20688
|
routeFile: hit.metadata.routeFile
|
|
20421
20689
|
}));
|
|
20422
20690
|
}
|
|
20423
|
-
return {
|
|
20424
|
-
q: input.q,
|
|
20425
|
-
scope: resolvedScope.scopeName,
|
|
20426
|
-
results,
|
|
20427
|
-
meta: {
|
|
20428
|
-
timingsMs: {
|
|
20429
|
-
embed: Math.round(embedMs),
|
|
20430
|
-
vector: Math.round(vectorMs),
|
|
20431
|
-
rerank: Math.round(rerankMs),
|
|
20432
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
20433
|
-
},
|
|
20434
|
-
usedRerank,
|
|
20435
|
-
modelId: this.config.embeddings.model
|
|
20436
|
-
}
|
|
20437
|
-
};
|
|
20438
20691
|
}
|
|
20439
20692
|
async getPage(pathOrUrl, scope) {
|
|
20440
20693
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -20557,7 +20810,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20557
20810
|
});
|
|
20558
20811
|
}
|
|
20559
20812
|
};
|
|
20560
|
-
function createServer(engine) {
|
|
20813
|
+
function createServer(engine, config) {
|
|
20561
20814
|
const server = new mcp_js.McpServer({
|
|
20562
20815
|
name: "searchsocket-mcp",
|
|
20563
20816
|
version: "0.1.0"
|
|
@@ -20565,14 +20818,15 @@ function createServer(engine) {
|
|
|
20565
20818
|
server.registerTool(
|
|
20566
20819
|
"search",
|
|
20567
20820
|
{
|
|
20568
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and
|
|
20821
|
+
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and rerank. Enable rerank for better relevance on natural-language queries.",
|
|
20569
20822
|
inputSchema: {
|
|
20570
20823
|
query: zod.z.string().min(1),
|
|
20571
20824
|
scope: zod.z.string().optional(),
|
|
20572
20825
|
topK: zod.z.number().int().positive().max(100).optional(),
|
|
20573
20826
|
pathPrefix: zod.z.string().optional(),
|
|
20574
20827
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
20575
|
-
groupBy: zod.z.enum(["page", "chunk"]).optional()
|
|
20828
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
20829
|
+
rerank: zod.z.boolean().optional().describe("Enable reranking for better relevance (uses Jina Reranker). Defaults to true when rerank is enabled in config.")
|
|
20576
20830
|
}
|
|
20577
20831
|
},
|
|
20578
20832
|
async (input) => {
|
|
@@ -20582,7 +20836,8 @@ function createServer(engine) {
|
|
|
20582
20836
|
scope: input.scope,
|
|
20583
20837
|
pathPrefix: input.pathPrefix,
|
|
20584
20838
|
tags: input.tags,
|
|
20585
|
-
groupBy: input.groupBy
|
|
20839
|
+
groupBy: input.groupBy,
|
|
20840
|
+
rerank: input.rerank ?? config.rerank.enabled
|
|
20586
20841
|
});
|
|
20587
20842
|
return {
|
|
20588
20843
|
content: [
|
|
@@ -20708,10 +20963,10 @@ async function runMcpServer(options = {}) {
|
|
|
20708
20963
|
config
|
|
20709
20964
|
});
|
|
20710
20965
|
if (resolvedTransport === "http") {
|
|
20711
|
-
await startHttpServer(() => createServer(engine), config, options);
|
|
20966
|
+
await startHttpServer(() => createServer(engine, config), config, options);
|
|
20712
20967
|
return;
|
|
20713
20968
|
}
|
|
20714
|
-
const server = createServer(engine);
|
|
20969
|
+
const server = createServer(engine, config);
|
|
20715
20970
|
const stdioTransport = new stdio_js.StdioServerTransport();
|
|
20716
20971
|
await server.connect(stdioTransport);
|
|
20717
20972
|
}
|
|
@@ -20867,7 +21122,44 @@ function searchsocketHandle(options = {}) {
|
|
|
20867
21122
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20868
21123
|
}
|
|
20869
21124
|
const engine = await getEngine();
|
|
20870
|
-
const
|
|
21125
|
+
const searchRequest = body;
|
|
21126
|
+
if (searchRequest.stream && searchRequest.rerank) {
|
|
21127
|
+
const encoder = new TextEncoder();
|
|
21128
|
+
const stream = new ReadableStream({
|
|
21129
|
+
async start(controller) {
|
|
21130
|
+
try {
|
|
21131
|
+
for await (const event2 of engine.searchStreaming(searchRequest)) {
|
|
21132
|
+
const line = JSON.stringify(event2) + "\n";
|
|
21133
|
+
controller.enqueue(encoder.encode(line));
|
|
21134
|
+
}
|
|
21135
|
+
} catch (streamError) {
|
|
21136
|
+
const errorEvent = {
|
|
21137
|
+
phase: "error",
|
|
21138
|
+
data: {
|
|
21139
|
+
error: {
|
|
21140
|
+
code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
|
|
21141
|
+
message: streamError instanceof Error ? streamError.message : "Unknown error"
|
|
21142
|
+
}
|
|
21143
|
+
}
|
|
21144
|
+
};
|
|
21145
|
+
controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
|
|
21146
|
+
} finally {
|
|
21147
|
+
controller.close();
|
|
21148
|
+
}
|
|
21149
|
+
}
|
|
21150
|
+
});
|
|
21151
|
+
return withCors(
|
|
21152
|
+
new Response(stream, {
|
|
21153
|
+
status: 200,
|
|
21154
|
+
headers: {
|
|
21155
|
+
"content-type": "application/x-ndjson"
|
|
21156
|
+
}
|
|
21157
|
+
}),
|
|
21158
|
+
event.request,
|
|
21159
|
+
config
|
|
21160
|
+
);
|
|
21161
|
+
}
|
|
21162
|
+
const result = await engine.search(searchRequest);
|
|
20871
21163
|
return withCors(
|
|
20872
21164
|
new Response(JSON.stringify(result), {
|
|
20873
21165
|
status: 200,
|
|
@@ -20980,7 +21272,7 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20980
21272
|
});
|
|
20981
21273
|
const stats = await pipeline.run({
|
|
20982
21274
|
changedOnly: options.changedOnly ?? true,
|
|
20983
|
-
force: options.force ?? false,
|
|
21275
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20984
21276
|
dryRun: options.dryRun ?? false,
|
|
20985
21277
|
scopeOverride: options.scope,
|
|
20986
21278
|
verbose: options.verbose
|
|
@@ -20997,6 +21289,60 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20997
21289
|
};
|
|
20998
21290
|
}
|
|
20999
21291
|
|
|
21292
|
+
// src/merge.ts
|
|
21293
|
+
function mergeSearchResults(initial, reranked, options) {
|
|
21294
|
+
const maxDisplacement = options?.maxDisplacement ?? 3;
|
|
21295
|
+
const initialUrls = initial.results.map((r) => r.url);
|
|
21296
|
+
const rerankedUrls = reranked.results.map((r) => r.url);
|
|
21297
|
+
const initialPos = /* @__PURE__ */ new Map();
|
|
21298
|
+
for (let i = 0; i < initialUrls.length; i++) {
|
|
21299
|
+
initialPos.set(initialUrls[i], i);
|
|
21300
|
+
}
|
|
21301
|
+
const rerankedPos = /* @__PURE__ */ new Map();
|
|
21302
|
+
for (let i = 0; i < rerankedUrls.length; i++) {
|
|
21303
|
+
rerankedPos.set(rerankedUrls[i], i);
|
|
21304
|
+
}
|
|
21305
|
+
const displacements = [];
|
|
21306
|
+
for (const url of initialUrls) {
|
|
21307
|
+
const iPos = initialPos.get(url);
|
|
21308
|
+
const rPos = rerankedPos.get(url);
|
|
21309
|
+
const displacement = rPos !== void 0 ? Math.abs(iPos - rPos) : 0;
|
|
21310
|
+
displacements.push({ url, displacement });
|
|
21311
|
+
}
|
|
21312
|
+
const totalResults = displacements.length;
|
|
21313
|
+
if (totalResults === 0) {
|
|
21314
|
+
return {
|
|
21315
|
+
response: reranked,
|
|
21316
|
+
usedRerankedOrder: true,
|
|
21317
|
+
displacements
|
|
21318
|
+
};
|
|
21319
|
+
}
|
|
21320
|
+
const hasLargeDisplacement = displacements.some((d) => d.displacement > maxDisplacement);
|
|
21321
|
+
if (hasLargeDisplacement) {
|
|
21322
|
+
return {
|
|
21323
|
+
response: reranked,
|
|
21324
|
+
usedRerankedOrder: true,
|
|
21325
|
+
displacements
|
|
21326
|
+
};
|
|
21327
|
+
}
|
|
21328
|
+
const rerankedScoreMap = /* @__PURE__ */ new Map();
|
|
21329
|
+
for (const result of reranked.results) {
|
|
21330
|
+
rerankedScoreMap.set(result.url, result.score);
|
|
21331
|
+
}
|
|
21332
|
+
const mergedResults = initial.results.map((result) => ({
|
|
21333
|
+
...result,
|
|
21334
|
+
score: rerankedScoreMap.get(result.url) ?? result.score
|
|
21335
|
+
}));
|
|
21336
|
+
return {
|
|
21337
|
+
response: {
|
|
21338
|
+
...reranked,
|
|
21339
|
+
results: mergedResults
|
|
21340
|
+
},
|
|
21341
|
+
usedRerankedOrder: false,
|
|
21342
|
+
displacements
|
|
21343
|
+
};
|
|
21344
|
+
}
|
|
21345
|
+
|
|
21000
21346
|
// src/client.ts
|
|
21001
21347
|
function createSearchClient(options = {}) {
|
|
21002
21348
|
const endpoint = options.endpoint ?? "/api/search";
|
|
@@ -21024,6 +21370,72 @@ function createSearchClient(options = {}) {
|
|
|
21024
21370
|
throw new Error(message);
|
|
21025
21371
|
}
|
|
21026
21372
|
return payload;
|
|
21373
|
+
},
|
|
21374
|
+
async streamSearch(request, onPhase) {
|
|
21375
|
+
const response = await fetchImpl(endpoint, {
|
|
21376
|
+
method: "POST",
|
|
21377
|
+
headers: {
|
|
21378
|
+
"content-type": "application/json"
|
|
21379
|
+
},
|
|
21380
|
+
body: JSON.stringify(request)
|
|
21381
|
+
});
|
|
21382
|
+
if (!response.ok) {
|
|
21383
|
+
let payload;
|
|
21384
|
+
try {
|
|
21385
|
+
payload = await response.json();
|
|
21386
|
+
} catch {
|
|
21387
|
+
throw new Error("Search failed");
|
|
21388
|
+
}
|
|
21389
|
+
const message = payload.error?.message ?? "Search failed";
|
|
21390
|
+
throw new Error(message);
|
|
21391
|
+
}
|
|
21392
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
21393
|
+
if (contentType.includes("application/json")) {
|
|
21394
|
+
const data = await response.json();
|
|
21395
|
+
onPhase({ phase: "initial", data });
|
|
21396
|
+
return data;
|
|
21397
|
+
}
|
|
21398
|
+
if (!response.body) {
|
|
21399
|
+
throw new Error("Response body is not readable");
|
|
21400
|
+
}
|
|
21401
|
+
const reader = response.body.getReader();
|
|
21402
|
+
const decoder = new TextDecoder();
|
|
21403
|
+
let buffer = "";
|
|
21404
|
+
let lastResponse = null;
|
|
21405
|
+
for (; ; ) {
|
|
21406
|
+
const { done, value } = await reader.read();
|
|
21407
|
+
if (done) break;
|
|
21408
|
+
buffer += decoder.decode(value, { stream: true });
|
|
21409
|
+
let newlineIdx;
|
|
21410
|
+
while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
|
|
21411
|
+
const line = buffer.slice(0, newlineIdx).trim();
|
|
21412
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
21413
|
+
if (line.length === 0) continue;
|
|
21414
|
+
const event = JSON.parse(line);
|
|
21415
|
+
if (event.phase === "error") {
|
|
21416
|
+
const errData = event.data;
|
|
21417
|
+
throw new Error(errData.error.message ?? "Streaming search error");
|
|
21418
|
+
}
|
|
21419
|
+
const searchEvent = event;
|
|
21420
|
+
onPhase(searchEvent);
|
|
21421
|
+
lastResponse = searchEvent.data;
|
|
21422
|
+
}
|
|
21423
|
+
}
|
|
21424
|
+
const remaining = buffer.trim();
|
|
21425
|
+
if (remaining.length > 0) {
|
|
21426
|
+
const event = JSON.parse(remaining);
|
|
21427
|
+
if (event.phase === "error") {
|
|
21428
|
+
const errData = event.data;
|
|
21429
|
+
throw new Error(errData.error.message ?? "Streaming search error");
|
|
21430
|
+
}
|
|
21431
|
+
const searchEvent = event;
|
|
21432
|
+
onPhase(searchEvent);
|
|
21433
|
+
lastResponse = searchEvent.data;
|
|
21434
|
+
}
|
|
21435
|
+
if (!lastResponse) {
|
|
21436
|
+
throw new Error("No search results received");
|
|
21437
|
+
}
|
|
21438
|
+
return lastResponse;
|
|
21027
21439
|
}
|
|
21028
21440
|
};
|
|
21029
21441
|
}
|
|
@@ -21050,6 +21462,7 @@ exports.isServerless = isServerless;
|
|
|
21050
21462
|
exports.loadConfig = loadConfig;
|
|
21051
21463
|
exports.mergeConfig = mergeConfig;
|
|
21052
21464
|
exports.mergeConfigServerless = mergeConfigServerless;
|
|
21465
|
+
exports.mergeSearchResults = mergeSearchResults;
|
|
21053
21466
|
exports.resolveScope = resolveScope;
|
|
21054
21467
|
exports.runMcpServer = runMcpServer;
|
|
21055
21468
|
exports.searchsocketHandle = searchsocketHandle;
|