searchsocket 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +443 -182
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +577 -164
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +577 -165
- package/dist/sveltekit.cjs +367 -77
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +367 -77
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -5013,32 +5013,32 @@ var require_URL = __commonJS({
|
|
|
5013
5013
|
else
|
|
5014
5014
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5015
5015
|
}
|
|
5016
|
-
function remove_dot_segments(
|
|
5017
|
-
if (!
|
|
5016
|
+
function remove_dot_segments(path15) {
|
|
5017
|
+
if (!path15) return path15;
|
|
5018
5018
|
var output = "";
|
|
5019
|
-
while (
|
|
5020
|
-
if (
|
|
5021
|
-
|
|
5019
|
+
while (path15.length > 0) {
|
|
5020
|
+
if (path15 === "." || path15 === "..") {
|
|
5021
|
+
path15 = "";
|
|
5022
5022
|
break;
|
|
5023
5023
|
}
|
|
5024
|
-
var twochars =
|
|
5025
|
-
var threechars =
|
|
5026
|
-
var fourchars =
|
|
5024
|
+
var twochars = path15.substring(0, 2);
|
|
5025
|
+
var threechars = path15.substring(0, 3);
|
|
5026
|
+
var fourchars = path15.substring(0, 4);
|
|
5027
5027
|
if (threechars === "../") {
|
|
5028
|
-
|
|
5028
|
+
path15 = path15.substring(3);
|
|
5029
5029
|
} else if (twochars === "./") {
|
|
5030
|
-
|
|
5030
|
+
path15 = path15.substring(2);
|
|
5031
5031
|
} else if (threechars === "/./") {
|
|
5032
|
-
|
|
5033
|
-
} else if (twochars === "/." &&
|
|
5034
|
-
|
|
5035
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5036
|
-
|
|
5032
|
+
path15 = "/" + path15.substring(3);
|
|
5033
|
+
} else if (twochars === "/." && path15.length === 2) {
|
|
5034
|
+
path15 = "/";
|
|
5035
|
+
} else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
|
|
5036
|
+
path15 = "/" + path15.substring(4);
|
|
5037
5037
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5038
5038
|
} else {
|
|
5039
|
-
var segment =
|
|
5039
|
+
var segment = path15.match(/(\/?([^\/]*))/)[0];
|
|
5040
5040
|
output += segment;
|
|
5041
|
-
|
|
5041
|
+
path15 = path15.substring(segment.length);
|
|
5042
5042
|
}
|
|
5043
5043
|
}
|
|
5044
5044
|
return output;
|
|
@@ -16602,6 +16602,8 @@ var searchSocketConfigSchema = z.object({
|
|
|
16602
16602
|
envVar: z.string().min(1).optional(),
|
|
16603
16603
|
sanitize: z.boolean().optional()
|
|
16604
16604
|
}).optional(),
|
|
16605
|
+
exclude: z.array(z.string()).optional(),
|
|
16606
|
+
respectRobotsTxt: z.boolean().optional(),
|
|
16605
16607
|
source: z.object({
|
|
16606
16608
|
mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16607
16609
|
staticOutputDir: z.string().min(1).optional(),
|
|
@@ -16732,6 +16734,8 @@ function createDefaultConfig(projectId) {
|
|
|
16732
16734
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16733
16735
|
sanitize: true
|
|
16734
16736
|
},
|
|
16737
|
+
exclude: [],
|
|
16738
|
+
respectRobotsTxt: true,
|
|
16735
16739
|
source: {
|
|
16736
16740
|
mode: "static-output",
|
|
16737
16741
|
staticOutputDir: "build",
|
|
@@ -16762,7 +16766,7 @@ function createDefaultConfig(projectId) {
|
|
|
16762
16766
|
},
|
|
16763
16767
|
embeddings: {
|
|
16764
16768
|
provider: "jina",
|
|
16765
|
-
model: "jina-embeddings-
|
|
16769
|
+
model: "jina-embeddings-v5-text-small",
|
|
16766
16770
|
apiKeyEnv: "JINA_API_KEY",
|
|
16767
16771
|
batchSize: 64,
|
|
16768
16772
|
concurrency: 4
|
|
@@ -16775,9 +16779,9 @@ function createDefaultConfig(projectId) {
|
|
|
16775
16779
|
}
|
|
16776
16780
|
},
|
|
16777
16781
|
rerank: {
|
|
16778
|
-
enabled:
|
|
16782
|
+
enabled: true,
|
|
16779
16783
|
topN: 20,
|
|
16780
|
-
model: "jina-reranker-
|
|
16784
|
+
model: "jina-reranker-v3"
|
|
16781
16785
|
},
|
|
16782
16786
|
ranking: {
|
|
16783
16787
|
enableIncomingLinkBoost: true,
|
|
@@ -16896,6 +16900,8 @@ ${issues}`
|
|
|
16896
16900
|
...defaults.scope,
|
|
16897
16901
|
...parsed.scope
|
|
16898
16902
|
},
|
|
16903
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16904
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16899
16905
|
source: {
|
|
16900
16906
|
...defaults.source,
|
|
16901
16907
|
...parsed.source,
|
|
@@ -19037,6 +19043,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19037
19043
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19038
19044
|
return null;
|
|
19039
19045
|
}
|
|
19046
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
19047
|
+
let weight;
|
|
19048
|
+
if (weightRaw !== void 0) {
|
|
19049
|
+
const parsed = Number(weightRaw);
|
|
19050
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
19051
|
+
weight = parsed;
|
|
19052
|
+
}
|
|
19053
|
+
}
|
|
19054
|
+
if (weight === 0) {
|
|
19055
|
+
return null;
|
|
19056
|
+
}
|
|
19040
19057
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19041
19058
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19042
19059
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19092,7 +19109,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19092
19109
|
noindex: false,
|
|
19093
19110
|
tags,
|
|
19094
19111
|
description,
|
|
19095
|
-
keywords
|
|
19112
|
+
keywords,
|
|
19113
|
+
weight
|
|
19096
19114
|
};
|
|
19097
19115
|
}
|
|
19098
19116
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19105,6 +19123,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19105
19123
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19106
19124
|
return null;
|
|
19107
19125
|
}
|
|
19126
|
+
let mdWeight;
|
|
19127
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
19128
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
19129
|
+
mdWeight = rawWeight;
|
|
19130
|
+
}
|
|
19131
|
+
if (mdWeight === 0) {
|
|
19132
|
+
return null;
|
|
19133
|
+
}
|
|
19108
19134
|
const content = parsed.content;
|
|
19109
19135
|
const normalized = normalizeMarkdown(content);
|
|
19110
19136
|
if (!normalizeText(normalized)) {
|
|
@@ -19127,7 +19153,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19127
19153
|
noindex: false,
|
|
19128
19154
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19129
19155
|
description: fmDescription,
|
|
19130
|
-
keywords: fmKeywords
|
|
19156
|
+
keywords: fmKeywords,
|
|
19157
|
+
weight: mdWeight
|
|
19131
19158
|
};
|
|
19132
19159
|
}
|
|
19133
19160
|
function yamlString(value) {
|
|
@@ -19323,6 +19350,38 @@ var Logger = class {
|
|
|
19323
19350
|
`);
|
|
19324
19351
|
}
|
|
19325
19352
|
};
|
|
19353
|
+
|
|
19354
|
+
// src/utils/pattern.ts
|
|
19355
|
+
function matchUrlPattern(url, pattern) {
|
|
19356
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
19357
|
+
const normalizedUrl = norm(url);
|
|
19358
|
+
const normalizedPattern = norm(pattern);
|
|
19359
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
19360
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
19361
|
+
if (prefix === "") {
|
|
19362
|
+
return true;
|
|
19363
|
+
}
|
|
19364
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
19365
|
+
}
|
|
19366
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
19367
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
19368
|
+
if (prefix === "") {
|
|
19369
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
19370
|
+
}
|
|
19371
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
19372
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
19373
|
+
return rest.length > 0 && !rest.includes("/");
|
|
19374
|
+
}
|
|
19375
|
+
return normalizedUrl === normalizedPattern;
|
|
19376
|
+
}
|
|
19377
|
+
function matchUrlPatterns(url, patterns) {
|
|
19378
|
+
for (const pattern of patterns) {
|
|
19379
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
19380
|
+
}
|
|
19381
|
+
return false;
|
|
19382
|
+
}
|
|
19383
|
+
|
|
19384
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
19326
19385
|
function routeIdToFile(routeId) {
|
|
19327
19386
|
if (routeId === "/") {
|
|
19328
19387
|
return "src/routes/+page.svelte";
|
|
@@ -19396,15 +19455,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19396
19455
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19397
19456
|
}
|
|
19398
19457
|
function isExcluded(url, patterns) {
|
|
19399
|
-
|
|
19400
|
-
if (pattern.endsWith("/*")) {
|
|
19401
|
-
const prefix = pattern.slice(0, -1);
|
|
19402
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19403
|
-
} else if (url === pattern) {
|
|
19404
|
-
return true;
|
|
19405
|
-
}
|
|
19406
|
-
}
|
|
19407
|
-
return false;
|
|
19458
|
+
return matchUrlPatterns(url, patterns);
|
|
19408
19459
|
}
|
|
19409
19460
|
function findFreePort() {
|
|
19410
19461
|
return new Promise((resolve, reject) => {
|
|
@@ -19820,6 +19871,158 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19820
19871
|
}
|
|
19821
19872
|
return pages;
|
|
19822
19873
|
}
|
|
19874
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
19875
|
+
const lines = content.split(/\r?\n/);
|
|
19876
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
19877
|
+
let currentAgents = [];
|
|
19878
|
+
for (const rawLine of lines) {
|
|
19879
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
19880
|
+
if (!line) continue;
|
|
19881
|
+
const colonIdx = line.indexOf(":");
|
|
19882
|
+
if (colonIdx === -1) continue;
|
|
19883
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
19884
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
19885
|
+
if (directive === "user-agent") {
|
|
19886
|
+
const agentName = value.toLowerCase();
|
|
19887
|
+
currentAgents.push(agentName);
|
|
19888
|
+
if (!agentGroups.has(agentName)) {
|
|
19889
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
19890
|
+
}
|
|
19891
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
19892
|
+
for (const agent of currentAgents) {
|
|
19893
|
+
agentGroups.get(agent).disallow.push(value);
|
|
19894
|
+
}
|
|
19895
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
19896
|
+
for (const agent of currentAgents) {
|
|
19897
|
+
agentGroups.get(agent).allow.push(value);
|
|
19898
|
+
}
|
|
19899
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
19900
|
+
currentAgents = [];
|
|
19901
|
+
}
|
|
19902
|
+
}
|
|
19903
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
19904
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
19905
|
+
return specific;
|
|
19906
|
+
}
|
|
19907
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
19908
|
+
}
|
|
19909
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
19910
|
+
let longestDisallow = "";
|
|
19911
|
+
for (const pattern of rules3.disallow) {
|
|
19912
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
19913
|
+
longestDisallow = pattern;
|
|
19914
|
+
}
|
|
19915
|
+
}
|
|
19916
|
+
if (!longestDisallow) return false;
|
|
19917
|
+
let longestAllow = "";
|
|
19918
|
+
for (const pattern of rules3.allow) {
|
|
19919
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
19920
|
+
longestAllow = pattern;
|
|
19921
|
+
}
|
|
19922
|
+
}
|
|
19923
|
+
return longestAllow.length < longestDisallow.length;
|
|
19924
|
+
}
|
|
19925
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
19926
|
+
try {
|
|
19927
|
+
const content = await fs4.readFile(path.join(dir, "robots.txt"), "utf8");
|
|
19928
|
+
return parseRobotsTxt(content);
|
|
19929
|
+
} catch {
|
|
19930
|
+
return null;
|
|
19931
|
+
}
|
|
19932
|
+
}
|
|
19933
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
19934
|
+
try {
|
|
19935
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
19936
|
+
const response = await fetch(url);
|
|
19937
|
+
if (!response.ok) return null;
|
|
19938
|
+
const content = await response.text();
|
|
19939
|
+
return parseRobotsTxt(content);
|
|
19940
|
+
} catch {
|
|
19941
|
+
return null;
|
|
19942
|
+
}
|
|
19943
|
+
}
|
|
19944
|
+
|
|
19945
|
+
// src/search/ranking.ts
|
|
19946
|
+
function nonNegativeOrZero(value) {
|
|
19947
|
+
if (!Number.isFinite(value)) {
|
|
19948
|
+
return 0;
|
|
19949
|
+
}
|
|
19950
|
+
return Math.max(0, value);
|
|
19951
|
+
}
|
|
19952
|
+
function rankHits(hits, config) {
|
|
19953
|
+
return hits.map((hit) => {
|
|
19954
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
19955
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
19956
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19957
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
19958
|
+
}
|
|
19959
|
+
if (config.ranking.enableDepthBoost) {
|
|
19960
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19961
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
19962
|
+
}
|
|
19963
|
+
return {
|
|
19964
|
+
hit,
|
|
19965
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19966
|
+
};
|
|
19967
|
+
}).sort((a, b) => {
|
|
19968
|
+
const delta = b.finalScore - a.finalScore;
|
|
19969
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19970
|
+
});
|
|
19971
|
+
}
|
|
19972
|
+
function findPageWeight(url, pageWeights) {
|
|
19973
|
+
let bestPattern = "";
|
|
19974
|
+
let bestWeight = 1;
|
|
19975
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
19976
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
19977
|
+
bestPattern = pattern;
|
|
19978
|
+
bestWeight = weight;
|
|
19979
|
+
}
|
|
19980
|
+
}
|
|
19981
|
+
return bestWeight;
|
|
19982
|
+
}
|
|
19983
|
+
function aggregateByPage(ranked, config) {
|
|
19984
|
+
const groups = /* @__PURE__ */ new Map();
|
|
19985
|
+
for (const hit of ranked) {
|
|
19986
|
+
const url = hit.hit.metadata.url;
|
|
19987
|
+
const group = groups.get(url);
|
|
19988
|
+
if (group) group.push(hit);
|
|
19989
|
+
else groups.set(url, [hit]);
|
|
19990
|
+
}
|
|
19991
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
19992
|
+
const pages = [];
|
|
19993
|
+
for (const [url, chunks] of groups) {
|
|
19994
|
+
chunks.sort((a, b) => {
|
|
19995
|
+
const delta = b.finalScore - a.finalScore;
|
|
19996
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19997
|
+
});
|
|
19998
|
+
const best = chunks[0];
|
|
19999
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20000
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
20001
|
+
let aggregationBonus = 0;
|
|
20002
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
20003
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20004
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20005
|
+
}
|
|
20006
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20007
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20008
|
+
if (pageWeight === 0) continue;
|
|
20009
|
+
if (pageWeight !== 1) {
|
|
20010
|
+
pageScore *= pageWeight;
|
|
20011
|
+
}
|
|
20012
|
+
pages.push({
|
|
20013
|
+
url,
|
|
20014
|
+
title: best.hit.metadata.title,
|
|
20015
|
+
routeFile: best.hit.metadata.routeFile,
|
|
20016
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20017
|
+
bestChunk: best,
|
|
20018
|
+
matchingChunks: chunks
|
|
20019
|
+
});
|
|
20020
|
+
}
|
|
20021
|
+
return pages.sort((a, b) => {
|
|
20022
|
+
const delta = b.pageScore - a.pageScore;
|
|
20023
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
20024
|
+
});
|
|
20025
|
+
}
|
|
19823
20026
|
|
|
19824
20027
|
// src/utils/time.ts
|
|
19825
20028
|
function nowIso() {
|
|
@@ -19831,9 +20034,10 @@ function hrTimeMs(start) {
|
|
|
19831
20034
|
|
|
19832
20035
|
// src/indexing/pipeline.ts
|
|
19833
20036
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19834
|
-
"jina-embeddings-v3": 2e-5
|
|
20037
|
+
"jina-embeddings-v3": 2e-5,
|
|
20038
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
19835
20039
|
};
|
|
19836
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
20040
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
19837
20041
|
var IndexPipeline = class _IndexPipeline {
|
|
19838
20042
|
cwd;
|
|
19839
20043
|
config;
|
|
@@ -19911,6 +20115,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19911
20115
|
}
|
|
19912
20116
|
stageEnd("source", sourceStart);
|
|
19913
20117
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20118
|
+
const filterStart = stageStart();
|
|
20119
|
+
let filteredSourcePages = sourcePages;
|
|
20120
|
+
if (this.config.exclude.length > 0) {
|
|
20121
|
+
const beforeExclude = filteredSourcePages.length;
|
|
20122
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20123
|
+
const url = normalizeUrlPath(p.url);
|
|
20124
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
20125
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
20126
|
+
return false;
|
|
20127
|
+
}
|
|
20128
|
+
return true;
|
|
20129
|
+
});
|
|
20130
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
20131
|
+
if (excludedCount > 0) {
|
|
20132
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
20133
|
+
}
|
|
20134
|
+
}
|
|
20135
|
+
if (this.config.respectRobotsTxt) {
|
|
20136
|
+
let robotsRules = null;
|
|
20137
|
+
if (sourceMode === "static-output") {
|
|
20138
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20139
|
+
path.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
20140
|
+
);
|
|
20141
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
20142
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20143
|
+
path.resolve(this.cwd, this.config.source.build.outputDir)
|
|
20144
|
+
);
|
|
20145
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
20146
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
20147
|
+
}
|
|
20148
|
+
if (robotsRules) {
|
|
20149
|
+
const beforeRobots = filteredSourcePages.length;
|
|
20150
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20151
|
+
const url = normalizeUrlPath(p.url);
|
|
20152
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
20153
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
20154
|
+
return false;
|
|
20155
|
+
}
|
|
20156
|
+
return true;
|
|
20157
|
+
});
|
|
20158
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
20159
|
+
if (robotsExcluded > 0) {
|
|
20160
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
20161
|
+
}
|
|
20162
|
+
}
|
|
20163
|
+
}
|
|
20164
|
+
stageEnd("filter", filterStart);
|
|
19914
20165
|
const routeStart = stageStart();
|
|
19915
20166
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19916
20167
|
stageEnd("route_map", routeStart);
|
|
@@ -19918,7 +20169,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19918
20169
|
const extractStart = stageStart();
|
|
19919
20170
|
this.logger.info("Extracting content...");
|
|
19920
20171
|
const extractedPages = [];
|
|
19921
|
-
for (const sourcePage of
|
|
20172
|
+
for (const sourcePage of filteredSourcePages) {
|
|
19922
20173
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
19923
20174
|
if (!extracted) {
|
|
19924
20175
|
this.logger.warn(
|
|
@@ -19944,16 +20195,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19944
20195
|
seenUrls.add(page.url);
|
|
19945
20196
|
uniquePages.push(page);
|
|
19946
20197
|
}
|
|
20198
|
+
const indexablePages = [];
|
|
20199
|
+
for (const page of uniquePages) {
|
|
20200
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
20201
|
+
if (effectiveWeight === 0) {
|
|
20202
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
20203
|
+
continue;
|
|
20204
|
+
}
|
|
20205
|
+
indexablePages.push(page);
|
|
20206
|
+
}
|
|
20207
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
20208
|
+
if (zeroWeightCount > 0) {
|
|
20209
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
20210
|
+
}
|
|
19947
20211
|
stageEnd("extract", extractStart);
|
|
19948
|
-
const skippedPages =
|
|
19949
|
-
this.logger.info(`Extracted ${
|
|
20212
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
20213
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19950
20214
|
const linkStart = stageStart();
|
|
19951
|
-
const pageSet = new Set(
|
|
20215
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19952
20216
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
19953
|
-
for (const page of
|
|
20217
|
+
for (const page of indexablePages) {
|
|
19954
20218
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19955
20219
|
}
|
|
19956
|
-
for (const page of
|
|
20220
|
+
for (const page of indexablePages) {
|
|
19957
20221
|
for (const outgoing of page.outgoingLinks) {
|
|
19958
20222
|
if (!pageSet.has(outgoing)) {
|
|
19959
20223
|
continue;
|
|
@@ -19977,7 +20241,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19977
20241
|
});
|
|
19978
20242
|
}
|
|
19979
20243
|
}
|
|
19980
|
-
for (const page of
|
|
20244
|
+
for (const page of indexablePages) {
|
|
19981
20245
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19982
20246
|
if (routeMatch.routeResolution === "best-effort") {
|
|
19983
20247
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -20194,100 +20458,6 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20194
20458
|
};
|
|
20195
20459
|
}
|
|
20196
20460
|
};
|
|
20197
|
-
|
|
20198
|
-
// src/search/ranking.ts
|
|
20199
|
-
function nonNegativeOrZero(value) {
|
|
20200
|
-
if (!Number.isFinite(value)) {
|
|
20201
|
-
return 0;
|
|
20202
|
-
}
|
|
20203
|
-
return Math.max(0, value);
|
|
20204
|
-
}
|
|
20205
|
-
function rankHits(hits, config) {
|
|
20206
|
-
return hits.map((hit) => {
|
|
20207
|
-
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20208
|
-
if (config.ranking.enableIncomingLinkBoost) {
|
|
20209
|
-
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
20210
|
-
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
20211
|
-
}
|
|
20212
|
-
if (config.ranking.enableDepthBoost) {
|
|
20213
|
-
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
20214
|
-
score += depthBoost * config.ranking.weights.depth;
|
|
20215
|
-
}
|
|
20216
|
-
return {
|
|
20217
|
-
hit,
|
|
20218
|
-
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
20219
|
-
};
|
|
20220
|
-
}).sort((a, b) => {
|
|
20221
|
-
const delta = b.finalScore - a.finalScore;
|
|
20222
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20223
|
-
});
|
|
20224
|
-
}
|
|
20225
|
-
function findPageWeight(url, pageWeights) {
|
|
20226
|
-
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
20227
|
-
const normalizedUrl = norm(url);
|
|
20228
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20229
|
-
if (norm(pattern) === normalizedUrl) {
|
|
20230
|
-
return weight;
|
|
20231
|
-
}
|
|
20232
|
-
}
|
|
20233
|
-
let bestPrefix = "";
|
|
20234
|
-
let bestWeight = 1;
|
|
20235
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20236
|
-
const normalizedPattern = norm(pattern);
|
|
20237
|
-
if (normalizedPattern === "/") continue;
|
|
20238
|
-
const prefix = `${normalizedPattern}/`;
|
|
20239
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
20240
|
-
bestPrefix = prefix;
|
|
20241
|
-
bestWeight = weight;
|
|
20242
|
-
}
|
|
20243
|
-
}
|
|
20244
|
-
return bestWeight;
|
|
20245
|
-
}
|
|
20246
|
-
function aggregateByPage(ranked, config) {
|
|
20247
|
-
const groups = /* @__PURE__ */ new Map();
|
|
20248
|
-
for (const hit of ranked) {
|
|
20249
|
-
const url = hit.hit.metadata.url;
|
|
20250
|
-
const group = groups.get(url);
|
|
20251
|
-
if (group) group.push(hit);
|
|
20252
|
-
else groups.set(url, [hit]);
|
|
20253
|
-
}
|
|
20254
|
-
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
20255
|
-
const pages = [];
|
|
20256
|
-
for (const [url, chunks] of groups) {
|
|
20257
|
-
chunks.sort((a, b) => {
|
|
20258
|
-
const delta = b.finalScore - a.finalScore;
|
|
20259
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20260
|
-
});
|
|
20261
|
-
const best = chunks[0];
|
|
20262
|
-
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20263
|
-
const topChunks = chunks.slice(0, aggregationCap);
|
|
20264
|
-
let aggregationBonus = 0;
|
|
20265
|
-
for (let i = 1; i < topChunks.length; i++) {
|
|
20266
|
-
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20267
|
-
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20268
|
-
}
|
|
20269
|
-
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20270
|
-
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20271
|
-
if (pageWeight === 0) continue;
|
|
20272
|
-
if (pageWeight !== 1) {
|
|
20273
|
-
pageScore *= pageWeight;
|
|
20274
|
-
}
|
|
20275
|
-
pages.push({
|
|
20276
|
-
url,
|
|
20277
|
-
title: best.hit.metadata.title,
|
|
20278
|
-
routeFile: best.hit.metadata.routeFile,
|
|
20279
|
-
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20280
|
-
bestChunk: best,
|
|
20281
|
-
matchingChunks: chunks
|
|
20282
|
-
});
|
|
20283
|
-
}
|
|
20284
|
-
return pages.sort((a, b) => {
|
|
20285
|
-
const delta = b.pageScore - a.pageScore;
|
|
20286
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20287
|
-
});
|
|
20288
|
-
}
|
|
20289
|
-
|
|
20290
|
-
// src/search/engine.ts
|
|
20291
20461
|
var requestSchema = z.object({
|
|
20292
20462
|
q: z.string().trim().min(1),
|
|
20293
20463
|
topK: z.number().int().positive().max(100).optional(),
|
|
@@ -20295,7 +20465,8 @@ var requestSchema = z.object({
|
|
|
20295
20465
|
pathPrefix: z.string().optional(),
|
|
20296
20466
|
tags: z.array(z.string()).optional(),
|
|
20297
20467
|
rerank: z.boolean().optional(),
|
|
20298
|
-
groupBy: z.enum(["page", "chunk"]).optional()
|
|
20468
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
20469
|
+
stream: z.boolean().optional()
|
|
20299
20470
|
});
|
|
20300
20471
|
var SearchEngine = class _SearchEngine {
|
|
20301
20472
|
cwd;
|
|
@@ -20368,7 +20539,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
20368
20539
|
rerankMs = hrTimeMs(rerankStart);
|
|
20369
20540
|
usedRerank = true;
|
|
20370
20541
|
}
|
|
20371
|
-
|
|
20542
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
20543
|
+
return {
|
|
20544
|
+
q: input.q,
|
|
20545
|
+
scope: resolvedScope.scopeName,
|
|
20546
|
+
results,
|
|
20547
|
+
meta: {
|
|
20548
|
+
timingsMs: {
|
|
20549
|
+
embed: Math.round(embedMs),
|
|
20550
|
+
vector: Math.round(vectorMs),
|
|
20551
|
+
rerank: Math.round(rerankMs),
|
|
20552
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20553
|
+
},
|
|
20554
|
+
usedRerank,
|
|
20555
|
+
modelId: this.config.embeddings.model
|
|
20556
|
+
}
|
|
20557
|
+
};
|
|
20558
|
+
}
|
|
20559
|
+
async *searchStreaming(request) {
|
|
20560
|
+
const parsed = requestSchema.safeParse(request);
|
|
20561
|
+
if (!parsed.success) {
|
|
20562
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
20563
|
+
}
|
|
20564
|
+
const input = parsed.data;
|
|
20565
|
+
const wantsRerank = Boolean(input.rerank);
|
|
20566
|
+
if (!wantsRerank) {
|
|
20567
|
+
const response = await this.search(request);
|
|
20568
|
+
yield { phase: "initial", data: response };
|
|
20569
|
+
return;
|
|
20570
|
+
}
|
|
20571
|
+
const totalStart = process.hrtime.bigint();
|
|
20572
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20573
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
20574
|
+
const topK = input.topK ?? 10;
|
|
20575
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20576
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20577
|
+
const embedStart = process.hrtime.bigint();
|
|
20578
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
20579
|
+
const queryVector = queryEmbeddings[0];
|
|
20580
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
20581
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
20582
|
+
}
|
|
20583
|
+
const embedMs = hrTimeMs(embedStart);
|
|
20584
|
+
const vectorStart = process.hrtime.bigint();
|
|
20585
|
+
const hits = await this.vectorStore.query(
|
|
20586
|
+
queryVector,
|
|
20587
|
+
{
|
|
20588
|
+
topK: candidateK,
|
|
20589
|
+
pathPrefix: input.pathPrefix,
|
|
20590
|
+
tags: input.tags
|
|
20591
|
+
},
|
|
20592
|
+
resolvedScope
|
|
20593
|
+
);
|
|
20594
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
20595
|
+
const ranked = rankHits(hits, this.config);
|
|
20596
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
20597
|
+
yield {
|
|
20598
|
+
phase: "initial",
|
|
20599
|
+
data: {
|
|
20600
|
+
q: input.q,
|
|
20601
|
+
scope: resolvedScope.scopeName,
|
|
20602
|
+
results: initialResults,
|
|
20603
|
+
meta: {
|
|
20604
|
+
timingsMs: {
|
|
20605
|
+
embed: Math.round(embedMs),
|
|
20606
|
+
vector: Math.round(vectorMs),
|
|
20607
|
+
rerank: 0,
|
|
20608
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20609
|
+
},
|
|
20610
|
+
usedRerank: false,
|
|
20611
|
+
modelId: this.config.embeddings.model
|
|
20612
|
+
}
|
|
20613
|
+
}
|
|
20614
|
+
};
|
|
20615
|
+
const rerankStart = process.hrtime.bigint();
|
|
20616
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
20617
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
20618
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
20619
|
+
yield {
|
|
20620
|
+
phase: "reranked",
|
|
20621
|
+
data: {
|
|
20622
|
+
q: input.q,
|
|
20623
|
+
scope: resolvedScope.scopeName,
|
|
20624
|
+
results: rerankedResults,
|
|
20625
|
+
meta: {
|
|
20626
|
+
timingsMs: {
|
|
20627
|
+
embed: Math.round(embedMs),
|
|
20628
|
+
vector: Math.round(vectorMs),
|
|
20629
|
+
rerank: Math.round(rerankMs),
|
|
20630
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20631
|
+
},
|
|
20632
|
+
usedRerank: true,
|
|
20633
|
+
modelId: this.config.embeddings.model
|
|
20634
|
+
}
|
|
20635
|
+
}
|
|
20636
|
+
};
|
|
20637
|
+
}
|
|
20638
|
+
buildResults(ordered, topK, groupByPage) {
|
|
20372
20639
|
const minScore = this.config.ranking.minScore;
|
|
20373
20640
|
if (groupByPage) {
|
|
20374
20641
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -20376,10 +20643,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
20376
20643
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20377
20644
|
}
|
|
20378
20645
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20379
|
-
|
|
20646
|
+
return pages.slice(0, topK).map((page) => {
|
|
20380
20647
|
const bestScore = page.bestChunk.finalScore;
|
|
20381
|
-
const
|
|
20382
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20648
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20649
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
20383
20650
|
return {
|
|
20384
20651
|
url: page.url,
|
|
20385
20652
|
title: page.title,
|
|
@@ -20396,10 +20663,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
20396
20663
|
};
|
|
20397
20664
|
});
|
|
20398
20665
|
} else {
|
|
20666
|
+
let filtered = ordered;
|
|
20399
20667
|
if (minScore > 0) {
|
|
20400
|
-
|
|
20668
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20401
20669
|
}
|
|
20402
|
-
|
|
20670
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20403
20671
|
url: hit.metadata.url,
|
|
20404
20672
|
title: hit.metadata.title,
|
|
20405
20673
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -20408,21 +20676,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
20408
20676
|
routeFile: hit.metadata.routeFile
|
|
20409
20677
|
}));
|
|
20410
20678
|
}
|
|
20411
|
-
return {
|
|
20412
|
-
q: input.q,
|
|
20413
|
-
scope: resolvedScope.scopeName,
|
|
20414
|
-
results,
|
|
20415
|
-
meta: {
|
|
20416
|
-
timingsMs: {
|
|
20417
|
-
embed: Math.round(embedMs),
|
|
20418
|
-
vector: Math.round(vectorMs),
|
|
20419
|
-
rerank: Math.round(rerankMs),
|
|
20420
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
20421
|
-
},
|
|
20422
|
-
usedRerank,
|
|
20423
|
-
modelId: this.config.embeddings.model
|
|
20424
|
-
}
|
|
20425
|
-
};
|
|
20426
20679
|
}
|
|
20427
20680
|
async getPage(pathOrUrl, scope) {
|
|
20428
20681
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -20545,7 +20798,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20545
20798
|
});
|
|
20546
20799
|
}
|
|
20547
20800
|
};
|
|
20548
|
-
function createServer(engine) {
|
|
20801
|
+
function createServer(engine, config) {
|
|
20549
20802
|
const server = new McpServer({
|
|
20550
20803
|
name: "searchsocket-mcp",
|
|
20551
20804
|
version: "0.1.0"
|
|
@@ -20553,14 +20806,15 @@ function createServer(engine) {
|
|
|
20553
20806
|
server.registerTool(
|
|
20554
20807
|
"search",
|
|
20555
20808
|
{
|
|
20556
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and
|
|
20809
|
+
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and rerank. Enable rerank for better relevance on natural-language queries.",
|
|
20557
20810
|
inputSchema: {
|
|
20558
20811
|
query: z.string().min(1),
|
|
20559
20812
|
scope: z.string().optional(),
|
|
20560
20813
|
topK: z.number().int().positive().max(100).optional(),
|
|
20561
20814
|
pathPrefix: z.string().optional(),
|
|
20562
20815
|
tags: z.array(z.string()).optional(),
|
|
20563
|
-
groupBy: z.enum(["page", "chunk"]).optional()
|
|
20816
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
20817
|
+
rerank: z.boolean().optional().describe("Enable reranking for better relevance (uses Jina Reranker). Defaults to true when rerank is enabled in config.")
|
|
20564
20818
|
}
|
|
20565
20819
|
},
|
|
20566
20820
|
async (input) => {
|
|
@@ -20570,7 +20824,8 @@ function createServer(engine) {
|
|
|
20570
20824
|
scope: input.scope,
|
|
20571
20825
|
pathPrefix: input.pathPrefix,
|
|
20572
20826
|
tags: input.tags,
|
|
20573
|
-
groupBy: input.groupBy
|
|
20827
|
+
groupBy: input.groupBy,
|
|
20828
|
+
rerank: input.rerank ?? config.rerank.enabled
|
|
20574
20829
|
});
|
|
20575
20830
|
return {
|
|
20576
20831
|
content: [
|
|
@@ -20696,10 +20951,10 @@ async function runMcpServer(options = {}) {
|
|
|
20696
20951
|
config
|
|
20697
20952
|
});
|
|
20698
20953
|
if (resolvedTransport === "http") {
|
|
20699
|
-
await startHttpServer(() => createServer(engine), config, options);
|
|
20954
|
+
await startHttpServer(() => createServer(engine, config), config, options);
|
|
20700
20955
|
return;
|
|
20701
20956
|
}
|
|
20702
|
-
const server = createServer(engine);
|
|
20957
|
+
const server = createServer(engine, config);
|
|
20703
20958
|
const stdioTransport = new StdioServerTransport();
|
|
20704
20959
|
await server.connect(stdioTransport);
|
|
20705
20960
|
}
|
|
@@ -20855,7 +21110,44 @@ function searchsocketHandle(options = {}) {
|
|
|
20855
21110
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20856
21111
|
}
|
|
20857
21112
|
const engine = await getEngine();
|
|
20858
|
-
const
|
|
21113
|
+
const searchRequest = body;
|
|
21114
|
+
if (searchRequest.stream && searchRequest.rerank) {
|
|
21115
|
+
const encoder = new TextEncoder();
|
|
21116
|
+
const stream = new ReadableStream({
|
|
21117
|
+
async start(controller) {
|
|
21118
|
+
try {
|
|
21119
|
+
for await (const event2 of engine.searchStreaming(searchRequest)) {
|
|
21120
|
+
const line = JSON.stringify(event2) + "\n";
|
|
21121
|
+
controller.enqueue(encoder.encode(line));
|
|
21122
|
+
}
|
|
21123
|
+
} catch (streamError) {
|
|
21124
|
+
const errorEvent = {
|
|
21125
|
+
phase: "error",
|
|
21126
|
+
data: {
|
|
21127
|
+
error: {
|
|
21128
|
+
code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
|
|
21129
|
+
message: streamError instanceof Error ? streamError.message : "Unknown error"
|
|
21130
|
+
}
|
|
21131
|
+
}
|
|
21132
|
+
};
|
|
21133
|
+
controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
|
|
21134
|
+
} finally {
|
|
21135
|
+
controller.close();
|
|
21136
|
+
}
|
|
21137
|
+
}
|
|
21138
|
+
});
|
|
21139
|
+
return withCors(
|
|
21140
|
+
new Response(stream, {
|
|
21141
|
+
status: 200,
|
|
21142
|
+
headers: {
|
|
21143
|
+
"content-type": "application/x-ndjson"
|
|
21144
|
+
}
|
|
21145
|
+
}),
|
|
21146
|
+
event.request,
|
|
21147
|
+
config
|
|
21148
|
+
);
|
|
21149
|
+
}
|
|
21150
|
+
const result = await engine.search(searchRequest);
|
|
20859
21151
|
return withCors(
|
|
20860
21152
|
new Response(JSON.stringify(result), {
|
|
20861
21153
|
status: 200,
|
|
@@ -20968,7 +21260,7 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20968
21260
|
});
|
|
20969
21261
|
const stats = await pipeline.run({
|
|
20970
21262
|
changedOnly: options.changedOnly ?? true,
|
|
20971
|
-
force: options.force ?? false,
|
|
21263
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20972
21264
|
dryRun: options.dryRun ?? false,
|
|
20973
21265
|
scopeOverride: options.scope,
|
|
20974
21266
|
verbose: options.verbose
|
|
@@ -20985,6 +21277,60 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20985
21277
|
};
|
|
20986
21278
|
}
|
|
20987
21279
|
|
|
21280
|
+
// src/merge.ts
|
|
21281
|
+
function mergeSearchResults(initial, reranked, options) {
|
|
21282
|
+
const maxDisplacement = options?.maxDisplacement ?? 3;
|
|
21283
|
+
const initialUrls = initial.results.map((r) => r.url);
|
|
21284
|
+
const rerankedUrls = reranked.results.map((r) => r.url);
|
|
21285
|
+
const initialPos = /* @__PURE__ */ new Map();
|
|
21286
|
+
for (let i = 0; i < initialUrls.length; i++) {
|
|
21287
|
+
initialPos.set(initialUrls[i], i);
|
|
21288
|
+
}
|
|
21289
|
+
const rerankedPos = /* @__PURE__ */ new Map();
|
|
21290
|
+
for (let i = 0; i < rerankedUrls.length; i++) {
|
|
21291
|
+
rerankedPos.set(rerankedUrls[i], i);
|
|
21292
|
+
}
|
|
21293
|
+
const displacements = [];
|
|
21294
|
+
for (const url of initialUrls) {
|
|
21295
|
+
const iPos = initialPos.get(url);
|
|
21296
|
+
const rPos = rerankedPos.get(url);
|
|
21297
|
+
const displacement = rPos !== void 0 ? Math.abs(iPos - rPos) : 0;
|
|
21298
|
+
displacements.push({ url, displacement });
|
|
21299
|
+
}
|
|
21300
|
+
const totalResults = displacements.length;
|
|
21301
|
+
if (totalResults === 0) {
|
|
21302
|
+
return {
|
|
21303
|
+
response: reranked,
|
|
21304
|
+
usedRerankedOrder: true,
|
|
21305
|
+
displacements
|
|
21306
|
+
};
|
|
21307
|
+
}
|
|
21308
|
+
const hasLargeDisplacement = displacements.some((d) => d.displacement > maxDisplacement);
|
|
21309
|
+
if (hasLargeDisplacement) {
|
|
21310
|
+
return {
|
|
21311
|
+
response: reranked,
|
|
21312
|
+
usedRerankedOrder: true,
|
|
21313
|
+
displacements
|
|
21314
|
+
};
|
|
21315
|
+
}
|
|
21316
|
+
const rerankedScoreMap = /* @__PURE__ */ new Map();
|
|
21317
|
+
for (const result of reranked.results) {
|
|
21318
|
+
rerankedScoreMap.set(result.url, result.score);
|
|
21319
|
+
}
|
|
21320
|
+
const mergedResults = initial.results.map((result) => ({
|
|
21321
|
+
...result,
|
|
21322
|
+
score: rerankedScoreMap.get(result.url) ?? result.score
|
|
21323
|
+
}));
|
|
21324
|
+
return {
|
|
21325
|
+
response: {
|
|
21326
|
+
...reranked,
|
|
21327
|
+
results: mergedResults
|
|
21328
|
+
},
|
|
21329
|
+
usedRerankedOrder: false,
|
|
21330
|
+
displacements
|
|
21331
|
+
};
|
|
21332
|
+
}
|
|
21333
|
+
|
|
20988
21334
|
// src/client.ts
|
|
20989
21335
|
function createSearchClient(options = {}) {
|
|
20990
21336
|
const endpoint = options.endpoint ?? "/api/search";
|
|
@@ -21012,6 +21358,72 @@ function createSearchClient(options = {}) {
|
|
|
21012
21358
|
throw new Error(message);
|
|
21013
21359
|
}
|
|
21014
21360
|
return payload;
|
|
21361
|
+
},
|
|
21362
|
+
async streamSearch(request, onPhase) {
|
|
21363
|
+
const response = await fetchImpl(endpoint, {
|
|
21364
|
+
method: "POST",
|
|
21365
|
+
headers: {
|
|
21366
|
+
"content-type": "application/json"
|
|
21367
|
+
},
|
|
21368
|
+
body: JSON.stringify(request)
|
|
21369
|
+
});
|
|
21370
|
+
if (!response.ok) {
|
|
21371
|
+
let payload;
|
|
21372
|
+
try {
|
|
21373
|
+
payload = await response.json();
|
|
21374
|
+
} catch {
|
|
21375
|
+
throw new Error("Search failed");
|
|
21376
|
+
}
|
|
21377
|
+
const message = payload.error?.message ?? "Search failed";
|
|
21378
|
+
throw new Error(message);
|
|
21379
|
+
}
|
|
21380
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
21381
|
+
if (contentType.includes("application/json")) {
|
|
21382
|
+
const data = await response.json();
|
|
21383
|
+
onPhase({ phase: "initial", data });
|
|
21384
|
+
return data;
|
|
21385
|
+
}
|
|
21386
|
+
if (!response.body) {
|
|
21387
|
+
throw new Error("Response body is not readable");
|
|
21388
|
+
}
|
|
21389
|
+
const reader = response.body.getReader();
|
|
21390
|
+
const decoder = new TextDecoder();
|
|
21391
|
+
let buffer = "";
|
|
21392
|
+
let lastResponse = null;
|
|
21393
|
+
for (; ; ) {
|
|
21394
|
+
const { done, value } = await reader.read();
|
|
21395
|
+
if (done) break;
|
|
21396
|
+
buffer += decoder.decode(value, { stream: true });
|
|
21397
|
+
let newlineIdx;
|
|
21398
|
+
while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
|
|
21399
|
+
const line = buffer.slice(0, newlineIdx).trim();
|
|
21400
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
21401
|
+
if (line.length === 0) continue;
|
|
21402
|
+
const event = JSON.parse(line);
|
|
21403
|
+
if (event.phase === "error") {
|
|
21404
|
+
const errData = event.data;
|
|
21405
|
+
throw new Error(errData.error.message ?? "Streaming search error");
|
|
21406
|
+
}
|
|
21407
|
+
const searchEvent = event;
|
|
21408
|
+
onPhase(searchEvent);
|
|
21409
|
+
lastResponse = searchEvent.data;
|
|
21410
|
+
}
|
|
21411
|
+
}
|
|
21412
|
+
const remaining = buffer.trim();
|
|
21413
|
+
if (remaining.length > 0) {
|
|
21414
|
+
const event = JSON.parse(remaining);
|
|
21415
|
+
if (event.phase === "error") {
|
|
21416
|
+
const errData = event.data;
|
|
21417
|
+
throw new Error(errData.error.message ?? "Streaming search error");
|
|
21418
|
+
}
|
|
21419
|
+
const searchEvent = event;
|
|
21420
|
+
onPhase(searchEvent);
|
|
21421
|
+
lastResponse = searchEvent.data;
|
|
21422
|
+
}
|
|
21423
|
+
if (!lastResponse) {
|
|
21424
|
+
throw new Error("No search results received");
|
|
21425
|
+
}
|
|
21426
|
+
return lastResponse;
|
|
21015
21427
|
}
|
|
21016
21428
|
};
|
|
21017
21429
|
}
|
|
@@ -21027,6 +21439,6 @@ function createSearchClient(options = {}) {
|
|
|
21027
21439
|
*)
|
|
21028
21440
|
*/
|
|
21029
21441
|
|
|
21030
|
-
export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
|
|
21442
|
+
export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, mergeSearchResults, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
|
|
21031
21443
|
//# sourceMappingURL=index.js.map
|
|
21032
21444
|
//# sourceMappingURL=index.js.map
|