searchsocket 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +456 -187
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +590 -169
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +590 -170
- package/dist/sveltekit.cjs +380 -82
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +380 -82
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -5013,32 +5013,32 @@ var require_URL = __commonJS({
|
|
|
5013
5013
|
else
|
|
5014
5014
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5015
5015
|
}
|
|
5016
|
-
function remove_dot_segments(
|
|
5017
|
-
if (!
|
|
5016
|
+
function remove_dot_segments(path15) {
|
|
5017
|
+
if (!path15) return path15;
|
|
5018
5018
|
var output = "";
|
|
5019
|
-
while (
|
|
5020
|
-
if (
|
|
5021
|
-
|
|
5019
|
+
while (path15.length > 0) {
|
|
5020
|
+
if (path15 === "." || path15 === "..") {
|
|
5021
|
+
path15 = "";
|
|
5022
5022
|
break;
|
|
5023
5023
|
}
|
|
5024
|
-
var twochars =
|
|
5025
|
-
var threechars =
|
|
5026
|
-
var fourchars =
|
|
5024
|
+
var twochars = path15.substring(0, 2);
|
|
5025
|
+
var threechars = path15.substring(0, 3);
|
|
5026
|
+
var fourchars = path15.substring(0, 4);
|
|
5027
5027
|
if (threechars === "../") {
|
|
5028
|
-
|
|
5028
|
+
path15 = path15.substring(3);
|
|
5029
5029
|
} else if (twochars === "./") {
|
|
5030
|
-
|
|
5030
|
+
path15 = path15.substring(2);
|
|
5031
5031
|
} else if (threechars === "/./") {
|
|
5032
|
-
|
|
5033
|
-
} else if (twochars === "/." &&
|
|
5034
|
-
|
|
5035
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5036
|
-
|
|
5032
|
+
path15 = "/" + path15.substring(3);
|
|
5033
|
+
} else if (twochars === "/." && path15.length === 2) {
|
|
5034
|
+
path15 = "/";
|
|
5035
|
+
} else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
|
|
5036
|
+
path15 = "/" + path15.substring(4);
|
|
5037
5037
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5038
5038
|
} else {
|
|
5039
|
-
var segment =
|
|
5039
|
+
var segment = path15.match(/(\/?([^\/]*))/)[0];
|
|
5040
5040
|
output += segment;
|
|
5041
|
-
|
|
5041
|
+
path15 = path15.substring(segment.length);
|
|
5042
5042
|
}
|
|
5043
5043
|
}
|
|
5044
5044
|
return output;
|
|
@@ -16602,6 +16602,8 @@ var searchSocketConfigSchema = z.object({
|
|
|
16602
16602
|
envVar: z.string().min(1).optional(),
|
|
16603
16603
|
sanitize: z.boolean().optional()
|
|
16604
16604
|
}).optional(),
|
|
16605
|
+
exclude: z.array(z.string()).optional(),
|
|
16606
|
+
respectRobotsTxt: z.boolean().optional(),
|
|
16605
16607
|
source: z.object({
|
|
16606
16608
|
mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16607
16609
|
staticOutputDir: z.string().min(1).optional(),
|
|
@@ -16732,6 +16734,8 @@ function createDefaultConfig(projectId) {
|
|
|
16732
16734
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16733
16735
|
sanitize: true
|
|
16734
16736
|
},
|
|
16737
|
+
exclude: [],
|
|
16738
|
+
respectRobotsTxt: true,
|
|
16735
16739
|
source: {
|
|
16736
16740
|
mode: "static-output",
|
|
16737
16741
|
staticOutputDir: "build",
|
|
@@ -16762,7 +16766,7 @@ function createDefaultConfig(projectId) {
|
|
|
16762
16766
|
},
|
|
16763
16767
|
embeddings: {
|
|
16764
16768
|
provider: "jina",
|
|
16765
|
-
model: "jina-embeddings-
|
|
16769
|
+
model: "jina-embeddings-v5-text-small",
|
|
16766
16770
|
apiKeyEnv: "JINA_API_KEY",
|
|
16767
16771
|
batchSize: 64,
|
|
16768
16772
|
concurrency: 4
|
|
@@ -16775,9 +16779,9 @@ function createDefaultConfig(projectId) {
|
|
|
16775
16779
|
}
|
|
16776
16780
|
},
|
|
16777
16781
|
rerank: {
|
|
16778
|
-
enabled:
|
|
16782
|
+
enabled: true,
|
|
16779
16783
|
topN: 20,
|
|
16780
|
-
model: "jina-reranker-
|
|
16784
|
+
model: "jina-reranker-v3"
|
|
16781
16785
|
},
|
|
16782
16786
|
ranking: {
|
|
16783
16787
|
enableIncomingLinkBoost: true,
|
|
@@ -16896,6 +16900,8 @@ ${issues}`
|
|
|
16896
16900
|
...defaults.scope,
|
|
16897
16901
|
...parsed.scope
|
|
16898
16902
|
},
|
|
16903
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16904
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16899
16905
|
source: {
|
|
16900
16906
|
...defaults.source,
|
|
16901
16907
|
...parsed.source,
|
|
@@ -17261,7 +17267,7 @@ var JinaReranker = class {
|
|
|
17261
17267
|
constructor(options) {
|
|
17262
17268
|
this.apiKey = options.apiKey;
|
|
17263
17269
|
this.model = options.model;
|
|
17264
|
-
this.maxRetries = options.maxRetries ??
|
|
17270
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
17265
17271
|
}
|
|
17266
17272
|
async rerank(query, candidates, topN) {
|
|
17267
17273
|
if (candidates.length === 0) {
|
|
@@ -17271,7 +17277,8 @@ var JinaReranker = class {
|
|
|
17271
17277
|
model: this.model,
|
|
17272
17278
|
query,
|
|
17273
17279
|
documents: candidates.map((candidate) => candidate.text),
|
|
17274
|
-
top_n: topN ?? candidates.length
|
|
17280
|
+
top_n: topN ?? candidates.length,
|
|
17281
|
+
return_documents: false
|
|
17275
17282
|
};
|
|
17276
17283
|
let attempt = 0;
|
|
17277
17284
|
while (attempt <= this.maxRetries) {
|
|
@@ -19036,6 +19043,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19036
19043
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19037
19044
|
return null;
|
|
19038
19045
|
}
|
|
19046
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
19047
|
+
let weight;
|
|
19048
|
+
if (weightRaw !== void 0) {
|
|
19049
|
+
const parsed = Number(weightRaw);
|
|
19050
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
19051
|
+
weight = parsed;
|
|
19052
|
+
}
|
|
19053
|
+
}
|
|
19054
|
+
if (weight === 0) {
|
|
19055
|
+
return null;
|
|
19056
|
+
}
|
|
19039
19057
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19040
19058
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19041
19059
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19091,7 +19109,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19091
19109
|
noindex: false,
|
|
19092
19110
|
tags,
|
|
19093
19111
|
description,
|
|
19094
|
-
keywords
|
|
19112
|
+
keywords,
|
|
19113
|
+
weight
|
|
19095
19114
|
};
|
|
19096
19115
|
}
|
|
19097
19116
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19104,6 +19123,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19104
19123
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19105
19124
|
return null;
|
|
19106
19125
|
}
|
|
19126
|
+
let mdWeight;
|
|
19127
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
19128
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
19129
|
+
mdWeight = rawWeight;
|
|
19130
|
+
}
|
|
19131
|
+
if (mdWeight === 0) {
|
|
19132
|
+
return null;
|
|
19133
|
+
}
|
|
19107
19134
|
const content = parsed.content;
|
|
19108
19135
|
const normalized = normalizeMarkdown(content);
|
|
19109
19136
|
if (!normalizeText(normalized)) {
|
|
@@ -19126,7 +19153,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19126
19153
|
noindex: false,
|
|
19127
19154
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19128
19155
|
description: fmDescription,
|
|
19129
|
-
keywords: fmKeywords
|
|
19156
|
+
keywords: fmKeywords,
|
|
19157
|
+
weight: mdWeight
|
|
19130
19158
|
};
|
|
19131
19159
|
}
|
|
19132
19160
|
function yamlString(value) {
|
|
@@ -19322,6 +19350,38 @@ var Logger = class {
|
|
|
19322
19350
|
`);
|
|
19323
19351
|
}
|
|
19324
19352
|
};
|
|
19353
|
+
|
|
19354
|
+
// src/utils/pattern.ts
|
|
19355
|
+
function matchUrlPattern(url, pattern) {
|
|
19356
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
19357
|
+
const normalizedUrl = norm(url);
|
|
19358
|
+
const normalizedPattern = norm(pattern);
|
|
19359
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
19360
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
19361
|
+
if (prefix === "") {
|
|
19362
|
+
return true;
|
|
19363
|
+
}
|
|
19364
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
19365
|
+
}
|
|
19366
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
19367
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
19368
|
+
if (prefix === "") {
|
|
19369
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
19370
|
+
}
|
|
19371
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
19372
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
19373
|
+
return rest.length > 0 && !rest.includes("/");
|
|
19374
|
+
}
|
|
19375
|
+
return normalizedUrl === normalizedPattern;
|
|
19376
|
+
}
|
|
19377
|
+
function matchUrlPatterns(url, patterns) {
|
|
19378
|
+
for (const pattern of patterns) {
|
|
19379
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
19380
|
+
}
|
|
19381
|
+
return false;
|
|
19382
|
+
}
|
|
19383
|
+
|
|
19384
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
19325
19385
|
function routeIdToFile(routeId) {
|
|
19326
19386
|
if (routeId === "/") {
|
|
19327
19387
|
return "src/routes/+page.svelte";
|
|
@@ -19395,15 +19455,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19395
19455
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19396
19456
|
}
|
|
19397
19457
|
function isExcluded(url, patterns) {
|
|
19398
|
-
|
|
19399
|
-
if (pattern.endsWith("/*")) {
|
|
19400
|
-
const prefix = pattern.slice(0, -1);
|
|
19401
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19402
|
-
} else if (url === pattern) {
|
|
19403
|
-
return true;
|
|
19404
|
-
}
|
|
19405
|
-
}
|
|
19406
|
-
return false;
|
|
19458
|
+
return matchUrlPatterns(url, patterns);
|
|
19407
19459
|
}
|
|
19408
19460
|
function findFreePort() {
|
|
19409
19461
|
return new Promise((resolve, reject) => {
|
|
@@ -19819,6 +19871,158 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
19819
19871
|
}
|
|
19820
19872
|
return pages;
|
|
19821
19873
|
}
|
|
19874
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
19875
|
+
const lines = content.split(/\r?\n/);
|
|
19876
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
19877
|
+
let currentAgents = [];
|
|
19878
|
+
for (const rawLine of lines) {
|
|
19879
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
19880
|
+
if (!line) continue;
|
|
19881
|
+
const colonIdx = line.indexOf(":");
|
|
19882
|
+
if (colonIdx === -1) continue;
|
|
19883
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
19884
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
19885
|
+
if (directive === "user-agent") {
|
|
19886
|
+
const agentName = value.toLowerCase();
|
|
19887
|
+
currentAgents.push(agentName);
|
|
19888
|
+
if (!agentGroups.has(agentName)) {
|
|
19889
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
19890
|
+
}
|
|
19891
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
19892
|
+
for (const agent of currentAgents) {
|
|
19893
|
+
agentGroups.get(agent).disallow.push(value);
|
|
19894
|
+
}
|
|
19895
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
19896
|
+
for (const agent of currentAgents) {
|
|
19897
|
+
agentGroups.get(agent).allow.push(value);
|
|
19898
|
+
}
|
|
19899
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
19900
|
+
currentAgents = [];
|
|
19901
|
+
}
|
|
19902
|
+
}
|
|
19903
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
19904
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
19905
|
+
return specific;
|
|
19906
|
+
}
|
|
19907
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
19908
|
+
}
|
|
19909
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
19910
|
+
let longestDisallow = "";
|
|
19911
|
+
for (const pattern of rules3.disallow) {
|
|
19912
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
19913
|
+
longestDisallow = pattern;
|
|
19914
|
+
}
|
|
19915
|
+
}
|
|
19916
|
+
if (!longestDisallow) return false;
|
|
19917
|
+
let longestAllow = "";
|
|
19918
|
+
for (const pattern of rules3.allow) {
|
|
19919
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
19920
|
+
longestAllow = pattern;
|
|
19921
|
+
}
|
|
19922
|
+
}
|
|
19923
|
+
return longestAllow.length < longestDisallow.length;
|
|
19924
|
+
}
|
|
19925
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
19926
|
+
try {
|
|
19927
|
+
const content = await fs4.readFile(path.join(dir, "robots.txt"), "utf8");
|
|
19928
|
+
return parseRobotsTxt(content);
|
|
19929
|
+
} catch {
|
|
19930
|
+
return null;
|
|
19931
|
+
}
|
|
19932
|
+
}
|
|
19933
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
19934
|
+
try {
|
|
19935
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
19936
|
+
const response = await fetch(url);
|
|
19937
|
+
if (!response.ok) return null;
|
|
19938
|
+
const content = await response.text();
|
|
19939
|
+
return parseRobotsTxt(content);
|
|
19940
|
+
} catch {
|
|
19941
|
+
return null;
|
|
19942
|
+
}
|
|
19943
|
+
}
|
|
19944
|
+
|
|
19945
|
+
// src/search/ranking.ts
|
|
19946
|
+
function nonNegativeOrZero(value) {
|
|
19947
|
+
if (!Number.isFinite(value)) {
|
|
19948
|
+
return 0;
|
|
19949
|
+
}
|
|
19950
|
+
return Math.max(0, value);
|
|
19951
|
+
}
|
|
19952
|
+
function rankHits(hits, config) {
|
|
19953
|
+
return hits.map((hit) => {
|
|
19954
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
19955
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
19956
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
19957
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
19958
|
+
}
|
|
19959
|
+
if (config.ranking.enableDepthBoost) {
|
|
19960
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
19961
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
19962
|
+
}
|
|
19963
|
+
return {
|
|
19964
|
+
hit,
|
|
19965
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
19966
|
+
};
|
|
19967
|
+
}).sort((a, b) => {
|
|
19968
|
+
const delta = b.finalScore - a.finalScore;
|
|
19969
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19970
|
+
});
|
|
19971
|
+
}
|
|
19972
|
+
function findPageWeight(url, pageWeights) {
|
|
19973
|
+
let bestPattern = "";
|
|
19974
|
+
let bestWeight = 1;
|
|
19975
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
19976
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
19977
|
+
bestPattern = pattern;
|
|
19978
|
+
bestWeight = weight;
|
|
19979
|
+
}
|
|
19980
|
+
}
|
|
19981
|
+
return bestWeight;
|
|
19982
|
+
}
|
|
19983
|
+
function aggregateByPage(ranked, config) {
|
|
19984
|
+
const groups = /* @__PURE__ */ new Map();
|
|
19985
|
+
for (const hit of ranked) {
|
|
19986
|
+
const url = hit.hit.metadata.url;
|
|
19987
|
+
const group = groups.get(url);
|
|
19988
|
+
if (group) group.push(hit);
|
|
19989
|
+
else groups.set(url, [hit]);
|
|
19990
|
+
}
|
|
19991
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
19992
|
+
const pages = [];
|
|
19993
|
+
for (const [url, chunks] of groups) {
|
|
19994
|
+
chunks.sort((a, b) => {
|
|
19995
|
+
const delta = b.finalScore - a.finalScore;
|
|
19996
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
19997
|
+
});
|
|
19998
|
+
const best = chunks[0];
|
|
19999
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20000
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
20001
|
+
let aggregationBonus = 0;
|
|
20002
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
20003
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20004
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20005
|
+
}
|
|
20006
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20007
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20008
|
+
if (pageWeight === 0) continue;
|
|
20009
|
+
if (pageWeight !== 1) {
|
|
20010
|
+
pageScore *= pageWeight;
|
|
20011
|
+
}
|
|
20012
|
+
pages.push({
|
|
20013
|
+
url,
|
|
20014
|
+
title: best.hit.metadata.title,
|
|
20015
|
+
routeFile: best.hit.metadata.routeFile,
|
|
20016
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20017
|
+
bestChunk: best,
|
|
20018
|
+
matchingChunks: chunks
|
|
20019
|
+
});
|
|
20020
|
+
}
|
|
20021
|
+
return pages.sort((a, b) => {
|
|
20022
|
+
const delta = b.pageScore - a.pageScore;
|
|
20023
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
20024
|
+
});
|
|
20025
|
+
}
|
|
19822
20026
|
|
|
19823
20027
|
// src/utils/time.ts
|
|
19824
20028
|
function nowIso() {
|
|
@@ -19830,9 +20034,10 @@ function hrTimeMs(start) {
|
|
|
19830
20034
|
|
|
19831
20035
|
// src/indexing/pipeline.ts
|
|
19832
20036
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
19833
|
-
"jina-embeddings-v3": 2e-5
|
|
20037
|
+
"jina-embeddings-v3": 2e-5,
|
|
20038
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
19834
20039
|
};
|
|
19835
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
20040
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
19836
20041
|
var IndexPipeline = class _IndexPipeline {
|
|
19837
20042
|
cwd;
|
|
19838
20043
|
config;
|
|
@@ -19910,6 +20115,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19910
20115
|
}
|
|
19911
20116
|
stageEnd("source", sourceStart);
|
|
19912
20117
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20118
|
+
const filterStart = stageStart();
|
|
20119
|
+
let filteredSourcePages = sourcePages;
|
|
20120
|
+
if (this.config.exclude.length > 0) {
|
|
20121
|
+
const beforeExclude = filteredSourcePages.length;
|
|
20122
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20123
|
+
const url = normalizeUrlPath(p.url);
|
|
20124
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
20125
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
20126
|
+
return false;
|
|
20127
|
+
}
|
|
20128
|
+
return true;
|
|
20129
|
+
});
|
|
20130
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
20131
|
+
if (excludedCount > 0) {
|
|
20132
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
20133
|
+
}
|
|
20134
|
+
}
|
|
20135
|
+
if (this.config.respectRobotsTxt) {
|
|
20136
|
+
let robotsRules = null;
|
|
20137
|
+
if (sourceMode === "static-output") {
|
|
20138
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20139
|
+
path.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
20140
|
+
);
|
|
20141
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
20142
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20143
|
+
path.resolve(this.cwd, this.config.source.build.outputDir)
|
|
20144
|
+
);
|
|
20145
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
20146
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
20147
|
+
}
|
|
20148
|
+
if (robotsRules) {
|
|
20149
|
+
const beforeRobots = filteredSourcePages.length;
|
|
20150
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20151
|
+
const url = normalizeUrlPath(p.url);
|
|
20152
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
20153
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
20154
|
+
return false;
|
|
20155
|
+
}
|
|
20156
|
+
return true;
|
|
20157
|
+
});
|
|
20158
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
20159
|
+
if (robotsExcluded > 0) {
|
|
20160
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
20161
|
+
}
|
|
20162
|
+
}
|
|
20163
|
+
}
|
|
20164
|
+
stageEnd("filter", filterStart);
|
|
19913
20165
|
const routeStart = stageStart();
|
|
19914
20166
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
19915
20167
|
stageEnd("route_map", routeStart);
|
|
@@ -19917,7 +20169,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19917
20169
|
const extractStart = stageStart();
|
|
19918
20170
|
this.logger.info("Extracting content...");
|
|
19919
20171
|
const extractedPages = [];
|
|
19920
|
-
for (const sourcePage of
|
|
20172
|
+
for (const sourcePage of filteredSourcePages) {
|
|
19921
20173
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
19922
20174
|
if (!extracted) {
|
|
19923
20175
|
this.logger.warn(
|
|
@@ -19943,16 +20195,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19943
20195
|
seenUrls.add(page.url);
|
|
19944
20196
|
uniquePages.push(page);
|
|
19945
20197
|
}
|
|
20198
|
+
const indexablePages = [];
|
|
20199
|
+
for (const page of uniquePages) {
|
|
20200
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
20201
|
+
if (effectiveWeight === 0) {
|
|
20202
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
20203
|
+
continue;
|
|
20204
|
+
}
|
|
20205
|
+
indexablePages.push(page);
|
|
20206
|
+
}
|
|
20207
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
20208
|
+
if (zeroWeightCount > 0) {
|
|
20209
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
20210
|
+
}
|
|
19946
20211
|
stageEnd("extract", extractStart);
|
|
19947
|
-
const skippedPages =
|
|
19948
|
-
this.logger.info(`Extracted ${
|
|
20212
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
20213
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
19949
20214
|
const linkStart = stageStart();
|
|
19950
|
-
const pageSet = new Set(
|
|
20215
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
19951
20216
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
19952
|
-
for (const page of
|
|
20217
|
+
for (const page of indexablePages) {
|
|
19953
20218
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
19954
20219
|
}
|
|
19955
|
-
for (const page of
|
|
20220
|
+
for (const page of indexablePages) {
|
|
19956
20221
|
for (const outgoing of page.outgoingLinks) {
|
|
19957
20222
|
if (!pageSet.has(outgoing)) {
|
|
19958
20223
|
continue;
|
|
@@ -19976,7 +20241,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
19976
20241
|
});
|
|
19977
20242
|
}
|
|
19978
20243
|
}
|
|
19979
|
-
for (const page of
|
|
20244
|
+
for (const page of indexablePages) {
|
|
19980
20245
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
19981
20246
|
if (routeMatch.routeResolution === "best-effort") {
|
|
19982
20247
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -20193,100 +20458,6 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20193
20458
|
};
|
|
20194
20459
|
}
|
|
20195
20460
|
};
|
|
20196
|
-
|
|
20197
|
-
// src/search/ranking.ts
|
|
20198
|
-
function nonNegativeOrZero(value) {
|
|
20199
|
-
if (!Number.isFinite(value)) {
|
|
20200
|
-
return 0;
|
|
20201
|
-
}
|
|
20202
|
-
return Math.max(0, value);
|
|
20203
|
-
}
|
|
20204
|
-
function rankHits(hits, config) {
|
|
20205
|
-
return hits.map((hit) => {
|
|
20206
|
-
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
20207
|
-
if (config.ranking.enableIncomingLinkBoost) {
|
|
20208
|
-
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
20209
|
-
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
20210
|
-
}
|
|
20211
|
-
if (config.ranking.enableDepthBoost) {
|
|
20212
|
-
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
20213
|
-
score += depthBoost * config.ranking.weights.depth;
|
|
20214
|
-
}
|
|
20215
|
-
return {
|
|
20216
|
-
hit,
|
|
20217
|
-
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
20218
|
-
};
|
|
20219
|
-
}).sort((a, b) => {
|
|
20220
|
-
const delta = b.finalScore - a.finalScore;
|
|
20221
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20222
|
-
});
|
|
20223
|
-
}
|
|
20224
|
-
function findPageWeight(url, pageWeights) {
|
|
20225
|
-
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
20226
|
-
const normalizedUrl = norm(url);
|
|
20227
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20228
|
-
if (norm(pattern) === normalizedUrl) {
|
|
20229
|
-
return weight;
|
|
20230
|
-
}
|
|
20231
|
-
}
|
|
20232
|
-
let bestPrefix = "";
|
|
20233
|
-
let bestWeight = 1;
|
|
20234
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
20235
|
-
const normalizedPattern = norm(pattern);
|
|
20236
|
-
if (normalizedPattern === "/") continue;
|
|
20237
|
-
const prefix = `${normalizedPattern}/`;
|
|
20238
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
20239
|
-
bestPrefix = prefix;
|
|
20240
|
-
bestWeight = weight;
|
|
20241
|
-
}
|
|
20242
|
-
}
|
|
20243
|
-
return bestWeight;
|
|
20244
|
-
}
|
|
20245
|
-
function aggregateByPage(ranked, config) {
|
|
20246
|
-
const groups = /* @__PURE__ */ new Map();
|
|
20247
|
-
for (const hit of ranked) {
|
|
20248
|
-
const url = hit.hit.metadata.url;
|
|
20249
|
-
const group = groups.get(url);
|
|
20250
|
-
if (group) group.push(hit);
|
|
20251
|
-
else groups.set(url, [hit]);
|
|
20252
|
-
}
|
|
20253
|
-
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
20254
|
-
const pages = [];
|
|
20255
|
-
for (const [url, chunks] of groups) {
|
|
20256
|
-
chunks.sort((a, b) => {
|
|
20257
|
-
const delta = b.finalScore - a.finalScore;
|
|
20258
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20259
|
-
});
|
|
20260
|
-
const best = chunks[0];
|
|
20261
|
-
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
20262
|
-
const topChunks = chunks.slice(0, aggregationCap);
|
|
20263
|
-
let aggregationBonus = 0;
|
|
20264
|
-
for (let i = 1; i < topChunks.length; i++) {
|
|
20265
|
-
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
20266
|
-
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
20267
|
-
}
|
|
20268
|
-
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20269
|
-
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20270
|
-
if (pageWeight === 0) continue;
|
|
20271
|
-
if (pageWeight !== 1) {
|
|
20272
|
-
pageScore *= pageWeight;
|
|
20273
|
-
}
|
|
20274
|
-
pages.push({
|
|
20275
|
-
url,
|
|
20276
|
-
title: best.hit.metadata.title,
|
|
20277
|
-
routeFile: best.hit.metadata.routeFile,
|
|
20278
|
-
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
20279
|
-
bestChunk: best,
|
|
20280
|
-
matchingChunks: chunks
|
|
20281
|
-
});
|
|
20282
|
-
}
|
|
20283
|
-
return pages.sort((a, b) => {
|
|
20284
|
-
const delta = b.pageScore - a.pageScore;
|
|
20285
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
20286
|
-
});
|
|
20287
|
-
}
|
|
20288
|
-
|
|
20289
|
-
// src/search/engine.ts
|
|
20290
20461
|
var requestSchema = z.object({
|
|
20291
20462
|
q: z.string().trim().min(1),
|
|
20292
20463
|
topK: z.number().int().positive().max(100).optional(),
|
|
@@ -20294,7 +20465,8 @@ var requestSchema = z.object({
|
|
|
20294
20465
|
pathPrefix: z.string().optional(),
|
|
20295
20466
|
tags: z.array(z.string()).optional(),
|
|
20296
20467
|
rerank: z.boolean().optional(),
|
|
20297
|
-
groupBy: z.enum(["page", "chunk"]).optional()
|
|
20468
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
20469
|
+
stream: z.boolean().optional()
|
|
20298
20470
|
});
|
|
20299
20471
|
var SearchEngine = class _SearchEngine {
|
|
20300
20472
|
cwd;
|
|
@@ -20367,7 +20539,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
20367
20539
|
rerankMs = hrTimeMs(rerankStart);
|
|
20368
20540
|
usedRerank = true;
|
|
20369
20541
|
}
|
|
20370
|
-
|
|
20542
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
20543
|
+
return {
|
|
20544
|
+
q: input.q,
|
|
20545
|
+
scope: resolvedScope.scopeName,
|
|
20546
|
+
results,
|
|
20547
|
+
meta: {
|
|
20548
|
+
timingsMs: {
|
|
20549
|
+
embed: Math.round(embedMs),
|
|
20550
|
+
vector: Math.round(vectorMs),
|
|
20551
|
+
rerank: Math.round(rerankMs),
|
|
20552
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20553
|
+
},
|
|
20554
|
+
usedRerank,
|
|
20555
|
+
modelId: this.config.embeddings.model
|
|
20556
|
+
}
|
|
20557
|
+
};
|
|
20558
|
+
}
|
|
20559
|
+
async *searchStreaming(request) {
|
|
20560
|
+
const parsed = requestSchema.safeParse(request);
|
|
20561
|
+
if (!parsed.success) {
|
|
20562
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
20563
|
+
}
|
|
20564
|
+
const input = parsed.data;
|
|
20565
|
+
const wantsRerank = Boolean(input.rerank);
|
|
20566
|
+
if (!wantsRerank) {
|
|
20567
|
+
const response = await this.search(request);
|
|
20568
|
+
yield { phase: "initial", data: response };
|
|
20569
|
+
return;
|
|
20570
|
+
}
|
|
20571
|
+
const totalStart = process.hrtime.bigint();
|
|
20572
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
20573
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
20574
|
+
const topK = input.topK ?? 10;
|
|
20575
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
20576
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
20577
|
+
const embedStart = process.hrtime.bigint();
|
|
20578
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
20579
|
+
const queryVector = queryEmbeddings[0];
|
|
20580
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
20581
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
20582
|
+
}
|
|
20583
|
+
const embedMs = hrTimeMs(embedStart);
|
|
20584
|
+
const vectorStart = process.hrtime.bigint();
|
|
20585
|
+
const hits = await this.vectorStore.query(
|
|
20586
|
+
queryVector,
|
|
20587
|
+
{
|
|
20588
|
+
topK: candidateK,
|
|
20589
|
+
pathPrefix: input.pathPrefix,
|
|
20590
|
+
tags: input.tags
|
|
20591
|
+
},
|
|
20592
|
+
resolvedScope
|
|
20593
|
+
);
|
|
20594
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
20595
|
+
const ranked = rankHits(hits, this.config);
|
|
20596
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
20597
|
+
yield {
|
|
20598
|
+
phase: "initial",
|
|
20599
|
+
data: {
|
|
20600
|
+
q: input.q,
|
|
20601
|
+
scope: resolvedScope.scopeName,
|
|
20602
|
+
results: initialResults,
|
|
20603
|
+
meta: {
|
|
20604
|
+
timingsMs: {
|
|
20605
|
+
embed: Math.round(embedMs),
|
|
20606
|
+
vector: Math.round(vectorMs),
|
|
20607
|
+
rerank: 0,
|
|
20608
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20609
|
+
},
|
|
20610
|
+
usedRerank: false,
|
|
20611
|
+
modelId: this.config.embeddings.model
|
|
20612
|
+
}
|
|
20613
|
+
}
|
|
20614
|
+
};
|
|
20615
|
+
const rerankStart = process.hrtime.bigint();
|
|
20616
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
20617
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
20618
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
20619
|
+
yield {
|
|
20620
|
+
phase: "reranked",
|
|
20621
|
+
data: {
|
|
20622
|
+
q: input.q,
|
|
20623
|
+
scope: resolvedScope.scopeName,
|
|
20624
|
+
results: rerankedResults,
|
|
20625
|
+
meta: {
|
|
20626
|
+
timingsMs: {
|
|
20627
|
+
embed: Math.round(embedMs),
|
|
20628
|
+
vector: Math.round(vectorMs),
|
|
20629
|
+
rerank: Math.round(rerankMs),
|
|
20630
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
20631
|
+
},
|
|
20632
|
+
usedRerank: true,
|
|
20633
|
+
modelId: this.config.embeddings.model
|
|
20634
|
+
}
|
|
20635
|
+
}
|
|
20636
|
+
};
|
|
20637
|
+
}
|
|
20638
|
+
buildResults(ordered, topK, groupByPage) {
|
|
20371
20639
|
const minScore = this.config.ranking.minScore;
|
|
20372
20640
|
if (groupByPage) {
|
|
20373
20641
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -20375,10 +20643,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
20375
20643
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
20376
20644
|
}
|
|
20377
20645
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
20378
|
-
|
|
20646
|
+
return pages.slice(0, topK).map((page) => {
|
|
20379
20647
|
const bestScore = page.bestChunk.finalScore;
|
|
20380
|
-
const
|
|
20381
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
20648
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
20649
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
20382
20650
|
return {
|
|
20383
20651
|
url: page.url,
|
|
20384
20652
|
title: page.title,
|
|
@@ -20395,10 +20663,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
20395
20663
|
};
|
|
20396
20664
|
});
|
|
20397
20665
|
} else {
|
|
20666
|
+
let filtered = ordered;
|
|
20398
20667
|
if (minScore > 0) {
|
|
20399
|
-
|
|
20668
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
20400
20669
|
}
|
|
20401
|
-
|
|
20670
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
20402
20671
|
url: hit.metadata.url,
|
|
20403
20672
|
title: hit.metadata.title,
|
|
20404
20673
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -20407,21 +20676,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
20407
20676
|
routeFile: hit.metadata.routeFile
|
|
20408
20677
|
}));
|
|
20409
20678
|
}
|
|
20410
|
-
return {
|
|
20411
|
-
q: input.q,
|
|
20412
|
-
scope: resolvedScope.scopeName,
|
|
20413
|
-
results,
|
|
20414
|
-
meta: {
|
|
20415
|
-
timingsMs: {
|
|
20416
|
-
embed: Math.round(embedMs),
|
|
20417
|
-
vector: Math.round(vectorMs),
|
|
20418
|
-
rerank: Math.round(rerankMs),
|
|
20419
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
20420
|
-
},
|
|
20421
|
-
usedRerank,
|
|
20422
|
-
modelId: this.config.embeddings.model
|
|
20423
|
-
}
|
|
20424
|
-
};
|
|
20425
20679
|
}
|
|
20426
20680
|
async getPage(pathOrUrl, scope) {
|
|
20427
20681
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -20493,6 +20747,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20493
20747
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20494
20748
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20495
20749
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20750
|
+
const MAX_DOC_CHARS = 2e3;
|
|
20496
20751
|
const pageCandidates = [];
|
|
20497
20752
|
for (const [url, chunks] of pageGroups) {
|
|
20498
20753
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -20512,12 +20767,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
20512
20767
|
}
|
|
20513
20768
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20514
20769
|
parts.push(body);
|
|
20515
|
-
|
|
20770
|
+
let text = parts.join("\n\n");
|
|
20771
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
20772
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
20773
|
+
}
|
|
20774
|
+
pageCandidates.push({ id: url, text });
|
|
20516
20775
|
}
|
|
20776
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
20777
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
20517
20778
|
const reranked = await this.reranker.rerank(
|
|
20518
20779
|
query,
|
|
20519
|
-
|
|
20520
|
-
|
|
20780
|
+
cappedCandidates,
|
|
20781
|
+
maxCandidates
|
|
20521
20782
|
);
|
|
20522
20783
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20523
20784
|
return ranked.map((entry) => {
|
|
@@ -20537,7 +20798,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20537
20798
|
});
|
|
20538
20799
|
}
|
|
20539
20800
|
};
|
|
20540
|
-
function createServer(engine) {
|
|
20801
|
+
function createServer(engine, config) {
|
|
20541
20802
|
const server = new McpServer({
|
|
20542
20803
|
name: "searchsocket-mcp",
|
|
20543
20804
|
version: "0.1.0"
|
|
@@ -20545,14 +20806,15 @@ function createServer(engine) {
|
|
|
20545
20806
|
server.registerTool(
|
|
20546
20807
|
"search",
|
|
20547
20808
|
{
|
|
20548
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and
|
|
20809
|
+
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and rerank. Enable rerank for better relevance on natural-language queries.",
|
|
20549
20810
|
inputSchema: {
|
|
20550
20811
|
query: z.string().min(1),
|
|
20551
20812
|
scope: z.string().optional(),
|
|
20552
20813
|
topK: z.number().int().positive().max(100).optional(),
|
|
20553
20814
|
pathPrefix: z.string().optional(),
|
|
20554
20815
|
tags: z.array(z.string()).optional(),
|
|
20555
|
-
groupBy: z.enum(["page", "chunk"]).optional()
|
|
20816
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
20817
|
+
rerank: z.boolean().optional().describe("Enable reranking for better relevance (uses Jina Reranker). Defaults to true when rerank is enabled in config.")
|
|
20556
20818
|
}
|
|
20557
20819
|
},
|
|
20558
20820
|
async (input) => {
|
|
@@ -20562,7 +20824,8 @@ function createServer(engine) {
|
|
|
20562
20824
|
scope: input.scope,
|
|
20563
20825
|
pathPrefix: input.pathPrefix,
|
|
20564
20826
|
tags: input.tags,
|
|
20565
|
-
groupBy: input.groupBy
|
|
20827
|
+
groupBy: input.groupBy,
|
|
20828
|
+
rerank: input.rerank ?? config.rerank.enabled
|
|
20566
20829
|
});
|
|
20567
20830
|
return {
|
|
20568
20831
|
content: [
|
|
@@ -20688,10 +20951,10 @@ async function runMcpServer(options = {}) {
|
|
|
20688
20951
|
config
|
|
20689
20952
|
});
|
|
20690
20953
|
if (resolvedTransport === "http") {
|
|
20691
|
-
await startHttpServer(() => createServer(engine), config, options);
|
|
20954
|
+
await startHttpServer(() => createServer(engine, config), config, options);
|
|
20692
20955
|
return;
|
|
20693
20956
|
}
|
|
20694
|
-
const server = createServer(engine);
|
|
20957
|
+
const server = createServer(engine, config);
|
|
20695
20958
|
const stdioTransport = new StdioServerTransport();
|
|
20696
20959
|
await server.connect(stdioTransport);
|
|
20697
20960
|
}
|
|
@@ -20847,7 +21110,44 @@ function searchsocketHandle(options = {}) {
|
|
|
20847
21110
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
20848
21111
|
}
|
|
20849
21112
|
const engine = await getEngine();
|
|
20850
|
-
const
|
|
21113
|
+
const searchRequest = body;
|
|
21114
|
+
if (searchRequest.stream && searchRequest.rerank) {
|
|
21115
|
+
const encoder = new TextEncoder();
|
|
21116
|
+
const stream = new ReadableStream({
|
|
21117
|
+
async start(controller) {
|
|
21118
|
+
try {
|
|
21119
|
+
for await (const event2 of engine.searchStreaming(searchRequest)) {
|
|
21120
|
+
const line = JSON.stringify(event2) + "\n";
|
|
21121
|
+
controller.enqueue(encoder.encode(line));
|
|
21122
|
+
}
|
|
21123
|
+
} catch (streamError) {
|
|
21124
|
+
const errorEvent = {
|
|
21125
|
+
phase: "error",
|
|
21126
|
+
data: {
|
|
21127
|
+
error: {
|
|
21128
|
+
code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
|
|
21129
|
+
message: streamError instanceof Error ? streamError.message : "Unknown error"
|
|
21130
|
+
}
|
|
21131
|
+
}
|
|
21132
|
+
};
|
|
21133
|
+
controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
|
|
21134
|
+
} finally {
|
|
21135
|
+
controller.close();
|
|
21136
|
+
}
|
|
21137
|
+
}
|
|
21138
|
+
});
|
|
21139
|
+
return withCors(
|
|
21140
|
+
new Response(stream, {
|
|
21141
|
+
status: 200,
|
|
21142
|
+
headers: {
|
|
21143
|
+
"content-type": "application/x-ndjson"
|
|
21144
|
+
}
|
|
21145
|
+
}),
|
|
21146
|
+
event.request,
|
|
21147
|
+
config
|
|
21148
|
+
);
|
|
21149
|
+
}
|
|
21150
|
+
const result = await engine.search(searchRequest);
|
|
20851
21151
|
return withCors(
|
|
20852
21152
|
new Response(JSON.stringify(result), {
|
|
20853
21153
|
status: 200,
|
|
@@ -20960,7 +21260,7 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20960
21260
|
});
|
|
20961
21261
|
const stats = await pipeline.run({
|
|
20962
21262
|
changedOnly: options.changedOnly ?? true,
|
|
20963
|
-
force: options.force ?? false,
|
|
21263
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20964
21264
|
dryRun: options.dryRun ?? false,
|
|
20965
21265
|
scopeOverride: options.scope,
|
|
20966
21266
|
verbose: options.verbose
|
|
@@ -20977,6 +21277,60 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20977
21277
|
};
|
|
20978
21278
|
}
|
|
20979
21279
|
|
|
21280
|
+
// src/merge.ts
|
|
21281
|
+
function mergeSearchResults(initial, reranked, options) {
|
|
21282
|
+
const maxDisplacement = options?.maxDisplacement ?? 3;
|
|
21283
|
+
const initialUrls = initial.results.map((r) => r.url);
|
|
21284
|
+
const rerankedUrls = reranked.results.map((r) => r.url);
|
|
21285
|
+
const initialPos = /* @__PURE__ */ new Map();
|
|
21286
|
+
for (let i = 0; i < initialUrls.length; i++) {
|
|
21287
|
+
initialPos.set(initialUrls[i], i);
|
|
21288
|
+
}
|
|
21289
|
+
const rerankedPos = /* @__PURE__ */ new Map();
|
|
21290
|
+
for (let i = 0; i < rerankedUrls.length; i++) {
|
|
21291
|
+
rerankedPos.set(rerankedUrls[i], i);
|
|
21292
|
+
}
|
|
21293
|
+
const displacements = [];
|
|
21294
|
+
for (const url of initialUrls) {
|
|
21295
|
+
const iPos = initialPos.get(url);
|
|
21296
|
+
const rPos = rerankedPos.get(url);
|
|
21297
|
+
const displacement = rPos !== void 0 ? Math.abs(iPos - rPos) : 0;
|
|
21298
|
+
displacements.push({ url, displacement });
|
|
21299
|
+
}
|
|
21300
|
+
const totalResults = displacements.length;
|
|
21301
|
+
if (totalResults === 0) {
|
|
21302
|
+
return {
|
|
21303
|
+
response: reranked,
|
|
21304
|
+
usedRerankedOrder: true,
|
|
21305
|
+
displacements
|
|
21306
|
+
};
|
|
21307
|
+
}
|
|
21308
|
+
const hasLargeDisplacement = displacements.some((d) => d.displacement > maxDisplacement);
|
|
21309
|
+
if (hasLargeDisplacement) {
|
|
21310
|
+
return {
|
|
21311
|
+
response: reranked,
|
|
21312
|
+
usedRerankedOrder: true,
|
|
21313
|
+
displacements
|
|
21314
|
+
};
|
|
21315
|
+
}
|
|
21316
|
+
const rerankedScoreMap = /* @__PURE__ */ new Map();
|
|
21317
|
+
for (const result of reranked.results) {
|
|
21318
|
+
rerankedScoreMap.set(result.url, result.score);
|
|
21319
|
+
}
|
|
21320
|
+
const mergedResults = initial.results.map((result) => ({
|
|
21321
|
+
...result,
|
|
21322
|
+
score: rerankedScoreMap.get(result.url) ?? result.score
|
|
21323
|
+
}));
|
|
21324
|
+
return {
|
|
21325
|
+
response: {
|
|
21326
|
+
...reranked,
|
|
21327
|
+
results: mergedResults
|
|
21328
|
+
},
|
|
21329
|
+
usedRerankedOrder: false,
|
|
21330
|
+
displacements
|
|
21331
|
+
};
|
|
21332
|
+
}
|
|
21333
|
+
|
|
20980
21334
|
// src/client.ts
|
|
20981
21335
|
function createSearchClient(options = {}) {
|
|
20982
21336
|
const endpoint = options.endpoint ?? "/api/search";
|
|
@@ -21004,6 +21358,72 @@ function createSearchClient(options = {}) {
|
|
|
21004
21358
|
throw new Error(message);
|
|
21005
21359
|
}
|
|
21006
21360
|
return payload;
|
|
21361
|
+
},
|
|
21362
|
+
async streamSearch(request, onPhase) {
|
|
21363
|
+
const response = await fetchImpl(endpoint, {
|
|
21364
|
+
method: "POST",
|
|
21365
|
+
headers: {
|
|
21366
|
+
"content-type": "application/json"
|
|
21367
|
+
},
|
|
21368
|
+
body: JSON.stringify(request)
|
|
21369
|
+
});
|
|
21370
|
+
if (!response.ok) {
|
|
21371
|
+
let payload;
|
|
21372
|
+
try {
|
|
21373
|
+
payload = await response.json();
|
|
21374
|
+
} catch {
|
|
21375
|
+
throw new Error("Search failed");
|
|
21376
|
+
}
|
|
21377
|
+
const message = payload.error?.message ?? "Search failed";
|
|
21378
|
+
throw new Error(message);
|
|
21379
|
+
}
|
|
21380
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
21381
|
+
if (contentType.includes("application/json")) {
|
|
21382
|
+
const data = await response.json();
|
|
21383
|
+
onPhase({ phase: "initial", data });
|
|
21384
|
+
return data;
|
|
21385
|
+
}
|
|
21386
|
+
if (!response.body) {
|
|
21387
|
+
throw new Error("Response body is not readable");
|
|
21388
|
+
}
|
|
21389
|
+
const reader = response.body.getReader();
|
|
21390
|
+
const decoder = new TextDecoder();
|
|
21391
|
+
let buffer = "";
|
|
21392
|
+
let lastResponse = null;
|
|
21393
|
+
for (; ; ) {
|
|
21394
|
+
const { done, value } = await reader.read();
|
|
21395
|
+
if (done) break;
|
|
21396
|
+
buffer += decoder.decode(value, { stream: true });
|
|
21397
|
+
let newlineIdx;
|
|
21398
|
+
while ((newlineIdx = buffer.indexOf("\n")) !== -1) {
|
|
21399
|
+
const line = buffer.slice(0, newlineIdx).trim();
|
|
21400
|
+
buffer = buffer.slice(newlineIdx + 1);
|
|
21401
|
+
if (line.length === 0) continue;
|
|
21402
|
+
const event = JSON.parse(line);
|
|
21403
|
+
if (event.phase === "error") {
|
|
21404
|
+
const errData = event.data;
|
|
21405
|
+
throw new Error(errData.error.message ?? "Streaming search error");
|
|
21406
|
+
}
|
|
21407
|
+
const searchEvent = event;
|
|
21408
|
+
onPhase(searchEvent);
|
|
21409
|
+
lastResponse = searchEvent.data;
|
|
21410
|
+
}
|
|
21411
|
+
}
|
|
21412
|
+
const remaining = buffer.trim();
|
|
21413
|
+
if (remaining.length > 0) {
|
|
21414
|
+
const event = JSON.parse(remaining);
|
|
21415
|
+
if (event.phase === "error") {
|
|
21416
|
+
const errData = event.data;
|
|
21417
|
+
throw new Error(errData.error.message ?? "Streaming search error");
|
|
21418
|
+
}
|
|
21419
|
+
const searchEvent = event;
|
|
21420
|
+
onPhase(searchEvent);
|
|
21421
|
+
lastResponse = searchEvent.data;
|
|
21422
|
+
}
|
|
21423
|
+
if (!lastResponse) {
|
|
21424
|
+
throw new Error("No search results received");
|
|
21425
|
+
}
|
|
21426
|
+
return lastResponse;
|
|
21007
21427
|
}
|
|
21008
21428
|
};
|
|
21009
21429
|
}
|
|
@@ -21019,6 +21439,6 @@ function createSearchClient(options = {}) {
|
|
|
21019
21439
|
*)
|
|
21020
21440
|
*/
|
|
21021
21441
|
|
|
21022
|
-
export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
|
|
21442
|
+
export { IndexPipeline, JinaReranker, SearchEngine, createEmbeddingsProvider, createReranker, createSearchClient, createVectorStore, isServerless, loadConfig, mergeConfig, mergeConfigServerless, mergeSearchResults, resolveScope, runMcpServer, searchsocketHandle, searchsocketVitePlugin };
|
|
21023
21443
|
//# sourceMappingURL=index.js.map
|
|
21024
21444
|
//# sourceMappingURL=index.js.map
|