searchsocket 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +456 -187
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +590 -169
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +590 -170
- package/dist/sveltekit.cjs +380 -82
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +380 -82
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/cli.ts
|
|
4
|
-
import
|
|
4
|
+
import fs10 from "fs";
|
|
5
5
|
import fsp from "fs/promises";
|
|
6
|
-
import
|
|
6
|
+
import path14 from "path";
|
|
7
7
|
import { execSync as execSync2 } from "child_process";
|
|
8
8
|
import { config as dotenvConfig } from "dotenv";
|
|
9
9
|
import chokidar from "chokidar";
|
|
@@ -12,7 +12,7 @@ import { Command } from "commander";
|
|
|
12
12
|
// package.json
|
|
13
13
|
var package_default = {
|
|
14
14
|
name: "searchsocket",
|
|
15
|
-
version: "0.
|
|
15
|
+
version: "0.4.0",
|
|
16
16
|
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
17
|
license: "MIT",
|
|
18
18
|
author: "Greg Priday <greg@siteorigin.com>",
|
|
@@ -115,6 +115,8 @@ var searchSocketConfigSchema = z.object({
|
|
|
115
115
|
envVar: z.string().min(1).optional(),
|
|
116
116
|
sanitize: z.boolean().optional()
|
|
117
117
|
}).optional(),
|
|
118
|
+
exclude: z.array(z.string()).optional(),
|
|
119
|
+
respectRobotsTxt: z.boolean().optional(),
|
|
118
120
|
source: z.object({
|
|
119
121
|
mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
120
122
|
staticOutputDir: z.string().min(1).optional(),
|
|
@@ -245,6 +247,8 @@ function createDefaultConfig(projectId) {
|
|
|
245
247
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
246
248
|
sanitize: true
|
|
247
249
|
},
|
|
250
|
+
exclude: [],
|
|
251
|
+
respectRobotsTxt: true,
|
|
248
252
|
source: {
|
|
249
253
|
mode: "static-output",
|
|
250
254
|
staticOutputDir: "build",
|
|
@@ -275,7 +279,7 @@ function createDefaultConfig(projectId) {
|
|
|
275
279
|
},
|
|
276
280
|
embeddings: {
|
|
277
281
|
provider: "jina",
|
|
278
|
-
model: "jina-embeddings-
|
|
282
|
+
model: "jina-embeddings-v5-text-small",
|
|
279
283
|
apiKeyEnv: "JINA_API_KEY",
|
|
280
284
|
batchSize: 64,
|
|
281
285
|
concurrency: 4
|
|
@@ -288,9 +292,9 @@ function createDefaultConfig(projectId) {
|
|
|
288
292
|
}
|
|
289
293
|
},
|
|
290
294
|
rerank: {
|
|
291
|
-
enabled:
|
|
295
|
+
enabled: true,
|
|
292
296
|
topN: 20,
|
|
293
|
-
model: "jina-reranker-
|
|
297
|
+
model: "jina-reranker-v3"
|
|
294
298
|
},
|
|
295
299
|
ranking: {
|
|
296
300
|
enableIncomingLinkBoost: true,
|
|
@@ -393,6 +397,8 @@ ${issues}`
|
|
|
393
397
|
...defaults.scope,
|
|
394
398
|
...parsed.scope
|
|
395
399
|
},
|
|
400
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
401
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
396
402
|
source: {
|
|
397
403
|
...defaults.source,
|
|
398
404
|
...parsed.source,
|
|
@@ -829,7 +835,7 @@ function createEmbeddingsProvider(config) {
|
|
|
829
835
|
}
|
|
830
836
|
|
|
831
837
|
// src/indexing/pipeline.ts
|
|
832
|
-
import
|
|
838
|
+
import path12 from "path";
|
|
833
839
|
|
|
834
840
|
// src/vector/factory.ts
|
|
835
841
|
import fs3 from "fs";
|
|
@@ -1710,6 +1716,17 @@ function extractFromHtml(url, html, config) {
|
|
|
1710
1716
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
1711
1717
|
return null;
|
|
1712
1718
|
}
|
|
1719
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
1720
|
+
let weight;
|
|
1721
|
+
if (weightRaw !== void 0) {
|
|
1722
|
+
const parsed = Number(weightRaw);
|
|
1723
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
1724
|
+
weight = parsed;
|
|
1725
|
+
}
|
|
1726
|
+
}
|
|
1727
|
+
if (weight === 0) {
|
|
1728
|
+
return null;
|
|
1729
|
+
}
|
|
1713
1730
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
1714
1731
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
1715
1732
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -1765,7 +1782,8 @@ function extractFromHtml(url, html, config) {
|
|
|
1765
1782
|
noindex: false,
|
|
1766
1783
|
tags,
|
|
1767
1784
|
description,
|
|
1768
|
-
keywords
|
|
1785
|
+
keywords,
|
|
1786
|
+
weight
|
|
1769
1787
|
};
|
|
1770
1788
|
}
|
|
1771
1789
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -1778,6 +1796,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
1778
1796
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
1779
1797
|
return null;
|
|
1780
1798
|
}
|
|
1799
|
+
let mdWeight;
|
|
1800
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
1801
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
1802
|
+
mdWeight = rawWeight;
|
|
1803
|
+
}
|
|
1804
|
+
if (mdWeight === 0) {
|
|
1805
|
+
return null;
|
|
1806
|
+
}
|
|
1781
1807
|
const content = parsed.content;
|
|
1782
1808
|
const normalized = normalizeMarkdown(content);
|
|
1783
1809
|
if (!normalizeText(normalized)) {
|
|
@@ -1800,7 +1826,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
1800
1826
|
noindex: false,
|
|
1801
1827
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
1802
1828
|
description: fmDescription,
|
|
1803
|
-
keywords: fmKeywords
|
|
1829
|
+
keywords: fmKeywords,
|
|
1830
|
+
weight: mdWeight
|
|
1804
1831
|
};
|
|
1805
1832
|
}
|
|
1806
1833
|
|
|
@@ -1937,6 +1964,38 @@ import pLimit2 from "p-limit";
|
|
|
1937
1964
|
// src/indexing/sources/build/manifest-parser.ts
|
|
1938
1965
|
import fs5 from "fs/promises";
|
|
1939
1966
|
import path7 from "path";
|
|
1967
|
+
|
|
1968
|
+
// src/utils/pattern.ts
|
|
1969
|
+
function matchUrlPattern(url, pattern) {
|
|
1970
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
1971
|
+
const normalizedUrl = norm(url);
|
|
1972
|
+
const normalizedPattern = norm(pattern);
|
|
1973
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
1974
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
1975
|
+
if (prefix === "") {
|
|
1976
|
+
return true;
|
|
1977
|
+
}
|
|
1978
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
1979
|
+
}
|
|
1980
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
1981
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
1982
|
+
if (prefix === "") {
|
|
1983
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
1984
|
+
}
|
|
1985
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
1986
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
1987
|
+
return rest.length > 0 && !rest.includes("/");
|
|
1988
|
+
}
|
|
1989
|
+
return normalizedUrl === normalizedPattern;
|
|
1990
|
+
}
|
|
1991
|
+
function matchUrlPatterns(url, patterns) {
|
|
1992
|
+
for (const pattern of patterns) {
|
|
1993
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
1994
|
+
}
|
|
1995
|
+
return false;
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1998
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
1940
1999
|
function routeIdToFile(routeId) {
|
|
1941
2000
|
if (routeId === "/") {
|
|
1942
2001
|
return "src/routes/+page.svelte";
|
|
@@ -2010,15 +2069,7 @@ function expandDynamicUrl(url, value) {
|
|
|
2010
2069
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
2011
2070
|
}
|
|
2012
2071
|
function isExcluded(url, patterns) {
|
|
2013
|
-
|
|
2014
|
-
if (pattern.endsWith("/*")) {
|
|
2015
|
-
const prefix = pattern.slice(0, -1);
|
|
2016
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
2017
|
-
} else if (url === pattern) {
|
|
2018
|
-
return true;
|
|
2019
|
-
}
|
|
2020
|
-
}
|
|
2021
|
-
return false;
|
|
2072
|
+
return matchUrlPatterns(url, patterns);
|
|
2022
2073
|
}
|
|
2023
2074
|
|
|
2024
2075
|
// src/indexing/sources/build/preview-server.ts
|
|
@@ -2456,6 +2507,162 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
2456
2507
|
return pages;
|
|
2457
2508
|
}
|
|
2458
2509
|
|
|
2510
|
+
// src/indexing/robots.ts
|
|
2511
|
+
import fs9 from "fs/promises";
|
|
2512
|
+
import path11 from "path";
|
|
2513
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
2514
|
+
const lines = content.split(/\r?\n/);
|
|
2515
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
2516
|
+
let currentAgents = [];
|
|
2517
|
+
for (const rawLine of lines) {
|
|
2518
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
2519
|
+
if (!line) continue;
|
|
2520
|
+
const colonIdx = line.indexOf(":");
|
|
2521
|
+
if (colonIdx === -1) continue;
|
|
2522
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
2523
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
2524
|
+
if (directive === "user-agent") {
|
|
2525
|
+
const agentName = value.toLowerCase();
|
|
2526
|
+
currentAgents.push(agentName);
|
|
2527
|
+
if (!agentGroups.has(agentName)) {
|
|
2528
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
2529
|
+
}
|
|
2530
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
2531
|
+
for (const agent of currentAgents) {
|
|
2532
|
+
agentGroups.get(agent).disallow.push(value);
|
|
2533
|
+
}
|
|
2534
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
2535
|
+
for (const agent of currentAgents) {
|
|
2536
|
+
agentGroups.get(agent).allow.push(value);
|
|
2537
|
+
}
|
|
2538
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
2539
|
+
currentAgents = [];
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
2543
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
2544
|
+
return specific;
|
|
2545
|
+
}
|
|
2546
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
2547
|
+
}
|
|
2548
|
+
function isBlockedByRobots(urlPath, rules) {
|
|
2549
|
+
let longestDisallow = "";
|
|
2550
|
+
for (const pattern of rules.disallow) {
|
|
2551
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
2552
|
+
longestDisallow = pattern;
|
|
2553
|
+
}
|
|
2554
|
+
}
|
|
2555
|
+
if (!longestDisallow) return false;
|
|
2556
|
+
let longestAllow = "";
|
|
2557
|
+
for (const pattern of rules.allow) {
|
|
2558
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
2559
|
+
longestAllow = pattern;
|
|
2560
|
+
}
|
|
2561
|
+
}
|
|
2562
|
+
return longestAllow.length < longestDisallow.length;
|
|
2563
|
+
}
|
|
2564
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
2565
|
+
try {
|
|
2566
|
+
const content = await fs9.readFile(path11.join(dir, "robots.txt"), "utf8");
|
|
2567
|
+
return parseRobotsTxt(content);
|
|
2568
|
+
} catch {
|
|
2569
|
+
return null;
|
|
2570
|
+
}
|
|
2571
|
+
}
|
|
2572
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
2573
|
+
try {
|
|
2574
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
2575
|
+
const response = await fetch(url);
|
|
2576
|
+
if (!response.ok) return null;
|
|
2577
|
+
const content = await response.text();
|
|
2578
|
+
return parseRobotsTxt(content);
|
|
2579
|
+
} catch {
|
|
2580
|
+
return null;
|
|
2581
|
+
}
|
|
2582
|
+
}
|
|
2583
|
+
|
|
2584
|
+
// src/search/ranking.ts
|
|
2585
|
+
function nonNegativeOrZero(value) {
|
|
2586
|
+
if (!Number.isFinite(value)) {
|
|
2587
|
+
return 0;
|
|
2588
|
+
}
|
|
2589
|
+
return Math.max(0, value);
|
|
2590
|
+
}
|
|
2591
|
+
function rankHits(hits, config) {
|
|
2592
|
+
return hits.map((hit) => {
|
|
2593
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
2594
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
2595
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
2596
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
2597
|
+
}
|
|
2598
|
+
if (config.ranking.enableDepthBoost) {
|
|
2599
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
2600
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
2601
|
+
}
|
|
2602
|
+
return {
|
|
2603
|
+
hit,
|
|
2604
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
2605
|
+
};
|
|
2606
|
+
}).sort((a, b) => {
|
|
2607
|
+
const delta = b.finalScore - a.finalScore;
|
|
2608
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2609
|
+
});
|
|
2610
|
+
}
|
|
2611
|
+
function findPageWeight(url, pageWeights) {
|
|
2612
|
+
let bestPattern = "";
|
|
2613
|
+
let bestWeight = 1;
|
|
2614
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2615
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
2616
|
+
bestPattern = pattern;
|
|
2617
|
+
bestWeight = weight;
|
|
2618
|
+
}
|
|
2619
|
+
}
|
|
2620
|
+
return bestWeight;
|
|
2621
|
+
}
|
|
2622
|
+
function aggregateByPage(ranked, config) {
|
|
2623
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2624
|
+
for (const hit of ranked) {
|
|
2625
|
+
const url = hit.hit.metadata.url;
|
|
2626
|
+
const group = groups.get(url);
|
|
2627
|
+
if (group) group.push(hit);
|
|
2628
|
+
else groups.set(url, [hit]);
|
|
2629
|
+
}
|
|
2630
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
2631
|
+
const pages = [];
|
|
2632
|
+
for (const [url, chunks] of groups) {
|
|
2633
|
+
chunks.sort((a, b) => {
|
|
2634
|
+
const delta = b.finalScore - a.finalScore;
|
|
2635
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2636
|
+
});
|
|
2637
|
+
const best = chunks[0];
|
|
2638
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
2639
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
2640
|
+
let aggregationBonus = 0;
|
|
2641
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
2642
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
2643
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
2644
|
+
}
|
|
2645
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
2646
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
2647
|
+
if (pageWeight === 0) continue;
|
|
2648
|
+
if (pageWeight !== 1) {
|
|
2649
|
+
pageScore *= pageWeight;
|
|
2650
|
+
}
|
|
2651
|
+
pages.push({
|
|
2652
|
+
url,
|
|
2653
|
+
title: best.hit.metadata.title,
|
|
2654
|
+
routeFile: best.hit.metadata.routeFile,
|
|
2655
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
2656
|
+
bestChunk: best,
|
|
2657
|
+
matchingChunks: chunks
|
|
2658
|
+
});
|
|
2659
|
+
}
|
|
2660
|
+
return pages.sort((a, b) => {
|
|
2661
|
+
const delta = b.pageScore - a.pageScore;
|
|
2662
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2663
|
+
});
|
|
2664
|
+
}
|
|
2665
|
+
|
|
2459
2666
|
// src/utils/time.ts
|
|
2460
2667
|
function nowIso() {
|
|
2461
2668
|
return (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -2466,9 +2673,10 @@ function hrTimeMs(start) {
|
|
|
2466
2673
|
|
|
2467
2674
|
// src/indexing/pipeline.ts
|
|
2468
2675
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
2469
|
-
"jina-embeddings-v3": 2e-5
|
|
2676
|
+
"jina-embeddings-v3": 2e-5,
|
|
2677
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
2470
2678
|
};
|
|
2471
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
2679
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
2472
2680
|
var IndexPipeline = class _IndexPipeline {
|
|
2473
2681
|
cwd;
|
|
2474
2682
|
config;
|
|
@@ -2483,7 +2691,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2483
2691
|
this.logger = options.logger;
|
|
2484
2692
|
}
|
|
2485
2693
|
static async create(options = {}) {
|
|
2486
|
-
const cwd =
|
|
2694
|
+
const cwd = path12.resolve(options.cwd ?? process.cwd());
|
|
2487
2695
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
2488
2696
|
const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
|
|
2489
2697
|
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
@@ -2546,6 +2754,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2546
2754
|
}
|
|
2547
2755
|
stageEnd("source", sourceStart);
|
|
2548
2756
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
2757
|
+
const filterStart = stageStart();
|
|
2758
|
+
let filteredSourcePages = sourcePages;
|
|
2759
|
+
if (this.config.exclude.length > 0) {
|
|
2760
|
+
const beforeExclude = filteredSourcePages.length;
|
|
2761
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
2762
|
+
const url = normalizeUrlPath(p.url);
|
|
2763
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
2764
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
2765
|
+
return false;
|
|
2766
|
+
}
|
|
2767
|
+
return true;
|
|
2768
|
+
});
|
|
2769
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
2770
|
+
if (excludedCount > 0) {
|
|
2771
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
2772
|
+
}
|
|
2773
|
+
}
|
|
2774
|
+
if (this.config.respectRobotsTxt) {
|
|
2775
|
+
let robotsRules = null;
|
|
2776
|
+
if (sourceMode === "static-output") {
|
|
2777
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
2778
|
+
path12.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
2779
|
+
);
|
|
2780
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
2781
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
2782
|
+
path12.resolve(this.cwd, this.config.source.build.outputDir)
|
|
2783
|
+
);
|
|
2784
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
2785
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
2786
|
+
}
|
|
2787
|
+
if (robotsRules) {
|
|
2788
|
+
const beforeRobots = filteredSourcePages.length;
|
|
2789
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
2790
|
+
const url = normalizeUrlPath(p.url);
|
|
2791
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
2792
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
2793
|
+
return false;
|
|
2794
|
+
}
|
|
2795
|
+
return true;
|
|
2796
|
+
});
|
|
2797
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
2798
|
+
if (robotsExcluded > 0) {
|
|
2799
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
2800
|
+
}
|
|
2801
|
+
}
|
|
2802
|
+
}
|
|
2803
|
+
stageEnd("filter", filterStart);
|
|
2549
2804
|
const routeStart = stageStart();
|
|
2550
2805
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
2551
2806
|
stageEnd("route_map", routeStart);
|
|
@@ -2553,7 +2808,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2553
2808
|
const extractStart = stageStart();
|
|
2554
2809
|
this.logger.info("Extracting content...");
|
|
2555
2810
|
const extractedPages = [];
|
|
2556
|
-
for (const sourcePage of
|
|
2811
|
+
for (const sourcePage of filteredSourcePages) {
|
|
2557
2812
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
2558
2813
|
if (!extracted) {
|
|
2559
2814
|
this.logger.warn(
|
|
@@ -2579,16 +2834,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2579
2834
|
seenUrls.add(page.url);
|
|
2580
2835
|
uniquePages.push(page);
|
|
2581
2836
|
}
|
|
2837
|
+
const indexablePages = [];
|
|
2838
|
+
for (const page of uniquePages) {
|
|
2839
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
2840
|
+
if (effectiveWeight === 0) {
|
|
2841
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
2842
|
+
continue;
|
|
2843
|
+
}
|
|
2844
|
+
indexablePages.push(page);
|
|
2845
|
+
}
|
|
2846
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
2847
|
+
if (zeroWeightCount > 0) {
|
|
2848
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
2849
|
+
}
|
|
2582
2850
|
stageEnd("extract", extractStart);
|
|
2583
|
-
const skippedPages =
|
|
2584
|
-
this.logger.info(`Extracted ${
|
|
2851
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
2852
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
2585
2853
|
const linkStart = stageStart();
|
|
2586
|
-
const pageSet = new Set(
|
|
2854
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
2587
2855
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
2588
|
-
for (const page of
|
|
2856
|
+
for (const page of indexablePages) {
|
|
2589
2857
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
2590
2858
|
}
|
|
2591
|
-
for (const page of
|
|
2859
|
+
for (const page of indexablePages) {
|
|
2592
2860
|
for (const outgoing of page.outgoingLinks) {
|
|
2593
2861
|
if (!pageSet.has(outgoing)) {
|
|
2594
2862
|
continue;
|
|
@@ -2612,7 +2880,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2612
2880
|
});
|
|
2613
2881
|
}
|
|
2614
2882
|
}
|
|
2615
|
-
for (const page of
|
|
2883
|
+
for (const page of indexablePages) {
|
|
2616
2884
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
2617
2885
|
if (routeMatch.routeResolution === "best-effort") {
|
|
2618
2886
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -2838,7 +3106,7 @@ import { createMcpExpressApp } from "@modelcontextprotocol/sdk/server/express.js
|
|
|
2838
3106
|
import { z as z3 } from "zod";
|
|
2839
3107
|
|
|
2840
3108
|
// src/search/engine.ts
|
|
2841
|
-
import
|
|
3109
|
+
import path13 from "path";
|
|
2842
3110
|
import { z as z2 } from "zod";
|
|
2843
3111
|
|
|
2844
3112
|
// src/rerank/jina.ts
|
|
@@ -2854,7 +3122,7 @@ var JinaReranker = class {
|
|
|
2854
3122
|
constructor(options) {
|
|
2855
3123
|
this.apiKey = options.apiKey;
|
|
2856
3124
|
this.model = options.model;
|
|
2857
|
-
this.maxRetries = options.maxRetries ??
|
|
3125
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
2858
3126
|
}
|
|
2859
3127
|
async rerank(query, candidates, topN) {
|
|
2860
3128
|
if (candidates.length === 0) {
|
|
@@ -2864,7 +3132,8 @@ var JinaReranker = class {
|
|
|
2864
3132
|
model: this.model,
|
|
2865
3133
|
query,
|
|
2866
3134
|
documents: candidates.map((candidate) => candidate.text),
|
|
2867
|
-
top_n: topN ?? candidates.length
|
|
3135
|
+
top_n: topN ?? candidates.length,
|
|
3136
|
+
return_documents: false
|
|
2868
3137
|
};
|
|
2869
3138
|
let attempt = 0;
|
|
2870
3139
|
while (attempt <= this.maxRetries) {
|
|
@@ -2937,98 +3206,6 @@ function createReranker(config) {
|
|
|
2937
3206
|
});
|
|
2938
3207
|
}
|
|
2939
3208
|
|
|
2940
|
-
// src/search/ranking.ts
|
|
2941
|
-
function nonNegativeOrZero(value) {
|
|
2942
|
-
if (!Number.isFinite(value)) {
|
|
2943
|
-
return 0;
|
|
2944
|
-
}
|
|
2945
|
-
return Math.max(0, value);
|
|
2946
|
-
}
|
|
2947
|
-
function rankHits(hits, config) {
|
|
2948
|
-
return hits.map((hit) => {
|
|
2949
|
-
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
2950
|
-
if (config.ranking.enableIncomingLinkBoost) {
|
|
2951
|
-
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
2952
|
-
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
2953
|
-
}
|
|
2954
|
-
if (config.ranking.enableDepthBoost) {
|
|
2955
|
-
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
2956
|
-
score += depthBoost * config.ranking.weights.depth;
|
|
2957
|
-
}
|
|
2958
|
-
return {
|
|
2959
|
-
hit,
|
|
2960
|
-
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
2961
|
-
};
|
|
2962
|
-
}).sort((a, b) => {
|
|
2963
|
-
const delta = b.finalScore - a.finalScore;
|
|
2964
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
2965
|
-
});
|
|
2966
|
-
}
|
|
2967
|
-
function findPageWeight(url, pageWeights) {
|
|
2968
|
-
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
2969
|
-
const normalizedUrl = norm(url);
|
|
2970
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2971
|
-
if (norm(pattern) === normalizedUrl) {
|
|
2972
|
-
return weight;
|
|
2973
|
-
}
|
|
2974
|
-
}
|
|
2975
|
-
let bestPrefix = "";
|
|
2976
|
-
let bestWeight = 1;
|
|
2977
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2978
|
-
const normalizedPattern = norm(pattern);
|
|
2979
|
-
if (normalizedPattern === "/") continue;
|
|
2980
|
-
const prefix = `${normalizedPattern}/`;
|
|
2981
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
2982
|
-
bestPrefix = prefix;
|
|
2983
|
-
bestWeight = weight;
|
|
2984
|
-
}
|
|
2985
|
-
}
|
|
2986
|
-
return bestWeight;
|
|
2987
|
-
}
|
|
2988
|
-
function aggregateByPage(ranked, config) {
|
|
2989
|
-
const groups = /* @__PURE__ */ new Map();
|
|
2990
|
-
for (const hit of ranked) {
|
|
2991
|
-
const url = hit.hit.metadata.url;
|
|
2992
|
-
const group = groups.get(url);
|
|
2993
|
-
if (group) group.push(hit);
|
|
2994
|
-
else groups.set(url, [hit]);
|
|
2995
|
-
}
|
|
2996
|
-
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
2997
|
-
const pages = [];
|
|
2998
|
-
for (const [url, chunks] of groups) {
|
|
2999
|
-
chunks.sort((a, b) => {
|
|
3000
|
-
const delta = b.finalScore - a.finalScore;
|
|
3001
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
3002
|
-
});
|
|
3003
|
-
const best = chunks[0];
|
|
3004
|
-
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
3005
|
-
const topChunks = chunks.slice(0, aggregationCap);
|
|
3006
|
-
let aggregationBonus = 0;
|
|
3007
|
-
for (let i = 1; i < topChunks.length; i++) {
|
|
3008
|
-
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
3009
|
-
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
3010
|
-
}
|
|
3011
|
-
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
3012
|
-
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
3013
|
-
if (pageWeight === 0) continue;
|
|
3014
|
-
if (pageWeight !== 1) {
|
|
3015
|
-
pageScore *= pageWeight;
|
|
3016
|
-
}
|
|
3017
|
-
pages.push({
|
|
3018
|
-
url,
|
|
3019
|
-
title: best.hit.metadata.title,
|
|
3020
|
-
routeFile: best.hit.metadata.routeFile,
|
|
3021
|
-
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
3022
|
-
bestChunk: best,
|
|
3023
|
-
matchingChunks: chunks
|
|
3024
|
-
});
|
|
3025
|
-
}
|
|
3026
|
-
return pages.sort((a, b) => {
|
|
3027
|
-
const delta = b.pageScore - a.pageScore;
|
|
3028
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
3029
|
-
});
|
|
3030
|
-
}
|
|
3031
|
-
|
|
3032
3209
|
// src/search/engine.ts
|
|
3033
3210
|
var requestSchema = z2.object({
|
|
3034
3211
|
q: z2.string().trim().min(1),
|
|
@@ -3037,7 +3214,8 @@ var requestSchema = z2.object({
|
|
|
3037
3214
|
pathPrefix: z2.string().optional(),
|
|
3038
3215
|
tags: z2.array(z2.string()).optional(),
|
|
3039
3216
|
rerank: z2.boolean().optional(),
|
|
3040
|
-
groupBy: z2.enum(["page", "chunk"]).optional()
|
|
3217
|
+
groupBy: z2.enum(["page", "chunk"]).optional(),
|
|
3218
|
+
stream: z2.boolean().optional()
|
|
3041
3219
|
});
|
|
3042
3220
|
var SearchEngine = class _SearchEngine {
|
|
3043
3221
|
cwd;
|
|
@@ -3053,7 +3231,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3053
3231
|
this.reranker = options.reranker;
|
|
3054
3232
|
}
|
|
3055
3233
|
static async create(options = {}) {
|
|
3056
|
-
const cwd =
|
|
3234
|
+
const cwd = path13.resolve(options.cwd ?? process.cwd());
|
|
3057
3235
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
3058
3236
|
const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
|
|
3059
3237
|
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
@@ -3110,7 +3288,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
3110
3288
|
rerankMs = hrTimeMs(rerankStart);
|
|
3111
3289
|
usedRerank = true;
|
|
3112
3290
|
}
|
|
3113
|
-
|
|
3291
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
3292
|
+
return {
|
|
3293
|
+
q: input.q,
|
|
3294
|
+
scope: resolvedScope.scopeName,
|
|
3295
|
+
results,
|
|
3296
|
+
meta: {
|
|
3297
|
+
timingsMs: {
|
|
3298
|
+
embed: Math.round(embedMs),
|
|
3299
|
+
vector: Math.round(vectorMs),
|
|
3300
|
+
rerank: Math.round(rerankMs),
|
|
3301
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
3302
|
+
},
|
|
3303
|
+
usedRerank,
|
|
3304
|
+
modelId: this.config.embeddings.model
|
|
3305
|
+
}
|
|
3306
|
+
};
|
|
3307
|
+
}
|
|
3308
|
+
async *searchStreaming(request) {
|
|
3309
|
+
const parsed = requestSchema.safeParse(request);
|
|
3310
|
+
if (!parsed.success) {
|
|
3311
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
3312
|
+
}
|
|
3313
|
+
const input = parsed.data;
|
|
3314
|
+
const wantsRerank = Boolean(input.rerank);
|
|
3315
|
+
if (!wantsRerank) {
|
|
3316
|
+
const response = await this.search(request);
|
|
3317
|
+
yield { phase: "initial", data: response };
|
|
3318
|
+
return;
|
|
3319
|
+
}
|
|
3320
|
+
const totalStart = process.hrtime.bigint();
|
|
3321
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
3322
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
3323
|
+
const topK = input.topK ?? 10;
|
|
3324
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
3325
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
3326
|
+
const embedStart = process.hrtime.bigint();
|
|
3327
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
3328
|
+
const queryVector = queryEmbeddings[0];
|
|
3329
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
3330
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
3331
|
+
}
|
|
3332
|
+
const embedMs = hrTimeMs(embedStart);
|
|
3333
|
+
const vectorStart = process.hrtime.bigint();
|
|
3334
|
+
const hits = await this.vectorStore.query(
|
|
3335
|
+
queryVector,
|
|
3336
|
+
{
|
|
3337
|
+
topK: candidateK,
|
|
3338
|
+
pathPrefix: input.pathPrefix,
|
|
3339
|
+
tags: input.tags
|
|
3340
|
+
},
|
|
3341
|
+
resolvedScope
|
|
3342
|
+
);
|
|
3343
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
3344
|
+
const ranked = rankHits(hits, this.config);
|
|
3345
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
3346
|
+
yield {
|
|
3347
|
+
phase: "initial",
|
|
3348
|
+
data: {
|
|
3349
|
+
q: input.q,
|
|
3350
|
+
scope: resolvedScope.scopeName,
|
|
3351
|
+
results: initialResults,
|
|
3352
|
+
meta: {
|
|
3353
|
+
timingsMs: {
|
|
3354
|
+
embed: Math.round(embedMs),
|
|
3355
|
+
vector: Math.round(vectorMs),
|
|
3356
|
+
rerank: 0,
|
|
3357
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
3358
|
+
},
|
|
3359
|
+
usedRerank: false,
|
|
3360
|
+
modelId: this.config.embeddings.model
|
|
3361
|
+
}
|
|
3362
|
+
}
|
|
3363
|
+
};
|
|
3364
|
+
const rerankStart = process.hrtime.bigint();
|
|
3365
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
3366
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
3367
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
3368
|
+
yield {
|
|
3369
|
+
phase: "reranked",
|
|
3370
|
+
data: {
|
|
3371
|
+
q: input.q,
|
|
3372
|
+
scope: resolvedScope.scopeName,
|
|
3373
|
+
results: rerankedResults,
|
|
3374
|
+
meta: {
|
|
3375
|
+
timingsMs: {
|
|
3376
|
+
embed: Math.round(embedMs),
|
|
3377
|
+
vector: Math.round(vectorMs),
|
|
3378
|
+
rerank: Math.round(rerankMs),
|
|
3379
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
3380
|
+
},
|
|
3381
|
+
usedRerank: true,
|
|
3382
|
+
modelId: this.config.embeddings.model
|
|
3383
|
+
}
|
|
3384
|
+
}
|
|
3385
|
+
};
|
|
3386
|
+
}
|
|
3387
|
+
buildResults(ordered, topK, groupByPage) {
|
|
3114
3388
|
const minScore = this.config.ranking.minScore;
|
|
3115
3389
|
if (groupByPage) {
|
|
3116
3390
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -3118,10 +3392,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
3118
3392
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
3119
3393
|
}
|
|
3120
3394
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
3121
|
-
|
|
3395
|
+
return pages.slice(0, topK).map((page) => {
|
|
3122
3396
|
const bestScore = page.bestChunk.finalScore;
|
|
3123
|
-
const
|
|
3124
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
3397
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
3398
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
3125
3399
|
return {
|
|
3126
3400
|
url: page.url,
|
|
3127
3401
|
title: page.title,
|
|
@@ -3138,10 +3412,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
3138
3412
|
};
|
|
3139
3413
|
});
|
|
3140
3414
|
} else {
|
|
3415
|
+
let filtered = ordered;
|
|
3141
3416
|
if (minScore > 0) {
|
|
3142
|
-
|
|
3417
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
3143
3418
|
}
|
|
3144
|
-
|
|
3419
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
3145
3420
|
url: hit.metadata.url,
|
|
3146
3421
|
title: hit.metadata.title,
|
|
3147
3422
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -3150,21 +3425,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
3150
3425
|
routeFile: hit.metadata.routeFile
|
|
3151
3426
|
}));
|
|
3152
3427
|
}
|
|
3153
|
-
return {
|
|
3154
|
-
q: input.q,
|
|
3155
|
-
scope: resolvedScope.scopeName,
|
|
3156
|
-
results,
|
|
3157
|
-
meta: {
|
|
3158
|
-
timingsMs: {
|
|
3159
|
-
embed: Math.round(embedMs),
|
|
3160
|
-
vector: Math.round(vectorMs),
|
|
3161
|
-
rerank: Math.round(rerankMs),
|
|
3162
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
3163
|
-
},
|
|
3164
|
-
usedRerank,
|
|
3165
|
-
modelId: this.config.embeddings.model
|
|
3166
|
-
}
|
|
3167
|
-
};
|
|
3168
3428
|
}
|
|
3169
3429
|
async getPage(pathOrUrl, scope) {
|
|
3170
3430
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -3236,6 +3496,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3236
3496
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
3237
3497
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
3238
3498
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
3499
|
+
const MAX_DOC_CHARS = 2e3;
|
|
3239
3500
|
const pageCandidates = [];
|
|
3240
3501
|
for (const [url, chunks] of pageGroups) {
|
|
3241
3502
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -3255,12 +3516,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
3255
3516
|
}
|
|
3256
3517
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
3257
3518
|
parts.push(body);
|
|
3258
|
-
|
|
3519
|
+
let text = parts.join("\n\n");
|
|
3520
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
3521
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
3522
|
+
}
|
|
3523
|
+
pageCandidates.push({ id: url, text });
|
|
3259
3524
|
}
|
|
3525
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
3526
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
3260
3527
|
const reranked = await this.reranker.rerank(
|
|
3261
3528
|
query,
|
|
3262
|
-
|
|
3263
|
-
|
|
3529
|
+
cappedCandidates,
|
|
3530
|
+
maxCandidates
|
|
3264
3531
|
);
|
|
3265
3532
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
3266
3533
|
return ranked.map((entry) => {
|
|
@@ -3282,7 +3549,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3282
3549
|
};
|
|
3283
3550
|
|
|
3284
3551
|
// src/mcp/server.ts
|
|
3285
|
-
function createServer(engine) {
|
|
3552
|
+
function createServer(engine, config) {
|
|
3286
3553
|
const server = new McpServer({
|
|
3287
3554
|
name: "searchsocket-mcp",
|
|
3288
3555
|
version: "0.1.0"
|
|
@@ -3290,14 +3557,15 @@ function createServer(engine) {
|
|
|
3290
3557
|
server.registerTool(
|
|
3291
3558
|
"search",
|
|
3292
3559
|
{
|
|
3293
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and
|
|
3560
|
+
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and rerank. Enable rerank for better relevance on natural-language queries.",
|
|
3294
3561
|
inputSchema: {
|
|
3295
3562
|
query: z3.string().min(1),
|
|
3296
3563
|
scope: z3.string().optional(),
|
|
3297
3564
|
topK: z3.number().int().positive().max(100).optional(),
|
|
3298
3565
|
pathPrefix: z3.string().optional(),
|
|
3299
3566
|
tags: z3.array(z3.string()).optional(),
|
|
3300
|
-
groupBy: z3.enum(["page", "chunk"]).optional()
|
|
3567
|
+
groupBy: z3.enum(["page", "chunk"]).optional(),
|
|
3568
|
+
rerank: z3.boolean().optional().describe("Enable reranking for better relevance (uses Jina Reranker). Defaults to true when rerank is enabled in config.")
|
|
3301
3569
|
}
|
|
3302
3570
|
},
|
|
3303
3571
|
async (input) => {
|
|
@@ -3307,7 +3575,8 @@ function createServer(engine) {
|
|
|
3307
3575
|
scope: input.scope,
|
|
3308
3576
|
pathPrefix: input.pathPrefix,
|
|
3309
3577
|
tags: input.tags,
|
|
3310
|
-
groupBy: input.groupBy
|
|
3578
|
+
groupBy: input.groupBy,
|
|
3579
|
+
rerank: input.rerank ?? config.rerank.enabled
|
|
3311
3580
|
});
|
|
3312
3581
|
return {
|
|
3313
3582
|
content: [
|
|
@@ -3435,10 +3704,10 @@ async function runMcpServer(options = {}) {
|
|
|
3435
3704
|
config
|
|
3436
3705
|
});
|
|
3437
3706
|
if (resolvedTransport === "http") {
|
|
3438
|
-
await startHttpServer(() => createServer(engine), config, options);
|
|
3707
|
+
await startHttpServer(() => createServer(engine, config), config, options);
|
|
3439
3708
|
return;
|
|
3440
3709
|
}
|
|
3441
|
-
const server = createServer(engine);
|
|
3710
|
+
const server = createServer(engine, config);
|
|
3442
3711
|
const stdioTransport = new StdioServerTransport();
|
|
3443
3712
|
await server.connect(stdioTransport);
|
|
3444
3713
|
}
|
|
@@ -3507,7 +3776,7 @@ function collectWatchPaths(config, cwd) {
|
|
|
3507
3776
|
const paths = ["src/routes/**"];
|
|
3508
3777
|
if (config.source.mode === "content-files" && config.source.contentFiles) {
|
|
3509
3778
|
for (const pattern of config.source.contentFiles.globs) {
|
|
3510
|
-
paths.push(
|
|
3779
|
+
paths.push(path14.join(config.source.contentFiles.baseDir, pattern));
|
|
3511
3780
|
}
|
|
3512
3781
|
}
|
|
3513
3782
|
if (config.source.mode === "static-output") {
|
|
@@ -3520,15 +3789,15 @@ function collectWatchPaths(config, cwd) {
|
|
|
3520
3789
|
paths.push("searchsocket.config.ts");
|
|
3521
3790
|
paths.push(config.source.build.outputDir);
|
|
3522
3791
|
}
|
|
3523
|
-
return paths.map((value) =>
|
|
3792
|
+
return paths.map((value) => path14.resolve(cwd, value));
|
|
3524
3793
|
}
|
|
3525
3794
|
function ensureStateDir(cwd) {
|
|
3526
|
-
const target =
|
|
3527
|
-
|
|
3795
|
+
const target = path14.join(cwd, ".searchsocket");
|
|
3796
|
+
fs10.mkdirSync(target, { recursive: true });
|
|
3528
3797
|
return target;
|
|
3529
3798
|
}
|
|
3530
3799
|
function ensureGitignore(cwd) {
|
|
3531
|
-
const gitignorePath =
|
|
3800
|
+
const gitignorePath = path14.join(cwd, ".gitignore");
|
|
3532
3801
|
const entries = [
|
|
3533
3802
|
".searchsocket/vectors.db",
|
|
3534
3803
|
".searchsocket/vectors.db-shm",
|
|
@@ -3537,8 +3806,8 @@ function ensureGitignore(cwd) {
|
|
|
3537
3806
|
".searchsocket/registry.json"
|
|
3538
3807
|
];
|
|
3539
3808
|
let content = "";
|
|
3540
|
-
if (
|
|
3541
|
-
content =
|
|
3809
|
+
if (fs10.existsSync(gitignorePath)) {
|
|
3810
|
+
content = fs10.readFileSync(gitignorePath, "utf8");
|
|
3542
3811
|
}
|
|
3543
3812
|
const lines = content.split("\n");
|
|
3544
3813
|
const missing = entries.filter((entry) => !lines.some((line) => line.trim() === entry));
|
|
@@ -3549,10 +3818,10 @@ function ensureGitignore(cwd) {
|
|
|
3549
3818
|
# SearchSocket local state
|
|
3550
3819
|
${missing.join("\n")}
|
|
3551
3820
|
`;
|
|
3552
|
-
|
|
3821
|
+
fs10.writeFileSync(gitignorePath, content.trimEnd() + block, "utf8");
|
|
3553
3822
|
}
|
|
3554
3823
|
function readScopesFromFile(filePath) {
|
|
3555
|
-
const raw =
|
|
3824
|
+
const raw = fs10.readFileSync(filePath, "utf8");
|
|
3556
3825
|
return new Set(
|
|
3557
3826
|
raw.split(/\r?\n/).map((line) => line.trim()).filter(Boolean)
|
|
3558
3827
|
);
|
|
@@ -3576,8 +3845,8 @@ function readRemoteGitBranches(cwd) {
|
|
|
3576
3845
|
}
|
|
3577
3846
|
}
|
|
3578
3847
|
async function loadResolvedConfigForDev(cwd, configPath) {
|
|
3579
|
-
const resolvedConfigPath =
|
|
3580
|
-
if (
|
|
3848
|
+
const resolvedConfigPath = path14.resolve(cwd, configPath ?? "searchsocket.config.ts");
|
|
3849
|
+
if (fs10.existsSync(resolvedConfigPath)) {
|
|
3581
3850
|
return loadConfig({ cwd, configPath });
|
|
3582
3851
|
}
|
|
3583
3852
|
return mergeConfig(cwd, {});
|
|
@@ -3624,7 +3893,7 @@ var program = new Command();
|
|
|
3624
3893
|
program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
|
|
3625
3894
|
program.command("init").description("Create searchsocket.config.ts and .searchsocket state directory").action(async (_opts, command) => {
|
|
3626
3895
|
const root = getRootOptions(command).cwd ?? process.cwd();
|
|
3627
|
-
const cwd =
|
|
3896
|
+
const cwd = path14.resolve(root);
|
|
3628
3897
|
const configPath = writeMinimalConfig(cwd);
|
|
3629
3898
|
const stateDir = ensureStateDir(cwd);
|
|
3630
3899
|
ensureGitignore(cwd);
|
|
@@ -3644,13 +3913,13 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
|
|
|
3644
3913
|
});
|
|
3645
3914
|
program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3646
3915
|
const rootOpts = getRootOptions(command);
|
|
3647
|
-
const cwd =
|
|
3916
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3648
3917
|
await runIndexCommand({
|
|
3649
3918
|
cwd,
|
|
3650
3919
|
configPath: rootOpts?.config,
|
|
3651
3920
|
scope: opts.scope,
|
|
3652
3921
|
changedOnly: opts.changedOnly,
|
|
3653
|
-
force: opts.force,
|
|
3922
|
+
force: opts.force || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
3654
3923
|
dryRun: opts.dryRun,
|
|
3655
3924
|
source: opts.source,
|
|
3656
3925
|
maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
|
|
@@ -3662,7 +3931,7 @@ program.command("index").description("Index site content into markdown mirror +
|
|
|
3662
3931
|
});
|
|
3663
3932
|
program.command("status").description("Show scope, indexing state, backend health, and recent cost estimate").option("--scope <name>", "scope override").action(async (opts, command) => {
|
|
3664
3933
|
const rootOpts = getRootOptions(command);
|
|
3665
|
-
const cwd =
|
|
3934
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3666
3935
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3667
3936
|
const scope = resolveScope(config, opts.scope);
|
|
3668
3937
|
let vectorStore;
|
|
@@ -3740,7 +4009,7 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
3740
4009
|
});
|
|
3741
4010
|
program.command("dev").description("Watch content files/routes and incrementally reindex on changes").option("--scope <name>", "scope override").option("--mcp", "start MCP server (http transport) alongside watcher", false).option("--mcp-port <n>", "MCP HTTP port", "3338").option("--mcp-path <path>", "MCP HTTP path", "/mcp").option("--verbose", "verbose logs", false).action(async (opts, command) => {
|
|
3742
4011
|
const rootOpts = getRootOptions(command);
|
|
3743
|
-
const cwd =
|
|
4012
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3744
4013
|
const config = await loadResolvedConfigForDev(cwd, rootOpts?.config);
|
|
3745
4014
|
const watchPaths = collectWatchPaths(config, cwd);
|
|
3746
4015
|
process.stdout.write("starting searchsocket dev watcher...\n");
|
|
@@ -3809,10 +4078,10 @@ ${watchPaths.map((entry) => ` - ${entry}`).join("\n")}
|
|
|
3809
4078
|
});
|
|
3810
4079
|
program.command("clean").description("Delete local state and optionally delete remote vectors for a scope").option("--scope <name>", "scope override").option("--remote", "delete remote scope vectors", false).action(async (opts, command) => {
|
|
3811
4080
|
const rootOpts = getRootOptions(command);
|
|
3812
|
-
const cwd =
|
|
4081
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3813
4082
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3814
4083
|
const scope = resolveScope(config, opts.scope);
|
|
3815
|
-
const statePath =
|
|
4084
|
+
const statePath = path14.join(cwd, config.state.dir);
|
|
3816
4085
|
await fsp.rm(statePath, { recursive: true, force: true });
|
|
3817
4086
|
process.stdout.write(`deleted local state directory: ${statePath}
|
|
3818
4087
|
`);
|
|
@@ -3825,7 +4094,7 @@ program.command("clean").description("Delete local state and optionally delete r
|
|
|
3825
4094
|
});
|
|
3826
4095
|
program.command("prune").description("List/delete stale scopes (dry-run by default)").option("--apply", "apply deletions", false).option("--scopes-file <path>", "file containing active scopes").option("--older-than <duration>", "ttl cutoff like 30d").action(async (opts, command) => {
|
|
3827
4096
|
const rootOpts = getRootOptions(command);
|
|
3828
|
-
const cwd =
|
|
4097
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3829
4098
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3830
4099
|
const baseScope = resolveScope(config);
|
|
3831
4100
|
let vectorStore;
|
|
@@ -3845,7 +4114,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
3845
4114
|
`);
|
|
3846
4115
|
let keepScopes = /* @__PURE__ */ new Set();
|
|
3847
4116
|
if (opts.scopesFile) {
|
|
3848
|
-
keepScopes = readScopesFromFile(
|
|
4117
|
+
keepScopes = readScopesFromFile(path14.resolve(cwd, opts.scopesFile));
|
|
3849
4118
|
} else {
|
|
3850
4119
|
keepScopes = readRemoteGitBranches(cwd);
|
|
3851
4120
|
}
|
|
@@ -3916,7 +4185,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
3916
4185
|
});
|
|
3917
4186
|
program.command("doctor").description("Validate config, env vars, provider connectivity, and local write access").action(async (_opts, command) => {
|
|
3918
4187
|
const rootOpts = getRootOptions(command);
|
|
3919
|
-
const cwd =
|
|
4188
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3920
4189
|
const checks = [];
|
|
3921
4190
|
let config = null;
|
|
3922
4191
|
try {
|
|
@@ -3945,8 +4214,8 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3945
4214
|
});
|
|
3946
4215
|
}
|
|
3947
4216
|
if (config.source.mode === "static-output") {
|
|
3948
|
-
const outputDir =
|
|
3949
|
-
const exists =
|
|
4217
|
+
const outputDir = path14.resolve(cwd, config.source.staticOutputDir);
|
|
4218
|
+
const exists = fs10.existsSync(outputDir);
|
|
3950
4219
|
checks.push({
|
|
3951
4220
|
name: "source: static output dir",
|
|
3952
4221
|
ok: exists,
|
|
@@ -3955,15 +4224,15 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3955
4224
|
} else if (config.source.mode === "build") {
|
|
3956
4225
|
const buildConfig = config.source.build;
|
|
3957
4226
|
if (buildConfig) {
|
|
3958
|
-
const manifestPath =
|
|
3959
|
-
const manifestExists =
|
|
4227
|
+
const manifestPath = path14.resolve(cwd, buildConfig.outputDir, "server", "manifest-full.js");
|
|
4228
|
+
const manifestExists = fs10.existsSync(manifestPath);
|
|
3960
4229
|
checks.push({
|
|
3961
4230
|
name: "source: build manifest",
|
|
3962
4231
|
ok: manifestExists,
|
|
3963
4232
|
details: manifestExists ? manifestPath : `${manifestPath} not found (run \`vite build\` first)`
|
|
3964
4233
|
});
|
|
3965
|
-
const viteBin =
|
|
3966
|
-
const viteExists =
|
|
4234
|
+
const viteBin = path14.resolve(cwd, "node_modules", ".bin", "vite");
|
|
4235
|
+
const viteExists = fs10.existsSync(viteBin);
|
|
3967
4236
|
checks.push({
|
|
3968
4237
|
name: "source: vite binary",
|
|
3969
4238
|
ok: viteExists,
|
|
@@ -3980,7 +4249,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3980
4249
|
const contentConfig = config.source.contentFiles;
|
|
3981
4250
|
if (contentConfig) {
|
|
3982
4251
|
const fg4 = await import("fast-glob");
|
|
3983
|
-
const baseDir =
|
|
4252
|
+
const baseDir = path14.resolve(cwd, contentConfig.baseDir);
|
|
3984
4253
|
const files = await fg4.default(contentConfig.globs, { cwd: baseDir, onlyFiles: true });
|
|
3985
4254
|
checks.push({
|
|
3986
4255
|
name: "source: content files",
|
|
@@ -4049,7 +4318,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4049
4318
|
try {
|
|
4050
4319
|
const scope = resolveScope(config);
|
|
4051
4320
|
const { statePath } = ensureStateDirs(cwd, config.state.dir, scope);
|
|
4052
|
-
const testPath =
|
|
4321
|
+
const testPath = path14.join(statePath, ".write-test");
|
|
4053
4322
|
await fsp.writeFile(testPath, "ok\n", "utf8");
|
|
4054
4323
|
await fsp.rm(testPath, { force: true });
|
|
4055
4324
|
checks.push({ name: "state directory writable", ok: true });
|
|
@@ -4078,7 +4347,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4078
4347
|
});
|
|
4079
4348
|
program.command("mcp").description("Run SearchSocket MCP server").option("--transport <transport>", "stdio|http", "stdio").option("--port <n>", "HTTP port", "3338").option("--path <path>", "HTTP path", "/mcp").action(async (opts, command) => {
|
|
4080
4349
|
const rootOpts = getRootOptions(command);
|
|
4081
|
-
const cwd =
|
|
4350
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4082
4351
|
await runMcpServer({
|
|
4083
4352
|
cwd,
|
|
4084
4353
|
configPath: rootOpts?.config,
|
|
@@ -4089,7 +4358,7 @@ program.command("mcp").description("Run SearchSocket MCP server").option("--tran
|
|
|
4089
4358
|
});
|
|
4090
4359
|
program.command("search").description("Quick local CLI search against indexed vectors").requiredOption("--q <query>", "search query").option("--scope <name>", "scope override").option("--top-k <n>", "top K results", "10").option("--path-prefix <prefix>", "path prefix filter").option("--rerank", "enable configured reranker", false).action(async (opts, command) => {
|
|
4091
4360
|
const rootOpts = getRootOptions(command);
|
|
4092
|
-
const cwd =
|
|
4361
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4093
4362
|
const engine = await SearchEngine.create({
|
|
4094
4363
|
cwd,
|
|
4095
4364
|
configPath: rootOpts?.config
|
|
@@ -4105,7 +4374,7 @@ program.command("search").description("Quick local CLI search against indexed ve
|
|
|
4105
4374
|
`);
|
|
4106
4375
|
});
|
|
4107
4376
|
async function main() {
|
|
4108
|
-
dotenvConfig({ path:
|
|
4377
|
+
dotenvConfig({ path: path14.resolve(process.cwd(), ".env") });
|
|
4109
4378
|
await program.parseAsync(process.argv);
|
|
4110
4379
|
}
|
|
4111
4380
|
main().catch((error) => {
|