searchsocket 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +443 -182
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +577 -164
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +577 -165
- package/dist/sveltekit.cjs +367 -77
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +367 -77
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/cli.ts
|
|
4
|
-
import
|
|
4
|
+
import fs10 from "fs";
|
|
5
5
|
import fsp from "fs/promises";
|
|
6
|
-
import
|
|
6
|
+
import path14 from "path";
|
|
7
7
|
import { execSync as execSync2 } from "child_process";
|
|
8
8
|
import { config as dotenvConfig } from "dotenv";
|
|
9
9
|
import chokidar from "chokidar";
|
|
@@ -12,7 +12,7 @@ import { Command } from "commander";
|
|
|
12
12
|
// package.json
|
|
13
13
|
var package_default = {
|
|
14
14
|
name: "searchsocket",
|
|
15
|
-
version: "0.
|
|
15
|
+
version: "0.4.0",
|
|
16
16
|
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
17
|
license: "MIT",
|
|
18
18
|
author: "Greg Priday <greg@siteorigin.com>",
|
|
@@ -115,6 +115,8 @@ var searchSocketConfigSchema = z.object({
|
|
|
115
115
|
envVar: z.string().min(1).optional(),
|
|
116
116
|
sanitize: z.boolean().optional()
|
|
117
117
|
}).optional(),
|
|
118
|
+
exclude: z.array(z.string()).optional(),
|
|
119
|
+
respectRobotsTxt: z.boolean().optional(),
|
|
118
120
|
source: z.object({
|
|
119
121
|
mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
120
122
|
staticOutputDir: z.string().min(1).optional(),
|
|
@@ -245,6 +247,8 @@ function createDefaultConfig(projectId) {
|
|
|
245
247
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
246
248
|
sanitize: true
|
|
247
249
|
},
|
|
250
|
+
exclude: [],
|
|
251
|
+
respectRobotsTxt: true,
|
|
248
252
|
source: {
|
|
249
253
|
mode: "static-output",
|
|
250
254
|
staticOutputDir: "build",
|
|
@@ -275,7 +279,7 @@ function createDefaultConfig(projectId) {
|
|
|
275
279
|
},
|
|
276
280
|
embeddings: {
|
|
277
281
|
provider: "jina",
|
|
278
|
-
model: "jina-embeddings-
|
|
282
|
+
model: "jina-embeddings-v5-text-small",
|
|
279
283
|
apiKeyEnv: "JINA_API_KEY",
|
|
280
284
|
batchSize: 64,
|
|
281
285
|
concurrency: 4
|
|
@@ -288,9 +292,9 @@ function createDefaultConfig(projectId) {
|
|
|
288
292
|
}
|
|
289
293
|
},
|
|
290
294
|
rerank: {
|
|
291
|
-
enabled:
|
|
295
|
+
enabled: true,
|
|
292
296
|
topN: 20,
|
|
293
|
-
model: "jina-reranker-
|
|
297
|
+
model: "jina-reranker-v3"
|
|
294
298
|
},
|
|
295
299
|
ranking: {
|
|
296
300
|
enableIncomingLinkBoost: true,
|
|
@@ -393,6 +397,8 @@ ${issues}`
|
|
|
393
397
|
...defaults.scope,
|
|
394
398
|
...parsed.scope
|
|
395
399
|
},
|
|
400
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
401
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
396
402
|
source: {
|
|
397
403
|
...defaults.source,
|
|
398
404
|
...parsed.source,
|
|
@@ -829,7 +835,7 @@ function createEmbeddingsProvider(config) {
|
|
|
829
835
|
}
|
|
830
836
|
|
|
831
837
|
// src/indexing/pipeline.ts
|
|
832
|
-
import
|
|
838
|
+
import path12 from "path";
|
|
833
839
|
|
|
834
840
|
// src/vector/factory.ts
|
|
835
841
|
import fs3 from "fs";
|
|
@@ -1710,6 +1716,17 @@ function extractFromHtml(url, html, config) {
|
|
|
1710
1716
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
1711
1717
|
return null;
|
|
1712
1718
|
}
|
|
1719
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
1720
|
+
let weight;
|
|
1721
|
+
if (weightRaw !== void 0) {
|
|
1722
|
+
const parsed = Number(weightRaw);
|
|
1723
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
1724
|
+
weight = parsed;
|
|
1725
|
+
}
|
|
1726
|
+
}
|
|
1727
|
+
if (weight === 0) {
|
|
1728
|
+
return null;
|
|
1729
|
+
}
|
|
1713
1730
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
1714
1731
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
1715
1732
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -1765,7 +1782,8 @@ function extractFromHtml(url, html, config) {
|
|
|
1765
1782
|
noindex: false,
|
|
1766
1783
|
tags,
|
|
1767
1784
|
description,
|
|
1768
|
-
keywords
|
|
1785
|
+
keywords,
|
|
1786
|
+
weight
|
|
1769
1787
|
};
|
|
1770
1788
|
}
|
|
1771
1789
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -1778,6 +1796,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
1778
1796
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
1779
1797
|
return null;
|
|
1780
1798
|
}
|
|
1799
|
+
let mdWeight;
|
|
1800
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
1801
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
1802
|
+
mdWeight = rawWeight;
|
|
1803
|
+
}
|
|
1804
|
+
if (mdWeight === 0) {
|
|
1805
|
+
return null;
|
|
1806
|
+
}
|
|
1781
1807
|
const content = parsed.content;
|
|
1782
1808
|
const normalized = normalizeMarkdown(content);
|
|
1783
1809
|
if (!normalizeText(normalized)) {
|
|
@@ -1800,7 +1826,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
1800
1826
|
noindex: false,
|
|
1801
1827
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
1802
1828
|
description: fmDescription,
|
|
1803
|
-
keywords: fmKeywords
|
|
1829
|
+
keywords: fmKeywords,
|
|
1830
|
+
weight: mdWeight
|
|
1804
1831
|
};
|
|
1805
1832
|
}
|
|
1806
1833
|
|
|
@@ -1937,6 +1964,38 @@ import pLimit2 from "p-limit";
|
|
|
1937
1964
|
// src/indexing/sources/build/manifest-parser.ts
|
|
1938
1965
|
import fs5 from "fs/promises";
|
|
1939
1966
|
import path7 from "path";
|
|
1967
|
+
|
|
1968
|
+
// src/utils/pattern.ts
|
|
1969
|
+
function matchUrlPattern(url, pattern) {
|
|
1970
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
1971
|
+
const normalizedUrl = norm(url);
|
|
1972
|
+
const normalizedPattern = norm(pattern);
|
|
1973
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
1974
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
1975
|
+
if (prefix === "") {
|
|
1976
|
+
return true;
|
|
1977
|
+
}
|
|
1978
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
1979
|
+
}
|
|
1980
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
1981
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
1982
|
+
if (prefix === "") {
|
|
1983
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
1984
|
+
}
|
|
1985
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
1986
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
1987
|
+
return rest.length > 0 && !rest.includes("/");
|
|
1988
|
+
}
|
|
1989
|
+
return normalizedUrl === normalizedPattern;
|
|
1990
|
+
}
|
|
1991
|
+
function matchUrlPatterns(url, patterns) {
|
|
1992
|
+
for (const pattern of patterns) {
|
|
1993
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
1994
|
+
}
|
|
1995
|
+
return false;
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1998
|
+
// src/indexing/sources/build/manifest-parser.ts
|
|
1940
1999
|
function routeIdToFile(routeId) {
|
|
1941
2000
|
if (routeId === "/") {
|
|
1942
2001
|
return "src/routes/+page.svelte";
|
|
@@ -2010,15 +2069,7 @@ function expandDynamicUrl(url, value) {
|
|
|
2010
2069
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
2011
2070
|
}
|
|
2012
2071
|
function isExcluded(url, patterns) {
|
|
2013
|
-
|
|
2014
|
-
if (pattern.endsWith("/*")) {
|
|
2015
|
-
const prefix = pattern.slice(0, -1);
|
|
2016
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
2017
|
-
} else if (url === pattern) {
|
|
2018
|
-
return true;
|
|
2019
|
-
}
|
|
2020
|
-
}
|
|
2021
|
-
return false;
|
|
2072
|
+
return matchUrlPatterns(url, patterns);
|
|
2022
2073
|
}
|
|
2023
2074
|
|
|
2024
2075
|
// src/indexing/sources/build/preview-server.ts
|
|
@@ -2456,6 +2507,162 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
2456
2507
|
return pages;
|
|
2457
2508
|
}
|
|
2458
2509
|
|
|
2510
|
+
// src/indexing/robots.ts
|
|
2511
|
+
import fs9 from "fs/promises";
|
|
2512
|
+
import path11 from "path";
|
|
2513
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
2514
|
+
const lines = content.split(/\r?\n/);
|
|
2515
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
2516
|
+
let currentAgents = [];
|
|
2517
|
+
for (const rawLine of lines) {
|
|
2518
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
2519
|
+
if (!line) continue;
|
|
2520
|
+
const colonIdx = line.indexOf(":");
|
|
2521
|
+
if (colonIdx === -1) continue;
|
|
2522
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
2523
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
2524
|
+
if (directive === "user-agent") {
|
|
2525
|
+
const agentName = value.toLowerCase();
|
|
2526
|
+
currentAgents.push(agentName);
|
|
2527
|
+
if (!agentGroups.has(agentName)) {
|
|
2528
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
2529
|
+
}
|
|
2530
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
2531
|
+
for (const agent of currentAgents) {
|
|
2532
|
+
agentGroups.get(agent).disallow.push(value);
|
|
2533
|
+
}
|
|
2534
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
2535
|
+
for (const agent of currentAgents) {
|
|
2536
|
+
agentGroups.get(agent).allow.push(value);
|
|
2537
|
+
}
|
|
2538
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
2539
|
+
currentAgents = [];
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
2543
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
2544
|
+
return specific;
|
|
2545
|
+
}
|
|
2546
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
2547
|
+
}
|
|
2548
|
+
function isBlockedByRobots(urlPath, rules) {
|
|
2549
|
+
let longestDisallow = "";
|
|
2550
|
+
for (const pattern of rules.disallow) {
|
|
2551
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
2552
|
+
longestDisallow = pattern;
|
|
2553
|
+
}
|
|
2554
|
+
}
|
|
2555
|
+
if (!longestDisallow) return false;
|
|
2556
|
+
let longestAllow = "";
|
|
2557
|
+
for (const pattern of rules.allow) {
|
|
2558
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
2559
|
+
longestAllow = pattern;
|
|
2560
|
+
}
|
|
2561
|
+
}
|
|
2562
|
+
return longestAllow.length < longestDisallow.length;
|
|
2563
|
+
}
|
|
2564
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
2565
|
+
try {
|
|
2566
|
+
const content = await fs9.readFile(path11.join(dir, "robots.txt"), "utf8");
|
|
2567
|
+
return parseRobotsTxt(content);
|
|
2568
|
+
} catch {
|
|
2569
|
+
return null;
|
|
2570
|
+
}
|
|
2571
|
+
}
|
|
2572
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
2573
|
+
try {
|
|
2574
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
2575
|
+
const response = await fetch(url);
|
|
2576
|
+
if (!response.ok) return null;
|
|
2577
|
+
const content = await response.text();
|
|
2578
|
+
return parseRobotsTxt(content);
|
|
2579
|
+
} catch {
|
|
2580
|
+
return null;
|
|
2581
|
+
}
|
|
2582
|
+
}
|
|
2583
|
+
|
|
2584
|
+
// src/search/ranking.ts
|
|
2585
|
+
function nonNegativeOrZero(value) {
|
|
2586
|
+
if (!Number.isFinite(value)) {
|
|
2587
|
+
return 0;
|
|
2588
|
+
}
|
|
2589
|
+
return Math.max(0, value);
|
|
2590
|
+
}
|
|
2591
|
+
function rankHits(hits, config) {
|
|
2592
|
+
return hits.map((hit) => {
|
|
2593
|
+
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
2594
|
+
if (config.ranking.enableIncomingLinkBoost) {
|
|
2595
|
+
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
2596
|
+
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
2597
|
+
}
|
|
2598
|
+
if (config.ranking.enableDepthBoost) {
|
|
2599
|
+
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
2600
|
+
score += depthBoost * config.ranking.weights.depth;
|
|
2601
|
+
}
|
|
2602
|
+
return {
|
|
2603
|
+
hit,
|
|
2604
|
+
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
2605
|
+
};
|
|
2606
|
+
}).sort((a, b) => {
|
|
2607
|
+
const delta = b.finalScore - a.finalScore;
|
|
2608
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2609
|
+
});
|
|
2610
|
+
}
|
|
2611
|
+
function findPageWeight(url, pageWeights) {
|
|
2612
|
+
let bestPattern = "";
|
|
2613
|
+
let bestWeight = 1;
|
|
2614
|
+
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2615
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
2616
|
+
bestPattern = pattern;
|
|
2617
|
+
bestWeight = weight;
|
|
2618
|
+
}
|
|
2619
|
+
}
|
|
2620
|
+
return bestWeight;
|
|
2621
|
+
}
|
|
2622
|
+
function aggregateByPage(ranked, config) {
|
|
2623
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2624
|
+
for (const hit of ranked) {
|
|
2625
|
+
const url = hit.hit.metadata.url;
|
|
2626
|
+
const group = groups.get(url);
|
|
2627
|
+
if (group) group.push(hit);
|
|
2628
|
+
else groups.set(url, [hit]);
|
|
2629
|
+
}
|
|
2630
|
+
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
2631
|
+
const pages = [];
|
|
2632
|
+
for (const [url, chunks] of groups) {
|
|
2633
|
+
chunks.sort((a, b) => {
|
|
2634
|
+
const delta = b.finalScore - a.finalScore;
|
|
2635
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2636
|
+
});
|
|
2637
|
+
const best = chunks[0];
|
|
2638
|
+
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
2639
|
+
const topChunks = chunks.slice(0, aggregationCap);
|
|
2640
|
+
let aggregationBonus = 0;
|
|
2641
|
+
for (let i = 1; i < topChunks.length; i++) {
|
|
2642
|
+
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
2643
|
+
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
2644
|
+
}
|
|
2645
|
+
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
2646
|
+
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
2647
|
+
if (pageWeight === 0) continue;
|
|
2648
|
+
if (pageWeight !== 1) {
|
|
2649
|
+
pageScore *= pageWeight;
|
|
2650
|
+
}
|
|
2651
|
+
pages.push({
|
|
2652
|
+
url,
|
|
2653
|
+
title: best.hit.metadata.title,
|
|
2654
|
+
routeFile: best.hit.metadata.routeFile,
|
|
2655
|
+
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
2656
|
+
bestChunk: best,
|
|
2657
|
+
matchingChunks: chunks
|
|
2658
|
+
});
|
|
2659
|
+
}
|
|
2660
|
+
return pages.sort((a, b) => {
|
|
2661
|
+
const delta = b.pageScore - a.pageScore;
|
|
2662
|
+
return Number.isNaN(delta) ? 0 : delta;
|
|
2663
|
+
});
|
|
2664
|
+
}
|
|
2665
|
+
|
|
2459
2666
|
// src/utils/time.ts
|
|
2460
2667
|
function nowIso() {
|
|
2461
2668
|
return (/* @__PURE__ */ new Date()).toISOString();
|
|
@@ -2466,9 +2673,10 @@ function hrTimeMs(start) {
|
|
|
2466
2673
|
|
|
2467
2674
|
// src/indexing/pipeline.ts
|
|
2468
2675
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
2469
|
-
"jina-embeddings-v3": 2e-5
|
|
2676
|
+
"jina-embeddings-v3": 2e-5,
|
|
2677
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
2470
2678
|
};
|
|
2471
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
2679
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
2472
2680
|
var IndexPipeline = class _IndexPipeline {
|
|
2473
2681
|
cwd;
|
|
2474
2682
|
config;
|
|
@@ -2483,7 +2691,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2483
2691
|
this.logger = options.logger;
|
|
2484
2692
|
}
|
|
2485
2693
|
static async create(options = {}) {
|
|
2486
|
-
const cwd =
|
|
2694
|
+
const cwd = path12.resolve(options.cwd ?? process.cwd());
|
|
2487
2695
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
2488
2696
|
const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
|
|
2489
2697
|
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
@@ -2546,6 +2754,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2546
2754
|
}
|
|
2547
2755
|
stageEnd("source", sourceStart);
|
|
2548
2756
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
2757
|
+
const filterStart = stageStart();
|
|
2758
|
+
let filteredSourcePages = sourcePages;
|
|
2759
|
+
if (this.config.exclude.length > 0) {
|
|
2760
|
+
const beforeExclude = filteredSourcePages.length;
|
|
2761
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
2762
|
+
const url = normalizeUrlPath(p.url);
|
|
2763
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
2764
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
2765
|
+
return false;
|
|
2766
|
+
}
|
|
2767
|
+
return true;
|
|
2768
|
+
});
|
|
2769
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
2770
|
+
if (excludedCount > 0) {
|
|
2771
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
2772
|
+
}
|
|
2773
|
+
}
|
|
2774
|
+
if (this.config.respectRobotsTxt) {
|
|
2775
|
+
let robotsRules = null;
|
|
2776
|
+
if (sourceMode === "static-output") {
|
|
2777
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
2778
|
+
path12.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
2779
|
+
);
|
|
2780
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
2781
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
2782
|
+
path12.resolve(this.cwd, this.config.source.build.outputDir)
|
|
2783
|
+
);
|
|
2784
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
2785
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
2786
|
+
}
|
|
2787
|
+
if (robotsRules) {
|
|
2788
|
+
const beforeRobots = filteredSourcePages.length;
|
|
2789
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
2790
|
+
const url = normalizeUrlPath(p.url);
|
|
2791
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
2792
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
2793
|
+
return false;
|
|
2794
|
+
}
|
|
2795
|
+
return true;
|
|
2796
|
+
});
|
|
2797
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
2798
|
+
if (robotsExcluded > 0) {
|
|
2799
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
2800
|
+
}
|
|
2801
|
+
}
|
|
2802
|
+
}
|
|
2803
|
+
stageEnd("filter", filterStart);
|
|
2549
2804
|
const routeStart = stageStart();
|
|
2550
2805
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
2551
2806
|
stageEnd("route_map", routeStart);
|
|
@@ -2553,7 +2808,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2553
2808
|
const extractStart = stageStart();
|
|
2554
2809
|
this.logger.info("Extracting content...");
|
|
2555
2810
|
const extractedPages = [];
|
|
2556
|
-
for (const sourcePage of
|
|
2811
|
+
for (const sourcePage of filteredSourcePages) {
|
|
2557
2812
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
2558
2813
|
if (!extracted) {
|
|
2559
2814
|
this.logger.warn(
|
|
@@ -2579,16 +2834,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2579
2834
|
seenUrls.add(page.url);
|
|
2580
2835
|
uniquePages.push(page);
|
|
2581
2836
|
}
|
|
2837
|
+
const indexablePages = [];
|
|
2838
|
+
for (const page of uniquePages) {
|
|
2839
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
2840
|
+
if (effectiveWeight === 0) {
|
|
2841
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
2842
|
+
continue;
|
|
2843
|
+
}
|
|
2844
|
+
indexablePages.push(page);
|
|
2845
|
+
}
|
|
2846
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
2847
|
+
if (zeroWeightCount > 0) {
|
|
2848
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
2849
|
+
}
|
|
2582
2850
|
stageEnd("extract", extractStart);
|
|
2583
|
-
const skippedPages =
|
|
2584
|
-
this.logger.info(`Extracted ${
|
|
2851
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
2852
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
2585
2853
|
const linkStart = stageStart();
|
|
2586
|
-
const pageSet = new Set(
|
|
2854
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
2587
2855
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
2588
|
-
for (const page of
|
|
2856
|
+
for (const page of indexablePages) {
|
|
2589
2857
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
2590
2858
|
}
|
|
2591
|
-
for (const page of
|
|
2859
|
+
for (const page of indexablePages) {
|
|
2592
2860
|
for (const outgoing of page.outgoingLinks) {
|
|
2593
2861
|
if (!pageSet.has(outgoing)) {
|
|
2594
2862
|
continue;
|
|
@@ -2612,7 +2880,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2612
2880
|
});
|
|
2613
2881
|
}
|
|
2614
2882
|
}
|
|
2615
|
-
for (const page of
|
|
2883
|
+
for (const page of indexablePages) {
|
|
2616
2884
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
2617
2885
|
if (routeMatch.routeResolution === "best-effort") {
|
|
2618
2886
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -2838,7 +3106,7 @@ import { createMcpExpressApp } from "@modelcontextprotocol/sdk/server/express.js
|
|
|
2838
3106
|
import { z as z3 } from "zod";
|
|
2839
3107
|
|
|
2840
3108
|
// src/search/engine.ts
|
|
2841
|
-
import
|
|
3109
|
+
import path13 from "path";
|
|
2842
3110
|
import { z as z2 } from "zod";
|
|
2843
3111
|
|
|
2844
3112
|
// src/rerank/jina.ts
|
|
@@ -2938,98 +3206,6 @@ function createReranker(config) {
|
|
|
2938
3206
|
});
|
|
2939
3207
|
}
|
|
2940
3208
|
|
|
2941
|
-
// src/search/ranking.ts
|
|
2942
|
-
function nonNegativeOrZero(value) {
|
|
2943
|
-
if (!Number.isFinite(value)) {
|
|
2944
|
-
return 0;
|
|
2945
|
-
}
|
|
2946
|
-
return Math.max(0, value);
|
|
2947
|
-
}
|
|
2948
|
-
function rankHits(hits, config) {
|
|
2949
|
-
return hits.map((hit) => {
|
|
2950
|
-
let score = Number.isFinite(hit.score) ? hit.score : Number.NEGATIVE_INFINITY;
|
|
2951
|
-
if (config.ranking.enableIncomingLinkBoost) {
|
|
2952
|
-
const incomingBoost = Math.log(1 + nonNegativeOrZero(hit.metadata.incomingLinks));
|
|
2953
|
-
score += incomingBoost * config.ranking.weights.incomingLinks;
|
|
2954
|
-
}
|
|
2955
|
-
if (config.ranking.enableDepthBoost) {
|
|
2956
|
-
const depthBoost = 1 / (1 + nonNegativeOrZero(hit.metadata.depth));
|
|
2957
|
-
score += depthBoost * config.ranking.weights.depth;
|
|
2958
|
-
}
|
|
2959
|
-
return {
|
|
2960
|
-
hit,
|
|
2961
|
-
finalScore: Number.isFinite(score) ? score : Number.NEGATIVE_INFINITY
|
|
2962
|
-
};
|
|
2963
|
-
}).sort((a, b) => {
|
|
2964
|
-
const delta = b.finalScore - a.finalScore;
|
|
2965
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
2966
|
-
});
|
|
2967
|
-
}
|
|
2968
|
-
function findPageWeight(url, pageWeights) {
|
|
2969
|
-
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
2970
|
-
const normalizedUrl = norm(url);
|
|
2971
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2972
|
-
if (norm(pattern) === normalizedUrl) {
|
|
2973
|
-
return weight;
|
|
2974
|
-
}
|
|
2975
|
-
}
|
|
2976
|
-
let bestPrefix = "";
|
|
2977
|
-
let bestWeight = 1;
|
|
2978
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
2979
|
-
const normalizedPattern = norm(pattern);
|
|
2980
|
-
if (normalizedPattern === "/") continue;
|
|
2981
|
-
const prefix = `${normalizedPattern}/`;
|
|
2982
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
2983
|
-
bestPrefix = prefix;
|
|
2984
|
-
bestWeight = weight;
|
|
2985
|
-
}
|
|
2986
|
-
}
|
|
2987
|
-
return bestWeight;
|
|
2988
|
-
}
|
|
2989
|
-
function aggregateByPage(ranked, config) {
|
|
2990
|
-
const groups = /* @__PURE__ */ new Map();
|
|
2991
|
-
for (const hit of ranked) {
|
|
2992
|
-
const url = hit.hit.metadata.url;
|
|
2993
|
-
const group = groups.get(url);
|
|
2994
|
-
if (group) group.push(hit);
|
|
2995
|
-
else groups.set(url, [hit]);
|
|
2996
|
-
}
|
|
2997
|
-
const { aggregationCap, aggregationDecay } = config.ranking;
|
|
2998
|
-
const pages = [];
|
|
2999
|
-
for (const [url, chunks] of groups) {
|
|
3000
|
-
chunks.sort((a, b) => {
|
|
3001
|
-
const delta = b.finalScore - a.finalScore;
|
|
3002
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
3003
|
-
});
|
|
3004
|
-
const best = chunks[0];
|
|
3005
|
-
const maxScore = Number.isFinite(best.finalScore) ? best.finalScore : Number.NEGATIVE_INFINITY;
|
|
3006
|
-
const topChunks = chunks.slice(0, aggregationCap);
|
|
3007
|
-
let aggregationBonus = 0;
|
|
3008
|
-
for (let i = 1; i < topChunks.length; i++) {
|
|
3009
|
-
const chunkScore = Number.isFinite(topChunks[i].finalScore) ? topChunks[i].finalScore : 0;
|
|
3010
|
-
aggregationBonus += chunkScore * Math.pow(aggregationDecay, i);
|
|
3011
|
-
}
|
|
3012
|
-
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
3013
|
-
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
3014
|
-
if (pageWeight === 0) continue;
|
|
3015
|
-
if (pageWeight !== 1) {
|
|
3016
|
-
pageScore *= pageWeight;
|
|
3017
|
-
}
|
|
3018
|
-
pages.push({
|
|
3019
|
-
url,
|
|
3020
|
-
title: best.hit.metadata.title,
|
|
3021
|
-
routeFile: best.hit.metadata.routeFile,
|
|
3022
|
-
pageScore: Number.isFinite(pageScore) ? pageScore : Number.NEGATIVE_INFINITY,
|
|
3023
|
-
bestChunk: best,
|
|
3024
|
-
matchingChunks: chunks
|
|
3025
|
-
});
|
|
3026
|
-
}
|
|
3027
|
-
return pages.sort((a, b) => {
|
|
3028
|
-
const delta = b.pageScore - a.pageScore;
|
|
3029
|
-
return Number.isNaN(delta) ? 0 : delta;
|
|
3030
|
-
});
|
|
3031
|
-
}
|
|
3032
|
-
|
|
3033
3209
|
// src/search/engine.ts
|
|
3034
3210
|
var requestSchema = z2.object({
|
|
3035
3211
|
q: z2.string().trim().min(1),
|
|
@@ -3038,7 +3214,8 @@ var requestSchema = z2.object({
|
|
|
3038
3214
|
pathPrefix: z2.string().optional(),
|
|
3039
3215
|
tags: z2.array(z2.string()).optional(),
|
|
3040
3216
|
rerank: z2.boolean().optional(),
|
|
3041
|
-
groupBy: z2.enum(["page", "chunk"]).optional()
|
|
3217
|
+
groupBy: z2.enum(["page", "chunk"]).optional(),
|
|
3218
|
+
stream: z2.boolean().optional()
|
|
3042
3219
|
});
|
|
3043
3220
|
var SearchEngine = class _SearchEngine {
|
|
3044
3221
|
cwd;
|
|
@@ -3054,7 +3231,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3054
3231
|
this.reranker = options.reranker;
|
|
3055
3232
|
}
|
|
3056
3233
|
static async create(options = {}) {
|
|
3057
|
-
const cwd =
|
|
3234
|
+
const cwd = path13.resolve(options.cwd ?? process.cwd());
|
|
3058
3235
|
const config = options.config ?? await loadConfig({ cwd, configPath: options.configPath });
|
|
3059
3236
|
const embeddings = options.embeddingsProvider ?? createEmbeddingsProvider(config);
|
|
3060
3237
|
const vectorStore = options.vectorStore ?? await createVectorStore(config, cwd);
|
|
@@ -3111,7 +3288,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
3111
3288
|
rerankMs = hrTimeMs(rerankStart);
|
|
3112
3289
|
usedRerank = true;
|
|
3113
3290
|
}
|
|
3114
|
-
|
|
3291
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
3292
|
+
return {
|
|
3293
|
+
q: input.q,
|
|
3294
|
+
scope: resolvedScope.scopeName,
|
|
3295
|
+
results,
|
|
3296
|
+
meta: {
|
|
3297
|
+
timingsMs: {
|
|
3298
|
+
embed: Math.round(embedMs),
|
|
3299
|
+
vector: Math.round(vectorMs),
|
|
3300
|
+
rerank: Math.round(rerankMs),
|
|
3301
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
3302
|
+
},
|
|
3303
|
+
usedRerank,
|
|
3304
|
+
modelId: this.config.embeddings.model
|
|
3305
|
+
}
|
|
3306
|
+
};
|
|
3307
|
+
}
|
|
3308
|
+
async *searchStreaming(request) {
|
|
3309
|
+
const parsed = requestSchema.safeParse(request);
|
|
3310
|
+
if (!parsed.success) {
|
|
3311
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
3312
|
+
}
|
|
3313
|
+
const input = parsed.data;
|
|
3314
|
+
const wantsRerank = Boolean(input.rerank);
|
|
3315
|
+
if (!wantsRerank) {
|
|
3316
|
+
const response = await this.search(request);
|
|
3317
|
+
yield { phase: "initial", data: response };
|
|
3318
|
+
return;
|
|
3319
|
+
}
|
|
3320
|
+
const totalStart = process.hrtime.bigint();
|
|
3321
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
3322
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
3323
|
+
const topK = input.topK ?? 10;
|
|
3324
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
3325
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
3326
|
+
const embedStart = process.hrtime.bigint();
|
|
3327
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
3328
|
+
const queryVector = queryEmbeddings[0];
|
|
3329
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
3330
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
3331
|
+
}
|
|
3332
|
+
const embedMs = hrTimeMs(embedStart);
|
|
3333
|
+
const vectorStart = process.hrtime.bigint();
|
|
3334
|
+
const hits = await this.vectorStore.query(
|
|
3335
|
+
queryVector,
|
|
3336
|
+
{
|
|
3337
|
+
topK: candidateK,
|
|
3338
|
+
pathPrefix: input.pathPrefix,
|
|
3339
|
+
tags: input.tags
|
|
3340
|
+
},
|
|
3341
|
+
resolvedScope
|
|
3342
|
+
);
|
|
3343
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
3344
|
+
const ranked = rankHits(hits, this.config);
|
|
3345
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
3346
|
+
yield {
|
|
3347
|
+
phase: "initial",
|
|
3348
|
+
data: {
|
|
3349
|
+
q: input.q,
|
|
3350
|
+
scope: resolvedScope.scopeName,
|
|
3351
|
+
results: initialResults,
|
|
3352
|
+
meta: {
|
|
3353
|
+
timingsMs: {
|
|
3354
|
+
embed: Math.round(embedMs),
|
|
3355
|
+
vector: Math.round(vectorMs),
|
|
3356
|
+
rerank: 0,
|
|
3357
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
3358
|
+
},
|
|
3359
|
+
usedRerank: false,
|
|
3360
|
+
modelId: this.config.embeddings.model
|
|
3361
|
+
}
|
|
3362
|
+
}
|
|
3363
|
+
};
|
|
3364
|
+
const rerankStart = process.hrtime.bigint();
|
|
3365
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
3366
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
3367
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
3368
|
+
yield {
|
|
3369
|
+
phase: "reranked",
|
|
3370
|
+
data: {
|
|
3371
|
+
q: input.q,
|
|
3372
|
+
scope: resolvedScope.scopeName,
|
|
3373
|
+
results: rerankedResults,
|
|
3374
|
+
meta: {
|
|
3375
|
+
timingsMs: {
|
|
3376
|
+
embed: Math.round(embedMs),
|
|
3377
|
+
vector: Math.round(vectorMs),
|
|
3378
|
+
rerank: Math.round(rerankMs),
|
|
3379
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
3380
|
+
},
|
|
3381
|
+
usedRerank: true,
|
|
3382
|
+
modelId: this.config.embeddings.model
|
|
3383
|
+
}
|
|
3384
|
+
}
|
|
3385
|
+
};
|
|
3386
|
+
}
|
|
3387
|
+
buildResults(ordered, topK, groupByPage) {
|
|
3115
3388
|
const minScore = this.config.ranking.minScore;
|
|
3116
3389
|
if (groupByPage) {
|
|
3117
3390
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -3119,10 +3392,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
3119
3392
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
3120
3393
|
}
|
|
3121
3394
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
3122
|
-
|
|
3395
|
+
return pages.slice(0, topK).map((page) => {
|
|
3123
3396
|
const bestScore = page.bestChunk.finalScore;
|
|
3124
|
-
const
|
|
3125
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
3397
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
3398
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
3126
3399
|
return {
|
|
3127
3400
|
url: page.url,
|
|
3128
3401
|
title: page.title,
|
|
@@ -3139,10 +3412,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
3139
3412
|
};
|
|
3140
3413
|
});
|
|
3141
3414
|
} else {
|
|
3415
|
+
let filtered = ordered;
|
|
3142
3416
|
if (minScore > 0) {
|
|
3143
|
-
|
|
3417
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
3144
3418
|
}
|
|
3145
|
-
|
|
3419
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
3146
3420
|
url: hit.metadata.url,
|
|
3147
3421
|
title: hit.metadata.title,
|
|
3148
3422
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -3151,21 +3425,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
3151
3425
|
routeFile: hit.metadata.routeFile
|
|
3152
3426
|
}));
|
|
3153
3427
|
}
|
|
3154
|
-
return {
|
|
3155
|
-
q: input.q,
|
|
3156
|
-
scope: resolvedScope.scopeName,
|
|
3157
|
-
results,
|
|
3158
|
-
meta: {
|
|
3159
|
-
timingsMs: {
|
|
3160
|
-
embed: Math.round(embedMs),
|
|
3161
|
-
vector: Math.round(vectorMs),
|
|
3162
|
-
rerank: Math.round(rerankMs),
|
|
3163
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
3164
|
-
},
|
|
3165
|
-
usedRerank,
|
|
3166
|
-
modelId: this.config.embeddings.model
|
|
3167
|
-
}
|
|
3168
|
-
};
|
|
3169
3428
|
}
|
|
3170
3429
|
async getPage(pathOrUrl, scope) {
|
|
3171
3430
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -3290,7 +3549,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3290
3549
|
};
|
|
3291
3550
|
|
|
3292
3551
|
// src/mcp/server.ts
|
|
3293
|
-
function createServer(engine) {
|
|
3552
|
+
function createServer(engine, config) {
|
|
3294
3553
|
const server = new McpServer({
|
|
3295
3554
|
name: "searchsocket-mcp",
|
|
3296
3555
|
version: "0.1.0"
|
|
@@ -3298,14 +3557,15 @@ function createServer(engine) {
|
|
|
3298
3557
|
server.registerTool(
|
|
3299
3558
|
"search",
|
|
3300
3559
|
{
|
|
3301
|
-
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, and
|
|
3560
|
+
description: "Semantic site search. Returns url/title/snippet/score/routeFile for each match. Supports optional scope, pathPrefix, tags, topK, and rerank. Enable rerank for better relevance on natural-language queries.",
|
|
3302
3561
|
inputSchema: {
|
|
3303
3562
|
query: z3.string().min(1),
|
|
3304
3563
|
scope: z3.string().optional(),
|
|
3305
3564
|
topK: z3.number().int().positive().max(100).optional(),
|
|
3306
3565
|
pathPrefix: z3.string().optional(),
|
|
3307
3566
|
tags: z3.array(z3.string()).optional(),
|
|
3308
|
-
groupBy: z3.enum(["page", "chunk"]).optional()
|
|
3567
|
+
groupBy: z3.enum(["page", "chunk"]).optional(),
|
|
3568
|
+
rerank: z3.boolean().optional().describe("Enable reranking for better relevance (uses Jina Reranker). Defaults to true when rerank is enabled in config.")
|
|
3309
3569
|
}
|
|
3310
3570
|
},
|
|
3311
3571
|
async (input) => {
|
|
@@ -3315,7 +3575,8 @@ function createServer(engine) {
|
|
|
3315
3575
|
scope: input.scope,
|
|
3316
3576
|
pathPrefix: input.pathPrefix,
|
|
3317
3577
|
tags: input.tags,
|
|
3318
|
-
groupBy: input.groupBy
|
|
3578
|
+
groupBy: input.groupBy,
|
|
3579
|
+
rerank: input.rerank ?? config.rerank.enabled
|
|
3319
3580
|
});
|
|
3320
3581
|
return {
|
|
3321
3582
|
content: [
|
|
@@ -3443,10 +3704,10 @@ async function runMcpServer(options = {}) {
|
|
|
3443
3704
|
config
|
|
3444
3705
|
});
|
|
3445
3706
|
if (resolvedTransport === "http") {
|
|
3446
|
-
await startHttpServer(() => createServer(engine), config, options);
|
|
3707
|
+
await startHttpServer(() => createServer(engine, config), config, options);
|
|
3447
3708
|
return;
|
|
3448
3709
|
}
|
|
3449
|
-
const server = createServer(engine);
|
|
3710
|
+
const server = createServer(engine, config);
|
|
3450
3711
|
const stdioTransport = new StdioServerTransport();
|
|
3451
3712
|
await server.connect(stdioTransport);
|
|
3452
3713
|
}
|
|
@@ -3515,7 +3776,7 @@ function collectWatchPaths(config, cwd) {
|
|
|
3515
3776
|
const paths = ["src/routes/**"];
|
|
3516
3777
|
if (config.source.mode === "content-files" && config.source.contentFiles) {
|
|
3517
3778
|
for (const pattern of config.source.contentFiles.globs) {
|
|
3518
|
-
paths.push(
|
|
3779
|
+
paths.push(path14.join(config.source.contentFiles.baseDir, pattern));
|
|
3519
3780
|
}
|
|
3520
3781
|
}
|
|
3521
3782
|
if (config.source.mode === "static-output") {
|
|
@@ -3528,15 +3789,15 @@ function collectWatchPaths(config, cwd) {
|
|
|
3528
3789
|
paths.push("searchsocket.config.ts");
|
|
3529
3790
|
paths.push(config.source.build.outputDir);
|
|
3530
3791
|
}
|
|
3531
|
-
return paths.map((value) =>
|
|
3792
|
+
return paths.map((value) => path14.resolve(cwd, value));
|
|
3532
3793
|
}
|
|
3533
3794
|
function ensureStateDir(cwd) {
|
|
3534
|
-
const target =
|
|
3535
|
-
|
|
3795
|
+
const target = path14.join(cwd, ".searchsocket");
|
|
3796
|
+
fs10.mkdirSync(target, { recursive: true });
|
|
3536
3797
|
return target;
|
|
3537
3798
|
}
|
|
3538
3799
|
function ensureGitignore(cwd) {
|
|
3539
|
-
const gitignorePath =
|
|
3800
|
+
const gitignorePath = path14.join(cwd, ".gitignore");
|
|
3540
3801
|
const entries = [
|
|
3541
3802
|
".searchsocket/vectors.db",
|
|
3542
3803
|
".searchsocket/vectors.db-shm",
|
|
@@ -3545,8 +3806,8 @@ function ensureGitignore(cwd) {
|
|
|
3545
3806
|
".searchsocket/registry.json"
|
|
3546
3807
|
];
|
|
3547
3808
|
let content = "";
|
|
3548
|
-
if (
|
|
3549
|
-
content =
|
|
3809
|
+
if (fs10.existsSync(gitignorePath)) {
|
|
3810
|
+
content = fs10.readFileSync(gitignorePath, "utf8");
|
|
3550
3811
|
}
|
|
3551
3812
|
const lines = content.split("\n");
|
|
3552
3813
|
const missing = entries.filter((entry) => !lines.some((line) => line.trim() === entry));
|
|
@@ -3557,10 +3818,10 @@ function ensureGitignore(cwd) {
|
|
|
3557
3818
|
# SearchSocket local state
|
|
3558
3819
|
${missing.join("\n")}
|
|
3559
3820
|
`;
|
|
3560
|
-
|
|
3821
|
+
fs10.writeFileSync(gitignorePath, content.trimEnd() + block, "utf8");
|
|
3561
3822
|
}
|
|
3562
3823
|
function readScopesFromFile(filePath) {
|
|
3563
|
-
const raw =
|
|
3824
|
+
const raw = fs10.readFileSync(filePath, "utf8");
|
|
3564
3825
|
return new Set(
|
|
3565
3826
|
raw.split(/\r?\n/).map((line) => line.trim()).filter(Boolean)
|
|
3566
3827
|
);
|
|
@@ -3584,8 +3845,8 @@ function readRemoteGitBranches(cwd) {
|
|
|
3584
3845
|
}
|
|
3585
3846
|
}
|
|
3586
3847
|
async function loadResolvedConfigForDev(cwd, configPath) {
|
|
3587
|
-
const resolvedConfigPath =
|
|
3588
|
-
if (
|
|
3848
|
+
const resolvedConfigPath = path14.resolve(cwd, configPath ?? "searchsocket.config.ts");
|
|
3849
|
+
if (fs10.existsSync(resolvedConfigPath)) {
|
|
3589
3850
|
return loadConfig({ cwd, configPath });
|
|
3590
3851
|
}
|
|
3591
3852
|
return mergeConfig(cwd, {});
|
|
@@ -3632,7 +3893,7 @@ var program = new Command();
|
|
|
3632
3893
|
program.name("searchsocket").description("Semantic site search and MCP retrieval for SvelteKit").version(package_default.version).option("-C, --cwd <path>", "working directory", process.cwd()).option("--config <path>", "config path (defaults to searchsocket.config.ts)");
|
|
3633
3894
|
program.command("init").description("Create searchsocket.config.ts and .searchsocket state directory").action(async (_opts, command) => {
|
|
3634
3895
|
const root = getRootOptions(command).cwd ?? process.cwd();
|
|
3635
|
-
const cwd =
|
|
3896
|
+
const cwd = path14.resolve(root);
|
|
3636
3897
|
const configPath = writeMinimalConfig(cwd);
|
|
3637
3898
|
const stateDir = ensureStateDir(cwd);
|
|
3638
3899
|
ensureGitignore(cwd);
|
|
@@ -3652,13 +3913,13 @@ program.command("init").description("Create searchsocket.config.ts and .searchso
|
|
|
3652
3913
|
});
|
|
3653
3914
|
program.command("index").description("Index site content into markdown mirror + vector store").option("--scope <name>", "scope override").option("--changed-only", "only process changed chunks", true).option("--no-changed-only", "re-index regardless of previous manifest").option("--force", "force full mirror rebuild and re-upsert", false).option("--dry-run", "compute plan and cost, no API writes", false).option("--source <mode>", "source mode override: static-output|crawl|content-files|build").option("--max-pages <n>", "limit pages processed").option("--max-chunks <n>", "limit chunks processed").option("--quiet", "suppress all output except errors and warnings", false).option("--verbose", "verbose output", false).option("--json", "emit JSON logs and summary", false).action(async (opts, command) => {
|
|
3654
3915
|
const rootOpts = getRootOptions(command);
|
|
3655
|
-
const cwd =
|
|
3916
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3656
3917
|
await runIndexCommand({
|
|
3657
3918
|
cwd,
|
|
3658
3919
|
configPath: rootOpts?.config,
|
|
3659
3920
|
scope: opts.scope,
|
|
3660
3921
|
changedOnly: opts.changedOnly,
|
|
3661
|
-
force: opts.force,
|
|
3922
|
+
force: opts.force || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
3662
3923
|
dryRun: opts.dryRun,
|
|
3663
3924
|
source: opts.source,
|
|
3664
3925
|
maxPages: opts.maxPages ? parsePositiveInt(opts.maxPages, "--max-pages") : void 0,
|
|
@@ -3670,7 +3931,7 @@ program.command("index").description("Index site content into markdown mirror +
|
|
|
3670
3931
|
});
|
|
3671
3932
|
program.command("status").description("Show scope, indexing state, backend health, and recent cost estimate").option("--scope <name>", "scope override").action(async (opts, command) => {
|
|
3672
3933
|
const rootOpts = getRootOptions(command);
|
|
3673
|
-
const cwd =
|
|
3934
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3674
3935
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3675
3936
|
const scope = resolveScope(config, opts.scope);
|
|
3676
3937
|
let vectorStore;
|
|
@@ -3748,7 +4009,7 @@ program.command("status").description("Show scope, indexing state, backend healt
|
|
|
3748
4009
|
});
|
|
3749
4010
|
program.command("dev").description("Watch content files/routes and incrementally reindex on changes").option("--scope <name>", "scope override").option("--mcp", "start MCP server (http transport) alongside watcher", false).option("--mcp-port <n>", "MCP HTTP port", "3338").option("--mcp-path <path>", "MCP HTTP path", "/mcp").option("--verbose", "verbose logs", false).action(async (opts, command) => {
|
|
3750
4011
|
const rootOpts = getRootOptions(command);
|
|
3751
|
-
const cwd =
|
|
4012
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3752
4013
|
const config = await loadResolvedConfigForDev(cwd, rootOpts?.config);
|
|
3753
4014
|
const watchPaths = collectWatchPaths(config, cwd);
|
|
3754
4015
|
process.stdout.write("starting searchsocket dev watcher...\n");
|
|
@@ -3817,10 +4078,10 @@ ${watchPaths.map((entry) => ` - ${entry}`).join("\n")}
|
|
|
3817
4078
|
});
|
|
3818
4079
|
program.command("clean").description("Delete local state and optionally delete remote vectors for a scope").option("--scope <name>", "scope override").option("--remote", "delete remote scope vectors", false).action(async (opts, command) => {
|
|
3819
4080
|
const rootOpts = getRootOptions(command);
|
|
3820
|
-
const cwd =
|
|
4081
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3821
4082
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3822
4083
|
const scope = resolveScope(config, opts.scope);
|
|
3823
|
-
const statePath =
|
|
4084
|
+
const statePath = path14.join(cwd, config.state.dir);
|
|
3824
4085
|
await fsp.rm(statePath, { recursive: true, force: true });
|
|
3825
4086
|
process.stdout.write(`deleted local state directory: ${statePath}
|
|
3826
4087
|
`);
|
|
@@ -3833,7 +4094,7 @@ program.command("clean").description("Delete local state and optionally delete r
|
|
|
3833
4094
|
});
|
|
3834
4095
|
program.command("prune").description("List/delete stale scopes (dry-run by default)").option("--apply", "apply deletions", false).option("--scopes-file <path>", "file containing active scopes").option("--older-than <duration>", "ttl cutoff like 30d").action(async (opts, command) => {
|
|
3835
4096
|
const rootOpts = getRootOptions(command);
|
|
3836
|
-
const cwd =
|
|
4097
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3837
4098
|
const config = await loadConfig({ cwd, configPath: rootOpts?.config });
|
|
3838
4099
|
const baseScope = resolveScope(config);
|
|
3839
4100
|
let vectorStore;
|
|
@@ -3853,7 +4114,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
3853
4114
|
`);
|
|
3854
4115
|
let keepScopes = /* @__PURE__ */ new Set();
|
|
3855
4116
|
if (opts.scopesFile) {
|
|
3856
|
-
keepScopes = readScopesFromFile(
|
|
4117
|
+
keepScopes = readScopesFromFile(path14.resolve(cwd, opts.scopesFile));
|
|
3857
4118
|
} else {
|
|
3858
4119
|
keepScopes = readRemoteGitBranches(cwd);
|
|
3859
4120
|
}
|
|
@@ -3924,7 +4185,7 @@ program.command("prune").description("List/delete stale scopes (dry-run by defau
|
|
|
3924
4185
|
});
|
|
3925
4186
|
program.command("doctor").description("Validate config, env vars, provider connectivity, and local write access").action(async (_opts, command) => {
|
|
3926
4187
|
const rootOpts = getRootOptions(command);
|
|
3927
|
-
const cwd =
|
|
4188
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
3928
4189
|
const checks = [];
|
|
3929
4190
|
let config = null;
|
|
3930
4191
|
try {
|
|
@@ -3953,8 +4214,8 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3953
4214
|
});
|
|
3954
4215
|
}
|
|
3955
4216
|
if (config.source.mode === "static-output") {
|
|
3956
|
-
const outputDir =
|
|
3957
|
-
const exists =
|
|
4217
|
+
const outputDir = path14.resolve(cwd, config.source.staticOutputDir);
|
|
4218
|
+
const exists = fs10.existsSync(outputDir);
|
|
3958
4219
|
checks.push({
|
|
3959
4220
|
name: "source: static output dir",
|
|
3960
4221
|
ok: exists,
|
|
@@ -3963,15 +4224,15 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3963
4224
|
} else if (config.source.mode === "build") {
|
|
3964
4225
|
const buildConfig = config.source.build;
|
|
3965
4226
|
if (buildConfig) {
|
|
3966
|
-
const manifestPath =
|
|
3967
|
-
const manifestExists =
|
|
4227
|
+
const manifestPath = path14.resolve(cwd, buildConfig.outputDir, "server", "manifest-full.js");
|
|
4228
|
+
const manifestExists = fs10.existsSync(manifestPath);
|
|
3968
4229
|
checks.push({
|
|
3969
4230
|
name: "source: build manifest",
|
|
3970
4231
|
ok: manifestExists,
|
|
3971
4232
|
details: manifestExists ? manifestPath : `${manifestPath} not found (run \`vite build\` first)`
|
|
3972
4233
|
});
|
|
3973
|
-
const viteBin =
|
|
3974
|
-
const viteExists =
|
|
4234
|
+
const viteBin = path14.resolve(cwd, "node_modules", ".bin", "vite");
|
|
4235
|
+
const viteExists = fs10.existsSync(viteBin);
|
|
3975
4236
|
checks.push({
|
|
3976
4237
|
name: "source: vite binary",
|
|
3977
4238
|
ok: viteExists,
|
|
@@ -3988,7 +4249,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
3988
4249
|
const contentConfig = config.source.contentFiles;
|
|
3989
4250
|
if (contentConfig) {
|
|
3990
4251
|
const fg4 = await import("fast-glob");
|
|
3991
|
-
const baseDir =
|
|
4252
|
+
const baseDir = path14.resolve(cwd, contentConfig.baseDir);
|
|
3992
4253
|
const files = await fg4.default(contentConfig.globs, { cwd: baseDir, onlyFiles: true });
|
|
3993
4254
|
checks.push({
|
|
3994
4255
|
name: "source: content files",
|
|
@@ -4057,7 +4318,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4057
4318
|
try {
|
|
4058
4319
|
const scope = resolveScope(config);
|
|
4059
4320
|
const { statePath } = ensureStateDirs(cwd, config.state.dir, scope);
|
|
4060
|
-
const testPath =
|
|
4321
|
+
const testPath = path14.join(statePath, ".write-test");
|
|
4061
4322
|
await fsp.writeFile(testPath, "ok\n", "utf8");
|
|
4062
4323
|
await fsp.rm(testPath, { force: true });
|
|
4063
4324
|
checks.push({ name: "state directory writable", ok: true });
|
|
@@ -4086,7 +4347,7 @@ program.command("doctor").description("Validate config, env vars, provider conne
|
|
|
4086
4347
|
});
|
|
4087
4348
|
program.command("mcp").description("Run SearchSocket MCP server").option("--transport <transport>", "stdio|http", "stdio").option("--port <n>", "HTTP port", "3338").option("--path <path>", "HTTP path", "/mcp").action(async (opts, command) => {
|
|
4088
4349
|
const rootOpts = getRootOptions(command);
|
|
4089
|
-
const cwd =
|
|
4350
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4090
4351
|
await runMcpServer({
|
|
4091
4352
|
cwd,
|
|
4092
4353
|
configPath: rootOpts?.config,
|
|
@@ -4097,7 +4358,7 @@ program.command("mcp").description("Run SearchSocket MCP server").option("--tran
|
|
|
4097
4358
|
});
|
|
4098
4359
|
program.command("search").description("Quick local CLI search against indexed vectors").requiredOption("--q <query>", "search query").option("--scope <name>", "scope override").option("--top-k <n>", "top K results", "10").option("--path-prefix <prefix>", "path prefix filter").option("--rerank", "enable configured reranker", false).action(async (opts, command) => {
|
|
4099
4360
|
const rootOpts = getRootOptions(command);
|
|
4100
|
-
const cwd =
|
|
4361
|
+
const cwd = path14.resolve(rootOpts?.cwd ?? process.cwd());
|
|
4101
4362
|
const engine = await SearchEngine.create({
|
|
4102
4363
|
cwd,
|
|
4103
4364
|
configPath: rootOpts?.config
|
|
@@ -4113,7 +4374,7 @@ program.command("search").description("Quick local CLI search against indexed ve
|
|
|
4113
4374
|
`);
|
|
4114
4375
|
});
|
|
4115
4376
|
async function main() {
|
|
4116
|
-
dotenvConfig({ path:
|
|
4377
|
+
dotenvConfig({ path: path14.resolve(process.cwd(), ".env") });
|
|
4117
4378
|
await program.parseAsync(process.argv);
|
|
4118
4379
|
}
|
|
4119
4380
|
main().catch((error) => {
|