searchsocket 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +443 -182
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +577 -164
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +577 -165
- package/dist/sveltekit.cjs +367 -77
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +367 -77
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/sveltekit.cjs
CHANGED
|
@@ -5021,32 +5021,32 @@ var require_URL = __commonJS({
|
|
|
5021
5021
|
else
|
|
5022
5022
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5023
5023
|
}
|
|
5024
|
-
function remove_dot_segments(
|
|
5025
|
-
if (!
|
|
5024
|
+
function remove_dot_segments(path15) {
|
|
5025
|
+
if (!path15) return path15;
|
|
5026
5026
|
var output = "";
|
|
5027
|
-
while (
|
|
5028
|
-
if (
|
|
5029
|
-
|
|
5027
|
+
while (path15.length > 0) {
|
|
5028
|
+
if (path15 === "." || path15 === "..") {
|
|
5029
|
+
path15 = "";
|
|
5030
5030
|
break;
|
|
5031
5031
|
}
|
|
5032
|
-
var twochars =
|
|
5033
|
-
var threechars =
|
|
5034
|
-
var fourchars =
|
|
5032
|
+
var twochars = path15.substring(0, 2);
|
|
5033
|
+
var threechars = path15.substring(0, 3);
|
|
5034
|
+
var fourchars = path15.substring(0, 4);
|
|
5035
5035
|
if (threechars === "../") {
|
|
5036
|
-
|
|
5036
|
+
path15 = path15.substring(3);
|
|
5037
5037
|
} else if (twochars === "./") {
|
|
5038
|
-
|
|
5038
|
+
path15 = path15.substring(2);
|
|
5039
5039
|
} else if (threechars === "/./") {
|
|
5040
|
-
|
|
5041
|
-
} else if (twochars === "/." &&
|
|
5042
|
-
|
|
5043
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5044
|
-
|
|
5040
|
+
path15 = "/" + path15.substring(3);
|
|
5041
|
+
} else if (twochars === "/." && path15.length === 2) {
|
|
5042
|
+
path15 = "/";
|
|
5043
|
+
} else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
|
|
5044
|
+
path15 = "/" + path15.substring(4);
|
|
5045
5045
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5046
5046
|
} else {
|
|
5047
|
-
var segment =
|
|
5047
|
+
var segment = path15.match(/(\/?([^\/]*))/)[0];
|
|
5048
5048
|
output += segment;
|
|
5049
|
-
|
|
5049
|
+
path15 = path15.substring(segment.length);
|
|
5050
5050
|
}
|
|
5051
5051
|
}
|
|
5052
5052
|
return output;
|
|
@@ -16610,6 +16610,8 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16610
16610
|
envVar: zod.z.string().min(1).optional(),
|
|
16611
16611
|
sanitize: zod.z.boolean().optional()
|
|
16612
16612
|
}).optional(),
|
|
16613
|
+
exclude: zod.z.array(zod.z.string()).optional(),
|
|
16614
|
+
respectRobotsTxt: zod.z.boolean().optional(),
|
|
16613
16615
|
source: zod.z.object({
|
|
16614
16616
|
mode: zod.z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16615
16617
|
staticOutputDir: zod.z.string().min(1).optional(),
|
|
@@ -16740,6 +16742,8 @@ function createDefaultConfig(projectId) {
|
|
|
16740
16742
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16741
16743
|
sanitize: true
|
|
16742
16744
|
},
|
|
16745
|
+
exclude: [],
|
|
16746
|
+
respectRobotsTxt: true,
|
|
16743
16747
|
source: {
|
|
16744
16748
|
mode: "static-output",
|
|
16745
16749
|
staticOutputDir: "build",
|
|
@@ -16770,7 +16774,7 @@ function createDefaultConfig(projectId) {
|
|
|
16770
16774
|
},
|
|
16771
16775
|
embeddings: {
|
|
16772
16776
|
provider: "jina",
|
|
16773
|
-
model: "jina-embeddings-
|
|
16777
|
+
model: "jina-embeddings-v5-text-small",
|
|
16774
16778
|
apiKeyEnv: "JINA_API_KEY",
|
|
16775
16779
|
batchSize: 64,
|
|
16776
16780
|
concurrency: 4
|
|
@@ -16783,9 +16787,9 @@ function createDefaultConfig(projectId) {
|
|
|
16783
16787
|
}
|
|
16784
16788
|
},
|
|
16785
16789
|
rerank: {
|
|
16786
|
-
enabled:
|
|
16790
|
+
enabled: true,
|
|
16787
16791
|
topN: 20,
|
|
16788
|
-
model: "jina-reranker-
|
|
16792
|
+
model: "jina-reranker-v3"
|
|
16789
16793
|
},
|
|
16790
16794
|
ranking: {
|
|
16791
16795
|
enableIncomingLinkBoost: true,
|
|
@@ -16904,6 +16908,8 @@ ${issues}`
|
|
|
16904
16908
|
...defaults.scope,
|
|
16905
16909
|
...parsed.scope
|
|
16906
16910
|
},
|
|
16911
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16912
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16907
16913
|
source: {
|
|
16908
16914
|
...defaults.source,
|
|
16909
16915
|
...parsed.source,
|
|
@@ -17868,6 +17874,36 @@ async function createVectorStore(config, cwd) {
|
|
|
17868
17874
|
});
|
|
17869
17875
|
}
|
|
17870
17876
|
|
|
17877
|
+
// src/utils/pattern.ts
|
|
17878
|
+
function matchUrlPattern(url, pattern) {
|
|
17879
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
17880
|
+
const normalizedUrl = norm(url);
|
|
17881
|
+
const normalizedPattern = norm(pattern);
|
|
17882
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
17883
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
17884
|
+
if (prefix === "") {
|
|
17885
|
+
return true;
|
|
17886
|
+
}
|
|
17887
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
17888
|
+
}
|
|
17889
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
17890
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
17891
|
+
if (prefix === "") {
|
|
17892
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
17893
|
+
}
|
|
17894
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
17895
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
17896
|
+
return rest.length > 0 && !rest.includes("/");
|
|
17897
|
+
}
|
|
17898
|
+
return normalizedUrl === normalizedPattern;
|
|
17899
|
+
}
|
|
17900
|
+
function matchUrlPatterns(url, patterns) {
|
|
17901
|
+
for (const pattern of patterns) {
|
|
17902
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
17903
|
+
}
|
|
17904
|
+
return false;
|
|
17905
|
+
}
|
|
17906
|
+
|
|
17871
17907
|
// src/search/ranking.ts
|
|
17872
17908
|
function nonNegativeOrZero(value) {
|
|
17873
17909
|
if (!Number.isFinite(value)) {
|
|
@@ -17896,21 +17932,11 @@ function rankHits(hits, config) {
|
|
|
17896
17932
|
});
|
|
17897
17933
|
}
|
|
17898
17934
|
function findPageWeight(url, pageWeights) {
|
|
17899
|
-
|
|
17900
|
-
const normalizedUrl = norm(url);
|
|
17901
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
17902
|
-
if (norm(pattern) === normalizedUrl) {
|
|
17903
|
-
return weight;
|
|
17904
|
-
}
|
|
17905
|
-
}
|
|
17906
|
-
let bestPrefix = "";
|
|
17935
|
+
let bestPattern = "";
|
|
17907
17936
|
let bestWeight = 1;
|
|
17908
17937
|
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
17909
|
-
|
|
17910
|
-
|
|
17911
|
-
const prefix = `${normalizedPattern}/`;
|
|
17912
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
17913
|
-
bestPrefix = prefix;
|
|
17938
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
17939
|
+
bestPattern = pattern;
|
|
17914
17940
|
bestWeight = weight;
|
|
17915
17941
|
}
|
|
17916
17942
|
}
|
|
@@ -17968,7 +17994,8 @@ var requestSchema = zod.z.object({
|
|
|
17968
17994
|
pathPrefix: zod.z.string().optional(),
|
|
17969
17995
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
17970
17996
|
rerank: zod.z.boolean().optional(),
|
|
17971
|
-
groupBy: zod.z.enum(["page", "chunk"]).optional()
|
|
17997
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
17998
|
+
stream: zod.z.boolean().optional()
|
|
17972
17999
|
});
|
|
17973
18000
|
var SearchEngine = class _SearchEngine {
|
|
17974
18001
|
cwd;
|
|
@@ -18041,7 +18068,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
18041
18068
|
rerankMs = hrTimeMs(rerankStart);
|
|
18042
18069
|
usedRerank = true;
|
|
18043
18070
|
}
|
|
18044
|
-
|
|
18071
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
18072
|
+
return {
|
|
18073
|
+
q: input.q,
|
|
18074
|
+
scope: resolvedScope.scopeName,
|
|
18075
|
+
results,
|
|
18076
|
+
meta: {
|
|
18077
|
+
timingsMs: {
|
|
18078
|
+
embed: Math.round(embedMs),
|
|
18079
|
+
vector: Math.round(vectorMs),
|
|
18080
|
+
rerank: Math.round(rerankMs),
|
|
18081
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18082
|
+
},
|
|
18083
|
+
usedRerank,
|
|
18084
|
+
modelId: this.config.embeddings.model
|
|
18085
|
+
}
|
|
18086
|
+
};
|
|
18087
|
+
}
|
|
18088
|
+
async *searchStreaming(request) {
|
|
18089
|
+
const parsed = requestSchema.safeParse(request);
|
|
18090
|
+
if (!parsed.success) {
|
|
18091
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
18092
|
+
}
|
|
18093
|
+
const input = parsed.data;
|
|
18094
|
+
const wantsRerank = Boolean(input.rerank);
|
|
18095
|
+
if (!wantsRerank) {
|
|
18096
|
+
const response = await this.search(request);
|
|
18097
|
+
yield { phase: "initial", data: response };
|
|
18098
|
+
return;
|
|
18099
|
+
}
|
|
18100
|
+
const totalStart = process.hrtime.bigint();
|
|
18101
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
18102
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
18103
|
+
const topK = input.topK ?? 10;
|
|
18104
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
18105
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
18106
|
+
const embedStart = process.hrtime.bigint();
|
|
18107
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
18108
|
+
const queryVector = queryEmbeddings[0];
|
|
18109
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
18110
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
18111
|
+
}
|
|
18112
|
+
const embedMs = hrTimeMs(embedStart);
|
|
18113
|
+
const vectorStart = process.hrtime.bigint();
|
|
18114
|
+
const hits = await this.vectorStore.query(
|
|
18115
|
+
queryVector,
|
|
18116
|
+
{
|
|
18117
|
+
topK: candidateK,
|
|
18118
|
+
pathPrefix: input.pathPrefix,
|
|
18119
|
+
tags: input.tags
|
|
18120
|
+
},
|
|
18121
|
+
resolvedScope
|
|
18122
|
+
);
|
|
18123
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
18124
|
+
const ranked = rankHits(hits, this.config);
|
|
18125
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
18126
|
+
yield {
|
|
18127
|
+
phase: "initial",
|
|
18128
|
+
data: {
|
|
18129
|
+
q: input.q,
|
|
18130
|
+
scope: resolvedScope.scopeName,
|
|
18131
|
+
results: initialResults,
|
|
18132
|
+
meta: {
|
|
18133
|
+
timingsMs: {
|
|
18134
|
+
embed: Math.round(embedMs),
|
|
18135
|
+
vector: Math.round(vectorMs),
|
|
18136
|
+
rerank: 0,
|
|
18137
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18138
|
+
},
|
|
18139
|
+
usedRerank: false,
|
|
18140
|
+
modelId: this.config.embeddings.model
|
|
18141
|
+
}
|
|
18142
|
+
}
|
|
18143
|
+
};
|
|
18144
|
+
const rerankStart = process.hrtime.bigint();
|
|
18145
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
18146
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
18147
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
18148
|
+
yield {
|
|
18149
|
+
phase: "reranked",
|
|
18150
|
+
data: {
|
|
18151
|
+
q: input.q,
|
|
18152
|
+
scope: resolvedScope.scopeName,
|
|
18153
|
+
results: rerankedResults,
|
|
18154
|
+
meta: {
|
|
18155
|
+
timingsMs: {
|
|
18156
|
+
embed: Math.round(embedMs),
|
|
18157
|
+
vector: Math.round(vectorMs),
|
|
18158
|
+
rerank: Math.round(rerankMs),
|
|
18159
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18160
|
+
},
|
|
18161
|
+
usedRerank: true,
|
|
18162
|
+
modelId: this.config.embeddings.model
|
|
18163
|
+
}
|
|
18164
|
+
}
|
|
18165
|
+
};
|
|
18166
|
+
}
|
|
18167
|
+
buildResults(ordered, topK, groupByPage) {
|
|
18045
18168
|
const minScore = this.config.ranking.minScore;
|
|
18046
18169
|
if (groupByPage) {
|
|
18047
18170
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -18049,10 +18172,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
18049
18172
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
18050
18173
|
}
|
|
18051
18174
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
18052
|
-
|
|
18175
|
+
return pages.slice(0, topK).map((page) => {
|
|
18053
18176
|
const bestScore = page.bestChunk.finalScore;
|
|
18054
|
-
const
|
|
18055
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
18177
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
18178
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
18056
18179
|
return {
|
|
18057
18180
|
url: page.url,
|
|
18058
18181
|
title: page.title,
|
|
@@ -18069,10 +18192,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
18069
18192
|
};
|
|
18070
18193
|
});
|
|
18071
18194
|
} else {
|
|
18195
|
+
let filtered = ordered;
|
|
18072
18196
|
if (minScore > 0) {
|
|
18073
|
-
|
|
18197
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
18074
18198
|
}
|
|
18075
|
-
|
|
18199
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
18076
18200
|
url: hit.metadata.url,
|
|
18077
18201
|
title: hit.metadata.title,
|
|
18078
18202
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -18081,21 +18205,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
18081
18205
|
routeFile: hit.metadata.routeFile
|
|
18082
18206
|
}));
|
|
18083
18207
|
}
|
|
18084
|
-
return {
|
|
18085
|
-
q: input.q,
|
|
18086
|
-
scope: resolvedScope.scopeName,
|
|
18087
|
-
results,
|
|
18088
|
-
meta: {
|
|
18089
|
-
timingsMs: {
|
|
18090
|
-
embed: Math.round(embedMs),
|
|
18091
|
-
vector: Math.round(vectorMs),
|
|
18092
|
-
rerank: Math.round(rerankMs),
|
|
18093
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
18094
|
-
},
|
|
18095
|
-
usedRerank,
|
|
18096
|
-
modelId: this.config.embeddings.model
|
|
18097
|
-
}
|
|
18098
|
-
};
|
|
18099
18208
|
}
|
|
18100
18209
|
async getPage(pathOrUrl, scope) {
|
|
18101
18210
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -18370,7 +18479,44 @@ function searchsocketHandle(options = {}) {
|
|
|
18370
18479
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
18371
18480
|
}
|
|
18372
18481
|
const engine = await getEngine();
|
|
18373
|
-
const
|
|
18482
|
+
const searchRequest = body;
|
|
18483
|
+
if (searchRequest.stream && searchRequest.rerank) {
|
|
18484
|
+
const encoder = new TextEncoder();
|
|
18485
|
+
const stream = new ReadableStream({
|
|
18486
|
+
async start(controller) {
|
|
18487
|
+
try {
|
|
18488
|
+
for await (const event2 of engine.searchStreaming(searchRequest)) {
|
|
18489
|
+
const line = JSON.stringify(event2) + "\n";
|
|
18490
|
+
controller.enqueue(encoder.encode(line));
|
|
18491
|
+
}
|
|
18492
|
+
} catch (streamError) {
|
|
18493
|
+
const errorEvent = {
|
|
18494
|
+
phase: "error",
|
|
18495
|
+
data: {
|
|
18496
|
+
error: {
|
|
18497
|
+
code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
|
|
18498
|
+
message: streamError instanceof Error ? streamError.message : "Unknown error"
|
|
18499
|
+
}
|
|
18500
|
+
}
|
|
18501
|
+
};
|
|
18502
|
+
controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
|
|
18503
|
+
} finally {
|
|
18504
|
+
controller.close();
|
|
18505
|
+
}
|
|
18506
|
+
}
|
|
18507
|
+
});
|
|
18508
|
+
return withCors(
|
|
18509
|
+
new Response(stream, {
|
|
18510
|
+
status: 200,
|
|
18511
|
+
headers: {
|
|
18512
|
+
"content-type": "application/x-ndjson"
|
|
18513
|
+
}
|
|
18514
|
+
}),
|
|
18515
|
+
event.request,
|
|
18516
|
+
config
|
|
18517
|
+
);
|
|
18518
|
+
}
|
|
18519
|
+
const result = await engine.search(searchRequest);
|
|
18374
18520
|
return withCors(
|
|
18375
18521
|
new Response(JSON.stringify(result), {
|
|
18376
18522
|
status: 200,
|
|
@@ -19599,6 +19745,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19599
19745
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19600
19746
|
return null;
|
|
19601
19747
|
}
|
|
19748
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
19749
|
+
let weight;
|
|
19750
|
+
if (weightRaw !== void 0) {
|
|
19751
|
+
const parsed = Number(weightRaw);
|
|
19752
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
19753
|
+
weight = parsed;
|
|
19754
|
+
}
|
|
19755
|
+
}
|
|
19756
|
+
if (weight === 0) {
|
|
19757
|
+
return null;
|
|
19758
|
+
}
|
|
19602
19759
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19603
19760
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19604
19761
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19654,7 +19811,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19654
19811
|
noindex: false,
|
|
19655
19812
|
tags,
|
|
19656
19813
|
description,
|
|
19657
|
-
keywords
|
|
19814
|
+
keywords,
|
|
19815
|
+
weight
|
|
19658
19816
|
};
|
|
19659
19817
|
}
|
|
19660
19818
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19667,6 +19825,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19667
19825
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19668
19826
|
return null;
|
|
19669
19827
|
}
|
|
19828
|
+
let mdWeight;
|
|
19829
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
19830
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
19831
|
+
mdWeight = rawWeight;
|
|
19832
|
+
}
|
|
19833
|
+
if (mdWeight === 0) {
|
|
19834
|
+
return null;
|
|
19835
|
+
}
|
|
19670
19836
|
const content = parsed.content;
|
|
19671
19837
|
const normalized = normalizeMarkdown(content);
|
|
19672
19838
|
if (!normalizeText(normalized)) {
|
|
@@ -19689,7 +19855,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19689
19855
|
noindex: false,
|
|
19690
19856
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19691
19857
|
description: fmDescription,
|
|
19692
|
-
keywords: fmKeywords
|
|
19858
|
+
keywords: fmKeywords,
|
|
19859
|
+
weight: mdWeight
|
|
19693
19860
|
};
|
|
19694
19861
|
}
|
|
19695
19862
|
function yamlString(value) {
|
|
@@ -19958,15 +20125,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19958
20125
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19959
20126
|
}
|
|
19960
20127
|
function isExcluded(url, patterns) {
|
|
19961
|
-
|
|
19962
|
-
if (pattern.endsWith("/*")) {
|
|
19963
|
-
const prefix = pattern.slice(0, -1);
|
|
19964
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19965
|
-
} else if (url === pattern) {
|
|
19966
|
-
return true;
|
|
19967
|
-
}
|
|
19968
|
-
}
|
|
19969
|
-
return false;
|
|
20128
|
+
return matchUrlPatterns(url, patterns);
|
|
19970
20129
|
}
|
|
19971
20130
|
function findFreePort() {
|
|
19972
20131
|
return new Promise((resolve, reject) => {
|
|
@@ -20382,12 +20541,83 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
20382
20541
|
}
|
|
20383
20542
|
return pages;
|
|
20384
20543
|
}
|
|
20544
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
20545
|
+
const lines = content.split(/\r?\n/);
|
|
20546
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
20547
|
+
let currentAgents = [];
|
|
20548
|
+
for (const rawLine of lines) {
|
|
20549
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
20550
|
+
if (!line) continue;
|
|
20551
|
+
const colonIdx = line.indexOf(":");
|
|
20552
|
+
if (colonIdx === -1) continue;
|
|
20553
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
20554
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
20555
|
+
if (directive === "user-agent") {
|
|
20556
|
+
const agentName = value.toLowerCase();
|
|
20557
|
+
currentAgents.push(agentName);
|
|
20558
|
+
if (!agentGroups.has(agentName)) {
|
|
20559
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
20560
|
+
}
|
|
20561
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
20562
|
+
for (const agent of currentAgents) {
|
|
20563
|
+
agentGroups.get(agent).disallow.push(value);
|
|
20564
|
+
}
|
|
20565
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
20566
|
+
for (const agent of currentAgents) {
|
|
20567
|
+
agentGroups.get(agent).allow.push(value);
|
|
20568
|
+
}
|
|
20569
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
20570
|
+
currentAgents = [];
|
|
20571
|
+
}
|
|
20572
|
+
}
|
|
20573
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
20574
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
20575
|
+
return specific;
|
|
20576
|
+
}
|
|
20577
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
20578
|
+
}
|
|
20579
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
20580
|
+
let longestDisallow = "";
|
|
20581
|
+
for (const pattern of rules3.disallow) {
|
|
20582
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
20583
|
+
longestDisallow = pattern;
|
|
20584
|
+
}
|
|
20585
|
+
}
|
|
20586
|
+
if (!longestDisallow) return false;
|
|
20587
|
+
let longestAllow = "";
|
|
20588
|
+
for (const pattern of rules3.allow) {
|
|
20589
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
20590
|
+
longestAllow = pattern;
|
|
20591
|
+
}
|
|
20592
|
+
}
|
|
20593
|
+
return longestAllow.length < longestDisallow.length;
|
|
20594
|
+
}
|
|
20595
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
20596
|
+
try {
|
|
20597
|
+
const content = await fs4__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
|
|
20598
|
+
return parseRobotsTxt(content);
|
|
20599
|
+
} catch {
|
|
20600
|
+
return null;
|
|
20601
|
+
}
|
|
20602
|
+
}
|
|
20603
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
20604
|
+
try {
|
|
20605
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
20606
|
+
const response = await fetch(url);
|
|
20607
|
+
if (!response.ok) return null;
|
|
20608
|
+
const content = await response.text();
|
|
20609
|
+
return parseRobotsTxt(content);
|
|
20610
|
+
} catch {
|
|
20611
|
+
return null;
|
|
20612
|
+
}
|
|
20613
|
+
}
|
|
20385
20614
|
|
|
20386
20615
|
// src/indexing/pipeline.ts
|
|
20387
20616
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
20388
|
-
"jina-embeddings-v3": 2e-5
|
|
20617
|
+
"jina-embeddings-v3": 2e-5,
|
|
20618
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
20389
20619
|
};
|
|
20390
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
20620
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
20391
20621
|
var IndexPipeline = class _IndexPipeline {
|
|
20392
20622
|
cwd;
|
|
20393
20623
|
config;
|
|
@@ -20465,6 +20695,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20465
20695
|
}
|
|
20466
20696
|
stageEnd("source", sourceStart);
|
|
20467
20697
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20698
|
+
const filterStart = stageStart();
|
|
20699
|
+
let filteredSourcePages = sourcePages;
|
|
20700
|
+
if (this.config.exclude.length > 0) {
|
|
20701
|
+
const beforeExclude = filteredSourcePages.length;
|
|
20702
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20703
|
+
const url = normalizeUrlPath(p.url);
|
|
20704
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
20705
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
20706
|
+
return false;
|
|
20707
|
+
}
|
|
20708
|
+
return true;
|
|
20709
|
+
});
|
|
20710
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
20711
|
+
if (excludedCount > 0) {
|
|
20712
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
20713
|
+
}
|
|
20714
|
+
}
|
|
20715
|
+
if (this.config.respectRobotsTxt) {
|
|
20716
|
+
let robotsRules = null;
|
|
20717
|
+
if (sourceMode === "static-output") {
|
|
20718
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20719
|
+
path__default.default.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
20720
|
+
);
|
|
20721
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
20722
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20723
|
+
path__default.default.resolve(this.cwd, this.config.source.build.outputDir)
|
|
20724
|
+
);
|
|
20725
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
20726
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
20727
|
+
}
|
|
20728
|
+
if (robotsRules) {
|
|
20729
|
+
const beforeRobots = filteredSourcePages.length;
|
|
20730
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20731
|
+
const url = normalizeUrlPath(p.url);
|
|
20732
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
20733
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
20734
|
+
return false;
|
|
20735
|
+
}
|
|
20736
|
+
return true;
|
|
20737
|
+
});
|
|
20738
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
20739
|
+
if (robotsExcluded > 0) {
|
|
20740
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
20741
|
+
}
|
|
20742
|
+
}
|
|
20743
|
+
}
|
|
20744
|
+
stageEnd("filter", filterStart);
|
|
20468
20745
|
const routeStart = stageStart();
|
|
20469
20746
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
20470
20747
|
stageEnd("route_map", routeStart);
|
|
@@ -20472,7 +20749,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20472
20749
|
const extractStart = stageStart();
|
|
20473
20750
|
this.logger.info("Extracting content...");
|
|
20474
20751
|
const extractedPages = [];
|
|
20475
|
-
for (const sourcePage of
|
|
20752
|
+
for (const sourcePage of filteredSourcePages) {
|
|
20476
20753
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
20477
20754
|
if (!extracted) {
|
|
20478
20755
|
this.logger.warn(
|
|
@@ -20498,16 +20775,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20498
20775
|
seenUrls.add(page.url);
|
|
20499
20776
|
uniquePages.push(page);
|
|
20500
20777
|
}
|
|
20778
|
+
const indexablePages = [];
|
|
20779
|
+
for (const page of uniquePages) {
|
|
20780
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
20781
|
+
if (effectiveWeight === 0) {
|
|
20782
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
20783
|
+
continue;
|
|
20784
|
+
}
|
|
20785
|
+
indexablePages.push(page);
|
|
20786
|
+
}
|
|
20787
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
20788
|
+
if (zeroWeightCount > 0) {
|
|
20789
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
20790
|
+
}
|
|
20501
20791
|
stageEnd("extract", extractStart);
|
|
20502
|
-
const skippedPages =
|
|
20503
|
-
this.logger.info(`Extracted ${
|
|
20792
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
20793
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
20504
20794
|
const linkStart = stageStart();
|
|
20505
|
-
const pageSet = new Set(
|
|
20795
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
20506
20796
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
20507
|
-
for (const page of
|
|
20797
|
+
for (const page of indexablePages) {
|
|
20508
20798
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
20509
20799
|
}
|
|
20510
|
-
for (const page of
|
|
20800
|
+
for (const page of indexablePages) {
|
|
20511
20801
|
for (const outgoing of page.outgoingLinks) {
|
|
20512
20802
|
if (!pageSet.has(outgoing)) {
|
|
20513
20803
|
continue;
|
|
@@ -20531,7 +20821,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20531
20821
|
});
|
|
20532
20822
|
}
|
|
20533
20823
|
}
|
|
20534
|
-
for (const page of
|
|
20824
|
+
for (const page of indexablePages) {
|
|
20535
20825
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
20536
20826
|
if (routeMatch.routeResolution === "best-effort") {
|
|
20537
20827
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -20817,7 +21107,7 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20817
21107
|
});
|
|
20818
21108
|
const stats = await pipeline.run({
|
|
20819
21109
|
changedOnly: options.changedOnly ?? true,
|
|
20820
|
-
force: options.force ?? false,
|
|
21110
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20821
21111
|
dryRun: options.dryRun ?? false,
|
|
20822
21112
|
scopeOverride: options.scope,
|
|
20823
21113
|
verbose: options.verbose
|
package/dist/sveltekit.d.cts
CHANGED
package/dist/sveltekit.d.ts
CHANGED