searchsocket 0.3.3 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +443 -182
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +577 -164
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +577 -165
- package/dist/sveltekit.cjs +367 -77
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +367 -77
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/sveltekit.js
CHANGED
|
@@ -5009,32 +5009,32 @@ var require_URL = __commonJS({
|
|
|
5009
5009
|
else
|
|
5010
5010
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5011
5011
|
}
|
|
5012
|
-
function remove_dot_segments(
|
|
5013
|
-
if (!
|
|
5012
|
+
function remove_dot_segments(path15) {
|
|
5013
|
+
if (!path15) return path15;
|
|
5014
5014
|
var output = "";
|
|
5015
|
-
while (
|
|
5016
|
-
if (
|
|
5017
|
-
|
|
5015
|
+
while (path15.length > 0) {
|
|
5016
|
+
if (path15 === "." || path15 === "..") {
|
|
5017
|
+
path15 = "";
|
|
5018
5018
|
break;
|
|
5019
5019
|
}
|
|
5020
|
-
var twochars =
|
|
5021
|
-
var threechars =
|
|
5022
|
-
var fourchars =
|
|
5020
|
+
var twochars = path15.substring(0, 2);
|
|
5021
|
+
var threechars = path15.substring(0, 3);
|
|
5022
|
+
var fourchars = path15.substring(0, 4);
|
|
5023
5023
|
if (threechars === "../") {
|
|
5024
|
-
|
|
5024
|
+
path15 = path15.substring(3);
|
|
5025
5025
|
} else if (twochars === "./") {
|
|
5026
|
-
|
|
5026
|
+
path15 = path15.substring(2);
|
|
5027
5027
|
} else if (threechars === "/./") {
|
|
5028
|
-
|
|
5029
|
-
} else if (twochars === "/." &&
|
|
5030
|
-
|
|
5031
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5032
|
-
|
|
5028
|
+
path15 = "/" + path15.substring(3);
|
|
5029
|
+
} else if (twochars === "/." && path15.length === 2) {
|
|
5030
|
+
path15 = "/";
|
|
5031
|
+
} else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
|
|
5032
|
+
path15 = "/" + path15.substring(4);
|
|
5033
5033
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5034
5034
|
} else {
|
|
5035
|
-
var segment =
|
|
5035
|
+
var segment = path15.match(/(\/?([^\/]*))/)[0];
|
|
5036
5036
|
output += segment;
|
|
5037
|
-
|
|
5037
|
+
path15 = path15.substring(segment.length);
|
|
5038
5038
|
}
|
|
5039
5039
|
}
|
|
5040
5040
|
return output;
|
|
@@ -16598,6 +16598,8 @@ var searchSocketConfigSchema = z.object({
|
|
|
16598
16598
|
envVar: z.string().min(1).optional(),
|
|
16599
16599
|
sanitize: z.boolean().optional()
|
|
16600
16600
|
}).optional(),
|
|
16601
|
+
exclude: z.array(z.string()).optional(),
|
|
16602
|
+
respectRobotsTxt: z.boolean().optional(),
|
|
16601
16603
|
source: z.object({
|
|
16602
16604
|
mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16603
16605
|
staticOutputDir: z.string().min(1).optional(),
|
|
@@ -16728,6 +16730,8 @@ function createDefaultConfig(projectId) {
|
|
|
16728
16730
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16729
16731
|
sanitize: true
|
|
16730
16732
|
},
|
|
16733
|
+
exclude: [],
|
|
16734
|
+
respectRobotsTxt: true,
|
|
16731
16735
|
source: {
|
|
16732
16736
|
mode: "static-output",
|
|
16733
16737
|
staticOutputDir: "build",
|
|
@@ -16758,7 +16762,7 @@ function createDefaultConfig(projectId) {
|
|
|
16758
16762
|
},
|
|
16759
16763
|
embeddings: {
|
|
16760
16764
|
provider: "jina",
|
|
16761
|
-
model: "jina-embeddings-
|
|
16765
|
+
model: "jina-embeddings-v5-text-small",
|
|
16762
16766
|
apiKeyEnv: "JINA_API_KEY",
|
|
16763
16767
|
batchSize: 64,
|
|
16764
16768
|
concurrency: 4
|
|
@@ -16771,9 +16775,9 @@ function createDefaultConfig(projectId) {
|
|
|
16771
16775
|
}
|
|
16772
16776
|
},
|
|
16773
16777
|
rerank: {
|
|
16774
|
-
enabled:
|
|
16778
|
+
enabled: true,
|
|
16775
16779
|
topN: 20,
|
|
16776
|
-
model: "jina-reranker-
|
|
16780
|
+
model: "jina-reranker-v3"
|
|
16777
16781
|
},
|
|
16778
16782
|
ranking: {
|
|
16779
16783
|
enableIncomingLinkBoost: true,
|
|
@@ -16892,6 +16896,8 @@ ${issues}`
|
|
|
16892
16896
|
...defaults.scope,
|
|
16893
16897
|
...parsed.scope
|
|
16894
16898
|
},
|
|
16899
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16900
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16895
16901
|
source: {
|
|
16896
16902
|
...defaults.source,
|
|
16897
16903
|
...parsed.source,
|
|
@@ -17856,6 +17862,36 @@ async function createVectorStore(config, cwd) {
|
|
|
17856
17862
|
});
|
|
17857
17863
|
}
|
|
17858
17864
|
|
|
17865
|
+
// src/utils/pattern.ts
|
|
17866
|
+
function matchUrlPattern(url, pattern) {
|
|
17867
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
17868
|
+
const normalizedUrl = norm(url);
|
|
17869
|
+
const normalizedPattern = norm(pattern);
|
|
17870
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
17871
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
17872
|
+
if (prefix === "") {
|
|
17873
|
+
return true;
|
|
17874
|
+
}
|
|
17875
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
17876
|
+
}
|
|
17877
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
17878
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
17879
|
+
if (prefix === "") {
|
|
17880
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
17881
|
+
}
|
|
17882
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
17883
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
17884
|
+
return rest.length > 0 && !rest.includes("/");
|
|
17885
|
+
}
|
|
17886
|
+
return normalizedUrl === normalizedPattern;
|
|
17887
|
+
}
|
|
17888
|
+
function matchUrlPatterns(url, patterns) {
|
|
17889
|
+
for (const pattern of patterns) {
|
|
17890
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
17891
|
+
}
|
|
17892
|
+
return false;
|
|
17893
|
+
}
|
|
17894
|
+
|
|
17859
17895
|
// src/search/ranking.ts
|
|
17860
17896
|
function nonNegativeOrZero(value) {
|
|
17861
17897
|
if (!Number.isFinite(value)) {
|
|
@@ -17884,21 +17920,11 @@ function rankHits(hits, config) {
|
|
|
17884
17920
|
});
|
|
17885
17921
|
}
|
|
17886
17922
|
function findPageWeight(url, pageWeights) {
|
|
17887
|
-
|
|
17888
|
-
const normalizedUrl = norm(url);
|
|
17889
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
17890
|
-
if (norm(pattern) === normalizedUrl) {
|
|
17891
|
-
return weight;
|
|
17892
|
-
}
|
|
17893
|
-
}
|
|
17894
|
-
let bestPrefix = "";
|
|
17923
|
+
let bestPattern = "";
|
|
17895
17924
|
let bestWeight = 1;
|
|
17896
17925
|
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
17897
|
-
|
|
17898
|
-
|
|
17899
|
-
const prefix = `${normalizedPattern}/`;
|
|
17900
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
17901
|
-
bestPrefix = prefix;
|
|
17926
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
17927
|
+
bestPattern = pattern;
|
|
17902
17928
|
bestWeight = weight;
|
|
17903
17929
|
}
|
|
17904
17930
|
}
|
|
@@ -17956,7 +17982,8 @@ var requestSchema = z.object({
|
|
|
17956
17982
|
pathPrefix: z.string().optional(),
|
|
17957
17983
|
tags: z.array(z.string()).optional(),
|
|
17958
17984
|
rerank: z.boolean().optional(),
|
|
17959
|
-
groupBy: z.enum(["page", "chunk"]).optional()
|
|
17985
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
17986
|
+
stream: z.boolean().optional()
|
|
17960
17987
|
});
|
|
17961
17988
|
var SearchEngine = class _SearchEngine {
|
|
17962
17989
|
cwd;
|
|
@@ -18029,7 +18056,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
18029
18056
|
rerankMs = hrTimeMs(rerankStart);
|
|
18030
18057
|
usedRerank = true;
|
|
18031
18058
|
}
|
|
18032
|
-
|
|
18059
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
18060
|
+
return {
|
|
18061
|
+
q: input.q,
|
|
18062
|
+
scope: resolvedScope.scopeName,
|
|
18063
|
+
results,
|
|
18064
|
+
meta: {
|
|
18065
|
+
timingsMs: {
|
|
18066
|
+
embed: Math.round(embedMs),
|
|
18067
|
+
vector: Math.round(vectorMs),
|
|
18068
|
+
rerank: Math.round(rerankMs),
|
|
18069
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18070
|
+
},
|
|
18071
|
+
usedRerank,
|
|
18072
|
+
modelId: this.config.embeddings.model
|
|
18073
|
+
}
|
|
18074
|
+
};
|
|
18075
|
+
}
|
|
18076
|
+
async *searchStreaming(request) {
|
|
18077
|
+
const parsed = requestSchema.safeParse(request);
|
|
18078
|
+
if (!parsed.success) {
|
|
18079
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
18080
|
+
}
|
|
18081
|
+
const input = parsed.data;
|
|
18082
|
+
const wantsRerank = Boolean(input.rerank);
|
|
18083
|
+
if (!wantsRerank) {
|
|
18084
|
+
const response = await this.search(request);
|
|
18085
|
+
yield { phase: "initial", data: response };
|
|
18086
|
+
return;
|
|
18087
|
+
}
|
|
18088
|
+
const totalStart = process.hrtime.bigint();
|
|
18089
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
18090
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
18091
|
+
const topK = input.topK ?? 10;
|
|
18092
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
18093
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
18094
|
+
const embedStart = process.hrtime.bigint();
|
|
18095
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
18096
|
+
const queryVector = queryEmbeddings[0];
|
|
18097
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
18098
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
18099
|
+
}
|
|
18100
|
+
const embedMs = hrTimeMs(embedStart);
|
|
18101
|
+
const vectorStart = process.hrtime.bigint();
|
|
18102
|
+
const hits = await this.vectorStore.query(
|
|
18103
|
+
queryVector,
|
|
18104
|
+
{
|
|
18105
|
+
topK: candidateK,
|
|
18106
|
+
pathPrefix: input.pathPrefix,
|
|
18107
|
+
tags: input.tags
|
|
18108
|
+
},
|
|
18109
|
+
resolvedScope
|
|
18110
|
+
);
|
|
18111
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
18112
|
+
const ranked = rankHits(hits, this.config);
|
|
18113
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
18114
|
+
yield {
|
|
18115
|
+
phase: "initial",
|
|
18116
|
+
data: {
|
|
18117
|
+
q: input.q,
|
|
18118
|
+
scope: resolvedScope.scopeName,
|
|
18119
|
+
results: initialResults,
|
|
18120
|
+
meta: {
|
|
18121
|
+
timingsMs: {
|
|
18122
|
+
embed: Math.round(embedMs),
|
|
18123
|
+
vector: Math.round(vectorMs),
|
|
18124
|
+
rerank: 0,
|
|
18125
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18126
|
+
},
|
|
18127
|
+
usedRerank: false,
|
|
18128
|
+
modelId: this.config.embeddings.model
|
|
18129
|
+
}
|
|
18130
|
+
}
|
|
18131
|
+
};
|
|
18132
|
+
const rerankStart = process.hrtime.bigint();
|
|
18133
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
18134
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
18135
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
18136
|
+
yield {
|
|
18137
|
+
phase: "reranked",
|
|
18138
|
+
data: {
|
|
18139
|
+
q: input.q,
|
|
18140
|
+
scope: resolvedScope.scopeName,
|
|
18141
|
+
results: rerankedResults,
|
|
18142
|
+
meta: {
|
|
18143
|
+
timingsMs: {
|
|
18144
|
+
embed: Math.round(embedMs),
|
|
18145
|
+
vector: Math.round(vectorMs),
|
|
18146
|
+
rerank: Math.round(rerankMs),
|
|
18147
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18148
|
+
},
|
|
18149
|
+
usedRerank: true,
|
|
18150
|
+
modelId: this.config.embeddings.model
|
|
18151
|
+
}
|
|
18152
|
+
}
|
|
18153
|
+
};
|
|
18154
|
+
}
|
|
18155
|
+
buildResults(ordered, topK, groupByPage) {
|
|
18033
18156
|
const minScore = this.config.ranking.minScore;
|
|
18034
18157
|
if (groupByPage) {
|
|
18035
18158
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -18037,10 +18160,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
18037
18160
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
18038
18161
|
}
|
|
18039
18162
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
18040
|
-
|
|
18163
|
+
return pages.slice(0, topK).map((page) => {
|
|
18041
18164
|
const bestScore = page.bestChunk.finalScore;
|
|
18042
|
-
const
|
|
18043
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
18165
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
18166
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
18044
18167
|
return {
|
|
18045
18168
|
url: page.url,
|
|
18046
18169
|
title: page.title,
|
|
@@ -18057,10 +18180,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
18057
18180
|
};
|
|
18058
18181
|
});
|
|
18059
18182
|
} else {
|
|
18183
|
+
let filtered = ordered;
|
|
18060
18184
|
if (minScore > 0) {
|
|
18061
|
-
|
|
18185
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
18062
18186
|
}
|
|
18063
|
-
|
|
18187
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
18064
18188
|
url: hit.metadata.url,
|
|
18065
18189
|
title: hit.metadata.title,
|
|
18066
18190
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -18069,21 +18193,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
18069
18193
|
routeFile: hit.metadata.routeFile
|
|
18070
18194
|
}));
|
|
18071
18195
|
}
|
|
18072
|
-
return {
|
|
18073
|
-
q: input.q,
|
|
18074
|
-
scope: resolvedScope.scopeName,
|
|
18075
|
-
results,
|
|
18076
|
-
meta: {
|
|
18077
|
-
timingsMs: {
|
|
18078
|
-
embed: Math.round(embedMs),
|
|
18079
|
-
vector: Math.round(vectorMs),
|
|
18080
|
-
rerank: Math.round(rerankMs),
|
|
18081
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
18082
|
-
},
|
|
18083
|
-
usedRerank,
|
|
18084
|
-
modelId: this.config.embeddings.model
|
|
18085
|
-
}
|
|
18086
|
-
};
|
|
18087
18196
|
}
|
|
18088
18197
|
async getPage(pathOrUrl, scope) {
|
|
18089
18198
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -18358,7 +18467,44 @@ function searchsocketHandle(options = {}) {
|
|
|
18358
18467
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
18359
18468
|
}
|
|
18360
18469
|
const engine = await getEngine();
|
|
18361
|
-
const
|
|
18470
|
+
const searchRequest = body;
|
|
18471
|
+
if (searchRequest.stream && searchRequest.rerank) {
|
|
18472
|
+
const encoder = new TextEncoder();
|
|
18473
|
+
const stream = new ReadableStream({
|
|
18474
|
+
async start(controller) {
|
|
18475
|
+
try {
|
|
18476
|
+
for await (const event2 of engine.searchStreaming(searchRequest)) {
|
|
18477
|
+
const line = JSON.stringify(event2) + "\n";
|
|
18478
|
+
controller.enqueue(encoder.encode(line));
|
|
18479
|
+
}
|
|
18480
|
+
} catch (streamError) {
|
|
18481
|
+
const errorEvent = {
|
|
18482
|
+
phase: "error",
|
|
18483
|
+
data: {
|
|
18484
|
+
error: {
|
|
18485
|
+
code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
|
|
18486
|
+
message: streamError instanceof Error ? streamError.message : "Unknown error"
|
|
18487
|
+
}
|
|
18488
|
+
}
|
|
18489
|
+
};
|
|
18490
|
+
controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
|
|
18491
|
+
} finally {
|
|
18492
|
+
controller.close();
|
|
18493
|
+
}
|
|
18494
|
+
}
|
|
18495
|
+
});
|
|
18496
|
+
return withCors(
|
|
18497
|
+
new Response(stream, {
|
|
18498
|
+
status: 200,
|
|
18499
|
+
headers: {
|
|
18500
|
+
"content-type": "application/x-ndjson"
|
|
18501
|
+
}
|
|
18502
|
+
}),
|
|
18503
|
+
event.request,
|
|
18504
|
+
config
|
|
18505
|
+
);
|
|
18506
|
+
}
|
|
18507
|
+
const result = await engine.search(searchRequest);
|
|
18362
18508
|
return withCors(
|
|
18363
18509
|
new Response(JSON.stringify(result), {
|
|
18364
18510
|
status: 200,
|
|
@@ -19587,6 +19733,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19587
19733
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19588
19734
|
return null;
|
|
19589
19735
|
}
|
|
19736
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
19737
|
+
let weight;
|
|
19738
|
+
if (weightRaw !== void 0) {
|
|
19739
|
+
const parsed = Number(weightRaw);
|
|
19740
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
19741
|
+
weight = parsed;
|
|
19742
|
+
}
|
|
19743
|
+
}
|
|
19744
|
+
if (weight === 0) {
|
|
19745
|
+
return null;
|
|
19746
|
+
}
|
|
19590
19747
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19591
19748
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19592
19749
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19642,7 +19799,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19642
19799
|
noindex: false,
|
|
19643
19800
|
tags,
|
|
19644
19801
|
description,
|
|
19645
|
-
keywords
|
|
19802
|
+
keywords,
|
|
19803
|
+
weight
|
|
19646
19804
|
};
|
|
19647
19805
|
}
|
|
19648
19806
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19655,6 +19813,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19655
19813
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19656
19814
|
return null;
|
|
19657
19815
|
}
|
|
19816
|
+
let mdWeight;
|
|
19817
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
19818
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
19819
|
+
mdWeight = rawWeight;
|
|
19820
|
+
}
|
|
19821
|
+
if (mdWeight === 0) {
|
|
19822
|
+
return null;
|
|
19823
|
+
}
|
|
19658
19824
|
const content = parsed.content;
|
|
19659
19825
|
const normalized = normalizeMarkdown(content);
|
|
19660
19826
|
if (!normalizeText(normalized)) {
|
|
@@ -19677,7 +19843,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19677
19843
|
noindex: false,
|
|
19678
19844
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19679
19845
|
description: fmDescription,
|
|
19680
|
-
keywords: fmKeywords
|
|
19846
|
+
keywords: fmKeywords,
|
|
19847
|
+
weight: mdWeight
|
|
19681
19848
|
};
|
|
19682
19849
|
}
|
|
19683
19850
|
function yamlString(value) {
|
|
@@ -19946,15 +20113,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19946
20113
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19947
20114
|
}
|
|
19948
20115
|
function isExcluded(url, patterns) {
|
|
19949
|
-
|
|
19950
|
-
if (pattern.endsWith("/*")) {
|
|
19951
|
-
const prefix = pattern.slice(0, -1);
|
|
19952
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19953
|
-
} else if (url === pattern) {
|
|
19954
|
-
return true;
|
|
19955
|
-
}
|
|
19956
|
-
}
|
|
19957
|
-
return false;
|
|
20116
|
+
return matchUrlPatterns(url, patterns);
|
|
19958
20117
|
}
|
|
19959
20118
|
function findFreePort() {
|
|
19960
20119
|
return new Promise((resolve, reject) => {
|
|
@@ -20370,12 +20529,83 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
20370
20529
|
}
|
|
20371
20530
|
return pages;
|
|
20372
20531
|
}
|
|
20532
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
20533
|
+
const lines = content.split(/\r?\n/);
|
|
20534
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
20535
|
+
let currentAgents = [];
|
|
20536
|
+
for (const rawLine of lines) {
|
|
20537
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
20538
|
+
if (!line) continue;
|
|
20539
|
+
const colonIdx = line.indexOf(":");
|
|
20540
|
+
if (colonIdx === -1) continue;
|
|
20541
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
20542
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
20543
|
+
if (directive === "user-agent") {
|
|
20544
|
+
const agentName = value.toLowerCase();
|
|
20545
|
+
currentAgents.push(agentName);
|
|
20546
|
+
if (!agentGroups.has(agentName)) {
|
|
20547
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
20548
|
+
}
|
|
20549
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
20550
|
+
for (const agent of currentAgents) {
|
|
20551
|
+
agentGroups.get(agent).disallow.push(value);
|
|
20552
|
+
}
|
|
20553
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
20554
|
+
for (const agent of currentAgents) {
|
|
20555
|
+
agentGroups.get(agent).allow.push(value);
|
|
20556
|
+
}
|
|
20557
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
20558
|
+
currentAgents = [];
|
|
20559
|
+
}
|
|
20560
|
+
}
|
|
20561
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
20562
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
20563
|
+
return specific;
|
|
20564
|
+
}
|
|
20565
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
20566
|
+
}
|
|
20567
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
20568
|
+
let longestDisallow = "";
|
|
20569
|
+
for (const pattern of rules3.disallow) {
|
|
20570
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
20571
|
+
longestDisallow = pattern;
|
|
20572
|
+
}
|
|
20573
|
+
}
|
|
20574
|
+
if (!longestDisallow) return false;
|
|
20575
|
+
let longestAllow = "";
|
|
20576
|
+
for (const pattern of rules3.allow) {
|
|
20577
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
20578
|
+
longestAllow = pattern;
|
|
20579
|
+
}
|
|
20580
|
+
}
|
|
20581
|
+
return longestAllow.length < longestDisallow.length;
|
|
20582
|
+
}
|
|
20583
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
20584
|
+
try {
|
|
20585
|
+
const content = await fs4.readFile(path.join(dir, "robots.txt"), "utf8");
|
|
20586
|
+
return parseRobotsTxt(content);
|
|
20587
|
+
} catch {
|
|
20588
|
+
return null;
|
|
20589
|
+
}
|
|
20590
|
+
}
|
|
20591
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
20592
|
+
try {
|
|
20593
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
20594
|
+
const response = await fetch(url);
|
|
20595
|
+
if (!response.ok) return null;
|
|
20596
|
+
const content = await response.text();
|
|
20597
|
+
return parseRobotsTxt(content);
|
|
20598
|
+
} catch {
|
|
20599
|
+
return null;
|
|
20600
|
+
}
|
|
20601
|
+
}
|
|
20373
20602
|
|
|
20374
20603
|
// src/indexing/pipeline.ts
|
|
20375
20604
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
20376
|
-
"jina-embeddings-v3": 2e-5
|
|
20605
|
+
"jina-embeddings-v3": 2e-5,
|
|
20606
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
20377
20607
|
};
|
|
20378
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
20608
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
20379
20609
|
var IndexPipeline = class _IndexPipeline {
|
|
20380
20610
|
cwd;
|
|
20381
20611
|
config;
|
|
@@ -20453,6 +20683,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20453
20683
|
}
|
|
20454
20684
|
stageEnd("source", sourceStart);
|
|
20455
20685
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20686
|
+
const filterStart = stageStart();
|
|
20687
|
+
let filteredSourcePages = sourcePages;
|
|
20688
|
+
if (this.config.exclude.length > 0) {
|
|
20689
|
+
const beforeExclude = filteredSourcePages.length;
|
|
20690
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20691
|
+
const url = normalizeUrlPath(p.url);
|
|
20692
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
20693
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
20694
|
+
return false;
|
|
20695
|
+
}
|
|
20696
|
+
return true;
|
|
20697
|
+
});
|
|
20698
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
20699
|
+
if (excludedCount > 0) {
|
|
20700
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
20701
|
+
}
|
|
20702
|
+
}
|
|
20703
|
+
if (this.config.respectRobotsTxt) {
|
|
20704
|
+
let robotsRules = null;
|
|
20705
|
+
if (sourceMode === "static-output") {
|
|
20706
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20707
|
+
path.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
20708
|
+
);
|
|
20709
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
20710
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20711
|
+
path.resolve(this.cwd, this.config.source.build.outputDir)
|
|
20712
|
+
);
|
|
20713
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
20714
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
20715
|
+
}
|
|
20716
|
+
if (robotsRules) {
|
|
20717
|
+
const beforeRobots = filteredSourcePages.length;
|
|
20718
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20719
|
+
const url = normalizeUrlPath(p.url);
|
|
20720
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
20721
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
20722
|
+
return false;
|
|
20723
|
+
}
|
|
20724
|
+
return true;
|
|
20725
|
+
});
|
|
20726
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
20727
|
+
if (robotsExcluded > 0) {
|
|
20728
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
20729
|
+
}
|
|
20730
|
+
}
|
|
20731
|
+
}
|
|
20732
|
+
stageEnd("filter", filterStart);
|
|
20456
20733
|
const routeStart = stageStart();
|
|
20457
20734
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
20458
20735
|
stageEnd("route_map", routeStart);
|
|
@@ -20460,7 +20737,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20460
20737
|
const extractStart = stageStart();
|
|
20461
20738
|
this.logger.info("Extracting content...");
|
|
20462
20739
|
const extractedPages = [];
|
|
20463
|
-
for (const sourcePage of
|
|
20740
|
+
for (const sourcePage of filteredSourcePages) {
|
|
20464
20741
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
20465
20742
|
if (!extracted) {
|
|
20466
20743
|
this.logger.warn(
|
|
@@ -20486,16 +20763,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20486
20763
|
seenUrls.add(page.url);
|
|
20487
20764
|
uniquePages.push(page);
|
|
20488
20765
|
}
|
|
20766
|
+
const indexablePages = [];
|
|
20767
|
+
for (const page of uniquePages) {
|
|
20768
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
20769
|
+
if (effectiveWeight === 0) {
|
|
20770
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
20771
|
+
continue;
|
|
20772
|
+
}
|
|
20773
|
+
indexablePages.push(page);
|
|
20774
|
+
}
|
|
20775
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
20776
|
+
if (zeroWeightCount > 0) {
|
|
20777
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
20778
|
+
}
|
|
20489
20779
|
stageEnd("extract", extractStart);
|
|
20490
|
-
const skippedPages =
|
|
20491
|
-
this.logger.info(`Extracted ${
|
|
20780
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
20781
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
20492
20782
|
const linkStart = stageStart();
|
|
20493
|
-
const pageSet = new Set(
|
|
20783
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
20494
20784
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
20495
|
-
for (const page of
|
|
20785
|
+
for (const page of indexablePages) {
|
|
20496
20786
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
20497
20787
|
}
|
|
20498
|
-
for (const page of
|
|
20788
|
+
for (const page of indexablePages) {
|
|
20499
20789
|
for (const outgoing of page.outgoingLinks) {
|
|
20500
20790
|
if (!pageSet.has(outgoing)) {
|
|
20501
20791
|
continue;
|
|
@@ -20519,7 +20809,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20519
20809
|
});
|
|
20520
20810
|
}
|
|
20521
20811
|
}
|
|
20522
|
-
for (const page of
|
|
20812
|
+
for (const page of indexablePages) {
|
|
20523
20813
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
20524
20814
|
if (routeMatch.routeResolution === "best-effort") {
|
|
20525
20815
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -20805,7 +21095,7 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20805
21095
|
});
|
|
20806
21096
|
const stats = await pipeline.run({
|
|
20807
21097
|
changedOnly: options.changedOnly ?? true,
|
|
20808
|
-
force: options.force ?? false,
|
|
21098
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20809
21099
|
dryRun: options.dryRun ?? false,
|
|
20810
21100
|
scopeOverride: options.scope,
|
|
20811
21101
|
verbose: options.verbose
|