searchsocket 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +456 -187
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +590 -169
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +590 -170
- package/dist/sveltekit.cjs +380 -82
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +380 -82
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/sveltekit.d.ts
CHANGED
package/dist/sveltekit.js
CHANGED
|
@@ -5009,32 +5009,32 @@ var require_URL = __commonJS({
|
|
|
5009
5009
|
else
|
|
5010
5010
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5011
5011
|
}
|
|
5012
|
-
function remove_dot_segments(
|
|
5013
|
-
if (!
|
|
5012
|
+
function remove_dot_segments(path15) {
|
|
5013
|
+
if (!path15) return path15;
|
|
5014
5014
|
var output = "";
|
|
5015
|
-
while (
|
|
5016
|
-
if (
|
|
5017
|
-
|
|
5015
|
+
while (path15.length > 0) {
|
|
5016
|
+
if (path15 === "." || path15 === "..") {
|
|
5017
|
+
path15 = "";
|
|
5018
5018
|
break;
|
|
5019
5019
|
}
|
|
5020
|
-
var twochars =
|
|
5021
|
-
var threechars =
|
|
5022
|
-
var fourchars =
|
|
5020
|
+
var twochars = path15.substring(0, 2);
|
|
5021
|
+
var threechars = path15.substring(0, 3);
|
|
5022
|
+
var fourchars = path15.substring(0, 4);
|
|
5023
5023
|
if (threechars === "../") {
|
|
5024
|
-
|
|
5024
|
+
path15 = path15.substring(3);
|
|
5025
5025
|
} else if (twochars === "./") {
|
|
5026
|
-
|
|
5026
|
+
path15 = path15.substring(2);
|
|
5027
5027
|
} else if (threechars === "/./") {
|
|
5028
|
-
|
|
5029
|
-
} else if (twochars === "/." &&
|
|
5030
|
-
|
|
5031
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5032
|
-
|
|
5028
|
+
path15 = "/" + path15.substring(3);
|
|
5029
|
+
} else if (twochars === "/." && path15.length === 2) {
|
|
5030
|
+
path15 = "/";
|
|
5031
|
+
} else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
|
|
5032
|
+
path15 = "/" + path15.substring(4);
|
|
5033
5033
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5034
5034
|
} else {
|
|
5035
|
-
var segment =
|
|
5035
|
+
var segment = path15.match(/(\/?([^\/]*))/)[0];
|
|
5036
5036
|
output += segment;
|
|
5037
|
-
|
|
5037
|
+
path15 = path15.substring(segment.length);
|
|
5038
5038
|
}
|
|
5039
5039
|
}
|
|
5040
5040
|
return output;
|
|
@@ -16598,6 +16598,8 @@ var searchSocketConfigSchema = z.object({
|
|
|
16598
16598
|
envVar: z.string().min(1).optional(),
|
|
16599
16599
|
sanitize: z.boolean().optional()
|
|
16600
16600
|
}).optional(),
|
|
16601
|
+
exclude: z.array(z.string()).optional(),
|
|
16602
|
+
respectRobotsTxt: z.boolean().optional(),
|
|
16601
16603
|
source: z.object({
|
|
16602
16604
|
mode: z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16603
16605
|
staticOutputDir: z.string().min(1).optional(),
|
|
@@ -16728,6 +16730,8 @@ function createDefaultConfig(projectId) {
|
|
|
16728
16730
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16729
16731
|
sanitize: true
|
|
16730
16732
|
},
|
|
16733
|
+
exclude: [],
|
|
16734
|
+
respectRobotsTxt: true,
|
|
16731
16735
|
source: {
|
|
16732
16736
|
mode: "static-output",
|
|
16733
16737
|
staticOutputDir: "build",
|
|
@@ -16758,7 +16762,7 @@ function createDefaultConfig(projectId) {
|
|
|
16758
16762
|
},
|
|
16759
16763
|
embeddings: {
|
|
16760
16764
|
provider: "jina",
|
|
16761
|
-
model: "jina-embeddings-
|
|
16765
|
+
model: "jina-embeddings-v5-text-small",
|
|
16762
16766
|
apiKeyEnv: "JINA_API_KEY",
|
|
16763
16767
|
batchSize: 64,
|
|
16764
16768
|
concurrency: 4
|
|
@@ -16771,9 +16775,9 @@ function createDefaultConfig(projectId) {
|
|
|
16771
16775
|
}
|
|
16772
16776
|
},
|
|
16773
16777
|
rerank: {
|
|
16774
|
-
enabled:
|
|
16778
|
+
enabled: true,
|
|
16775
16779
|
topN: 20,
|
|
16776
|
-
model: "jina-reranker-
|
|
16780
|
+
model: "jina-reranker-v3"
|
|
16777
16781
|
},
|
|
16778
16782
|
ranking: {
|
|
16779
16783
|
enableIncomingLinkBoost: true,
|
|
@@ -16892,6 +16896,8 @@ ${issues}`
|
|
|
16892
16896
|
...defaults.scope,
|
|
16893
16897
|
...parsed.scope
|
|
16894
16898
|
},
|
|
16899
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16900
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16895
16901
|
source: {
|
|
16896
16902
|
...defaults.source,
|
|
16897
16903
|
...parsed.source,
|
|
@@ -17242,7 +17248,7 @@ var JinaReranker = class {
|
|
|
17242
17248
|
constructor(options) {
|
|
17243
17249
|
this.apiKey = options.apiKey;
|
|
17244
17250
|
this.model = options.model;
|
|
17245
|
-
this.maxRetries = options.maxRetries ??
|
|
17251
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
17246
17252
|
}
|
|
17247
17253
|
async rerank(query, candidates, topN) {
|
|
17248
17254
|
if (candidates.length === 0) {
|
|
@@ -17252,7 +17258,8 @@ var JinaReranker = class {
|
|
|
17252
17258
|
model: this.model,
|
|
17253
17259
|
query,
|
|
17254
17260
|
documents: candidates.map((candidate) => candidate.text),
|
|
17255
|
-
top_n: topN ?? candidates.length
|
|
17261
|
+
top_n: topN ?? candidates.length,
|
|
17262
|
+
return_documents: false
|
|
17256
17263
|
};
|
|
17257
17264
|
let attempt = 0;
|
|
17258
17265
|
while (attempt <= this.maxRetries) {
|
|
@@ -17855,6 +17862,36 @@ async function createVectorStore(config, cwd) {
|
|
|
17855
17862
|
});
|
|
17856
17863
|
}
|
|
17857
17864
|
|
|
17865
|
+
// src/utils/pattern.ts
|
|
17866
|
+
function matchUrlPattern(url, pattern) {
|
|
17867
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
17868
|
+
const normalizedUrl = norm(url);
|
|
17869
|
+
const normalizedPattern = norm(pattern);
|
|
17870
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
17871
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
17872
|
+
if (prefix === "") {
|
|
17873
|
+
return true;
|
|
17874
|
+
}
|
|
17875
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
17876
|
+
}
|
|
17877
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
17878
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
17879
|
+
if (prefix === "") {
|
|
17880
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
17881
|
+
}
|
|
17882
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
17883
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
17884
|
+
return rest.length > 0 && !rest.includes("/");
|
|
17885
|
+
}
|
|
17886
|
+
return normalizedUrl === normalizedPattern;
|
|
17887
|
+
}
|
|
17888
|
+
function matchUrlPatterns(url, patterns) {
|
|
17889
|
+
for (const pattern of patterns) {
|
|
17890
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
17891
|
+
}
|
|
17892
|
+
return false;
|
|
17893
|
+
}
|
|
17894
|
+
|
|
17858
17895
|
// src/search/ranking.ts
|
|
17859
17896
|
function nonNegativeOrZero(value) {
|
|
17860
17897
|
if (!Number.isFinite(value)) {
|
|
@@ -17883,21 +17920,11 @@ function rankHits(hits, config) {
|
|
|
17883
17920
|
});
|
|
17884
17921
|
}
|
|
17885
17922
|
function findPageWeight(url, pageWeights) {
|
|
17886
|
-
|
|
17887
|
-
const normalizedUrl = norm(url);
|
|
17888
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
17889
|
-
if (norm(pattern) === normalizedUrl) {
|
|
17890
|
-
return weight;
|
|
17891
|
-
}
|
|
17892
|
-
}
|
|
17893
|
-
let bestPrefix = "";
|
|
17923
|
+
let bestPattern = "";
|
|
17894
17924
|
let bestWeight = 1;
|
|
17895
17925
|
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
17896
|
-
|
|
17897
|
-
|
|
17898
|
-
const prefix = `${normalizedPattern}/`;
|
|
17899
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
17900
|
-
bestPrefix = prefix;
|
|
17926
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
17927
|
+
bestPattern = pattern;
|
|
17901
17928
|
bestWeight = weight;
|
|
17902
17929
|
}
|
|
17903
17930
|
}
|
|
@@ -17955,7 +17982,8 @@ var requestSchema = z.object({
|
|
|
17955
17982
|
pathPrefix: z.string().optional(),
|
|
17956
17983
|
tags: z.array(z.string()).optional(),
|
|
17957
17984
|
rerank: z.boolean().optional(),
|
|
17958
|
-
groupBy: z.enum(["page", "chunk"]).optional()
|
|
17985
|
+
groupBy: z.enum(["page", "chunk"]).optional(),
|
|
17986
|
+
stream: z.boolean().optional()
|
|
17959
17987
|
});
|
|
17960
17988
|
var SearchEngine = class _SearchEngine {
|
|
17961
17989
|
cwd;
|
|
@@ -18028,7 +18056,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
18028
18056
|
rerankMs = hrTimeMs(rerankStart);
|
|
18029
18057
|
usedRerank = true;
|
|
18030
18058
|
}
|
|
18031
|
-
|
|
18059
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
18060
|
+
return {
|
|
18061
|
+
q: input.q,
|
|
18062
|
+
scope: resolvedScope.scopeName,
|
|
18063
|
+
results,
|
|
18064
|
+
meta: {
|
|
18065
|
+
timingsMs: {
|
|
18066
|
+
embed: Math.round(embedMs),
|
|
18067
|
+
vector: Math.round(vectorMs),
|
|
18068
|
+
rerank: Math.round(rerankMs),
|
|
18069
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18070
|
+
},
|
|
18071
|
+
usedRerank,
|
|
18072
|
+
modelId: this.config.embeddings.model
|
|
18073
|
+
}
|
|
18074
|
+
};
|
|
18075
|
+
}
|
|
18076
|
+
async *searchStreaming(request) {
|
|
18077
|
+
const parsed = requestSchema.safeParse(request);
|
|
18078
|
+
if (!parsed.success) {
|
|
18079
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
18080
|
+
}
|
|
18081
|
+
const input = parsed.data;
|
|
18082
|
+
const wantsRerank = Boolean(input.rerank);
|
|
18083
|
+
if (!wantsRerank) {
|
|
18084
|
+
const response = await this.search(request);
|
|
18085
|
+
yield { phase: "initial", data: response };
|
|
18086
|
+
return;
|
|
18087
|
+
}
|
|
18088
|
+
const totalStart = process.hrtime.bigint();
|
|
18089
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
18090
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
18091
|
+
const topK = input.topK ?? 10;
|
|
18092
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
18093
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
18094
|
+
const embedStart = process.hrtime.bigint();
|
|
18095
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
18096
|
+
const queryVector = queryEmbeddings[0];
|
|
18097
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
18098
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
18099
|
+
}
|
|
18100
|
+
const embedMs = hrTimeMs(embedStart);
|
|
18101
|
+
const vectorStart = process.hrtime.bigint();
|
|
18102
|
+
const hits = await this.vectorStore.query(
|
|
18103
|
+
queryVector,
|
|
18104
|
+
{
|
|
18105
|
+
topK: candidateK,
|
|
18106
|
+
pathPrefix: input.pathPrefix,
|
|
18107
|
+
tags: input.tags
|
|
18108
|
+
},
|
|
18109
|
+
resolvedScope
|
|
18110
|
+
);
|
|
18111
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
18112
|
+
const ranked = rankHits(hits, this.config);
|
|
18113
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
18114
|
+
yield {
|
|
18115
|
+
phase: "initial",
|
|
18116
|
+
data: {
|
|
18117
|
+
q: input.q,
|
|
18118
|
+
scope: resolvedScope.scopeName,
|
|
18119
|
+
results: initialResults,
|
|
18120
|
+
meta: {
|
|
18121
|
+
timingsMs: {
|
|
18122
|
+
embed: Math.round(embedMs),
|
|
18123
|
+
vector: Math.round(vectorMs),
|
|
18124
|
+
rerank: 0,
|
|
18125
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18126
|
+
},
|
|
18127
|
+
usedRerank: false,
|
|
18128
|
+
modelId: this.config.embeddings.model
|
|
18129
|
+
}
|
|
18130
|
+
}
|
|
18131
|
+
};
|
|
18132
|
+
const rerankStart = process.hrtime.bigint();
|
|
18133
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
18134
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
18135
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
18136
|
+
yield {
|
|
18137
|
+
phase: "reranked",
|
|
18138
|
+
data: {
|
|
18139
|
+
q: input.q,
|
|
18140
|
+
scope: resolvedScope.scopeName,
|
|
18141
|
+
results: rerankedResults,
|
|
18142
|
+
meta: {
|
|
18143
|
+
timingsMs: {
|
|
18144
|
+
embed: Math.round(embedMs),
|
|
18145
|
+
vector: Math.round(vectorMs),
|
|
18146
|
+
rerank: Math.round(rerankMs),
|
|
18147
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18148
|
+
},
|
|
18149
|
+
usedRerank: true,
|
|
18150
|
+
modelId: this.config.embeddings.model
|
|
18151
|
+
}
|
|
18152
|
+
}
|
|
18153
|
+
};
|
|
18154
|
+
}
|
|
18155
|
+
buildResults(ordered, topK, groupByPage) {
|
|
18032
18156
|
const minScore = this.config.ranking.minScore;
|
|
18033
18157
|
if (groupByPage) {
|
|
18034
18158
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -18036,10 +18160,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
18036
18160
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
18037
18161
|
}
|
|
18038
18162
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
18039
|
-
|
|
18163
|
+
return pages.slice(0, topK).map((page) => {
|
|
18040
18164
|
const bestScore = page.bestChunk.finalScore;
|
|
18041
|
-
const
|
|
18042
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
18165
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
18166
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
18043
18167
|
return {
|
|
18044
18168
|
url: page.url,
|
|
18045
18169
|
title: page.title,
|
|
@@ -18056,10 +18180,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
18056
18180
|
};
|
|
18057
18181
|
});
|
|
18058
18182
|
} else {
|
|
18183
|
+
let filtered = ordered;
|
|
18059
18184
|
if (minScore > 0) {
|
|
18060
|
-
|
|
18185
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
18061
18186
|
}
|
|
18062
|
-
|
|
18187
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
18063
18188
|
url: hit.metadata.url,
|
|
18064
18189
|
title: hit.metadata.title,
|
|
18065
18190
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -18068,21 +18193,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
18068
18193
|
routeFile: hit.metadata.routeFile
|
|
18069
18194
|
}));
|
|
18070
18195
|
}
|
|
18071
|
-
return {
|
|
18072
|
-
q: input.q,
|
|
18073
|
-
scope: resolvedScope.scopeName,
|
|
18074
|
-
results,
|
|
18075
|
-
meta: {
|
|
18076
|
-
timingsMs: {
|
|
18077
|
-
embed: Math.round(embedMs),
|
|
18078
|
-
vector: Math.round(vectorMs),
|
|
18079
|
-
rerank: Math.round(rerankMs),
|
|
18080
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
18081
|
-
},
|
|
18082
|
-
usedRerank,
|
|
18083
|
-
modelId: this.config.embeddings.model
|
|
18084
|
-
}
|
|
18085
|
-
};
|
|
18086
18196
|
}
|
|
18087
18197
|
async getPage(pathOrUrl, scope) {
|
|
18088
18198
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -18154,6 +18264,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
18154
18264
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
18155
18265
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
18156
18266
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
18267
|
+
const MAX_DOC_CHARS = 2e3;
|
|
18157
18268
|
const pageCandidates = [];
|
|
18158
18269
|
for (const [url, chunks] of pageGroups) {
|
|
18159
18270
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -18173,12 +18284,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
18173
18284
|
}
|
|
18174
18285
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18175
18286
|
parts.push(body);
|
|
18176
|
-
|
|
18287
|
+
let text = parts.join("\n\n");
|
|
18288
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
18289
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
18290
|
+
}
|
|
18291
|
+
pageCandidates.push({ id: url, text });
|
|
18177
18292
|
}
|
|
18293
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
18294
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
18178
18295
|
const reranked = await this.reranker.rerank(
|
|
18179
18296
|
query,
|
|
18180
|
-
|
|
18181
|
-
|
|
18297
|
+
cappedCandidates,
|
|
18298
|
+
maxCandidates
|
|
18182
18299
|
);
|
|
18183
18300
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
18184
18301
|
return ranked.map((entry) => {
|
|
@@ -18350,7 +18467,44 @@ function searchsocketHandle(options = {}) {
|
|
|
18350
18467
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
18351
18468
|
}
|
|
18352
18469
|
const engine = await getEngine();
|
|
18353
|
-
const
|
|
18470
|
+
const searchRequest = body;
|
|
18471
|
+
if (searchRequest.stream && searchRequest.rerank) {
|
|
18472
|
+
const encoder = new TextEncoder();
|
|
18473
|
+
const stream = new ReadableStream({
|
|
18474
|
+
async start(controller) {
|
|
18475
|
+
try {
|
|
18476
|
+
for await (const event2 of engine.searchStreaming(searchRequest)) {
|
|
18477
|
+
const line = JSON.stringify(event2) + "\n";
|
|
18478
|
+
controller.enqueue(encoder.encode(line));
|
|
18479
|
+
}
|
|
18480
|
+
} catch (streamError) {
|
|
18481
|
+
const errorEvent = {
|
|
18482
|
+
phase: "error",
|
|
18483
|
+
data: {
|
|
18484
|
+
error: {
|
|
18485
|
+
code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
|
|
18486
|
+
message: streamError instanceof Error ? streamError.message : "Unknown error"
|
|
18487
|
+
}
|
|
18488
|
+
}
|
|
18489
|
+
};
|
|
18490
|
+
controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
|
|
18491
|
+
} finally {
|
|
18492
|
+
controller.close();
|
|
18493
|
+
}
|
|
18494
|
+
}
|
|
18495
|
+
});
|
|
18496
|
+
return withCors(
|
|
18497
|
+
new Response(stream, {
|
|
18498
|
+
status: 200,
|
|
18499
|
+
headers: {
|
|
18500
|
+
"content-type": "application/x-ndjson"
|
|
18501
|
+
}
|
|
18502
|
+
}),
|
|
18503
|
+
event.request,
|
|
18504
|
+
config
|
|
18505
|
+
);
|
|
18506
|
+
}
|
|
18507
|
+
const result = await engine.search(searchRequest);
|
|
18354
18508
|
return withCors(
|
|
18355
18509
|
new Response(JSON.stringify(result), {
|
|
18356
18510
|
status: 200,
|
|
@@ -19579,6 +19733,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19579
19733
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19580
19734
|
return null;
|
|
19581
19735
|
}
|
|
19736
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
19737
|
+
let weight;
|
|
19738
|
+
if (weightRaw !== void 0) {
|
|
19739
|
+
const parsed = Number(weightRaw);
|
|
19740
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
19741
|
+
weight = parsed;
|
|
19742
|
+
}
|
|
19743
|
+
}
|
|
19744
|
+
if (weight === 0) {
|
|
19745
|
+
return null;
|
|
19746
|
+
}
|
|
19582
19747
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19583
19748
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19584
19749
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19634,7 +19799,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19634
19799
|
noindex: false,
|
|
19635
19800
|
tags,
|
|
19636
19801
|
description,
|
|
19637
|
-
keywords
|
|
19802
|
+
keywords,
|
|
19803
|
+
weight
|
|
19638
19804
|
};
|
|
19639
19805
|
}
|
|
19640
19806
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19647,6 +19813,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19647
19813
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19648
19814
|
return null;
|
|
19649
19815
|
}
|
|
19816
|
+
let mdWeight;
|
|
19817
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
19818
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
19819
|
+
mdWeight = rawWeight;
|
|
19820
|
+
}
|
|
19821
|
+
if (mdWeight === 0) {
|
|
19822
|
+
return null;
|
|
19823
|
+
}
|
|
19650
19824
|
const content = parsed.content;
|
|
19651
19825
|
const normalized = normalizeMarkdown(content);
|
|
19652
19826
|
if (!normalizeText(normalized)) {
|
|
@@ -19669,7 +19843,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19669
19843
|
noindex: false,
|
|
19670
19844
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19671
19845
|
description: fmDescription,
|
|
19672
|
-
keywords: fmKeywords
|
|
19846
|
+
keywords: fmKeywords,
|
|
19847
|
+
weight: mdWeight
|
|
19673
19848
|
};
|
|
19674
19849
|
}
|
|
19675
19850
|
function yamlString(value) {
|
|
@@ -19938,15 +20113,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19938
20113
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19939
20114
|
}
|
|
19940
20115
|
function isExcluded(url, patterns) {
|
|
19941
|
-
|
|
19942
|
-
if (pattern.endsWith("/*")) {
|
|
19943
|
-
const prefix = pattern.slice(0, -1);
|
|
19944
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19945
|
-
} else if (url === pattern) {
|
|
19946
|
-
return true;
|
|
19947
|
-
}
|
|
19948
|
-
}
|
|
19949
|
-
return false;
|
|
20116
|
+
return matchUrlPatterns(url, patterns);
|
|
19950
20117
|
}
|
|
19951
20118
|
function findFreePort() {
|
|
19952
20119
|
return new Promise((resolve, reject) => {
|
|
@@ -20362,12 +20529,83 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
20362
20529
|
}
|
|
20363
20530
|
return pages;
|
|
20364
20531
|
}
|
|
20532
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
20533
|
+
const lines = content.split(/\r?\n/);
|
|
20534
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
20535
|
+
let currentAgents = [];
|
|
20536
|
+
for (const rawLine of lines) {
|
|
20537
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
20538
|
+
if (!line) continue;
|
|
20539
|
+
const colonIdx = line.indexOf(":");
|
|
20540
|
+
if (colonIdx === -1) continue;
|
|
20541
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
20542
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
20543
|
+
if (directive === "user-agent") {
|
|
20544
|
+
const agentName = value.toLowerCase();
|
|
20545
|
+
currentAgents.push(agentName);
|
|
20546
|
+
if (!agentGroups.has(agentName)) {
|
|
20547
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
20548
|
+
}
|
|
20549
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
20550
|
+
for (const agent of currentAgents) {
|
|
20551
|
+
agentGroups.get(agent).disallow.push(value);
|
|
20552
|
+
}
|
|
20553
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
20554
|
+
for (const agent of currentAgents) {
|
|
20555
|
+
agentGroups.get(agent).allow.push(value);
|
|
20556
|
+
}
|
|
20557
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
20558
|
+
currentAgents = [];
|
|
20559
|
+
}
|
|
20560
|
+
}
|
|
20561
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
20562
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
20563
|
+
return specific;
|
|
20564
|
+
}
|
|
20565
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
20566
|
+
}
|
|
20567
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
20568
|
+
let longestDisallow = "";
|
|
20569
|
+
for (const pattern of rules3.disallow) {
|
|
20570
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
20571
|
+
longestDisallow = pattern;
|
|
20572
|
+
}
|
|
20573
|
+
}
|
|
20574
|
+
if (!longestDisallow) return false;
|
|
20575
|
+
let longestAllow = "";
|
|
20576
|
+
for (const pattern of rules3.allow) {
|
|
20577
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
20578
|
+
longestAllow = pattern;
|
|
20579
|
+
}
|
|
20580
|
+
}
|
|
20581
|
+
return longestAllow.length < longestDisallow.length;
|
|
20582
|
+
}
|
|
20583
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
20584
|
+
try {
|
|
20585
|
+
const content = await fs4.readFile(path.join(dir, "robots.txt"), "utf8");
|
|
20586
|
+
return parseRobotsTxt(content);
|
|
20587
|
+
} catch {
|
|
20588
|
+
return null;
|
|
20589
|
+
}
|
|
20590
|
+
}
|
|
20591
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
20592
|
+
try {
|
|
20593
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
20594
|
+
const response = await fetch(url);
|
|
20595
|
+
if (!response.ok) return null;
|
|
20596
|
+
const content = await response.text();
|
|
20597
|
+
return parseRobotsTxt(content);
|
|
20598
|
+
} catch {
|
|
20599
|
+
return null;
|
|
20600
|
+
}
|
|
20601
|
+
}
|
|
20365
20602
|
|
|
20366
20603
|
// src/indexing/pipeline.ts
|
|
20367
20604
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
20368
|
-
"jina-embeddings-v3": 2e-5
|
|
20605
|
+
"jina-embeddings-v3": 2e-5,
|
|
20606
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
20369
20607
|
};
|
|
20370
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
20608
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
20371
20609
|
var IndexPipeline = class _IndexPipeline {
|
|
20372
20610
|
cwd;
|
|
20373
20611
|
config;
|
|
@@ -20445,6 +20683,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20445
20683
|
}
|
|
20446
20684
|
stageEnd("source", sourceStart);
|
|
20447
20685
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20686
|
+
const filterStart = stageStart();
|
|
20687
|
+
let filteredSourcePages = sourcePages;
|
|
20688
|
+
if (this.config.exclude.length > 0) {
|
|
20689
|
+
const beforeExclude = filteredSourcePages.length;
|
|
20690
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20691
|
+
const url = normalizeUrlPath(p.url);
|
|
20692
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
20693
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
20694
|
+
return false;
|
|
20695
|
+
}
|
|
20696
|
+
return true;
|
|
20697
|
+
});
|
|
20698
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
20699
|
+
if (excludedCount > 0) {
|
|
20700
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
20701
|
+
}
|
|
20702
|
+
}
|
|
20703
|
+
if (this.config.respectRobotsTxt) {
|
|
20704
|
+
let robotsRules = null;
|
|
20705
|
+
if (sourceMode === "static-output") {
|
|
20706
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20707
|
+
path.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
20708
|
+
);
|
|
20709
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
20710
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20711
|
+
path.resolve(this.cwd, this.config.source.build.outputDir)
|
|
20712
|
+
);
|
|
20713
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
20714
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
20715
|
+
}
|
|
20716
|
+
if (robotsRules) {
|
|
20717
|
+
const beforeRobots = filteredSourcePages.length;
|
|
20718
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20719
|
+
const url = normalizeUrlPath(p.url);
|
|
20720
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
20721
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
20722
|
+
return false;
|
|
20723
|
+
}
|
|
20724
|
+
return true;
|
|
20725
|
+
});
|
|
20726
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
20727
|
+
if (robotsExcluded > 0) {
|
|
20728
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
20729
|
+
}
|
|
20730
|
+
}
|
|
20731
|
+
}
|
|
20732
|
+
stageEnd("filter", filterStart);
|
|
20448
20733
|
const routeStart = stageStart();
|
|
20449
20734
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
20450
20735
|
stageEnd("route_map", routeStart);
|
|
@@ -20452,7 +20737,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20452
20737
|
const extractStart = stageStart();
|
|
20453
20738
|
this.logger.info("Extracting content...");
|
|
20454
20739
|
const extractedPages = [];
|
|
20455
|
-
for (const sourcePage of
|
|
20740
|
+
for (const sourcePage of filteredSourcePages) {
|
|
20456
20741
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
20457
20742
|
if (!extracted) {
|
|
20458
20743
|
this.logger.warn(
|
|
@@ -20478,16 +20763,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20478
20763
|
seenUrls.add(page.url);
|
|
20479
20764
|
uniquePages.push(page);
|
|
20480
20765
|
}
|
|
20766
|
+
const indexablePages = [];
|
|
20767
|
+
for (const page of uniquePages) {
|
|
20768
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
20769
|
+
if (effectiveWeight === 0) {
|
|
20770
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
20771
|
+
continue;
|
|
20772
|
+
}
|
|
20773
|
+
indexablePages.push(page);
|
|
20774
|
+
}
|
|
20775
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
20776
|
+
if (zeroWeightCount > 0) {
|
|
20777
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
20778
|
+
}
|
|
20481
20779
|
stageEnd("extract", extractStart);
|
|
20482
|
-
const skippedPages =
|
|
20483
|
-
this.logger.info(`Extracted ${
|
|
20780
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
20781
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
20484
20782
|
const linkStart = stageStart();
|
|
20485
|
-
const pageSet = new Set(
|
|
20783
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
20486
20784
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
20487
|
-
for (const page of
|
|
20785
|
+
for (const page of indexablePages) {
|
|
20488
20786
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
20489
20787
|
}
|
|
20490
|
-
for (const page of
|
|
20788
|
+
for (const page of indexablePages) {
|
|
20491
20789
|
for (const outgoing of page.outgoingLinks) {
|
|
20492
20790
|
if (!pageSet.has(outgoing)) {
|
|
20493
20791
|
continue;
|
|
@@ -20511,7 +20809,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20511
20809
|
});
|
|
20512
20810
|
}
|
|
20513
20811
|
}
|
|
20514
|
-
for (const page of
|
|
20812
|
+
for (const page of indexablePages) {
|
|
20515
20813
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
20516
20814
|
if (routeMatch.routeResolution === "best-effort") {
|
|
20517
20815
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -20797,7 +21095,7 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20797
21095
|
});
|
|
20798
21096
|
const stats = await pipeline.run({
|
|
20799
21097
|
changedOnly: options.changedOnly ?? true,
|
|
20800
|
-
force: options.force ?? false,
|
|
21098
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20801
21099
|
dryRun: options.dryRun ?? false,
|
|
20802
21100
|
scopeOverride: options.scope,
|
|
20803
21101
|
verbose: options.verbose
|