searchsocket 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -8
- package/dist/cli.js +456 -187
- package/dist/client.cjs +121 -0
- package/dist/client.d.cts +17 -2
- package/dist/client.d.ts +17 -2
- package/dist/client.js +121 -1
- package/dist/index.cjs +590 -169
- package/dist/index.d.cts +6 -4
- package/dist/index.d.ts +6 -4
- package/dist/index.js +590 -170
- package/dist/sveltekit.cjs +380 -82
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +380 -82
- package/dist/{types-BrG6XTUU.d.cts → types-z2dw3H6E.d.cts} +37 -1
- package/dist/{types-BrG6XTUU.d.ts → types-z2dw3H6E.d.ts} +37 -1
- package/package.json +1 -1
package/dist/sveltekit.cjs
CHANGED
|
@@ -5021,32 +5021,32 @@ var require_URL = __commonJS({
|
|
|
5021
5021
|
else
|
|
5022
5022
|
return basepath.substring(0, lastslash + 1) + refpath;
|
|
5023
5023
|
}
|
|
5024
|
-
function remove_dot_segments(
|
|
5025
|
-
if (!
|
|
5024
|
+
function remove_dot_segments(path15) {
|
|
5025
|
+
if (!path15) return path15;
|
|
5026
5026
|
var output = "";
|
|
5027
|
-
while (
|
|
5028
|
-
if (
|
|
5029
|
-
|
|
5027
|
+
while (path15.length > 0) {
|
|
5028
|
+
if (path15 === "." || path15 === "..") {
|
|
5029
|
+
path15 = "";
|
|
5030
5030
|
break;
|
|
5031
5031
|
}
|
|
5032
|
-
var twochars =
|
|
5033
|
-
var threechars =
|
|
5034
|
-
var fourchars =
|
|
5032
|
+
var twochars = path15.substring(0, 2);
|
|
5033
|
+
var threechars = path15.substring(0, 3);
|
|
5034
|
+
var fourchars = path15.substring(0, 4);
|
|
5035
5035
|
if (threechars === "../") {
|
|
5036
|
-
|
|
5036
|
+
path15 = path15.substring(3);
|
|
5037
5037
|
} else if (twochars === "./") {
|
|
5038
|
-
|
|
5038
|
+
path15 = path15.substring(2);
|
|
5039
5039
|
} else if (threechars === "/./") {
|
|
5040
|
-
|
|
5041
|
-
} else if (twochars === "/." &&
|
|
5042
|
-
|
|
5043
|
-
} else if (fourchars === "/../" || threechars === "/.." &&
|
|
5044
|
-
|
|
5040
|
+
path15 = "/" + path15.substring(3);
|
|
5041
|
+
} else if (twochars === "/." && path15.length === 2) {
|
|
5042
|
+
path15 = "/";
|
|
5043
|
+
} else if (fourchars === "/../" || threechars === "/.." && path15.length === 3) {
|
|
5044
|
+
path15 = "/" + path15.substring(4);
|
|
5045
5045
|
output = output.replace(/\/?[^\/]*$/, "");
|
|
5046
5046
|
} else {
|
|
5047
|
-
var segment =
|
|
5047
|
+
var segment = path15.match(/(\/?([^\/]*))/)[0];
|
|
5048
5048
|
output += segment;
|
|
5049
|
-
|
|
5049
|
+
path15 = path15.substring(segment.length);
|
|
5050
5050
|
}
|
|
5051
5051
|
}
|
|
5052
5052
|
return output;
|
|
@@ -16610,6 +16610,8 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16610
16610
|
envVar: zod.z.string().min(1).optional(),
|
|
16611
16611
|
sanitize: zod.z.boolean().optional()
|
|
16612
16612
|
}).optional(),
|
|
16613
|
+
exclude: zod.z.array(zod.z.string()).optional(),
|
|
16614
|
+
respectRobotsTxt: zod.z.boolean().optional(),
|
|
16613
16615
|
source: zod.z.object({
|
|
16614
16616
|
mode: zod.z.enum(["static-output", "crawl", "content-files", "build"]).optional(),
|
|
16615
16617
|
staticOutputDir: zod.z.string().min(1).optional(),
|
|
@@ -16740,6 +16742,8 @@ function createDefaultConfig(projectId) {
|
|
|
16740
16742
|
envVar: "SEARCHSOCKET_SCOPE",
|
|
16741
16743
|
sanitize: true
|
|
16742
16744
|
},
|
|
16745
|
+
exclude: [],
|
|
16746
|
+
respectRobotsTxt: true,
|
|
16743
16747
|
source: {
|
|
16744
16748
|
mode: "static-output",
|
|
16745
16749
|
staticOutputDir: "build",
|
|
@@ -16770,7 +16774,7 @@ function createDefaultConfig(projectId) {
|
|
|
16770
16774
|
},
|
|
16771
16775
|
embeddings: {
|
|
16772
16776
|
provider: "jina",
|
|
16773
|
-
model: "jina-embeddings-
|
|
16777
|
+
model: "jina-embeddings-v5-text-small",
|
|
16774
16778
|
apiKeyEnv: "JINA_API_KEY",
|
|
16775
16779
|
batchSize: 64,
|
|
16776
16780
|
concurrency: 4
|
|
@@ -16783,9 +16787,9 @@ function createDefaultConfig(projectId) {
|
|
|
16783
16787
|
}
|
|
16784
16788
|
},
|
|
16785
16789
|
rerank: {
|
|
16786
|
-
enabled:
|
|
16790
|
+
enabled: true,
|
|
16787
16791
|
topN: 20,
|
|
16788
|
-
model: "jina-reranker-
|
|
16792
|
+
model: "jina-reranker-v3"
|
|
16789
16793
|
},
|
|
16790
16794
|
ranking: {
|
|
16791
16795
|
enableIncomingLinkBoost: true,
|
|
@@ -16904,6 +16908,8 @@ ${issues}`
|
|
|
16904
16908
|
...defaults.scope,
|
|
16905
16909
|
...parsed.scope
|
|
16906
16910
|
},
|
|
16911
|
+
exclude: parsed.exclude ?? defaults.exclude,
|
|
16912
|
+
respectRobotsTxt: parsed.respectRobotsTxt ?? defaults.respectRobotsTxt,
|
|
16907
16913
|
source: {
|
|
16908
16914
|
...defaults.source,
|
|
16909
16915
|
...parsed.source,
|
|
@@ -17254,7 +17260,7 @@ var JinaReranker = class {
|
|
|
17254
17260
|
constructor(options) {
|
|
17255
17261
|
this.apiKey = options.apiKey;
|
|
17256
17262
|
this.model = options.model;
|
|
17257
|
-
this.maxRetries = options.maxRetries ??
|
|
17263
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
17258
17264
|
}
|
|
17259
17265
|
async rerank(query, candidates, topN) {
|
|
17260
17266
|
if (candidates.length === 0) {
|
|
@@ -17264,7 +17270,8 @@ var JinaReranker = class {
|
|
|
17264
17270
|
model: this.model,
|
|
17265
17271
|
query,
|
|
17266
17272
|
documents: candidates.map((candidate) => candidate.text),
|
|
17267
|
-
top_n: topN ?? candidates.length
|
|
17273
|
+
top_n: topN ?? candidates.length,
|
|
17274
|
+
return_documents: false
|
|
17268
17275
|
};
|
|
17269
17276
|
let attempt = 0;
|
|
17270
17277
|
while (attempt <= this.maxRetries) {
|
|
@@ -17867,6 +17874,36 @@ async function createVectorStore(config, cwd) {
|
|
|
17867
17874
|
});
|
|
17868
17875
|
}
|
|
17869
17876
|
|
|
17877
|
+
// src/utils/pattern.ts
|
|
17878
|
+
function matchUrlPattern(url, pattern) {
|
|
17879
|
+
const norm = (p) => p !== "/" && p.endsWith("/") ? p.slice(0, -1) : p;
|
|
17880
|
+
const normalizedUrl = norm(url);
|
|
17881
|
+
const normalizedPattern = norm(pattern);
|
|
17882
|
+
if (normalizedPattern.endsWith("/**")) {
|
|
17883
|
+
const prefix = normalizedPattern.slice(0, -3);
|
|
17884
|
+
if (prefix === "") {
|
|
17885
|
+
return true;
|
|
17886
|
+
}
|
|
17887
|
+
return normalizedUrl === prefix || normalizedUrl.startsWith(prefix + "/");
|
|
17888
|
+
}
|
|
17889
|
+
if (normalizedPattern.endsWith("/*")) {
|
|
17890
|
+
const prefix = normalizedPattern.slice(0, -2);
|
|
17891
|
+
if (prefix === "") {
|
|
17892
|
+
return normalizedUrl !== "/" && !normalizedUrl.slice(1).includes("/");
|
|
17893
|
+
}
|
|
17894
|
+
if (!normalizedUrl.startsWith(prefix + "/")) return false;
|
|
17895
|
+
const rest = normalizedUrl.slice(prefix.length + 1);
|
|
17896
|
+
return rest.length > 0 && !rest.includes("/");
|
|
17897
|
+
}
|
|
17898
|
+
return normalizedUrl === normalizedPattern;
|
|
17899
|
+
}
|
|
17900
|
+
function matchUrlPatterns(url, patterns) {
|
|
17901
|
+
for (const pattern of patterns) {
|
|
17902
|
+
if (matchUrlPattern(url, pattern)) return true;
|
|
17903
|
+
}
|
|
17904
|
+
return false;
|
|
17905
|
+
}
|
|
17906
|
+
|
|
17870
17907
|
// src/search/ranking.ts
|
|
17871
17908
|
function nonNegativeOrZero(value) {
|
|
17872
17909
|
if (!Number.isFinite(value)) {
|
|
@@ -17895,21 +17932,11 @@ function rankHits(hits, config) {
|
|
|
17895
17932
|
});
|
|
17896
17933
|
}
|
|
17897
17934
|
function findPageWeight(url, pageWeights) {
|
|
17898
|
-
|
|
17899
|
-
const normalizedUrl = norm(url);
|
|
17900
|
-
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
17901
|
-
if (norm(pattern) === normalizedUrl) {
|
|
17902
|
-
return weight;
|
|
17903
|
-
}
|
|
17904
|
-
}
|
|
17905
|
-
let bestPrefix = "";
|
|
17935
|
+
let bestPattern = "";
|
|
17906
17936
|
let bestWeight = 1;
|
|
17907
17937
|
for (const [pattern, weight] of Object.entries(pageWeights)) {
|
|
17908
|
-
|
|
17909
|
-
|
|
17910
|
-
const prefix = `${normalizedPattern}/`;
|
|
17911
|
-
if (normalizedUrl.startsWith(prefix) && prefix.length > bestPrefix.length) {
|
|
17912
|
-
bestPrefix = prefix;
|
|
17938
|
+
if (matchUrlPattern(url, pattern) && pattern.length > bestPattern.length) {
|
|
17939
|
+
bestPattern = pattern;
|
|
17913
17940
|
bestWeight = weight;
|
|
17914
17941
|
}
|
|
17915
17942
|
}
|
|
@@ -17967,7 +17994,8 @@ var requestSchema = zod.z.object({
|
|
|
17967
17994
|
pathPrefix: zod.z.string().optional(),
|
|
17968
17995
|
tags: zod.z.array(zod.z.string()).optional(),
|
|
17969
17996
|
rerank: zod.z.boolean().optional(),
|
|
17970
|
-
groupBy: zod.z.enum(["page", "chunk"]).optional()
|
|
17997
|
+
groupBy: zod.z.enum(["page", "chunk"]).optional(),
|
|
17998
|
+
stream: zod.z.boolean().optional()
|
|
17971
17999
|
});
|
|
17972
18000
|
var SearchEngine = class _SearchEngine {
|
|
17973
18001
|
cwd;
|
|
@@ -18040,7 +18068,103 @@ var SearchEngine = class _SearchEngine {
|
|
|
18040
18068
|
rerankMs = hrTimeMs(rerankStart);
|
|
18041
18069
|
usedRerank = true;
|
|
18042
18070
|
}
|
|
18043
|
-
|
|
18071
|
+
const results = this.buildResults(ordered, topK, groupByPage);
|
|
18072
|
+
return {
|
|
18073
|
+
q: input.q,
|
|
18074
|
+
scope: resolvedScope.scopeName,
|
|
18075
|
+
results,
|
|
18076
|
+
meta: {
|
|
18077
|
+
timingsMs: {
|
|
18078
|
+
embed: Math.round(embedMs),
|
|
18079
|
+
vector: Math.round(vectorMs),
|
|
18080
|
+
rerank: Math.round(rerankMs),
|
|
18081
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18082
|
+
},
|
|
18083
|
+
usedRerank,
|
|
18084
|
+
modelId: this.config.embeddings.model
|
|
18085
|
+
}
|
|
18086
|
+
};
|
|
18087
|
+
}
|
|
18088
|
+
async *searchStreaming(request) {
|
|
18089
|
+
const parsed = requestSchema.safeParse(request);
|
|
18090
|
+
if (!parsed.success) {
|
|
18091
|
+
throw new SearchSocketError("INVALID_REQUEST", parsed.error.issues[0]?.message ?? "Invalid request", 400);
|
|
18092
|
+
}
|
|
18093
|
+
const input = parsed.data;
|
|
18094
|
+
const wantsRerank = Boolean(input.rerank);
|
|
18095
|
+
if (!wantsRerank) {
|
|
18096
|
+
const response = await this.search(request);
|
|
18097
|
+
yield { phase: "initial", data: response };
|
|
18098
|
+
return;
|
|
18099
|
+
}
|
|
18100
|
+
const totalStart = process.hrtime.bigint();
|
|
18101
|
+
const resolvedScope = resolveScope(this.config, input.scope);
|
|
18102
|
+
await this.assertModelCompatibility(resolvedScope);
|
|
18103
|
+
const topK = input.topK ?? 10;
|
|
18104
|
+
const groupByPage = (input.groupBy ?? "page") === "page";
|
|
18105
|
+
const candidateK = groupByPage ? Math.max(topK * 10, 50) : Math.max(50, topK);
|
|
18106
|
+
const embedStart = process.hrtime.bigint();
|
|
18107
|
+
const queryEmbeddings = await this.embeddings.embedTexts([input.q], this.config.embeddings.model, "retrieval.query");
|
|
18108
|
+
const queryVector = queryEmbeddings[0];
|
|
18109
|
+
if (!queryVector || queryVector.length === 0 || queryVector.some((value) => !Number.isFinite(value))) {
|
|
18110
|
+
throw new SearchSocketError("VECTOR_BACKEND_UNAVAILABLE", "Unable to create query embedding.");
|
|
18111
|
+
}
|
|
18112
|
+
const embedMs = hrTimeMs(embedStart);
|
|
18113
|
+
const vectorStart = process.hrtime.bigint();
|
|
18114
|
+
const hits = await this.vectorStore.query(
|
|
18115
|
+
queryVector,
|
|
18116
|
+
{
|
|
18117
|
+
topK: candidateK,
|
|
18118
|
+
pathPrefix: input.pathPrefix,
|
|
18119
|
+
tags: input.tags
|
|
18120
|
+
},
|
|
18121
|
+
resolvedScope
|
|
18122
|
+
);
|
|
18123
|
+
const vectorMs = hrTimeMs(vectorStart);
|
|
18124
|
+
const ranked = rankHits(hits, this.config);
|
|
18125
|
+
const initialResults = this.buildResults(ranked, topK, groupByPage);
|
|
18126
|
+
yield {
|
|
18127
|
+
phase: "initial",
|
|
18128
|
+
data: {
|
|
18129
|
+
q: input.q,
|
|
18130
|
+
scope: resolvedScope.scopeName,
|
|
18131
|
+
results: initialResults,
|
|
18132
|
+
meta: {
|
|
18133
|
+
timingsMs: {
|
|
18134
|
+
embed: Math.round(embedMs),
|
|
18135
|
+
vector: Math.round(vectorMs),
|
|
18136
|
+
rerank: 0,
|
|
18137
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18138
|
+
},
|
|
18139
|
+
usedRerank: false,
|
|
18140
|
+
modelId: this.config.embeddings.model
|
|
18141
|
+
}
|
|
18142
|
+
}
|
|
18143
|
+
};
|
|
18144
|
+
const rerankStart = process.hrtime.bigint();
|
|
18145
|
+
const reranked = await this.rerankHits(input.q, ranked, topK);
|
|
18146
|
+
const rerankMs = hrTimeMs(rerankStart);
|
|
18147
|
+
const rerankedResults = this.buildResults(reranked, topK, groupByPage);
|
|
18148
|
+
yield {
|
|
18149
|
+
phase: "reranked",
|
|
18150
|
+
data: {
|
|
18151
|
+
q: input.q,
|
|
18152
|
+
scope: resolvedScope.scopeName,
|
|
18153
|
+
results: rerankedResults,
|
|
18154
|
+
meta: {
|
|
18155
|
+
timingsMs: {
|
|
18156
|
+
embed: Math.round(embedMs),
|
|
18157
|
+
vector: Math.round(vectorMs),
|
|
18158
|
+
rerank: Math.round(rerankMs),
|
|
18159
|
+
total: Math.round(hrTimeMs(totalStart))
|
|
18160
|
+
},
|
|
18161
|
+
usedRerank: true,
|
|
18162
|
+
modelId: this.config.embeddings.model
|
|
18163
|
+
}
|
|
18164
|
+
}
|
|
18165
|
+
};
|
|
18166
|
+
}
|
|
18167
|
+
buildResults(ordered, topK, groupByPage) {
|
|
18044
18168
|
const minScore = this.config.ranking.minScore;
|
|
18045
18169
|
if (groupByPage) {
|
|
18046
18170
|
let pages = aggregateByPage(ordered, this.config);
|
|
@@ -18048,10 +18172,10 @@ var SearchEngine = class _SearchEngine {
|
|
|
18048
18172
|
pages = pages.filter((p) => p.pageScore >= minScore);
|
|
18049
18173
|
}
|
|
18050
18174
|
const minRatio = this.config.ranking.minChunkScoreRatio;
|
|
18051
|
-
|
|
18175
|
+
return pages.slice(0, topK).map((page) => {
|
|
18052
18176
|
const bestScore = page.bestChunk.finalScore;
|
|
18053
|
-
const
|
|
18054
|
-
const meaningful = page.matchingChunks.filter((c) => c.finalScore >=
|
|
18177
|
+
const minChunkScore = Number.isFinite(bestScore) ? bestScore * minRatio : Number.NEGATIVE_INFINITY;
|
|
18178
|
+
const meaningful = page.matchingChunks.filter((c) => c.finalScore >= minChunkScore).slice(0, 5);
|
|
18055
18179
|
return {
|
|
18056
18180
|
url: page.url,
|
|
18057
18181
|
title: page.title,
|
|
@@ -18068,10 +18192,11 @@ var SearchEngine = class _SearchEngine {
|
|
|
18068
18192
|
};
|
|
18069
18193
|
});
|
|
18070
18194
|
} else {
|
|
18195
|
+
let filtered = ordered;
|
|
18071
18196
|
if (minScore > 0) {
|
|
18072
|
-
|
|
18197
|
+
filtered = ordered.filter((entry) => entry.finalScore >= minScore);
|
|
18073
18198
|
}
|
|
18074
|
-
|
|
18199
|
+
return filtered.slice(0, topK).map(({ hit, finalScore }) => ({
|
|
18075
18200
|
url: hit.metadata.url,
|
|
18076
18201
|
title: hit.metadata.title,
|
|
18077
18202
|
sectionTitle: hit.metadata.sectionTitle || void 0,
|
|
@@ -18080,21 +18205,6 @@ var SearchEngine = class _SearchEngine {
|
|
|
18080
18205
|
routeFile: hit.metadata.routeFile
|
|
18081
18206
|
}));
|
|
18082
18207
|
}
|
|
18083
|
-
return {
|
|
18084
|
-
q: input.q,
|
|
18085
|
-
scope: resolvedScope.scopeName,
|
|
18086
|
-
results,
|
|
18087
|
-
meta: {
|
|
18088
|
-
timingsMs: {
|
|
18089
|
-
embed: Math.round(embedMs),
|
|
18090
|
-
vector: Math.round(vectorMs),
|
|
18091
|
-
rerank: Math.round(rerankMs),
|
|
18092
|
-
total: Math.round(hrTimeMs(totalStart))
|
|
18093
|
-
},
|
|
18094
|
-
usedRerank,
|
|
18095
|
-
modelId: this.config.embeddings.model
|
|
18096
|
-
}
|
|
18097
|
-
};
|
|
18098
18208
|
}
|
|
18099
18209
|
async getPage(pathOrUrl, scope) {
|
|
18100
18210
|
const resolvedScope = resolveScope(this.config, scope);
|
|
@@ -18166,6 +18276,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
18166
18276
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
18167
18277
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
18168
18278
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
18279
|
+
const MAX_DOC_CHARS = 2e3;
|
|
18169
18280
|
const pageCandidates = [];
|
|
18170
18281
|
for (const [url, chunks] of pageGroups) {
|
|
18171
18282
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -18185,12 +18296,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
18185
18296
|
}
|
|
18186
18297
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18187
18298
|
parts.push(body);
|
|
18188
|
-
|
|
18299
|
+
let text = parts.join("\n\n");
|
|
18300
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
18301
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
18302
|
+
}
|
|
18303
|
+
pageCandidates.push({ id: url, text });
|
|
18189
18304
|
}
|
|
18305
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
18306
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
18190
18307
|
const reranked = await this.reranker.rerank(
|
|
18191
18308
|
query,
|
|
18192
|
-
|
|
18193
|
-
|
|
18309
|
+
cappedCandidates,
|
|
18310
|
+
maxCandidates
|
|
18194
18311
|
);
|
|
18195
18312
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
18196
18313
|
return ranked.map((entry) => {
|
|
@@ -18362,7 +18479,44 @@ function searchsocketHandle(options = {}) {
|
|
|
18362
18479
|
throw new SearchSocketError("INVALID_REQUEST", "Malformed JSON request body", 400);
|
|
18363
18480
|
}
|
|
18364
18481
|
const engine = await getEngine();
|
|
18365
|
-
const
|
|
18482
|
+
const searchRequest = body;
|
|
18483
|
+
if (searchRequest.stream && searchRequest.rerank) {
|
|
18484
|
+
const encoder = new TextEncoder();
|
|
18485
|
+
const stream = new ReadableStream({
|
|
18486
|
+
async start(controller) {
|
|
18487
|
+
try {
|
|
18488
|
+
for await (const event2 of engine.searchStreaming(searchRequest)) {
|
|
18489
|
+
const line = JSON.stringify(event2) + "\n";
|
|
18490
|
+
controller.enqueue(encoder.encode(line));
|
|
18491
|
+
}
|
|
18492
|
+
} catch (streamError) {
|
|
18493
|
+
const errorEvent = {
|
|
18494
|
+
phase: "error",
|
|
18495
|
+
data: {
|
|
18496
|
+
error: {
|
|
18497
|
+
code: streamError instanceof SearchSocketError ? streamError.code : "INTERNAL_ERROR",
|
|
18498
|
+
message: streamError instanceof Error ? streamError.message : "Unknown error"
|
|
18499
|
+
}
|
|
18500
|
+
}
|
|
18501
|
+
};
|
|
18502
|
+
controller.enqueue(encoder.encode(JSON.stringify(errorEvent) + "\n"));
|
|
18503
|
+
} finally {
|
|
18504
|
+
controller.close();
|
|
18505
|
+
}
|
|
18506
|
+
}
|
|
18507
|
+
});
|
|
18508
|
+
return withCors(
|
|
18509
|
+
new Response(stream, {
|
|
18510
|
+
status: 200,
|
|
18511
|
+
headers: {
|
|
18512
|
+
"content-type": "application/x-ndjson"
|
|
18513
|
+
}
|
|
18514
|
+
}),
|
|
18515
|
+
event.request,
|
|
18516
|
+
config
|
|
18517
|
+
);
|
|
18518
|
+
}
|
|
18519
|
+
const result = await engine.search(searchRequest);
|
|
18366
18520
|
return withCors(
|
|
18367
18521
|
new Response(JSON.stringify(result), {
|
|
18368
18522
|
status: 200,
|
|
@@ -19591,6 +19745,17 @@ function extractFromHtml(url, html, config) {
|
|
|
19591
19745
|
if ($(`[${config.extract.noindexAttr}]`).length > 0) {
|
|
19592
19746
|
return null;
|
|
19593
19747
|
}
|
|
19748
|
+
const weightRaw = $("meta[name='searchsocket-weight']").attr("content")?.trim();
|
|
19749
|
+
let weight;
|
|
19750
|
+
if (weightRaw !== void 0) {
|
|
19751
|
+
const parsed = Number(weightRaw);
|
|
19752
|
+
if (Number.isFinite(parsed) && parsed >= 0) {
|
|
19753
|
+
weight = parsed;
|
|
19754
|
+
}
|
|
19755
|
+
}
|
|
19756
|
+
if (weight === 0) {
|
|
19757
|
+
return null;
|
|
19758
|
+
}
|
|
19594
19759
|
const description = $("meta[name='description']").attr("content")?.trim() || $("meta[property='og:description']").attr("content")?.trim() || void 0;
|
|
19595
19760
|
const keywordsRaw = $("meta[name='keywords']").attr("content")?.trim();
|
|
19596
19761
|
const keywords = keywordsRaw ? keywordsRaw.split(",").map((k) => k.trim()).filter(Boolean) : void 0;
|
|
@@ -19646,7 +19811,8 @@ function extractFromHtml(url, html, config) {
|
|
|
19646
19811
|
noindex: false,
|
|
19647
19812
|
tags,
|
|
19648
19813
|
description,
|
|
19649
|
-
keywords
|
|
19814
|
+
keywords,
|
|
19815
|
+
weight
|
|
19650
19816
|
};
|
|
19651
19817
|
}
|
|
19652
19818
|
function extractFromMarkdown(url, markdown, title) {
|
|
@@ -19659,6 +19825,14 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19659
19825
|
if (frontmatter.noindex === true || searchsocketMeta?.noindex === true) {
|
|
19660
19826
|
return null;
|
|
19661
19827
|
}
|
|
19828
|
+
let mdWeight;
|
|
19829
|
+
const rawWeight = searchsocketMeta?.weight ?? frontmatter.searchsocketWeight;
|
|
19830
|
+
if (typeof rawWeight === "number" && Number.isFinite(rawWeight) && rawWeight >= 0) {
|
|
19831
|
+
mdWeight = rawWeight;
|
|
19832
|
+
}
|
|
19833
|
+
if (mdWeight === 0) {
|
|
19834
|
+
return null;
|
|
19835
|
+
}
|
|
19662
19836
|
const content = parsed.content;
|
|
19663
19837
|
const normalized = normalizeMarkdown(content);
|
|
19664
19838
|
if (!normalizeText(normalized)) {
|
|
@@ -19681,7 +19855,8 @@ function extractFromMarkdown(url, markdown, title) {
|
|
|
19681
19855
|
noindex: false,
|
|
19682
19856
|
tags: normalizeUrlPath(url).split("/").filter(Boolean).slice(0, 1),
|
|
19683
19857
|
description: fmDescription,
|
|
19684
|
-
keywords: fmKeywords
|
|
19858
|
+
keywords: fmKeywords,
|
|
19859
|
+
weight: mdWeight
|
|
19685
19860
|
};
|
|
19686
19861
|
}
|
|
19687
19862
|
function yamlString(value) {
|
|
@@ -19950,15 +20125,7 @@ function expandDynamicUrl(url, value) {
|
|
|
19950
20125
|
return url.replace(/\[\[?\.\.\.[^\]]+\]?\]|\[\[[^\]]+\]\]|\[[^\]]+\]/g, value);
|
|
19951
20126
|
}
|
|
19952
20127
|
function isExcluded(url, patterns) {
|
|
19953
|
-
|
|
19954
|
-
if (pattern.endsWith("/*")) {
|
|
19955
|
-
const prefix = pattern.slice(0, -1);
|
|
19956
|
-
if (url.startsWith(prefix) || url === prefix.slice(0, -1)) return true;
|
|
19957
|
-
} else if (url === pattern) {
|
|
19958
|
-
return true;
|
|
19959
|
-
}
|
|
19960
|
-
}
|
|
19961
|
-
return false;
|
|
20128
|
+
return matchUrlPatterns(url, patterns);
|
|
19962
20129
|
}
|
|
19963
20130
|
function findFreePort() {
|
|
19964
20131
|
return new Promise((resolve, reject) => {
|
|
@@ -20374,12 +20541,83 @@ async function loadStaticOutputPages(cwd, config, maxPages) {
|
|
|
20374
20541
|
}
|
|
20375
20542
|
return pages;
|
|
20376
20543
|
}
|
|
20544
|
+
function parseRobotsTxt(content, userAgent = "Searchsocket") {
|
|
20545
|
+
const lines = content.split(/\r?\n/);
|
|
20546
|
+
const agentGroups = /* @__PURE__ */ new Map();
|
|
20547
|
+
let currentAgents = [];
|
|
20548
|
+
for (const rawLine of lines) {
|
|
20549
|
+
const line = rawLine.replace(/#.*$/, "").trim();
|
|
20550
|
+
if (!line) continue;
|
|
20551
|
+
const colonIdx = line.indexOf(":");
|
|
20552
|
+
if (colonIdx === -1) continue;
|
|
20553
|
+
const directive = line.slice(0, colonIdx).trim().toLowerCase();
|
|
20554
|
+
const value = line.slice(colonIdx + 1).trim();
|
|
20555
|
+
if (directive === "user-agent") {
|
|
20556
|
+
const agentName = value.toLowerCase();
|
|
20557
|
+
currentAgents.push(agentName);
|
|
20558
|
+
if (!agentGroups.has(agentName)) {
|
|
20559
|
+
agentGroups.set(agentName, { disallow: [], allow: [] });
|
|
20560
|
+
}
|
|
20561
|
+
} else if (directive === "disallow" && value && currentAgents.length > 0) {
|
|
20562
|
+
for (const agent of currentAgents) {
|
|
20563
|
+
agentGroups.get(agent).disallow.push(value);
|
|
20564
|
+
}
|
|
20565
|
+
} else if (directive === "allow" && value && currentAgents.length > 0) {
|
|
20566
|
+
for (const agent of currentAgents) {
|
|
20567
|
+
agentGroups.get(agent).allow.push(value);
|
|
20568
|
+
}
|
|
20569
|
+
} else if (directive !== "disallow" && directive !== "allow") {
|
|
20570
|
+
currentAgents = [];
|
|
20571
|
+
}
|
|
20572
|
+
}
|
|
20573
|
+
const specific = agentGroups.get(userAgent.toLowerCase());
|
|
20574
|
+
if (specific && (specific.disallow.length > 0 || specific.allow.length > 0)) {
|
|
20575
|
+
return specific;
|
|
20576
|
+
}
|
|
20577
|
+
return agentGroups.get("*") ?? { disallow: [], allow: [] };
|
|
20578
|
+
}
|
|
20579
|
+
function isBlockedByRobots(urlPath, rules3) {
|
|
20580
|
+
let longestDisallow = "";
|
|
20581
|
+
for (const pattern of rules3.disallow) {
|
|
20582
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestDisallow.length) {
|
|
20583
|
+
longestDisallow = pattern;
|
|
20584
|
+
}
|
|
20585
|
+
}
|
|
20586
|
+
if (!longestDisallow) return false;
|
|
20587
|
+
let longestAllow = "";
|
|
20588
|
+
for (const pattern of rules3.allow) {
|
|
20589
|
+
if (urlPath.startsWith(pattern) && pattern.length > longestAllow.length) {
|
|
20590
|
+
longestAllow = pattern;
|
|
20591
|
+
}
|
|
20592
|
+
}
|
|
20593
|
+
return longestAllow.length < longestDisallow.length;
|
|
20594
|
+
}
|
|
20595
|
+
async function loadRobotsTxtFromDir(dir) {
|
|
20596
|
+
try {
|
|
20597
|
+
const content = await fs4__default.default.readFile(path__default.default.join(dir, "robots.txt"), "utf8");
|
|
20598
|
+
return parseRobotsTxt(content);
|
|
20599
|
+
} catch {
|
|
20600
|
+
return null;
|
|
20601
|
+
}
|
|
20602
|
+
}
|
|
20603
|
+
async function fetchRobotsTxt(baseUrl) {
|
|
20604
|
+
try {
|
|
20605
|
+
const url = new URL("/robots.txt", baseUrl).href;
|
|
20606
|
+
const response = await fetch(url);
|
|
20607
|
+
if (!response.ok) return null;
|
|
20608
|
+
const content = await response.text();
|
|
20609
|
+
return parseRobotsTxt(content);
|
|
20610
|
+
} catch {
|
|
20611
|
+
return null;
|
|
20612
|
+
}
|
|
20613
|
+
}
|
|
20377
20614
|
|
|
20378
20615
|
// src/indexing/pipeline.ts
|
|
20379
20616
|
var EMBEDDING_PRICE_PER_1K_TOKENS_USD = {
|
|
20380
|
-
"jina-embeddings-v3": 2e-5
|
|
20617
|
+
"jina-embeddings-v3": 2e-5,
|
|
20618
|
+
"jina-embeddings-v5-text-small": 5e-5
|
|
20381
20619
|
};
|
|
20382
|
-
var DEFAULT_EMBEDDING_PRICE_PER_1K =
|
|
20620
|
+
var DEFAULT_EMBEDDING_PRICE_PER_1K = 5e-5;
|
|
20383
20621
|
var IndexPipeline = class _IndexPipeline {
|
|
20384
20622
|
cwd;
|
|
20385
20623
|
config;
|
|
@@ -20457,6 +20695,53 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20457
20695
|
}
|
|
20458
20696
|
stageEnd("source", sourceStart);
|
|
20459
20697
|
this.logger.info(`Loaded ${sourcePages.length} page${sourcePages.length === 1 ? "" : "s"} (${stageTimingsMs["source"]}ms)`);
|
|
20698
|
+
const filterStart = stageStart();
|
|
20699
|
+
let filteredSourcePages = sourcePages;
|
|
20700
|
+
if (this.config.exclude.length > 0) {
|
|
20701
|
+
const beforeExclude = filteredSourcePages.length;
|
|
20702
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20703
|
+
const url = normalizeUrlPath(p.url);
|
|
20704
|
+
if (matchUrlPatterns(url, this.config.exclude)) {
|
|
20705
|
+
this.logger.debug(`Excluding ${url} (matched exclude pattern)`);
|
|
20706
|
+
return false;
|
|
20707
|
+
}
|
|
20708
|
+
return true;
|
|
20709
|
+
});
|
|
20710
|
+
const excludedCount = beforeExclude - filteredSourcePages.length;
|
|
20711
|
+
if (excludedCount > 0) {
|
|
20712
|
+
this.logger.info(`Excluded ${excludedCount} page${excludedCount === 1 ? "" : "s"} by config exclude patterns`);
|
|
20713
|
+
}
|
|
20714
|
+
}
|
|
20715
|
+
if (this.config.respectRobotsTxt) {
|
|
20716
|
+
let robotsRules = null;
|
|
20717
|
+
if (sourceMode === "static-output") {
|
|
20718
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20719
|
+
path__default.default.resolve(this.cwd, this.config.source.staticOutputDir)
|
|
20720
|
+
);
|
|
20721
|
+
} else if (sourceMode === "build" && this.config.source.build) {
|
|
20722
|
+
robotsRules = await loadRobotsTxtFromDir(
|
|
20723
|
+
path__default.default.resolve(this.cwd, this.config.source.build.outputDir)
|
|
20724
|
+
);
|
|
20725
|
+
} else if (sourceMode === "crawl" && this.config.source.crawl) {
|
|
20726
|
+
robotsRules = await fetchRobotsTxt(this.config.source.crawl.baseUrl);
|
|
20727
|
+
}
|
|
20728
|
+
if (robotsRules) {
|
|
20729
|
+
const beforeRobots = filteredSourcePages.length;
|
|
20730
|
+
filteredSourcePages = filteredSourcePages.filter((p) => {
|
|
20731
|
+
const url = normalizeUrlPath(p.url);
|
|
20732
|
+
if (isBlockedByRobots(url, robotsRules)) {
|
|
20733
|
+
this.logger.debug(`Excluding ${url} (blocked by robots.txt)`);
|
|
20734
|
+
return false;
|
|
20735
|
+
}
|
|
20736
|
+
return true;
|
|
20737
|
+
});
|
|
20738
|
+
const robotsExcluded = beforeRobots - filteredSourcePages.length;
|
|
20739
|
+
if (robotsExcluded > 0) {
|
|
20740
|
+
this.logger.info(`Excluded ${robotsExcluded} page${robotsExcluded === 1 ? "" : "s"} by robots.txt`);
|
|
20741
|
+
}
|
|
20742
|
+
}
|
|
20743
|
+
}
|
|
20744
|
+
stageEnd("filter", filterStart);
|
|
20460
20745
|
const routeStart = stageStart();
|
|
20461
20746
|
const routePatterns = await buildRoutePatterns(this.cwd);
|
|
20462
20747
|
stageEnd("route_map", routeStart);
|
|
@@ -20464,7 +20749,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20464
20749
|
const extractStart = stageStart();
|
|
20465
20750
|
this.logger.info("Extracting content...");
|
|
20466
20751
|
const extractedPages = [];
|
|
20467
|
-
for (const sourcePage of
|
|
20752
|
+
for (const sourcePage of filteredSourcePages) {
|
|
20468
20753
|
const extracted = sourcePage.html ? extractFromHtml(sourcePage.url, sourcePage.html, this.config) : extractFromMarkdown(sourcePage.url, sourcePage.markdown ?? "", sourcePage.title);
|
|
20469
20754
|
if (!extracted) {
|
|
20470
20755
|
this.logger.warn(
|
|
@@ -20490,16 +20775,29 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20490
20775
|
seenUrls.add(page.url);
|
|
20491
20776
|
uniquePages.push(page);
|
|
20492
20777
|
}
|
|
20778
|
+
const indexablePages = [];
|
|
20779
|
+
for (const page of uniquePages) {
|
|
20780
|
+
const effectiveWeight = page.weight ?? findPageWeight(page.url, this.config.ranking.pageWeights);
|
|
20781
|
+
if (effectiveWeight === 0) {
|
|
20782
|
+
this.logger.debug(`Excluding ${page.url} (zero weight)`);
|
|
20783
|
+
continue;
|
|
20784
|
+
}
|
|
20785
|
+
indexablePages.push(page);
|
|
20786
|
+
}
|
|
20787
|
+
const zeroWeightCount = uniquePages.length - indexablePages.length;
|
|
20788
|
+
if (zeroWeightCount > 0) {
|
|
20789
|
+
this.logger.info(`Excluded ${zeroWeightCount} page${zeroWeightCount === 1 ? "" : "s"} with zero weight`);
|
|
20790
|
+
}
|
|
20493
20791
|
stageEnd("extract", extractStart);
|
|
20494
|
-
const skippedPages =
|
|
20495
|
-
this.logger.info(`Extracted ${
|
|
20792
|
+
const skippedPages = filteredSourcePages.length - indexablePages.length;
|
|
20793
|
+
this.logger.info(`Extracted ${indexablePages.length} page${indexablePages.length === 1 ? "" : "s"}${skippedPages > 0 ? ` (${skippedPages} skipped)` : ""} (${stageTimingsMs["extract"]}ms)`);
|
|
20496
20794
|
const linkStart = stageStart();
|
|
20497
|
-
const pageSet = new Set(
|
|
20795
|
+
const pageSet = new Set(indexablePages.map((page) => normalizeUrlPath(page.url)));
|
|
20498
20796
|
const incomingLinkCount = /* @__PURE__ */ new Map();
|
|
20499
|
-
for (const page of
|
|
20797
|
+
for (const page of indexablePages) {
|
|
20500
20798
|
incomingLinkCount.set(page.url, incomingLinkCount.get(page.url) ?? 0);
|
|
20501
20799
|
}
|
|
20502
|
-
for (const page of
|
|
20800
|
+
for (const page of indexablePages) {
|
|
20503
20801
|
for (const outgoing of page.outgoingLinks) {
|
|
20504
20802
|
if (!pageSet.has(outgoing)) {
|
|
20505
20803
|
continue;
|
|
@@ -20523,7 +20821,7 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20523
20821
|
});
|
|
20524
20822
|
}
|
|
20525
20823
|
}
|
|
20526
|
-
for (const page of
|
|
20824
|
+
for (const page of indexablePages) {
|
|
20527
20825
|
const routeMatch = precomputedRoutes.get(normalizeUrlPath(page.url)) ?? mapUrlToRoute(page.url, routePatterns);
|
|
20528
20826
|
if (routeMatch.routeResolution === "best-effort") {
|
|
20529
20827
|
if (this.config.source.strictRouteMapping) {
|
|
@@ -20809,7 +21107,7 @@ function searchsocketVitePlugin(options = {}) {
|
|
|
20809
21107
|
});
|
|
20810
21108
|
const stats = await pipeline.run({
|
|
20811
21109
|
changedOnly: options.changedOnly ?? true,
|
|
20812
|
-
force: options.force ?? false,
|
|
21110
|
+
force: (options.force ?? false) || /^(1|true|yes)$/i.test(process.env.SEARCHSOCKET_FORCE_REINDEX ?? ""),
|
|
20813
21111
|
dryRun: options.dryRun ?? false,
|
|
20814
21112
|
scopeOverride: options.scope,
|
|
20815
21113
|
verbose: options.verbose
|
package/dist/sveltekit.d.cts
CHANGED