searchsocket 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +17 -8
- package/dist/index.cjs +16 -7
- package/dist/index.js +16 -7
- package/dist/sveltekit.cjs +16 -7
- package/dist/sveltekit.js +16 -7
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -12,7 +12,7 @@ import { Command } from "commander";
|
|
|
12
12
|
// package.json
|
|
13
13
|
var package_default = {
|
|
14
14
|
name: "searchsocket",
|
|
15
|
-
version: "0.3.
|
|
15
|
+
version: "0.3.3",
|
|
16
16
|
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
17
|
license: "MIT",
|
|
18
18
|
author: "Greg Priday <greg@siteorigin.com>",
|
|
@@ -189,7 +189,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
189
189
|
ranking: z.object({
|
|
190
190
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
191
191
|
enableDepthBoost: z.boolean().optional(),
|
|
192
|
-
pageWeights: z.record(z.string(), z.number().
|
|
192
|
+
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
193
193
|
aggregationCap: z.number().int().positive().optional(),
|
|
194
194
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
195
195
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
@@ -1700,7 +1700,7 @@ function extractFromHtml(url, html, config) {
|
|
|
1700
1700
|
const $ = load(html);
|
|
1701
1701
|
const normalizedUrl = normalizeUrlPath(url);
|
|
1702
1702
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
1703
|
-
const title =
|
|
1703
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
1704
1704
|
if (config.extract.respectRobotsNoindex) {
|
|
1705
1705
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
1706
1706
|
if (/\bnoindex\b/i.test(robots)) {
|
|
@@ -2854,7 +2854,7 @@ var JinaReranker = class {
|
|
|
2854
2854
|
constructor(options) {
|
|
2855
2855
|
this.apiKey = options.apiKey;
|
|
2856
2856
|
this.model = options.model;
|
|
2857
|
-
this.maxRetries = options.maxRetries ??
|
|
2857
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
2858
2858
|
}
|
|
2859
2859
|
async rerank(query, candidates, topN) {
|
|
2860
2860
|
if (candidates.length === 0) {
|
|
@@ -2864,7 +2864,8 @@ var JinaReranker = class {
|
|
|
2864
2864
|
model: this.model,
|
|
2865
2865
|
query,
|
|
2866
2866
|
documents: candidates.map((candidate) => candidate.text),
|
|
2867
|
-
top_n: topN ?? candidates.length
|
|
2867
|
+
top_n: topN ?? candidates.length,
|
|
2868
|
+
return_documents: false
|
|
2868
2869
|
};
|
|
2869
2870
|
let attempt = 0;
|
|
2870
2871
|
while (attempt <= this.maxRetries) {
|
|
@@ -3010,6 +3011,7 @@ function aggregateByPage(ranked, config) {
|
|
|
3010
3011
|
}
|
|
3011
3012
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
3012
3013
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
3014
|
+
if (pageWeight === 0) continue;
|
|
3013
3015
|
if (pageWeight !== 1) {
|
|
3014
3016
|
pageScore *= pageWeight;
|
|
3015
3017
|
}
|
|
@@ -3235,6 +3237,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
3235
3237
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
3236
3238
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
3237
3239
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
3240
|
+
const MAX_DOC_CHARS = 2e3;
|
|
3238
3241
|
const pageCandidates = [];
|
|
3239
3242
|
for (const [url, chunks] of pageGroups) {
|
|
3240
3243
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -3254,12 +3257,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
3254
3257
|
}
|
|
3255
3258
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
3256
3259
|
parts.push(body);
|
|
3257
|
-
|
|
3260
|
+
let text = parts.join("\n\n");
|
|
3261
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
3262
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
3263
|
+
}
|
|
3264
|
+
pageCandidates.push({ id: url, text });
|
|
3258
3265
|
}
|
|
3266
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
3267
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
3259
3268
|
const reranked = await this.reranker.rerank(
|
|
3260
3269
|
query,
|
|
3261
|
-
|
|
3262
|
-
|
|
3270
|
+
cappedCandidates,
|
|
3271
|
+
maxCandidates
|
|
3263
3272
|
);
|
|
3264
3273
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
3265
3274
|
return ranked.map((entry) => {
|
package/dist/index.cjs
CHANGED
|
@@ -16688,7 +16688,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16688
16688
|
ranking: zod.z.object({
|
|
16689
16689
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
16690
16690
|
enableDepthBoost: zod.z.boolean().optional(),
|
|
16691
|
-
pageWeights: zod.z.record(zod.z.string(), zod.z.number().
|
|
16691
|
+
pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
|
|
16692
16692
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16693
16693
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16694
16694
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
@@ -17273,7 +17273,7 @@ var JinaReranker = class {
|
|
|
17273
17273
|
constructor(options) {
|
|
17274
17274
|
this.apiKey = options.apiKey;
|
|
17275
17275
|
this.model = options.model;
|
|
17276
|
-
this.maxRetries = options.maxRetries ??
|
|
17276
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
17277
17277
|
}
|
|
17278
17278
|
async rerank(query, candidates, topN) {
|
|
17279
17279
|
if (candidates.length === 0) {
|
|
@@ -17283,7 +17283,8 @@ var JinaReranker = class {
|
|
|
17283
17283
|
model: this.model,
|
|
17284
17284
|
query,
|
|
17285
17285
|
documents: candidates.map((candidate) => candidate.text),
|
|
17286
|
-
top_n: topN ?? candidates.length
|
|
17286
|
+
top_n: topN ?? candidates.length,
|
|
17287
|
+
return_documents: false
|
|
17287
17288
|
};
|
|
17288
17289
|
let attempt = 0;
|
|
17289
17290
|
while (attempt <= this.maxRetries) {
|
|
@@ -19038,7 +19039,7 @@ function extractFromHtml(url, html, config) {
|
|
|
19038
19039
|
const $ = cheerio.load(html);
|
|
19039
19040
|
const normalizedUrl = normalizeUrlPath(url);
|
|
19040
19041
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
19041
|
-
const title =
|
|
19042
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
19042
19043
|
if (config.extract.respectRobotsNoindex) {
|
|
19043
19044
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
19044
19045
|
if (/\bnoindex\b/i.test(robots)) {
|
|
@@ -20279,6 +20280,7 @@ function aggregateByPage(ranked, config) {
|
|
|
20279
20280
|
}
|
|
20280
20281
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20281
20282
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20283
|
+
if (pageWeight === 0) continue;
|
|
20282
20284
|
if (pageWeight !== 1) {
|
|
20283
20285
|
pageScore *= pageWeight;
|
|
20284
20286
|
}
|
|
@@ -20504,6 +20506,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20504
20506
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20505
20507
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20506
20508
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20509
|
+
const MAX_DOC_CHARS = 2e3;
|
|
20507
20510
|
const pageCandidates = [];
|
|
20508
20511
|
for (const [url, chunks] of pageGroups) {
|
|
20509
20512
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -20523,12 +20526,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
20523
20526
|
}
|
|
20524
20527
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20525
20528
|
parts.push(body);
|
|
20526
|
-
|
|
20529
|
+
let text = parts.join("\n\n");
|
|
20530
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
20531
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
20532
|
+
}
|
|
20533
|
+
pageCandidates.push({ id: url, text });
|
|
20527
20534
|
}
|
|
20535
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
20536
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
20528
20537
|
const reranked = await this.reranker.rerank(
|
|
20529
20538
|
query,
|
|
20530
|
-
|
|
20531
|
-
|
|
20539
|
+
cappedCandidates,
|
|
20540
|
+
maxCandidates
|
|
20532
20541
|
);
|
|
20533
20542
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20534
20543
|
return ranked.map((entry) => {
|
package/dist/index.js
CHANGED
|
@@ -16676,7 +16676,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16676
16676
|
ranking: z.object({
|
|
16677
16677
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
16678
16678
|
enableDepthBoost: z.boolean().optional(),
|
|
16679
|
-
pageWeights: z.record(z.string(), z.number().
|
|
16679
|
+
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
16680
16680
|
aggregationCap: z.number().int().positive().optional(),
|
|
16681
16681
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16682
16682
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
@@ -17261,7 +17261,7 @@ var JinaReranker = class {
|
|
|
17261
17261
|
constructor(options) {
|
|
17262
17262
|
this.apiKey = options.apiKey;
|
|
17263
17263
|
this.model = options.model;
|
|
17264
|
-
this.maxRetries = options.maxRetries ??
|
|
17264
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
17265
17265
|
}
|
|
17266
17266
|
async rerank(query, candidates, topN) {
|
|
17267
17267
|
if (candidates.length === 0) {
|
|
@@ -17271,7 +17271,8 @@ var JinaReranker = class {
|
|
|
17271
17271
|
model: this.model,
|
|
17272
17272
|
query,
|
|
17273
17273
|
documents: candidates.map((candidate) => candidate.text),
|
|
17274
|
-
top_n: topN ?? candidates.length
|
|
17274
|
+
top_n: topN ?? candidates.length,
|
|
17275
|
+
return_documents: false
|
|
17275
17276
|
};
|
|
17276
17277
|
let attempt = 0;
|
|
17277
17278
|
while (attempt <= this.maxRetries) {
|
|
@@ -19026,7 +19027,7 @@ function extractFromHtml(url, html, config) {
|
|
|
19026
19027
|
const $ = load(html);
|
|
19027
19028
|
const normalizedUrl = normalizeUrlPath(url);
|
|
19028
19029
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
19029
|
-
const title =
|
|
19030
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
19030
19031
|
if (config.extract.respectRobotsNoindex) {
|
|
19031
19032
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
19032
19033
|
if (/\bnoindex\b/i.test(robots)) {
|
|
@@ -20267,6 +20268,7 @@ function aggregateByPage(ranked, config) {
|
|
|
20267
20268
|
}
|
|
20268
20269
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20269
20270
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20271
|
+
if (pageWeight === 0) continue;
|
|
20270
20272
|
if (pageWeight !== 1) {
|
|
20271
20273
|
pageScore *= pageWeight;
|
|
20272
20274
|
}
|
|
@@ -20492,6 +20494,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
20492
20494
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20493
20495
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20494
20496
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20497
|
+
const MAX_DOC_CHARS = 2e3;
|
|
20495
20498
|
const pageCandidates = [];
|
|
20496
20499
|
for (const [url, chunks] of pageGroups) {
|
|
20497
20500
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -20511,12 +20514,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
20511
20514
|
}
|
|
20512
20515
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20513
20516
|
parts.push(body);
|
|
20514
|
-
|
|
20517
|
+
let text = parts.join("\n\n");
|
|
20518
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
20519
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
20520
|
+
}
|
|
20521
|
+
pageCandidates.push({ id: url, text });
|
|
20515
20522
|
}
|
|
20523
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
20524
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
20516
20525
|
const reranked = await this.reranker.rerank(
|
|
20517
20526
|
query,
|
|
20518
|
-
|
|
20519
|
-
|
|
20527
|
+
cappedCandidates,
|
|
20528
|
+
maxCandidates
|
|
20520
20529
|
);
|
|
20521
20530
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
20522
20531
|
return ranked.map((entry) => {
|
package/dist/sveltekit.cjs
CHANGED
|
@@ -16684,7 +16684,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16684
16684
|
ranking: zod.z.object({
|
|
16685
16685
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
16686
16686
|
enableDepthBoost: zod.z.boolean().optional(),
|
|
16687
|
-
pageWeights: zod.z.record(zod.z.string(), zod.z.number().
|
|
16687
|
+
pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
|
|
16688
16688
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16689
16689
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16690
16690
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
@@ -17254,7 +17254,7 @@ var JinaReranker = class {
|
|
|
17254
17254
|
constructor(options) {
|
|
17255
17255
|
this.apiKey = options.apiKey;
|
|
17256
17256
|
this.model = options.model;
|
|
17257
|
-
this.maxRetries = options.maxRetries ??
|
|
17257
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
17258
17258
|
}
|
|
17259
17259
|
async rerank(query, candidates, topN) {
|
|
17260
17260
|
if (candidates.length === 0) {
|
|
@@ -17264,7 +17264,8 @@ var JinaReranker = class {
|
|
|
17264
17264
|
model: this.model,
|
|
17265
17265
|
query,
|
|
17266
17266
|
documents: candidates.map((candidate) => candidate.text),
|
|
17267
|
-
top_n: topN ?? candidates.length
|
|
17267
|
+
top_n: topN ?? candidates.length,
|
|
17268
|
+
return_documents: false
|
|
17268
17269
|
};
|
|
17269
17270
|
let attempt = 0;
|
|
17270
17271
|
while (attempt <= this.maxRetries) {
|
|
@@ -17940,6 +17941,7 @@ function aggregateByPage(ranked, config) {
|
|
|
17940
17941
|
}
|
|
17941
17942
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
17942
17943
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
17944
|
+
if (pageWeight === 0) continue;
|
|
17943
17945
|
if (pageWeight !== 1) {
|
|
17944
17946
|
pageScore *= pageWeight;
|
|
17945
17947
|
}
|
|
@@ -18165,6 +18167,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
18165
18167
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
18166
18168
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
18167
18169
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
18170
|
+
const MAX_DOC_CHARS = 2e3;
|
|
18168
18171
|
const pageCandidates = [];
|
|
18169
18172
|
for (const [url, chunks] of pageGroups) {
|
|
18170
18173
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -18184,12 +18187,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
18184
18187
|
}
|
|
18185
18188
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18186
18189
|
parts.push(body);
|
|
18187
|
-
|
|
18190
|
+
let text = parts.join("\n\n");
|
|
18191
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
18192
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
18193
|
+
}
|
|
18194
|
+
pageCandidates.push({ id: url, text });
|
|
18188
18195
|
}
|
|
18196
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
18197
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
18189
18198
|
const reranked = await this.reranker.rerank(
|
|
18190
18199
|
query,
|
|
18191
|
-
|
|
18192
|
-
|
|
18200
|
+
cappedCandidates,
|
|
18201
|
+
maxCandidates
|
|
18193
18202
|
);
|
|
18194
18203
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
18195
18204
|
return ranked.map((entry) => {
|
|
@@ -19580,7 +19589,7 @@ function extractFromHtml(url, html, config) {
|
|
|
19580
19589
|
const $ = cheerio.load(html);
|
|
19581
19590
|
const normalizedUrl = normalizeUrlPath(url);
|
|
19582
19591
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
19583
|
-
const title =
|
|
19592
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
19584
19593
|
if (config.extract.respectRobotsNoindex) {
|
|
19585
19594
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
19586
19595
|
if (/\bnoindex\b/i.test(robots)) {
|
package/dist/sveltekit.js
CHANGED
|
@@ -16672,7 +16672,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16672
16672
|
ranking: z.object({
|
|
16673
16673
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
16674
16674
|
enableDepthBoost: z.boolean().optional(),
|
|
16675
|
-
pageWeights: z.record(z.string(), z.number().
|
|
16675
|
+
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
16676
16676
|
aggregationCap: z.number().int().positive().optional(),
|
|
16677
16677
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16678
16678
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
@@ -17242,7 +17242,7 @@ var JinaReranker = class {
|
|
|
17242
17242
|
constructor(options) {
|
|
17243
17243
|
this.apiKey = options.apiKey;
|
|
17244
17244
|
this.model = options.model;
|
|
17245
|
-
this.maxRetries = options.maxRetries ??
|
|
17245
|
+
this.maxRetries = options.maxRetries ?? 2;
|
|
17246
17246
|
}
|
|
17247
17247
|
async rerank(query, candidates, topN) {
|
|
17248
17248
|
if (candidates.length === 0) {
|
|
@@ -17252,7 +17252,8 @@ var JinaReranker = class {
|
|
|
17252
17252
|
model: this.model,
|
|
17253
17253
|
query,
|
|
17254
17254
|
documents: candidates.map((candidate) => candidate.text),
|
|
17255
|
-
top_n: topN ?? candidates.length
|
|
17255
|
+
top_n: topN ?? candidates.length,
|
|
17256
|
+
return_documents: false
|
|
17256
17257
|
};
|
|
17257
17258
|
let attempt = 0;
|
|
17258
17259
|
while (attempt <= this.maxRetries) {
|
|
@@ -17928,6 +17929,7 @@ function aggregateByPage(ranked, config) {
|
|
|
17928
17929
|
}
|
|
17929
17930
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
17930
17931
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
17932
|
+
if (pageWeight === 0) continue;
|
|
17931
17933
|
if (pageWeight !== 1) {
|
|
17932
17934
|
pageScore *= pageWeight;
|
|
17933
17935
|
}
|
|
@@ -18153,6 +18155,7 @@ var SearchEngine = class _SearchEngine {
|
|
|
18153
18155
|
const MAX_CHUNKS_PER_PAGE = 5;
|
|
18154
18156
|
const MIN_CHUNKS_PER_PAGE = 1;
|
|
18155
18157
|
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
18158
|
+
const MAX_DOC_CHARS = 2e3;
|
|
18156
18159
|
const pageCandidates = [];
|
|
18157
18160
|
for (const [url, chunks] of pageGroups) {
|
|
18158
18161
|
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
@@ -18172,12 +18175,18 @@ var SearchEngine = class _SearchEngine {
|
|
|
18172
18175
|
}
|
|
18173
18176
|
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18174
18177
|
parts.push(body);
|
|
18175
|
-
|
|
18178
|
+
let text = parts.join("\n\n");
|
|
18179
|
+
if (text.length > MAX_DOC_CHARS) {
|
|
18180
|
+
text = text.slice(0, MAX_DOC_CHARS);
|
|
18181
|
+
}
|
|
18182
|
+
pageCandidates.push({ id: url, text });
|
|
18176
18183
|
}
|
|
18184
|
+
const maxCandidates = Math.max(topK, this.config.rerank.topN);
|
|
18185
|
+
const cappedCandidates = pageCandidates.slice(0, maxCandidates);
|
|
18177
18186
|
const reranked = await this.reranker.rerank(
|
|
18178
18187
|
query,
|
|
18179
|
-
|
|
18180
|
-
|
|
18188
|
+
cappedCandidates,
|
|
18189
|
+
maxCandidates
|
|
18181
18190
|
);
|
|
18182
18191
|
const scoreByUrl = new Map(reranked.map((e) => [e.id, e.score]));
|
|
18183
18192
|
return ranked.map((entry) => {
|
|
@@ -19568,7 +19577,7 @@ function extractFromHtml(url, html, config) {
|
|
|
19568
19577
|
const $ = load(html);
|
|
19569
19578
|
const normalizedUrl = normalizeUrlPath(url);
|
|
19570
19579
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
19571
|
-
const title =
|
|
19580
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
19572
19581
|
if (config.extract.respectRobotsNoindex) {
|
|
19573
19582
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
19574
19583
|
if (/\bnoindex\b/i.test(robots)) {
|