searchsocket 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +50 -30
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +49 -29
- package/dist/index.d.cts +2 -2
- package/dist/index.d.ts +2 -2
- package/dist/index.js +49 -29
- package/dist/sveltekit.cjs +49 -29
- package/dist/sveltekit.d.cts +1 -1
- package/dist/sveltekit.d.ts +1 -1
- package/dist/sveltekit.js +49 -29
- package/dist/{types-DAXk6A3Y.d.cts → types-BrG6XTUU.d.cts} +4 -0
- package/dist/{types-DAXk6A3Y.d.ts → types-BrG6XTUU.d.ts} +4 -0
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -12,7 +12,7 @@ import { Command } from "commander";
|
|
|
12
12
|
// package.json
|
|
13
13
|
var package_default = {
|
|
14
14
|
name: "searchsocket",
|
|
15
|
-
version: "0.3.
|
|
15
|
+
version: "0.3.2",
|
|
16
16
|
description: "Semantic site search and MCP retrieval for SvelteKit static sites",
|
|
17
17
|
license: "MIT",
|
|
18
18
|
author: "Greg Priday <greg@siteorigin.com>",
|
|
@@ -189,7 +189,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
189
189
|
ranking: z.object({
|
|
190
190
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
191
191
|
enableDepthBoost: z.boolean().optional(),
|
|
192
|
-
pageWeights: z.record(z.string(), z.number().
|
|
192
|
+
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
193
193
|
aggregationCap: z.number().int().positive().optional(),
|
|
194
194
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
195
195
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
@@ -913,23 +913,12 @@ var TursoVectorStore = class {
|
|
|
913
913
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
914
914
|
route_file TEXT NOT NULL DEFAULT '',
|
|
915
915
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
916
|
+
description TEXT NOT NULL DEFAULT '',
|
|
917
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
916
918
|
embedding F32_BLOB(${dim})
|
|
917
919
|
)`,
|
|
918
920
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
919
921
|
]);
|
|
920
|
-
const chunkMigrationCols = [
|
|
921
|
-
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
922
|
-
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
923
|
-
];
|
|
924
|
-
for (const col of chunkMigrationCols) {
|
|
925
|
-
try {
|
|
926
|
-
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
927
|
-
} catch (error) {
|
|
928
|
-
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
929
|
-
throw error;
|
|
930
|
-
}
|
|
931
|
-
}
|
|
932
|
-
}
|
|
933
922
|
this.chunksReady = true;
|
|
934
923
|
}
|
|
935
924
|
async ensurePages() {
|
|
@@ -1007,8 +996,8 @@ var TursoVectorStore = class {
|
|
|
1007
996
|
sql: `INSERT OR REPLACE INTO chunks
|
|
1008
997
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
1009
998
|
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
1010
|
-
incoming_links, route_file, tags, embedding)
|
|
1011
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
999
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
1000
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
1012
1001
|
args: [
|
|
1013
1002
|
r.id,
|
|
1014
1003
|
r.metadata.projectId,
|
|
@@ -1027,6 +1016,8 @@ var TursoVectorStore = class {
|
|
|
1027
1016
|
r.metadata.incomingLinks,
|
|
1028
1017
|
r.metadata.routeFile,
|
|
1029
1018
|
JSON.stringify(r.metadata.tags),
|
|
1019
|
+
r.metadata.description ?? "",
|
|
1020
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
1030
1021
|
JSON.stringify(r.vector)
|
|
1031
1022
|
]
|
|
1032
1023
|
}));
|
|
@@ -1042,6 +1033,7 @@ var TursoVectorStore = class {
|
|
|
1042
1033
|
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
1043
1034
|
c.ordinal, c.content_hash,
|
|
1044
1035
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
1036
|
+
c.description, c.keywords,
|
|
1045
1037
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
1046
1038
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
1047
1039
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -1072,6 +1064,12 @@ var TursoVectorStore = class {
|
|
|
1072
1064
|
}
|
|
1073
1065
|
const distance = row.distance;
|
|
1074
1066
|
const score = 1 - distance;
|
|
1067
|
+
const description = row.description || void 0;
|
|
1068
|
+
const keywords = (() => {
|
|
1069
|
+
const raw = row.keywords || "[]";
|
|
1070
|
+
const parsed = JSON.parse(raw);
|
|
1071
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
1072
|
+
})();
|
|
1075
1073
|
hits.push({
|
|
1076
1074
|
id: row.id,
|
|
1077
1075
|
score,
|
|
@@ -1091,7 +1089,9 @@ var TursoVectorStore = class {
|
|
|
1091
1089
|
depth: row.depth,
|
|
1092
1090
|
incomingLinks: row.incoming_links,
|
|
1093
1091
|
routeFile: row.route_file,
|
|
1094
|
-
tags
|
|
1092
|
+
tags,
|
|
1093
|
+
description,
|
|
1094
|
+
keywords
|
|
1095
1095
|
}
|
|
1096
1096
|
});
|
|
1097
1097
|
}
|
|
@@ -1636,7 +1636,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
1636
1636
|
incomingLinks: page.incomingLinks,
|
|
1637
1637
|
routeFile: page.routeFile,
|
|
1638
1638
|
tags: page.tags,
|
|
1639
|
-
contentHash: ""
|
|
1639
|
+
contentHash: "",
|
|
1640
|
+
description: page.description,
|
|
1641
|
+
keywords: page.keywords
|
|
1640
1642
|
};
|
|
1641
1643
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
1642
1644
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -1663,7 +1665,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
1663
1665
|
incomingLinks: page.incomingLinks,
|
|
1664
1666
|
routeFile: page.routeFile,
|
|
1665
1667
|
tags: page.tags,
|
|
1666
|
-
contentHash: ""
|
|
1668
|
+
contentHash: "",
|
|
1669
|
+
description: page.description,
|
|
1670
|
+
keywords: page.keywords
|
|
1667
1671
|
};
|
|
1668
1672
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
1669
1673
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -1696,7 +1700,7 @@ function extractFromHtml(url, html, config) {
|
|
|
1696
1700
|
const $ = load(html);
|
|
1697
1701
|
const normalizedUrl = normalizeUrlPath(url);
|
|
1698
1702
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
1699
|
-
const title =
|
|
1703
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
1700
1704
|
if (config.extract.respectRobotsNoindex) {
|
|
1701
1705
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
1702
1706
|
if (/\bnoindex\b/i.test(robots)) {
|
|
@@ -2774,7 +2778,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
2774
2778
|
depth: chunk.depth,
|
|
2775
2779
|
incomingLinks: chunk.incomingLinks,
|
|
2776
2780
|
routeFile: chunk.routeFile,
|
|
2777
|
-
tags: chunk.tags
|
|
2781
|
+
tags: chunk.tags,
|
|
2782
|
+
description: chunk.description,
|
|
2783
|
+
keywords: chunk.keywords
|
|
2778
2784
|
}
|
|
2779
2785
|
});
|
|
2780
2786
|
}
|
|
@@ -3004,6 +3010,7 @@ function aggregateByPage(ranked, config) {
|
|
|
3004
3010
|
}
|
|
3005
3011
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
3006
3012
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
3013
|
+
if (pageWeight === 0) continue;
|
|
3007
3014
|
if (pageWeight !== 1) {
|
|
3008
3015
|
pageScore *= pageWeight;
|
|
3009
3016
|
}
|
|
@@ -3226,16 +3233,29 @@ var SearchEngine = class _SearchEngine {
|
|
|
3226
3233
|
if (group) group.push(entry);
|
|
3227
3234
|
else pageGroups.set(url, [entry]);
|
|
3228
3235
|
}
|
|
3236
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
3237
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
3238
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
3229
3239
|
const pageCandidates = [];
|
|
3230
3240
|
for (const [url, chunks] of pageGroups) {
|
|
3231
|
-
const
|
|
3232
|
-
|
|
3233
|
-
);
|
|
3234
|
-
const
|
|
3235
|
-
|
|
3236
|
-
|
|
3237
|
-
|
|
3238
|
-
|
|
3241
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
3242
|
+
const bestScore = byScore[0].finalScore;
|
|
3243
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
3244
|
+
const selected = byScore.filter(
|
|
3245
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
3246
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
3247
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
3248
|
+
const first = selected[0].hit.metadata;
|
|
3249
|
+
const parts = [first.title];
|
|
3250
|
+
if (first.description) {
|
|
3251
|
+
parts.push(first.description);
|
|
3252
|
+
}
|
|
3253
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
3254
|
+
parts.push(first.keywords.join(", "));
|
|
3255
|
+
}
|
|
3256
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
3257
|
+
parts.push(body);
|
|
3258
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
3239
3259
|
}
|
|
3240
3260
|
const reranked = await this.reranker.rerank(
|
|
3241
3261
|
query,
|
package/dist/client.d.cts
CHANGED
package/dist/client.d.ts
CHANGED
package/dist/index.cjs
CHANGED
|
@@ -16688,7 +16688,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16688
16688
|
ranking: zod.z.object({
|
|
16689
16689
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
16690
16690
|
enableDepthBoost: zod.z.boolean().optional(),
|
|
16691
|
-
pageWeights: zod.z.record(zod.z.string(), zod.z.number().
|
|
16691
|
+
pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
|
|
16692
16692
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16693
16693
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16694
16694
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
@@ -17435,23 +17435,12 @@ var TursoVectorStore = class {
|
|
|
17435
17435
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17436
17436
|
route_file TEXT NOT NULL DEFAULT '',
|
|
17437
17437
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
17438
|
+
description TEXT NOT NULL DEFAULT '',
|
|
17439
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17438
17440
|
embedding F32_BLOB(${dim})
|
|
17439
17441
|
)`,
|
|
17440
17442
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17441
17443
|
]);
|
|
17442
|
-
const chunkMigrationCols = [
|
|
17443
|
-
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
17444
|
-
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
17445
|
-
];
|
|
17446
|
-
for (const col of chunkMigrationCols) {
|
|
17447
|
-
try {
|
|
17448
|
-
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
17449
|
-
} catch (error) {
|
|
17450
|
-
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17451
|
-
throw error;
|
|
17452
|
-
}
|
|
17453
|
-
}
|
|
17454
|
-
}
|
|
17455
17444
|
this.chunksReady = true;
|
|
17456
17445
|
}
|
|
17457
17446
|
async ensurePages() {
|
|
@@ -17529,8 +17518,8 @@ var TursoVectorStore = class {
|
|
|
17529
17518
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17530
17519
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17531
17520
|
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17532
|
-
incoming_links, route_file, tags, embedding)
|
|
17533
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17521
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17522
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17534
17523
|
args: [
|
|
17535
17524
|
r.id,
|
|
17536
17525
|
r.metadata.projectId,
|
|
@@ -17549,6 +17538,8 @@ var TursoVectorStore = class {
|
|
|
17549
17538
|
r.metadata.incomingLinks,
|
|
17550
17539
|
r.metadata.routeFile,
|
|
17551
17540
|
JSON.stringify(r.metadata.tags),
|
|
17541
|
+
r.metadata.description ?? "",
|
|
17542
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
17552
17543
|
JSON.stringify(r.vector)
|
|
17553
17544
|
]
|
|
17554
17545
|
}));
|
|
@@ -17564,6 +17555,7 @@ var TursoVectorStore = class {
|
|
|
17564
17555
|
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17565
17556
|
c.ordinal, c.content_hash,
|
|
17566
17557
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17558
|
+
c.description, c.keywords,
|
|
17567
17559
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17568
17560
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17569
17561
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -17594,6 +17586,12 @@ var TursoVectorStore = class {
|
|
|
17594
17586
|
}
|
|
17595
17587
|
const distance = row.distance;
|
|
17596
17588
|
const score = 1 - distance;
|
|
17589
|
+
const description = row.description || void 0;
|
|
17590
|
+
const keywords = (() => {
|
|
17591
|
+
const raw = row.keywords || "[]";
|
|
17592
|
+
const parsed = JSON.parse(raw);
|
|
17593
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
17594
|
+
})();
|
|
17597
17595
|
hits.push({
|
|
17598
17596
|
id: row.id,
|
|
17599
17597
|
score,
|
|
@@ -17613,7 +17611,9 @@ var TursoVectorStore = class {
|
|
|
17613
17611
|
depth: row.depth,
|
|
17614
17612
|
incomingLinks: row.incoming_links,
|
|
17615
17613
|
routeFile: row.route_file,
|
|
17616
|
-
tags
|
|
17614
|
+
tags,
|
|
17615
|
+
description,
|
|
17616
|
+
keywords
|
|
17617
17617
|
}
|
|
17618
17618
|
});
|
|
17619
17619
|
}
|
|
@@ -18152,7 +18152,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18152
18152
|
incomingLinks: page.incomingLinks,
|
|
18153
18153
|
routeFile: page.routeFile,
|
|
18154
18154
|
tags: page.tags,
|
|
18155
|
-
contentHash: ""
|
|
18155
|
+
contentHash: "",
|
|
18156
|
+
description: page.description,
|
|
18157
|
+
keywords: page.keywords
|
|
18156
18158
|
};
|
|
18157
18159
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18158
18160
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -18179,7 +18181,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18179
18181
|
incomingLinks: page.incomingLinks,
|
|
18180
18182
|
routeFile: page.routeFile,
|
|
18181
18183
|
tags: page.tags,
|
|
18182
|
-
contentHash: ""
|
|
18184
|
+
contentHash: "",
|
|
18185
|
+
description: page.description,
|
|
18186
|
+
keywords: page.keywords
|
|
18183
18187
|
};
|
|
18184
18188
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18185
18189
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -19034,7 +19038,7 @@ function extractFromHtml(url, html, config) {
|
|
|
19034
19038
|
const $ = cheerio.load(html);
|
|
19035
19039
|
const normalizedUrl = normalizeUrlPath(url);
|
|
19036
19040
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
19037
|
-
const title =
|
|
19041
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
19038
19042
|
if (config.extract.respectRobotsNoindex) {
|
|
19039
19043
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
19040
19044
|
if (/\bnoindex\b/i.test(robots)) {
|
|
@@ -20150,7 +20154,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20150
20154
|
depth: chunk.depth,
|
|
20151
20155
|
incomingLinks: chunk.incomingLinks,
|
|
20152
20156
|
routeFile: chunk.routeFile,
|
|
20153
|
-
tags: chunk.tags
|
|
20157
|
+
tags: chunk.tags,
|
|
20158
|
+
description: chunk.description,
|
|
20159
|
+
keywords: chunk.keywords
|
|
20154
20160
|
}
|
|
20155
20161
|
});
|
|
20156
20162
|
}
|
|
@@ -20273,6 +20279,7 @@ function aggregateByPage(ranked, config) {
|
|
|
20273
20279
|
}
|
|
20274
20280
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20275
20281
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20282
|
+
if (pageWeight === 0) continue;
|
|
20276
20283
|
if (pageWeight !== 1) {
|
|
20277
20284
|
pageScore *= pageWeight;
|
|
20278
20285
|
}
|
|
@@ -20495,16 +20502,29 @@ var SearchEngine = class _SearchEngine {
|
|
|
20495
20502
|
if (group) group.push(entry);
|
|
20496
20503
|
else pageGroups.set(url, [entry]);
|
|
20497
20504
|
}
|
|
20505
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20506
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20507
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20498
20508
|
const pageCandidates = [];
|
|
20499
20509
|
for (const [url, chunks] of pageGroups) {
|
|
20500
|
-
const
|
|
20501
|
-
|
|
20502
|
-
);
|
|
20503
|
-
const
|
|
20504
|
-
|
|
20505
|
-
|
|
20506
|
-
|
|
20507
|
-
|
|
20510
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
20511
|
+
const bestScore = byScore[0].finalScore;
|
|
20512
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
20513
|
+
const selected = byScore.filter(
|
|
20514
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
20515
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
20516
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
20517
|
+
const first = selected[0].hit.metadata;
|
|
20518
|
+
const parts = [first.title];
|
|
20519
|
+
if (first.description) {
|
|
20520
|
+
parts.push(first.description);
|
|
20521
|
+
}
|
|
20522
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
20523
|
+
parts.push(first.keywords.join(", "));
|
|
20524
|
+
}
|
|
20525
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20526
|
+
parts.push(body);
|
|
20527
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
20508
20528
|
}
|
|
20509
20529
|
const reranked = await this.reranker.rerank(
|
|
20510
20530
|
query,
|
package/dist/index.d.cts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { R as ResolvedSearchSocketConfig, b as SearchSocketConfig, c as Scope, E as EmbeddingsProvider, d as Reranker, e as RerankCandidate, V as VectorStore, I as IndexOptions, f as IndexStats, S as SearchRequest, a as SearchResponse } from './types-
|
|
2
|
-
export { C as Chunk, Q as QueryOpts, g as VectorHit, h as VectorRecord } from './types-
|
|
1
|
+
import { R as ResolvedSearchSocketConfig, b as SearchSocketConfig, c as Scope, E as EmbeddingsProvider, d as Reranker, e as RerankCandidate, V as VectorStore, I as IndexOptions, f as IndexStats, S as SearchRequest, a as SearchResponse } from './types-BrG6XTUU.cjs';
|
|
2
|
+
export { C as Chunk, Q as QueryOpts, g as VectorHit, h as VectorRecord } from './types-BrG6XTUU.cjs';
|
|
3
3
|
export { searchsocketHandle, searchsocketVitePlugin } from './sveltekit.cjs';
|
|
4
4
|
export { createSearchClient } from './client.cjs';
|
|
5
5
|
|
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { R as ResolvedSearchSocketConfig, b as SearchSocketConfig, c as Scope, E as EmbeddingsProvider, d as Reranker, e as RerankCandidate, V as VectorStore, I as IndexOptions, f as IndexStats, S as SearchRequest, a as SearchResponse } from './types-
|
|
2
|
-
export { C as Chunk, Q as QueryOpts, g as VectorHit, h as VectorRecord } from './types-
|
|
1
|
+
import { R as ResolvedSearchSocketConfig, b as SearchSocketConfig, c as Scope, E as EmbeddingsProvider, d as Reranker, e as RerankCandidate, V as VectorStore, I as IndexOptions, f as IndexStats, S as SearchRequest, a as SearchResponse } from './types-BrG6XTUU.js';
|
|
2
|
+
export { C as Chunk, Q as QueryOpts, g as VectorHit, h as VectorRecord } from './types-BrG6XTUU.js';
|
|
3
3
|
export { searchsocketHandle, searchsocketVitePlugin } from './sveltekit.js';
|
|
4
4
|
export { createSearchClient } from './client.js';
|
|
5
5
|
|
package/dist/index.js
CHANGED
|
@@ -16676,7 +16676,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16676
16676
|
ranking: z.object({
|
|
16677
16677
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
16678
16678
|
enableDepthBoost: z.boolean().optional(),
|
|
16679
|
-
pageWeights: z.record(z.string(), z.number().
|
|
16679
|
+
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
16680
16680
|
aggregationCap: z.number().int().positive().optional(),
|
|
16681
16681
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16682
16682
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
@@ -17423,23 +17423,12 @@ var TursoVectorStore = class {
|
|
|
17423
17423
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17424
17424
|
route_file TEXT NOT NULL DEFAULT '',
|
|
17425
17425
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
17426
|
+
description TEXT NOT NULL DEFAULT '',
|
|
17427
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17426
17428
|
embedding F32_BLOB(${dim})
|
|
17427
17429
|
)`,
|
|
17428
17430
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17429
17431
|
]);
|
|
17430
|
-
const chunkMigrationCols = [
|
|
17431
|
-
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
17432
|
-
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
17433
|
-
];
|
|
17434
|
-
for (const col of chunkMigrationCols) {
|
|
17435
|
-
try {
|
|
17436
|
-
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
17437
|
-
} catch (error) {
|
|
17438
|
-
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17439
|
-
throw error;
|
|
17440
|
-
}
|
|
17441
|
-
}
|
|
17442
|
-
}
|
|
17443
17432
|
this.chunksReady = true;
|
|
17444
17433
|
}
|
|
17445
17434
|
async ensurePages() {
|
|
@@ -17517,8 +17506,8 @@ var TursoVectorStore = class {
|
|
|
17517
17506
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17518
17507
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17519
17508
|
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17520
|
-
incoming_links, route_file, tags, embedding)
|
|
17521
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17509
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17510
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17522
17511
|
args: [
|
|
17523
17512
|
r.id,
|
|
17524
17513
|
r.metadata.projectId,
|
|
@@ -17537,6 +17526,8 @@ var TursoVectorStore = class {
|
|
|
17537
17526
|
r.metadata.incomingLinks,
|
|
17538
17527
|
r.metadata.routeFile,
|
|
17539
17528
|
JSON.stringify(r.metadata.tags),
|
|
17529
|
+
r.metadata.description ?? "",
|
|
17530
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
17540
17531
|
JSON.stringify(r.vector)
|
|
17541
17532
|
]
|
|
17542
17533
|
}));
|
|
@@ -17552,6 +17543,7 @@ var TursoVectorStore = class {
|
|
|
17552
17543
|
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17553
17544
|
c.ordinal, c.content_hash,
|
|
17554
17545
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17546
|
+
c.description, c.keywords,
|
|
17555
17547
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17556
17548
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17557
17549
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -17582,6 +17574,12 @@ var TursoVectorStore = class {
|
|
|
17582
17574
|
}
|
|
17583
17575
|
const distance = row.distance;
|
|
17584
17576
|
const score = 1 - distance;
|
|
17577
|
+
const description = row.description || void 0;
|
|
17578
|
+
const keywords = (() => {
|
|
17579
|
+
const raw = row.keywords || "[]";
|
|
17580
|
+
const parsed = JSON.parse(raw);
|
|
17581
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
17582
|
+
})();
|
|
17585
17583
|
hits.push({
|
|
17586
17584
|
id: row.id,
|
|
17587
17585
|
score,
|
|
@@ -17601,7 +17599,9 @@ var TursoVectorStore = class {
|
|
|
17601
17599
|
depth: row.depth,
|
|
17602
17600
|
incomingLinks: row.incoming_links,
|
|
17603
17601
|
routeFile: row.route_file,
|
|
17604
|
-
tags
|
|
17602
|
+
tags,
|
|
17603
|
+
description,
|
|
17604
|
+
keywords
|
|
17605
17605
|
}
|
|
17606
17606
|
});
|
|
17607
17607
|
}
|
|
@@ -18140,7 +18140,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18140
18140
|
incomingLinks: page.incomingLinks,
|
|
18141
18141
|
routeFile: page.routeFile,
|
|
18142
18142
|
tags: page.tags,
|
|
18143
|
-
contentHash: ""
|
|
18143
|
+
contentHash: "",
|
|
18144
|
+
description: page.description,
|
|
18145
|
+
keywords: page.keywords
|
|
18144
18146
|
};
|
|
18145
18147
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18146
18148
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -18167,7 +18169,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18167
18169
|
incomingLinks: page.incomingLinks,
|
|
18168
18170
|
routeFile: page.routeFile,
|
|
18169
18171
|
tags: page.tags,
|
|
18170
|
-
contentHash: ""
|
|
18172
|
+
contentHash: "",
|
|
18173
|
+
description: page.description,
|
|
18174
|
+
keywords: page.keywords
|
|
18171
18175
|
};
|
|
18172
18176
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18173
18177
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -19022,7 +19026,7 @@ function extractFromHtml(url, html, config) {
|
|
|
19022
19026
|
const $ = load(html);
|
|
19023
19027
|
const normalizedUrl = normalizeUrlPath(url);
|
|
19024
19028
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
19025
|
-
const title =
|
|
19029
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
19026
19030
|
if (config.extract.respectRobotsNoindex) {
|
|
19027
19031
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
19028
19032
|
if (/\bnoindex\b/i.test(robots)) {
|
|
@@ -20138,7 +20142,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20138
20142
|
depth: chunk.depth,
|
|
20139
20143
|
incomingLinks: chunk.incomingLinks,
|
|
20140
20144
|
routeFile: chunk.routeFile,
|
|
20141
|
-
tags: chunk.tags
|
|
20145
|
+
tags: chunk.tags,
|
|
20146
|
+
description: chunk.description,
|
|
20147
|
+
keywords: chunk.keywords
|
|
20142
20148
|
}
|
|
20143
20149
|
});
|
|
20144
20150
|
}
|
|
@@ -20261,6 +20267,7 @@ function aggregateByPage(ranked, config) {
|
|
|
20261
20267
|
}
|
|
20262
20268
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
20263
20269
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
20270
|
+
if (pageWeight === 0) continue;
|
|
20264
20271
|
if (pageWeight !== 1) {
|
|
20265
20272
|
pageScore *= pageWeight;
|
|
20266
20273
|
}
|
|
@@ -20483,16 +20490,29 @@ var SearchEngine = class _SearchEngine {
|
|
|
20483
20490
|
if (group) group.push(entry);
|
|
20484
20491
|
else pageGroups.set(url, [entry]);
|
|
20485
20492
|
}
|
|
20493
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
20494
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
20495
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
20486
20496
|
const pageCandidates = [];
|
|
20487
20497
|
for (const [url, chunks] of pageGroups) {
|
|
20488
|
-
const
|
|
20489
|
-
|
|
20490
|
-
);
|
|
20491
|
-
const
|
|
20492
|
-
|
|
20493
|
-
|
|
20494
|
-
|
|
20495
|
-
|
|
20498
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
20499
|
+
const bestScore = byScore[0].finalScore;
|
|
20500
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
20501
|
+
const selected = byScore.filter(
|
|
20502
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
20503
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
20504
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
20505
|
+
const first = selected[0].hit.metadata;
|
|
20506
|
+
const parts = [first.title];
|
|
20507
|
+
if (first.description) {
|
|
20508
|
+
parts.push(first.description);
|
|
20509
|
+
}
|
|
20510
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
20511
|
+
parts.push(first.keywords.join(", "));
|
|
20512
|
+
}
|
|
20513
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
20514
|
+
parts.push(body);
|
|
20515
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
20496
20516
|
}
|
|
20497
20517
|
const reranked = await this.reranker.rerank(
|
|
20498
20518
|
query,
|
package/dist/sveltekit.cjs
CHANGED
|
@@ -16684,7 +16684,7 @@ var searchSocketConfigSchema = zod.z.object({
|
|
|
16684
16684
|
ranking: zod.z.object({
|
|
16685
16685
|
enableIncomingLinkBoost: zod.z.boolean().optional(),
|
|
16686
16686
|
enableDepthBoost: zod.z.boolean().optional(),
|
|
16687
|
-
pageWeights: zod.z.record(zod.z.string(), zod.z.number().
|
|
16687
|
+
pageWeights: zod.z.record(zod.z.string(), zod.z.number().min(0)).optional(),
|
|
16688
16688
|
aggregationCap: zod.z.number().int().positive().optional(),
|
|
16689
16689
|
aggregationDecay: zod.z.number().min(0).max(1).optional(),
|
|
16690
16690
|
minChunkScoreRatio: zod.z.number().min(0).max(1).optional(),
|
|
@@ -17468,23 +17468,12 @@ var TursoVectorStore = class {
|
|
|
17468
17468
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17469
17469
|
route_file TEXT NOT NULL DEFAULT '',
|
|
17470
17470
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
17471
|
+
description TEXT NOT NULL DEFAULT '',
|
|
17472
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17471
17473
|
embedding F32_BLOB(${dim})
|
|
17472
17474
|
)`,
|
|
17473
17475
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17474
17476
|
]);
|
|
17475
|
-
const chunkMigrationCols = [
|
|
17476
|
-
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
17477
|
-
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
17478
|
-
];
|
|
17479
|
-
for (const col of chunkMigrationCols) {
|
|
17480
|
-
try {
|
|
17481
|
-
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
17482
|
-
} catch (error) {
|
|
17483
|
-
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17484
|
-
throw error;
|
|
17485
|
-
}
|
|
17486
|
-
}
|
|
17487
|
-
}
|
|
17488
17477
|
this.chunksReady = true;
|
|
17489
17478
|
}
|
|
17490
17479
|
async ensurePages() {
|
|
@@ -17562,8 +17551,8 @@ var TursoVectorStore = class {
|
|
|
17562
17551
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17563
17552
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17564
17553
|
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17565
|
-
incoming_links, route_file, tags, embedding)
|
|
17566
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17554
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17555
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17567
17556
|
args: [
|
|
17568
17557
|
r.id,
|
|
17569
17558
|
r.metadata.projectId,
|
|
@@ -17582,6 +17571,8 @@ var TursoVectorStore = class {
|
|
|
17582
17571
|
r.metadata.incomingLinks,
|
|
17583
17572
|
r.metadata.routeFile,
|
|
17584
17573
|
JSON.stringify(r.metadata.tags),
|
|
17574
|
+
r.metadata.description ?? "",
|
|
17575
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
17585
17576
|
JSON.stringify(r.vector)
|
|
17586
17577
|
]
|
|
17587
17578
|
}));
|
|
@@ -17597,6 +17588,7 @@ var TursoVectorStore = class {
|
|
|
17597
17588
|
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17598
17589
|
c.ordinal, c.content_hash,
|
|
17599
17590
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17591
|
+
c.description, c.keywords,
|
|
17600
17592
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17601
17593
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17602
17594
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -17627,6 +17619,12 @@ var TursoVectorStore = class {
|
|
|
17627
17619
|
}
|
|
17628
17620
|
const distance = row.distance;
|
|
17629
17621
|
const score = 1 - distance;
|
|
17622
|
+
const description = row.description || void 0;
|
|
17623
|
+
const keywords = (() => {
|
|
17624
|
+
const raw = row.keywords || "[]";
|
|
17625
|
+
const parsed = JSON.parse(raw);
|
|
17626
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
17627
|
+
})();
|
|
17630
17628
|
hits.push({
|
|
17631
17629
|
id: row.id,
|
|
17632
17630
|
score,
|
|
@@ -17646,7 +17644,9 @@ var TursoVectorStore = class {
|
|
|
17646
17644
|
depth: row.depth,
|
|
17647
17645
|
incomingLinks: row.incoming_links,
|
|
17648
17646
|
routeFile: row.route_file,
|
|
17649
|
-
tags
|
|
17647
|
+
tags,
|
|
17648
|
+
description,
|
|
17649
|
+
keywords
|
|
17650
17650
|
}
|
|
17651
17651
|
});
|
|
17652
17652
|
}
|
|
@@ -17940,6 +17940,7 @@ function aggregateByPage(ranked, config) {
|
|
|
17940
17940
|
}
|
|
17941
17941
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
17942
17942
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
17943
|
+
if (pageWeight === 0) continue;
|
|
17943
17944
|
if (pageWeight !== 1) {
|
|
17944
17945
|
pageScore *= pageWeight;
|
|
17945
17946
|
}
|
|
@@ -18162,16 +18163,29 @@ var SearchEngine = class _SearchEngine {
|
|
|
18162
18163
|
if (group) group.push(entry);
|
|
18163
18164
|
else pageGroups.set(url, [entry]);
|
|
18164
18165
|
}
|
|
18166
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
18167
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
18168
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
18165
18169
|
const pageCandidates = [];
|
|
18166
18170
|
for (const [url, chunks] of pageGroups) {
|
|
18167
|
-
const
|
|
18168
|
-
|
|
18169
|
-
);
|
|
18170
|
-
const
|
|
18171
|
-
|
|
18172
|
-
|
|
18173
|
-
|
|
18174
|
-
|
|
18171
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
18172
|
+
const bestScore = byScore[0].finalScore;
|
|
18173
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
18174
|
+
const selected = byScore.filter(
|
|
18175
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
18176
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
18177
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
18178
|
+
const first = selected[0].hit.metadata;
|
|
18179
|
+
const parts = [first.title];
|
|
18180
|
+
if (first.description) {
|
|
18181
|
+
parts.push(first.description);
|
|
18182
|
+
}
|
|
18183
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
18184
|
+
parts.push(first.keywords.join(", "));
|
|
18185
|
+
}
|
|
18186
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18187
|
+
parts.push(body);
|
|
18188
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
18175
18189
|
}
|
|
18176
18190
|
const reranked = await this.reranker.rerank(
|
|
18177
18191
|
query,
|
|
@@ -18681,7 +18695,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18681
18695
|
incomingLinks: page.incomingLinks,
|
|
18682
18696
|
routeFile: page.routeFile,
|
|
18683
18697
|
tags: page.tags,
|
|
18684
|
-
contentHash: ""
|
|
18698
|
+
contentHash: "",
|
|
18699
|
+
description: page.description,
|
|
18700
|
+
keywords: page.keywords
|
|
18685
18701
|
};
|
|
18686
18702
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18687
18703
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -18708,7 +18724,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18708
18724
|
incomingLinks: page.incomingLinks,
|
|
18709
18725
|
routeFile: page.routeFile,
|
|
18710
18726
|
tags: page.tags,
|
|
18711
|
-
contentHash: ""
|
|
18727
|
+
contentHash: "",
|
|
18728
|
+
description: page.description,
|
|
18729
|
+
keywords: page.keywords
|
|
18712
18730
|
};
|
|
18713
18731
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18714
18732
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -19563,7 +19581,7 @@ function extractFromHtml(url, html, config) {
|
|
|
19563
19581
|
const $ = cheerio.load(html);
|
|
19564
19582
|
const normalizedUrl = normalizeUrlPath(url);
|
|
19565
19583
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
19566
|
-
const title =
|
|
19584
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
19567
19585
|
if (config.extract.respectRobotsNoindex) {
|
|
19568
19586
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
19569
19587
|
if (/\bnoindex\b/i.test(robots)) {
|
|
@@ -20671,7 +20689,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20671
20689
|
depth: chunk.depth,
|
|
20672
20690
|
incomingLinks: chunk.incomingLinks,
|
|
20673
20691
|
routeFile: chunk.routeFile,
|
|
20674
|
-
tags: chunk.tags
|
|
20692
|
+
tags: chunk.tags,
|
|
20693
|
+
description: chunk.description,
|
|
20694
|
+
keywords: chunk.keywords
|
|
20675
20695
|
}
|
|
20676
20696
|
});
|
|
20677
20697
|
}
|
package/dist/sveltekit.d.cts
CHANGED
package/dist/sveltekit.d.ts
CHANGED
package/dist/sveltekit.js
CHANGED
|
@@ -16672,7 +16672,7 @@ var searchSocketConfigSchema = z.object({
|
|
|
16672
16672
|
ranking: z.object({
|
|
16673
16673
|
enableIncomingLinkBoost: z.boolean().optional(),
|
|
16674
16674
|
enableDepthBoost: z.boolean().optional(),
|
|
16675
|
-
pageWeights: z.record(z.string(), z.number().
|
|
16675
|
+
pageWeights: z.record(z.string(), z.number().min(0)).optional(),
|
|
16676
16676
|
aggregationCap: z.number().int().positive().optional(),
|
|
16677
16677
|
aggregationDecay: z.number().min(0).max(1).optional(),
|
|
16678
16678
|
minChunkScoreRatio: z.number().min(0).max(1).optional(),
|
|
@@ -17456,23 +17456,12 @@ var TursoVectorStore = class {
|
|
|
17456
17456
|
incoming_links INTEGER NOT NULL DEFAULT 0,
|
|
17457
17457
|
route_file TEXT NOT NULL DEFAULT '',
|
|
17458
17458
|
tags TEXT NOT NULL DEFAULT '[]',
|
|
17459
|
+
description TEXT NOT NULL DEFAULT '',
|
|
17460
|
+
keywords TEXT NOT NULL DEFAULT '[]',
|
|
17459
17461
|
embedding F32_BLOB(${dim})
|
|
17460
17462
|
)`,
|
|
17461
17463
|
`CREATE INDEX IF NOT EXISTS idx ON chunks (libsql_vector_idx(embedding, 'metric=cosine'))`
|
|
17462
17464
|
]);
|
|
17463
|
-
const chunkMigrationCols = [
|
|
17464
|
-
{ name: "chunk_text", def: "TEXT NOT NULL DEFAULT ''" },
|
|
17465
|
-
{ name: "ordinal", def: "INTEGER NOT NULL DEFAULT 0" }
|
|
17466
|
-
];
|
|
17467
|
-
for (const col of chunkMigrationCols) {
|
|
17468
|
-
try {
|
|
17469
|
-
await this.client.execute(`ALTER TABLE chunks ADD COLUMN ${col.name} ${col.def}`);
|
|
17470
|
-
} catch (error) {
|
|
17471
|
-
if (error instanceof Error && !error.message.includes("duplicate column")) {
|
|
17472
|
-
throw error;
|
|
17473
|
-
}
|
|
17474
|
-
}
|
|
17475
|
-
}
|
|
17476
17465
|
this.chunksReady = true;
|
|
17477
17466
|
}
|
|
17478
17467
|
async ensurePages() {
|
|
@@ -17550,8 +17539,8 @@ var TursoVectorStore = class {
|
|
|
17550
17539
|
sql: `INSERT OR REPLACE INTO chunks
|
|
17551
17540
|
(id, project_id, scope_name, url, path, title, section_title,
|
|
17552
17541
|
heading_path, snippet, chunk_text, ordinal, content_hash, model_id, depth,
|
|
17553
|
-
incoming_links, route_file, tags, embedding)
|
|
17554
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17542
|
+
incoming_links, route_file, tags, description, keywords, embedding)
|
|
17543
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, vector(?))`,
|
|
17555
17544
|
args: [
|
|
17556
17545
|
r.id,
|
|
17557
17546
|
r.metadata.projectId,
|
|
@@ -17570,6 +17559,8 @@ var TursoVectorStore = class {
|
|
|
17570
17559
|
r.metadata.incomingLinks,
|
|
17571
17560
|
r.metadata.routeFile,
|
|
17572
17561
|
JSON.stringify(r.metadata.tags),
|
|
17562
|
+
r.metadata.description ?? "",
|
|
17563
|
+
JSON.stringify(r.metadata.keywords ?? []),
|
|
17573
17564
|
JSON.stringify(r.vector)
|
|
17574
17565
|
]
|
|
17575
17566
|
}));
|
|
@@ -17585,6 +17576,7 @@ var TursoVectorStore = class {
|
|
|
17585
17576
|
c.section_title, c.heading_path, c.snippet, c.chunk_text,
|
|
17586
17577
|
c.ordinal, c.content_hash,
|
|
17587
17578
|
c.model_id, c.depth, c.incoming_links, c.route_file, c.tags,
|
|
17579
|
+
c.description, c.keywords,
|
|
17588
17580
|
vector_distance_cos(c.embedding, vector(?)) AS distance
|
|
17589
17581
|
FROM vector_top_k('idx', vector(?), ?) AS v
|
|
17590
17582
|
JOIN chunks AS c ON c.rowid = v.id`,
|
|
@@ -17615,6 +17607,12 @@ var TursoVectorStore = class {
|
|
|
17615
17607
|
}
|
|
17616
17608
|
const distance = row.distance;
|
|
17617
17609
|
const score = 1 - distance;
|
|
17610
|
+
const description = row.description || void 0;
|
|
17611
|
+
const keywords = (() => {
|
|
17612
|
+
const raw = row.keywords || "[]";
|
|
17613
|
+
const parsed = JSON.parse(raw);
|
|
17614
|
+
return parsed.length > 0 ? parsed : void 0;
|
|
17615
|
+
})();
|
|
17618
17616
|
hits.push({
|
|
17619
17617
|
id: row.id,
|
|
17620
17618
|
score,
|
|
@@ -17634,7 +17632,9 @@ var TursoVectorStore = class {
|
|
|
17634
17632
|
depth: row.depth,
|
|
17635
17633
|
incomingLinks: row.incoming_links,
|
|
17636
17634
|
routeFile: row.route_file,
|
|
17637
|
-
tags
|
|
17635
|
+
tags,
|
|
17636
|
+
description,
|
|
17637
|
+
keywords
|
|
17638
17638
|
}
|
|
17639
17639
|
});
|
|
17640
17640
|
}
|
|
@@ -17928,6 +17928,7 @@ function aggregateByPage(ranked, config) {
|
|
|
17928
17928
|
}
|
|
17929
17929
|
let pageScore = maxScore + aggregationBonus * config.ranking.weights.aggregation;
|
|
17930
17930
|
const pageWeight = findPageWeight(url, config.ranking.pageWeights);
|
|
17931
|
+
if (pageWeight === 0) continue;
|
|
17931
17932
|
if (pageWeight !== 1) {
|
|
17932
17933
|
pageScore *= pageWeight;
|
|
17933
17934
|
}
|
|
@@ -18150,16 +18151,29 @@ var SearchEngine = class _SearchEngine {
|
|
|
18150
18151
|
if (group) group.push(entry);
|
|
18151
18152
|
else pageGroups.set(url, [entry]);
|
|
18152
18153
|
}
|
|
18154
|
+
const MAX_CHUNKS_PER_PAGE = 5;
|
|
18155
|
+
const MIN_CHUNKS_PER_PAGE = 1;
|
|
18156
|
+
const MIN_CHUNK_SCORE_RATIO = 0.5;
|
|
18153
18157
|
const pageCandidates = [];
|
|
18154
18158
|
for (const [url, chunks] of pageGroups) {
|
|
18155
|
-
const
|
|
18156
|
-
|
|
18157
|
-
);
|
|
18158
|
-
const
|
|
18159
|
-
|
|
18160
|
-
|
|
18161
|
-
|
|
18162
|
-
|
|
18159
|
+
const byScore = [...chunks].sort((a, b) => b.finalScore - a.finalScore);
|
|
18160
|
+
const bestScore = byScore[0].finalScore;
|
|
18161
|
+
const scoreFloor = Number.isFinite(bestScore) ? bestScore * MIN_CHUNK_SCORE_RATIO : Number.NEGATIVE_INFINITY;
|
|
18162
|
+
const selected = byScore.filter(
|
|
18163
|
+
(c, i) => i < MIN_CHUNKS_PER_PAGE || c.finalScore >= scoreFloor
|
|
18164
|
+
).slice(0, MAX_CHUNKS_PER_PAGE);
|
|
18165
|
+
selected.sort((a, b) => (a.hit.metadata.ordinal ?? 0) - (b.hit.metadata.ordinal ?? 0));
|
|
18166
|
+
const first = selected[0].hit.metadata;
|
|
18167
|
+
const parts = [first.title];
|
|
18168
|
+
if (first.description) {
|
|
18169
|
+
parts.push(first.description);
|
|
18170
|
+
}
|
|
18171
|
+
if (first.keywords && first.keywords.length > 0) {
|
|
18172
|
+
parts.push(first.keywords.join(", "));
|
|
18173
|
+
}
|
|
18174
|
+
const body = selected.map((c) => c.hit.metadata.chunkText || c.hit.metadata.snippet).join("\n\n");
|
|
18175
|
+
parts.push(body);
|
|
18176
|
+
pageCandidates.push({ id: url, text: parts.join("\n\n") });
|
|
18163
18177
|
}
|
|
18164
18178
|
const reranked = await this.reranker.rerank(
|
|
18165
18179
|
query,
|
|
@@ -18669,7 +18683,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18669
18683
|
incomingLinks: page.incomingLinks,
|
|
18670
18684
|
routeFile: page.routeFile,
|
|
18671
18685
|
tags: page.tags,
|
|
18672
|
-
contentHash: ""
|
|
18686
|
+
contentHash: "",
|
|
18687
|
+
description: page.description,
|
|
18688
|
+
keywords: page.keywords
|
|
18673
18689
|
};
|
|
18674
18690
|
const embeddingText = buildEmbeddingText(summaryChunk, config.chunking.prependTitle);
|
|
18675
18691
|
summaryChunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -18696,7 +18712,9 @@ function chunkMirrorPage(page, config, scope) {
|
|
|
18696
18712
|
incomingLinks: page.incomingLinks,
|
|
18697
18713
|
routeFile: page.routeFile,
|
|
18698
18714
|
tags: page.tags,
|
|
18699
|
-
contentHash: ""
|
|
18715
|
+
contentHash: "",
|
|
18716
|
+
description: page.description,
|
|
18717
|
+
keywords: page.keywords
|
|
18700
18718
|
};
|
|
18701
18719
|
const embeddingText = buildEmbeddingText(chunk, config.chunking.prependTitle);
|
|
18702
18720
|
chunk.contentHash = sha256(normalizeText(embeddingText));
|
|
@@ -19551,7 +19569,7 @@ function extractFromHtml(url, html, config) {
|
|
|
19551
19569
|
const $ = load(html);
|
|
19552
19570
|
const normalizedUrl = normalizeUrlPath(url);
|
|
19553
19571
|
const pageBaseUrl = new URL(`https://searchsocket.local${normalizedUrl}`);
|
|
19554
|
-
const title =
|
|
19572
|
+
const title = $("meta[property='og:title']").attr("content")?.trim() || normalizeText($(`${config.extract.mainSelector} h1`).first().text() || "") || $("meta[name='twitter:title']").attr("content")?.trim() || normalizeText($("title").first().text() || "") || normalizedUrl;
|
|
19555
19573
|
if (config.extract.respectRobotsNoindex) {
|
|
19556
19574
|
const robots = $("meta[name='robots']").attr("content") ?? "";
|
|
19557
19575
|
if (/\bnoindex\b/i.test(robots)) {
|
|
@@ -20659,7 +20677,9 @@ var IndexPipeline = class _IndexPipeline {
|
|
|
20659
20677
|
depth: chunk.depth,
|
|
20660
20678
|
incomingLinks: chunk.incomingLinks,
|
|
20661
20679
|
routeFile: chunk.routeFile,
|
|
20662
|
-
tags: chunk.tags
|
|
20680
|
+
tags: chunk.tags,
|
|
20681
|
+
description: chunk.description,
|
|
20682
|
+
keywords: chunk.keywords
|
|
20663
20683
|
}
|
|
20664
20684
|
});
|
|
20665
20685
|
}
|
|
@@ -261,6 +261,8 @@ interface Chunk {
|
|
|
261
261
|
routeFile: string;
|
|
262
262
|
tags: string[];
|
|
263
263
|
contentHash: string;
|
|
264
|
+
description?: string;
|
|
265
|
+
keywords?: string[];
|
|
264
266
|
}
|
|
265
267
|
interface VectorRecord {
|
|
266
268
|
id: string;
|
|
@@ -282,6 +284,8 @@ interface VectorRecord {
|
|
|
282
284
|
incomingLinks: number;
|
|
283
285
|
routeFile: string;
|
|
284
286
|
tags: string[];
|
|
287
|
+
description?: string;
|
|
288
|
+
keywords?: string[];
|
|
285
289
|
};
|
|
286
290
|
}
|
|
287
291
|
interface QueryOpts {
|
|
@@ -261,6 +261,8 @@ interface Chunk {
|
|
|
261
261
|
routeFile: string;
|
|
262
262
|
tags: string[];
|
|
263
263
|
contentHash: string;
|
|
264
|
+
description?: string;
|
|
265
|
+
keywords?: string[];
|
|
264
266
|
}
|
|
265
267
|
interface VectorRecord {
|
|
266
268
|
id: string;
|
|
@@ -282,6 +284,8 @@ interface VectorRecord {
|
|
|
282
284
|
incomingLinks: number;
|
|
283
285
|
routeFile: string;
|
|
284
286
|
tags: string[];
|
|
287
|
+
description?: string;
|
|
288
|
+
keywords?: string[];
|
|
285
289
|
};
|
|
286
290
|
}
|
|
287
291
|
interface QueryOpts {
|