searchsocket 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,10 @@
1
+ type Awaitable<T> = T | Promise<T>;
1
2
  type ScopeMode = "fixed" | "git" | "env";
2
3
  type SourceMode = "static-output" | "crawl" | "content-files" | "build";
3
- type EmbeddingProvider = "jina";
4
+ interface OutgoingLink {
5
+ url: string;
6
+ anchorText: string;
7
+ }
4
8
  interface SearchSocketConfig {
5
9
  project?: {
6
10
  id?: string;
@@ -44,6 +48,7 @@ interface SearchSocketConfig {
44
48
  dropSelectors?: string[];
45
49
  ignoreAttr?: string;
46
50
  noindexAttr?: string;
51
+ imageDescAttr?: string;
47
52
  respectRobotsNoindex?: boolean;
48
53
  };
49
54
  transform?: {
@@ -60,44 +65,47 @@ interface SearchSocketConfig {
60
65
  dontSplitInside?: Array<"code" | "table" | "blockquote">;
61
66
  prependTitle?: boolean;
62
67
  pageSummaryChunk?: boolean;
68
+ weightHeadings?: boolean;
63
69
  };
64
- embeddings?: {
65
- provider?: EmbeddingProvider;
66
- model?: string;
67
- apiKey?: string;
68
- apiKeyEnv?: string;
69
- batchSize?: number;
70
- concurrency?: number;
71
- pricePer1kTokens?: number;
72
- };
73
- vector?: {
74
- dimension?: number;
75
- turso?: {
76
- url?: string;
77
- authToken?: string;
78
- urlEnv?: string;
79
- authTokenEnv?: string;
80
- localPath?: string;
70
+ upstash?: {
71
+ url?: string;
72
+ token?: string;
73
+ urlEnv?: string;
74
+ tokenEnv?: string;
75
+ namespaces?: {
76
+ pages?: string;
77
+ chunks?: string;
81
78
  };
82
79
  };
83
- rerank?: {
84
- enabled?: boolean;
85
- topN?: number;
80
+ embedding?: {
86
81
  model?: string;
82
+ dimensions?: number;
83
+ taskType?: string;
84
+ batchSize?: number;
85
+ };
86
+ search?: {
87
+ dualSearch?: boolean;
88
+ pageSearchWeight?: number;
87
89
  };
88
90
  ranking?: {
89
91
  enableIncomingLinkBoost?: boolean;
90
92
  enableDepthBoost?: boolean;
93
+ enableFreshnessBoost?: boolean;
94
+ freshnessDecayRate?: number;
95
+ enableAnchorTextBoost?: boolean;
91
96
  pageWeights?: Record<string, number>;
92
97
  aggregationCap?: number;
93
98
  aggregationDecay?: number;
94
99
  minChunkScoreRatio?: number;
95
- minScore?: number;
100
+ minScoreRatio?: number;
101
+ scoreGapThreshold?: number;
96
102
  weights?: {
97
103
  incomingLinks?: number;
98
104
  depth?: number;
99
- rerank?: number;
100
105
  aggregation?: number;
106
+ titleMatch?: number;
107
+ freshness?: number;
108
+ anchorText?: number;
101
109
  };
102
110
  };
103
111
  api?: {
@@ -112,15 +120,30 @@ interface SearchSocketConfig {
112
120
  };
113
121
  mcp?: {
114
122
  enable?: boolean;
123
+ access?: "public" | "private";
115
124
  transport?: "stdio" | "http";
116
125
  http?: {
117
126
  port?: number;
118
127
  path?: string;
128
+ apiKey?: string;
129
+ apiKeyEnv?: string;
119
130
  };
131
+ handle?: {
132
+ path?: string;
133
+ apiKey?: string;
134
+ enableJsonResponse?: boolean;
135
+ };
136
+ };
137
+ llmsTxt?: {
138
+ enable?: boolean;
139
+ outputPath?: string;
140
+ title?: string;
141
+ description?: string;
142
+ generateFull?: boolean;
143
+ serveMarkdownVariants?: boolean;
120
144
  };
121
145
  state?: {
122
146
  dir?: string;
123
- writeMirror?: boolean;
124
147
  };
125
148
  }
126
149
  interface ResolvedSearchSocketConfig {
@@ -166,6 +189,7 @@ interface ResolvedSearchSocketConfig {
166
189
  dropSelectors: string[];
167
190
  ignoreAttr: string;
168
191
  noindexAttr: string;
192
+ imageDescAttr: string;
169
193
  respectRobotsNoindex: boolean;
170
194
  };
171
195
  transform: {
@@ -182,44 +206,47 @@ interface ResolvedSearchSocketConfig {
182
206
  dontSplitInside: Array<"code" | "table" | "blockquote">;
183
207
  prependTitle: boolean;
184
208
  pageSummaryChunk: boolean;
209
+ weightHeadings: boolean;
185
210
  };
186
- embeddings: {
187
- provider: EmbeddingProvider;
188
- model: string;
189
- apiKey?: string;
190
- apiKeyEnv: string;
191
- batchSize: number;
192
- concurrency: number;
193
- pricePer1kTokens?: number;
194
- };
195
- vector: {
196
- dimension?: number;
197
- turso: {
198
- url?: string;
199
- authToken?: string;
200
- urlEnv: string;
201
- authTokenEnv: string;
202
- localPath: string;
211
+ upstash: {
212
+ url?: string;
213
+ token?: string;
214
+ urlEnv: string;
215
+ tokenEnv: string;
216
+ namespaces: {
217
+ pages: string;
218
+ chunks: string;
203
219
  };
204
220
  };
205
- rerank: {
206
- enabled: boolean;
207
- topN: number;
221
+ embedding: {
208
222
  model: string;
223
+ dimensions: number;
224
+ taskType: string;
225
+ batchSize: number;
226
+ };
227
+ search: {
228
+ dualSearch: boolean;
229
+ pageSearchWeight: number;
209
230
  };
210
231
  ranking: {
211
232
  enableIncomingLinkBoost: boolean;
212
233
  enableDepthBoost: boolean;
234
+ enableFreshnessBoost: boolean;
235
+ freshnessDecayRate: number;
236
+ enableAnchorTextBoost: boolean;
213
237
  pageWeights: Record<string, number>;
214
238
  aggregationCap: number;
215
239
  aggregationDecay: number;
216
240
  minChunkScoreRatio: number;
217
- minScore: number;
241
+ minScoreRatio: number;
242
+ scoreGapThreshold: number;
218
243
  weights: {
219
244
  incomingLinks: number;
220
245
  depth: number;
221
- rerank: number;
222
246
  aggregation: number;
247
+ titleMatch: number;
248
+ freshness: number;
249
+ anchorText: number;
223
250
  };
224
251
  };
225
252
  api: {
@@ -234,15 +261,30 @@ interface ResolvedSearchSocketConfig {
234
261
  };
235
262
  mcp: {
236
263
  enable: boolean;
264
+ access: "public" | "private";
237
265
  transport: "stdio" | "http";
238
266
  http: {
239
267
  port: number;
240
268
  path: string;
269
+ apiKey?: string;
270
+ apiKeyEnv?: string;
271
+ };
272
+ handle: {
273
+ path: string;
274
+ apiKey?: string;
275
+ enableJsonResponse: boolean;
241
276
  };
242
277
  };
278
+ llmsTxt: {
279
+ enable: boolean;
280
+ outputPath: string;
281
+ title?: string;
282
+ description?: string;
283
+ generateFull: boolean;
284
+ serveMarkdownVariants: boolean;
285
+ };
243
286
  state: {
244
287
  dir: string;
245
- writeMirror: boolean;
246
288
  };
247
289
  }
248
290
  interface Scope {
@@ -250,6 +292,19 @@ interface Scope {
250
292
  scopeName: string;
251
293
  scopeId: string;
252
294
  }
295
+ interface ExtractedPage {
296
+ url: string;
297
+ title: string;
298
+ markdown: string;
299
+ outgoingLinks: OutgoingLink[];
300
+ noindex: boolean;
301
+ tags: string[];
302
+ description?: string;
303
+ keywords?: string[];
304
+ weight?: number;
305
+ publishedAt?: number;
306
+ meta?: Record<string, string | number | boolean | string[]>;
307
+ }
253
308
  interface Chunk {
254
309
  chunkKey: string;
255
310
  ordinal: number;
@@ -257,6 +312,7 @@ interface Chunk {
257
312
  path: string;
258
313
  title: string;
259
314
  sectionTitle?: string;
315
+ headingLevel?: number;
260
316
  headingPath: string[];
261
317
  chunkText: string;
262
318
  snippet: string;
@@ -267,10 +323,13 @@ interface Chunk {
267
323
  contentHash: string;
268
324
  description?: string;
269
325
  keywords?: string[];
326
+ publishedAt?: number;
327
+ incomingAnchorText?: string;
328
+ meta?: Record<string, string | number | boolean | string[]>;
270
329
  }
271
- interface VectorRecord {
330
+ interface VectorHit {
272
331
  id: string;
273
- vector: number[];
332
+ score: number;
274
333
  metadata: {
275
334
  projectId: string;
276
335
  scopeName: string;
@@ -283,25 +342,19 @@ interface VectorRecord {
283
342
  chunkText: string;
284
343
  ordinal: number;
285
344
  contentHash: string;
286
- modelId: string;
287
345
  depth: number;
288
346
  incomingLinks: number;
289
347
  routeFile: string;
290
348
  tags: string[];
349
+ type?: "chunk" | "page" | "image";
291
350
  description?: string;
292
351
  keywords?: string[];
352
+ incomingAnchorText?: string;
353
+ imageSrc?: string;
354
+ imageAlt?: string;
355
+ publishedAt?: number;
293
356
  };
294
357
  }
295
- interface QueryOpts {
296
- topK: number;
297
- pathPrefix?: string;
298
- tags?: string[];
299
- }
300
- interface VectorHit {
301
- id: string;
302
- score: number;
303
- metadata: VectorRecord["metadata"];
304
- }
305
358
  interface PageRecord {
306
359
  url: string;
307
360
  title: string;
@@ -312,51 +365,54 @@ interface PageRecord {
312
365
  routeResolution: "exact" | "best-effort";
313
366
  incomingLinks: number;
314
367
  outgoingLinks: number;
368
+ outgoingLinkUrls?: string[];
315
369
  depth: number;
316
370
  tags: string[];
317
371
  indexedAt: string;
372
+ summary?: string;
373
+ description?: string;
374
+ keywords?: string[];
375
+ contentHash?: string;
376
+ publishedAt?: number;
377
+ meta?: Record<string, string | number | boolean | string[]>;
378
+ }
379
+ interface PageHit {
380
+ id: string;
381
+ score: number;
382
+ title: string;
383
+ url: string;
384
+ description: string;
385
+ tags: string[];
386
+ depth: number;
387
+ incomingLinks: number;
388
+ routeFile: string;
389
+ publishedAt?: number;
318
390
  }
319
391
  interface ScopeInfo {
320
392
  projectId: string;
321
393
  scopeName: string;
322
- modelId: string;
323
394
  lastIndexedAt: string;
324
- vectorCount?: number;
325
- lastEstimateTokens?: number;
326
- lastEstimateCostUSD?: number;
327
- lastEstimateChangedChunks?: number;
328
- }
329
- interface VectorStore {
330
- upsert(records: VectorRecord[], scope: Scope): Promise<void>;
331
- query(queryVector: number[], opts: QueryOpts, scope: Scope): Promise<VectorHit[]>;
332
- deleteByIds(ids: string[], scope: Scope): Promise<void>;
333
- deleteScope(scope: Scope): Promise<void>;
334
- listScopes(scopeProjectId: string): Promise<ScopeInfo[]>;
335
- recordScope(info: ScopeInfo): Promise<void>;
336
- health(): Promise<{
337
- ok: boolean;
338
- details?: string;
339
- }>;
340
- getContentHashes(scope: Scope): Promise<Map<string, string>>;
341
- upsertPages(pages: PageRecord[], scope: Scope): Promise<void>;
342
- getPage(url: string, scope: Scope): Promise<PageRecord | null>;
343
- deletePages(scope: Scope): Promise<void>;
344
- getScopeModelId(scope: Scope): Promise<string | null>;
345
- dropAllTables(): Promise<void>;
395
+ documentCount?: number;
346
396
  }
347
- interface EmbeddingsProvider {
348
- embedTexts(texts: string[], modelId: string, task?: string): Promise<number[][]>;
349
- estimateTokens(text: string): number;
350
- }
351
- interface RerankCandidate {
352
- id: string;
353
- text: string;
354
- }
355
- interface Reranker {
356
- rerank(query: string, candidates: RerankCandidate[], topN?: number): Promise<Array<{
357
- id: string;
358
- score: number;
359
- }>>;
397
+ interface RankingOverrides {
398
+ ranking?: {
399
+ enableIncomingLinkBoost?: boolean;
400
+ enableDepthBoost?: boolean;
401
+ aggregationCap?: number;
402
+ aggregationDecay?: number;
403
+ minChunkScoreRatio?: number;
404
+ minScoreRatio?: number;
405
+ scoreGapThreshold?: number;
406
+ weights?: {
407
+ incomingLinks?: number;
408
+ depth?: number;
409
+ aggregation?: number;
410
+ titleMatch?: number;
411
+ };
412
+ };
413
+ search?: {
414
+ pageSearchWeight?: number;
415
+ };
360
416
  }
361
417
  interface SearchRequest {
362
418
  q: string;
@@ -364,13 +420,24 @@ interface SearchRequest {
364
420
  scope?: string;
365
421
  pathPrefix?: string;
366
422
  tags?: string[];
367
- rerank?: boolean;
423
+ filters?: Record<string, string | number | boolean>;
368
424
  groupBy?: "page" | "chunk";
369
- stream?: boolean;
425
+ maxSubResults?: number;
426
+ debug?: boolean;
427
+ rankingOverrides?: RankingOverrides;
428
+ }
429
+ interface ScoreBreakdown {
430
+ baseScore: number;
431
+ incomingLinkBoost: number;
432
+ depthBoost: number;
433
+ titleMatchBoost: number;
434
+ freshnessBoost: number;
435
+ anchorTextMatchBoost: number;
370
436
  }
371
437
  interface SearchResultChunk {
372
438
  sectionTitle?: string;
373
439
  snippet: string;
440
+ chunkText?: string;
374
441
  headingPath: string[];
375
442
  score: number;
376
443
  }
@@ -379,9 +446,11 @@ interface SearchResult {
379
446
  title: string;
380
447
  sectionTitle?: string;
381
448
  snippet: string;
449
+ chunkText?: string;
382
450
  score: number;
383
451
  routeFile: string;
384
452
  chunks?: SearchResultChunk[];
453
+ breakdown?: ScoreBreakdown;
385
454
  }
386
455
  interface SearchResponse {
387
456
  q: string;
@@ -389,27 +458,37 @@ interface SearchResponse {
389
458
  results: SearchResult[];
390
459
  meta: {
391
460
  timingsMs: {
392
- embed: number;
393
- vector: number;
394
- rerank: number;
461
+ search: number;
395
462
  total: number;
396
463
  };
397
- usedRerank: boolean;
398
- modelId: string;
399
464
  };
400
465
  }
401
466
  interface IndexStats {
402
467
  pagesProcessed: number;
468
+ pagesChanged: number;
469
+ pagesDeleted: number;
403
470
  chunksTotal: number;
404
471
  chunksChanged: number;
405
- newEmbeddings: number;
472
+ documentsUpserted: number;
406
473
  deletes: number;
407
- estimatedTokens: number;
408
- estimatedCostUSD: number;
409
474
  routeExact: number;
410
475
  routeBestEffort: number;
411
476
  stageTimingsMs: Record<string, number>;
412
477
  }
478
+ interface IndexingHooks {
479
+ transformPage?: (page: ExtractedPage) => Awaitable<ExtractedPage | null>;
480
+ transformChunk?: (chunk: Chunk) => Awaitable<Chunk | null>;
481
+ beforeIndex?: (chunks: Chunk[]) => Awaitable<Chunk[]>;
482
+ afterIndex?: (stats: IndexStats) => Awaitable<void>;
483
+ }
484
+ interface CustomRecord {
485
+ url: string;
486
+ title: string;
487
+ content: string;
488
+ metadata?: Record<string, string>;
489
+ tags?: string[];
490
+ weight?: number;
491
+ }
413
492
  interface IndexOptions {
414
493
  scopeOverride?: string;
415
494
  changedOnly?: boolean;
@@ -419,37 +498,34 @@ interface IndexOptions {
419
498
  maxPages?: number;
420
499
  maxChunks?: number;
421
500
  verbose?: boolean;
501
+ customRecords?: CustomRecord[];
422
502
  }
423
- interface StreamSearchEvent {
424
- phase: "initial" | "reranked";
425
- data: SearchResponse;
503
+ interface SiteTreeNode {
504
+ url: string;
505
+ title: string;
506
+ depth: number;
507
+ routeFile: string;
508
+ isIndexed: boolean;
509
+ childCount: number;
510
+ children: SiteTreeNode[];
426
511
  }
427
- interface StreamSearchErrorEvent {
428
- phase: "error";
429
- data: {
430
- error: {
431
- code: string;
432
- message: string;
433
- };
434
- };
512
+ interface SiteStructureResult {
513
+ root: SiteTreeNode;
514
+ totalPages: number;
515
+ truncated: boolean;
435
516
  }
436
- type StreamEvent = StreamSearchEvent | StreamSearchErrorEvent;
437
- interface MergeSearchOptions {
438
- /**
439
- * If any single result moved more than this many positions, adopt
440
- * the reranked order. The reranker is semantic — if it strongly
441
- * disagrees with vector similarity on even one result, trust it.
442
- * @default 3
443
- */
444
- maxDisplacement?: number;
517
+ type RelationshipType = "outgoing_link" | "incoming_link" | "sibling" | "semantic";
518
+ interface RelatedPage {
519
+ url: string;
520
+ title: string;
521
+ score: number;
522
+ relationshipType: RelationshipType;
523
+ routeFile: string;
445
524
  }
446
- interface MergeSearchResult {
447
- response: SearchResponse;
448
- usedRerankedOrder: boolean;
449
- displacements: Array<{
450
- url: string;
451
- displacement: number;
452
- }>;
525
+ interface RelatedPagesResult {
526
+ sourceUrl: string;
527
+ scope: string;
528
+ relatedPages: RelatedPage[];
453
529
  }
454
530
 
455
- export type { Chunk as C, EmbeddingsProvider as E, IndexOptions as I, MergeSearchOptions as M, QueryOpts as Q, ResolvedSearchSocketConfig as R, SearchResponse as S, VectorStore as V, MergeSearchResult as a, SearchRequest as b, StreamSearchEvent as c, SearchSocketConfig as d, Scope as e, Reranker as f, RerankCandidate as g, IndexStats as h, StreamEvent as i, StreamSearchErrorEvent as j, VectorHit as k, VectorRecord as l };
531
+ export type { Awaitable as A, Chunk as C, IndexingHooks as I, PageHit as P, ResolvedSearchSocketConfig as R, SearchRequest as S, VectorHit as V, SearchResponse as a, SearchResult as b, SearchSocketConfig as c, Scope as d, ScopeInfo as e, PageRecord as f, IndexOptions as g, IndexStats as h, SiteStructureResult as i, RelatedPagesResult as j, CustomRecord as k };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "searchsocket",
3
- "version": "0.4.0",
3
+ "version": "0.6.0",
4
4
  "description": "Semantic site search and MCP retrieval for SvelteKit static sites",
5
5
  "license": "MIT",
6
6
  "author": "Greg Priday <greg@siteorigin.com>",
@@ -26,6 +26,7 @@
26
26
  "files": [
27
27
  "dist",
28
28
  "!dist/**/*.map",
29
+ "src/svelte",
29
30
  "README.md"
30
31
  ],
31
32
  "bin": {
@@ -46,6 +47,24 @@
46
47
  "types": "./dist/client.d.ts",
47
48
  "import": "./dist/client.js",
48
49
  "require": "./dist/client.cjs"
50
+ },
51
+ "./scroll": {
52
+ "types": "./dist/scroll.d.ts",
53
+ "import": "./dist/scroll.js",
54
+ "require": "./dist/scroll.cjs"
55
+ },
56
+ "./svelte": {
57
+ "types": "./src/svelte/index.svelte.ts",
58
+ "svelte": "./src/svelte/index.svelte.ts",
59
+ "default": "./src/svelte/index.svelte.ts"
60
+ }
61
+ },
62
+ "peerDependencies": {
63
+ "svelte": "^5.0.0"
64
+ },
65
+ "peerDependenciesMeta": {
66
+ "svelte": {
67
+ "optional": true
49
68
  }
50
69
  },
51
70
  "scripts": {
@@ -53,15 +72,17 @@
53
72
  "clean": "rm -rf dist",
54
73
  "typecheck": "tsc --noEmit",
55
74
  "test": "vitest run",
56
- "test:watch": "vitest"
75
+ "test:watch": "vitest",
76
+ "test:quality": "SEARCHSOCKET_QUALITY_TESTS=1 vitest run tests/quality.test.ts"
57
77
  },
58
78
  "engines": {
59
79
  "node": ">=20"
60
80
  },
61
81
  "packageManager": "pnpm@10.29.2",
62
82
  "dependencies": {
63
- "@libsql/client": "^0.17.0",
83
+ "@clack/prompts": "^1.2.0",
64
84
  "@modelcontextprotocol/sdk": "^1.26.0",
85
+ "@upstash/vector": "^1.2.3",
65
86
  "cheerio": "^1.2.0",
66
87
  "chokidar": "^5.0.0",
67
88
  "commander": "^14.0.3",
@@ -70,15 +91,19 @@
70
91
  "fast-glob": "^3.3.3",
71
92
  "gray-matter": "^4.0.3",
72
93
  "jiti": "^2.6.1",
94
+ "magicast": "^0.5.2",
73
95
  "p-limit": "^7.3.0",
74
96
  "turndown": "^7.2.2",
75
97
  "turndown-plugin-gfm": "^1.0.2",
76
98
  "zod": "^4.3.6"
77
99
  },
78
100
  "devDependencies": {
101
+ "@sveltejs/vite-plugin-svelte": "^6.2.4",
79
102
  "@types/express": "^5.0.6",
80
103
  "@types/node": "^25.2.2",
81
104
  "@types/turndown": "^5.0.6",
105
+ "jsdom": "^28.1.0",
106
+ "svelte": "^5.55.1",
82
107
  "tsup": "^8.5.1",
83
108
  "typescript": "^5.9.3",
84
109
  "vitest": "^4.0.18"
@@ -0,0 +1,35 @@
1
+ <script lang="ts">
2
+ import { serializeMetaValue, validateMetaKey } from "../utils/structured-meta";
3
+ import type { MetaValue } from "../utils/structured-meta";
4
+
5
+ interface Props {
6
+ weight?: number;
7
+ noindex?: boolean;
8
+ tags?: string[];
9
+ meta?: Record<string, MetaValue>;
10
+ }
11
+
12
+ let { weight, noindex, tags, meta }: Props = $props();
13
+
14
+ const metaEntries = $derived(
15
+ meta
16
+ ? Object.entries(meta).filter(([key]) => validateMetaKey(key))
17
+ : []
18
+ );
19
+ </script>
20
+
21
+ <svelte:head>
22
+ {#if weight !== undefined}
23
+ <meta name="searchsocket-weight" content={String(weight)} />
24
+ {/if}
25
+ {#if noindex}
26
+ <meta name="searchsocket:noindex" content="true" />
27
+ {/if}
28
+ {#if tags && tags.length > 0}
29
+ <meta name="searchsocket:tags" content={tags.join(",")} data-type="string[]" />
30
+ {/if}
31
+ {#each metaEntries as [key, value]}
32
+ {@const serialized = serializeMetaValue(value)}
33
+ <meta name={`searchsocket:${key}`} content={serialized.content} data-type={serialized.dataType} />
34
+ {/each}
35
+ </svelte:head>