searchsocket 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +120 -42
- package/dist/cli.js +348 -111
- package/dist/client.d.cts +1 -1
- package/dist/client.d.ts +1 -1
- package/dist/index.cjs +367 -104
- package/dist/index.d.cts +20 -3
- package/dist/index.d.ts +20 -3
- package/dist/index.js +365 -103
- package/dist/sveltekit.cjs +350 -104
- package/dist/sveltekit.d.cts +8 -2
- package/dist/sveltekit.d.ts +8 -2
- package/dist/sveltekit.js +349 -102
- package/dist/{types-D1K46vwd.d.cts → types-DAXk6A3Y.d.cts} +25 -13
- package/dist/{types-D1K46vwd.d.ts → types-DAXk6A3Y.d.ts} +25 -13
- package/package.json +3 -3
- package/dist/cli.js.map +0 -1
- package/dist/client.cjs.map +0 -1
- package/dist/client.js.map +0 -1
- package/dist/index.cjs.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/sveltekit.cjs.map +0 -1
- package/dist/sveltekit.js.map +0 -1
package/README.md
CHANGED
|
@@ -6,9 +6,9 @@ Semantic site search and MCP retrieval for SvelteKit content projects.
|
|
|
6
6
|
|
|
7
7
|
## Features
|
|
8
8
|
|
|
9
|
-
- **Embeddings**:
|
|
9
|
+
- **Embeddings**: Jina AI `jina-embeddings-v3` with task-specific LoRA adapters (configurable)
|
|
10
10
|
- **Vector Backend**: Turso/libSQL with vector search (local file DB for development, remote for production)
|
|
11
|
-
- **Rerank**: Optional Jina reranker
|
|
11
|
+
- **Rerank**: Optional Jina reranker — same API key, one boolean to enable
|
|
12
12
|
- **Page Aggregation**: Group results by page with score-weighted chunk decay
|
|
13
13
|
- **Meta Extraction**: Automatically extracts `<meta name="description">` and `<meta name="keywords">` for improved relevance
|
|
14
14
|
- **SvelteKit Integrations**:
|
|
@@ -48,7 +48,7 @@ Minimal config (`searchsocket.config.ts`):
|
|
|
48
48
|
|
|
49
49
|
```ts
|
|
50
50
|
export default {
|
|
51
|
-
embeddings: { apiKeyEnv: "
|
|
51
|
+
embeddings: { apiKeyEnv: "JINA_API_KEY" }
|
|
52
52
|
};
|
|
53
53
|
```
|
|
54
54
|
|
|
@@ -74,12 +74,12 @@ The CLI automatically loads `.env` from the working directory on startup, so you
|
|
|
74
74
|
|
|
75
75
|
Development (`.env`):
|
|
76
76
|
```bash
|
|
77
|
-
|
|
77
|
+
JINA_API_KEY=jina_...
|
|
78
78
|
```
|
|
79
79
|
|
|
80
80
|
Production (add these for remote Turso):
|
|
81
81
|
```bash
|
|
82
|
-
|
|
82
|
+
JINA_API_KEY=jina_...
|
|
83
83
|
TURSO_DATABASE_URL=libsql://your-db.turso.io
|
|
84
84
|
TURSO_AUTH_TOKEN=eyJ...
|
|
85
85
|
```
|
|
@@ -101,7 +101,7 @@ The indexing pipeline:
|
|
|
101
101
|
- Chunks text with semantic heading boundaries
|
|
102
102
|
- Prepends page title to each chunk for embedding context
|
|
103
103
|
- Generates a synthetic summary chunk per page for identity matching
|
|
104
|
-
- Generates embeddings via
|
|
104
|
+
- Generates embeddings via Jina AI (with task-specific LoRA adapters for indexing vs search)
|
|
105
105
|
- Stores vectors in Turso/libSQL with cosine similarity index
|
|
106
106
|
|
|
107
107
|
### 6. Query
|
|
@@ -163,7 +163,7 @@ pnpm searchsocket search --q "getting started" --top-k 5 --path-prefix /docs
|
|
|
163
163
|
"meta": {
|
|
164
164
|
"timingsMs": { "embed": 120, "vector": 15, "rerank": 0, "total": 135 },
|
|
165
165
|
"usedRerank": false,
|
|
166
|
-
"modelId": "
|
|
166
|
+
"modelId": "jina-embeddings-v3"
|
|
167
167
|
}
|
|
168
168
|
}
|
|
169
169
|
```
|
|
@@ -203,7 +203,11 @@ export default {
|
|
|
203
203
|
paramValues: { // values for dynamic routes
|
|
204
204
|
"/blog/[slug]": ["hello-world", "getting-started"],
|
|
205
205
|
"/docs/[category]/[page]": ["guides/quickstart", "api/search"]
|
|
206
|
-
}
|
|
206
|
+
},
|
|
207
|
+
discover: true, // crawl internal links to find pages (default: false)
|
|
208
|
+
seedUrls: ["/"], // starting URLs for discovery
|
|
209
|
+
maxPages: 200, // max pages to discover (default: 200)
|
|
210
|
+
maxDepth: 5 // max link depth from seed URLs (default: 5)
|
|
207
211
|
}
|
|
208
212
|
}
|
|
209
213
|
};
|
|
@@ -221,6 +225,8 @@ Best for: CI/CD pipelines. Enables `vite build && searchsocket index` with zero
|
|
|
221
225
|
|
|
222
226
|
**Dynamic routes**: Each key in `paramValues` maps to a route ID (e.g., `/blog/[slug]`) or its URL equivalent. Each value in the array replaces all `[param]` segments in the URL. Routes with layout groups like `/(app)/blog/[slug]` also match the URL key `/blog/[slug]`.
|
|
223
227
|
|
|
228
|
+
**Link discovery**: Enable `discover: true` to automatically find pages by crawling internal links from `seedUrls`. This is useful when dynamic routes have many parameter values that are impractical to enumerate. The crawler respects `maxPages` and `maxDepth` limits and only follows links within the same origin.
|
|
229
|
+
|
|
224
230
|
### `crawl`
|
|
225
231
|
|
|
226
232
|
Fetches pages from a running HTTP server.
|
|
@@ -325,30 +331,102 @@ For production, switch to **Turso's hosted service**:
|
|
|
325
331
|
|
|
326
332
|
3. **Index normally** — SearchSocket auto-detects the remote URL and uses it.
|
|
327
333
|
|
|
334
|
+
### Direct Credential Passing
|
|
335
|
+
|
|
336
|
+
Instead of environment variables, you can pass credentials directly in the config. This is useful for serverless deployments or multi-tenant setups:
|
|
337
|
+
|
|
338
|
+
```ts
|
|
339
|
+
export default {
|
|
340
|
+
embeddings: {
|
|
341
|
+
apiKey: "jina_..." // direct API key (takes precedence over apiKeyEnv)
|
|
342
|
+
},
|
|
343
|
+
vector: {
|
|
344
|
+
turso: {
|
|
345
|
+
url: "libsql://my-db.turso.io", // direct URL
|
|
346
|
+
authToken: "eyJhbGc..." // direct auth token
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
};
|
|
350
|
+
```
|
|
351
|
+
|
|
352
|
+
Direct values take precedence over environment variable lookups (`apiKeyEnv`, `urlEnv`, `authTokenEnv`).
|
|
353
|
+
|
|
354
|
+
### Dimension Mismatch Auto-Recovery
|
|
355
|
+
|
|
356
|
+
When switching embedding models (e.g., from a 1536-dim model to Jina's 1024-dim), the vector dimension changes. SearchSocket automatically detects this and recreates the chunks table with the new dimension — no manual intervention needed. A full re-index (`--force`) is still required after switching models.
|
|
357
|
+
|
|
328
358
|
### Why Turso?
|
|
329
359
|
|
|
330
|
-
- **Single backend** —
|
|
360
|
+
- **Single backend** — one unified Turso/libSQL store for vectors, metadata, and state
|
|
331
361
|
- **Local-first development** — zero external dependencies for local dev
|
|
332
362
|
- **Production-ready** — same codebase scales to remote hosted DB
|
|
333
363
|
- **Cost-effective** — Turso free tier includes 9GB storage, 500M row reads/month
|
|
334
364
|
- **Vector search native** — `F32_BLOB` vectors, cosine similarity index, `vector_top_k` ANN queries
|
|
335
365
|
|
|
336
|
-
##
|
|
366
|
+
## Serverless Deployment (Vercel, Netlify, etc.)
|
|
337
367
|
|
|
338
|
-
SearchSocket
|
|
368
|
+
SearchSocket works on serverless platforms with a few adjustments:
|
|
369
|
+
|
|
370
|
+
### Requirements
|
|
371
|
+
|
|
372
|
+
1. **Remote Turso database** — local SQLite is not available in serverless (no persistent filesystem). Set `TURSO_DATABASE_URL` and `TURSO_AUTH_TOKEN` as platform environment variables.
|
|
373
|
+
|
|
374
|
+
2. **Inline config via `rawConfig`** — the default config loader uses `jiti` to import `searchsocket.config.ts` from disk, which isn't bundled in serverless. Use `rawConfig` to pass config inline:
|
|
375
|
+
|
|
376
|
+
```ts
|
|
377
|
+
// hooks.server.ts (Vercel / Netlify)
|
|
378
|
+
import { searchsocketHandle } from "searchsocket/sveltekit";
|
|
379
|
+
|
|
380
|
+
export const handle = searchsocketHandle({
|
|
381
|
+
rawConfig: {
|
|
382
|
+
project: { id: "my-docs-site" },
|
|
383
|
+
source: { mode: "static-output" },
|
|
384
|
+
embeddings: { apiKeyEnv: "JINA_API_KEY" },
|
|
385
|
+
}
|
|
386
|
+
});
|
|
387
|
+
```
|
|
388
|
+
|
|
389
|
+
3. **Environment variables** — set these on your platform dashboard:
|
|
390
|
+
- `JINA_API_KEY`
|
|
391
|
+
- `TURSO_DATABASE_URL`
|
|
392
|
+
- `TURSO_AUTH_TOKEN`
|
|
393
|
+
|
|
394
|
+
### Rate Limiting
|
|
395
|
+
|
|
396
|
+
The built-in `InMemoryRateLimiter` auto-disables on serverless platforms (it resets on every cold start). Use your platform's WAF or edge rate-limiting instead.
|
|
397
|
+
|
|
398
|
+
### What Only Applies to Indexing
|
|
399
|
+
|
|
400
|
+
The following features are only used during `searchsocket index` (CLI), not the search handler:
|
|
401
|
+
- `ensureStateDirs` — creates `.searchsocket/` state directories
|
|
402
|
+
- Markdown mirror — writes `.searchsocket/mirror/` files
|
|
403
|
+
- Local SQLite fallback — only needed when `TURSO_DATABASE_URL` is not set
|
|
404
|
+
|
|
405
|
+
### Adapter Guidance
|
|
406
|
+
|
|
407
|
+
| Platform | Adapter | Notes |
|
|
408
|
+
|----------|---------|-------|
|
|
409
|
+
| Vercel | `adapter-auto` (default) | Serverless — use `rawConfig` + remote Turso |
|
|
410
|
+
| Netlify | `adapter-netlify` | Serverless — same as Vercel |
|
|
411
|
+
| VPS / Docker | `adapter-node` | Long-lived process — no limitations, local SQLite works |
|
|
412
|
+
|
|
413
|
+
## Embeddings: Jina AI
|
|
414
|
+
|
|
415
|
+
SearchSocket uses **Jina AI's embedding models** to convert text into semantic vectors. A single `JINA_API_KEY` powers both embeddings and optional reranking.
|
|
339
416
|
|
|
340
417
|
### Default Model
|
|
341
418
|
|
|
342
|
-
- **Model**: `
|
|
343
|
-
- **Dimensions**:
|
|
344
|
-
- **Cost**: ~$0.00002 per 1K tokens (
|
|
419
|
+
- **Model**: `jina-embeddings-v3`
|
|
420
|
+
- **Dimensions**: 1024 (default)
|
|
421
|
+
- **Cost**: ~$0.00002 per 1K tokens (generous 10M token free tier)
|
|
422
|
+
- **Task adapters**: Uses `retrieval.passage` for indexing, `retrieval.query` for search queries (LoRA task-specific adapters for better retrieval quality)
|
|
345
423
|
|
|
346
424
|
### How It Works
|
|
347
425
|
|
|
348
426
|
1. **Chunking**: Text is split into semantic chunks (default 2200 chars, 200 overlap)
|
|
349
427
|
2. **Title Prepend**: Page title is prepended to each chunk for better context (`chunking.prependTitle`, default: true)
|
|
350
428
|
3. **Summary Chunk**: A synthetic identity chunk is generated per page with title, URL, and first paragraph (`chunking.pageSummaryChunk`, default: true)
|
|
351
|
-
4. **Embedding**: Each chunk is sent to
|
|
429
|
+
4. **Embedding**: Each chunk is sent to Jina's embedding API with the `retrieval.passage` task adapter
|
|
352
430
|
5. **Batching**: Requests batched (64 texts per request) for efficiency
|
|
353
431
|
6. **Storage**: Vectors stored in Turso with metadata (URL, title, tags, depth, etc.)
|
|
354
432
|
|
|
@@ -369,17 +447,14 @@ estimated tokens: 32,400
|
|
|
369
447
|
estimated cost (USD): $0.000648
|
|
370
448
|
```
|
|
371
449
|
|
|
372
|
-
###
|
|
450
|
+
### Reranking
|
|
451
|
+
|
|
452
|
+
Since embeddings and reranking share the same Jina API key, enabling reranking is one boolean:
|
|
373
453
|
|
|
374
|
-
Override in config:
|
|
375
454
|
```ts
|
|
376
455
|
export default {
|
|
377
|
-
embeddings: {
|
|
378
|
-
|
|
379
|
-
model: "text-embedding-3-large", // 3072 dims, higher quality
|
|
380
|
-
apiKeyEnv: "OPENAI_API_KEY",
|
|
381
|
-
pricePer1kTokens: 0.00013
|
|
382
|
-
}
|
|
456
|
+
embeddings: { apiKeyEnv: "JINA_API_KEY" },
|
|
457
|
+
rerank: { enabled: true }
|
|
383
458
|
};
|
|
384
459
|
```
|
|
385
460
|
|
|
@@ -400,6 +475,7 @@ Configure aggregation behavior:
|
|
|
400
475
|
```ts
|
|
401
476
|
export default {
|
|
402
477
|
ranking: {
|
|
478
|
+
minScore: 0, // minimum absolute score to include in results (default: 0, disabled)
|
|
403
479
|
aggregationCap: 5, // max chunks contributing to page score (default: 5)
|
|
404
480
|
aggregationDecay: 0.5, // decay factor for additional chunks (default: 0.5)
|
|
405
481
|
minChunkScoreRatio: 0.5, // threshold for sub-chunks in results (default: 0.5)
|
|
@@ -420,6 +496,8 @@ export default {
|
|
|
420
496
|
|
|
421
497
|
`pageWeights` supports exact URL matches and prefix matching. A weight of `1.15` on `"/docs"` boosts all pages under `/docs/` by 15%. Use gentle values (1.05-1.2x) since they compound with aggregation.
|
|
422
498
|
|
|
499
|
+
`minScore` filters out low-relevance results before they reach the client. Set to a value like `0.3` to remove noise. In page mode, pages below the threshold are dropped; in chunk mode, individual chunks are filtered. Default is `0` (disabled).
|
|
500
|
+
|
|
423
501
|
### Chunk Mode
|
|
424
502
|
|
|
425
503
|
Use `groupBy: "chunk"` for flat per-chunk results without page aggregation:
|
|
@@ -534,7 +612,7 @@ pnpm searchsocket status
|
|
|
534
612
|
# Output:
|
|
535
613
|
# project: my-site
|
|
536
614
|
# resolved scope: main
|
|
537
|
-
# embedding model:
|
|
615
|
+
# embedding model: jina-embeddings-v3
|
|
538
616
|
# vector backend: turso/libsql (local (.searchsocket/vectors.db))
|
|
539
617
|
# vector health: ok
|
|
540
618
|
# last indexed (main): 2025-02-23T10:30:00Z
|
|
@@ -597,7 +675,7 @@ pnpm searchsocket doctor
|
|
|
597
675
|
|
|
598
676
|
# Output:
|
|
599
677
|
# PASS config parse
|
|
600
|
-
# PASS env
|
|
678
|
+
# PASS env JINA_API_KEY
|
|
601
679
|
# PASS turso/libsql (local file: .searchsocket/vectors.db)
|
|
602
680
|
# PASS source: build manifest
|
|
603
681
|
# PASS source: vite binary
|
|
@@ -699,8 +777,8 @@ The CLI automatically loads `.env` from the working directory on startup. Existi
|
|
|
699
777
|
|
|
700
778
|
### Required
|
|
701
779
|
|
|
702
|
-
**
|
|
703
|
-
- `
|
|
780
|
+
**Jina AI:**
|
|
781
|
+
- `JINA_API_KEY` — Jina AI API key for embeddings and reranking
|
|
704
782
|
|
|
705
783
|
### Optional (Turso)
|
|
706
784
|
|
|
@@ -710,11 +788,6 @@ The CLI automatically loads `.env` from the working directory on startup. Existi
|
|
|
710
788
|
|
|
711
789
|
If not set, uses local file DB at `.searchsocket/vectors.db`.
|
|
712
790
|
|
|
713
|
-
### Optional (Rerank)
|
|
714
|
-
|
|
715
|
-
**Jina:**
|
|
716
|
-
- `JINA_API_KEY` — Jina reranker API key (if using `rerank: { provider: "jina" }`)
|
|
717
|
-
|
|
718
791
|
### Optional (Scope/Build)
|
|
719
792
|
|
|
720
793
|
- `SEARCHSOCKET_SCOPE` — Override scope (when `scope.mode: "env"`)
|
|
@@ -750,7 +823,11 @@ export default {
|
|
|
750
823
|
exclude: ["/api/*"],
|
|
751
824
|
paramValues: {
|
|
752
825
|
"/blog/[slug]": ["hello-world", "getting-started"]
|
|
753
|
-
}
|
|
826
|
+
},
|
|
827
|
+
discover: false,
|
|
828
|
+
seedUrls: ["/"],
|
|
829
|
+
maxPages: 200,
|
|
830
|
+
maxDepth: 5
|
|
754
831
|
},
|
|
755
832
|
|
|
756
833
|
// Crawl mode (alternative)
|
|
@@ -787,16 +864,19 @@ export default {
|
|
|
787
864
|
},
|
|
788
865
|
|
|
789
866
|
embeddings: {
|
|
790
|
-
provider: "
|
|
791
|
-
model: "
|
|
792
|
-
|
|
867
|
+
provider: "jina",
|
|
868
|
+
model: "jina-embeddings-v3",
|
|
869
|
+
apiKey: "jina_...", // direct API key (or use apiKeyEnv)
|
|
870
|
+
apiKeyEnv: "JINA_API_KEY",
|
|
793
871
|
batchSize: 64,
|
|
794
872
|
concurrency: 4
|
|
795
873
|
},
|
|
796
874
|
|
|
797
875
|
vector: {
|
|
798
|
-
dimension:
|
|
876
|
+
dimension: 1024, // optional, inferred from first embedding
|
|
799
877
|
turso: {
|
|
878
|
+
url: "libsql://my-db.turso.io", // direct URL (or use urlEnv)
|
|
879
|
+
authToken: "eyJhbGc...", // direct token (or use authTokenEnv)
|
|
800
880
|
urlEnv: "TURSO_DATABASE_URL",
|
|
801
881
|
authTokenEnv: "TURSO_AUTH_TOKEN",
|
|
802
882
|
localPath: ".searchsocket/vectors.db"
|
|
@@ -804,12 +884,9 @@ export default {
|
|
|
804
884
|
},
|
|
805
885
|
|
|
806
886
|
rerank: {
|
|
807
|
-
|
|
887
|
+
enabled: true,
|
|
808
888
|
topN: 20,
|
|
809
|
-
|
|
810
|
-
apiKeyEnv: "JINA_API_KEY",
|
|
811
|
-
model: "jina-reranker-v2-base-multilingual"
|
|
812
|
-
}
|
|
889
|
+
model: "jina-reranker-v2-base-multilingual"
|
|
813
890
|
},
|
|
814
891
|
|
|
815
892
|
ranking: {
|
|
@@ -819,6 +896,7 @@ export default {
|
|
|
819
896
|
"/": 1.1,
|
|
820
897
|
"/docs": 1.15
|
|
821
898
|
},
|
|
899
|
+
minScore: 0,
|
|
822
900
|
aggregationCap: 5,
|
|
823
901
|
aggregationDecay: 0.5,
|
|
824
902
|
minChunkScoreRatio: 0.5,
|