searchsocket 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -6,9 +6,9 @@ Semantic site search and MCP retrieval for SvelteKit content projects.
6
6
 
7
7
  ## Features
8
8
 
9
- - **Embeddings**: OpenAI `text-embedding-3-small` (configurable)
9
+ - **Embeddings**: Jina AI `jina-embeddings-v3` with task-specific LoRA adapters (configurable)
10
10
  - **Vector Backend**: Turso/libSQL with vector search (local file DB for development, remote for production)
11
- - **Rerank**: Optional Jina reranker for improved relevance
11
+ - **Rerank**: Optional Jina reranker same API key, one boolean to enable
12
12
  - **Page Aggregation**: Group results by page with score-weighted chunk decay
13
13
  - **Meta Extraction**: Automatically extracts `<meta name="description">` and `<meta name="keywords">` for improved relevance
14
14
  - **SvelteKit Integrations**:
@@ -48,7 +48,7 @@ Minimal config (`searchsocket.config.ts`):
48
48
 
49
49
  ```ts
50
50
  export default {
51
- embeddings: { apiKeyEnv: "OPENAI_API_KEY" }
51
+ embeddings: { apiKeyEnv: "JINA_API_KEY" }
52
52
  };
53
53
  ```
54
54
 
@@ -74,12 +74,12 @@ The CLI automatically loads `.env` from the working directory on startup, so you
74
74
 
75
75
  Development (`.env`):
76
76
  ```bash
77
- OPENAI_API_KEY=sk-...
77
+ JINA_API_KEY=jina_...
78
78
  ```
79
79
 
80
80
  Production (add these for remote Turso):
81
81
  ```bash
82
- OPENAI_API_KEY=sk-...
82
+ JINA_API_KEY=jina_...
83
83
  TURSO_DATABASE_URL=libsql://your-db.turso.io
84
84
  TURSO_AUTH_TOKEN=eyJ...
85
85
  ```
@@ -101,7 +101,7 @@ The indexing pipeline:
101
101
  - Chunks text with semantic heading boundaries
102
102
  - Prepends page title to each chunk for embedding context
103
103
  - Generates a synthetic summary chunk per page for identity matching
104
- - Generates embeddings via OpenAI
104
+ - Generates embeddings via Jina AI (with task-specific LoRA adapters for indexing vs search)
105
105
  - Stores vectors in Turso/libSQL with cosine similarity index
106
106
 
107
107
  ### 6. Query
@@ -163,7 +163,7 @@ pnpm searchsocket search --q "getting started" --top-k 5 --path-prefix /docs
163
163
  "meta": {
164
164
  "timingsMs": { "embed": 120, "vector": 15, "rerank": 0, "total": 135 },
165
165
  "usedRerank": false,
166
- "modelId": "text-embedding-3-small"
166
+ "modelId": "jina-embeddings-v3"
167
167
  }
168
168
  }
169
169
  ```
@@ -203,7 +203,11 @@ export default {
203
203
  paramValues: { // values for dynamic routes
204
204
  "/blog/[slug]": ["hello-world", "getting-started"],
205
205
  "/docs/[category]/[page]": ["guides/quickstart", "api/search"]
206
- }
206
+ },
207
+ discover: true, // crawl internal links to find pages (default: false)
208
+ seedUrls: ["/"], // starting URLs for discovery
209
+ maxPages: 200, // max pages to discover (default: 200)
210
+ maxDepth: 5 // max link depth from seed URLs (default: 5)
207
211
  }
208
212
  }
209
213
  };
@@ -221,6 +225,8 @@ Best for: CI/CD pipelines. Enables `vite build && searchsocket index` with zero
221
225
 
222
226
  **Dynamic routes**: Each key in `paramValues` maps to a route ID (e.g., `/blog/[slug]`) or its URL equivalent. Each value in the array replaces all `[param]` segments in the URL. Routes with layout groups like `/(app)/blog/[slug]` also match the URL key `/blog/[slug]`.
223
227
 
228
+ **Link discovery**: Enable `discover: true` to automatically find pages by crawling internal links from `seedUrls`. This is useful when dynamic routes have many parameter values that are impractical to enumerate. The crawler respects `maxPages` and `maxDepth` limits and only follows links within the same origin.
229
+
224
230
  ### `crawl`
225
231
 
226
232
  Fetches pages from a running HTTP server.
@@ -325,30 +331,102 @@ For production, switch to **Turso's hosted service**:
325
331
 
326
332
  3. **Index normally** — SearchSocket auto-detects the remote URL and uses it.
327
333
 
334
+ ### Direct Credential Passing
335
+
336
+ Instead of environment variables, you can pass credentials directly in the config. This is useful for serverless deployments or multi-tenant setups:
337
+
338
+ ```ts
339
+ export default {
340
+ embeddings: {
341
+ apiKey: "jina_..." // direct API key (takes precedence over apiKeyEnv)
342
+ },
343
+ vector: {
344
+ turso: {
345
+ url: "libsql://my-db.turso.io", // direct URL
346
+ authToken: "eyJhbGc..." // direct auth token
347
+ }
348
+ }
349
+ };
350
+ ```
351
+
352
+ Direct values take precedence over environment variable lookups (`apiKeyEnv`, `urlEnv`, `authTokenEnv`).
353
+
354
+ ### Dimension Mismatch Auto-Recovery
355
+
356
+ When switching embedding models (e.g., from a 1536-dim model to Jina's 1024-dim), the vector dimension changes. SearchSocket automatically detects this and recreates the chunks table with the new dimension — no manual intervention needed. A full re-index (`--force`) is still required after switching models.
357
+
328
358
  ### Why Turso?
329
359
 
330
- - **Single backend** — no more choosing between Pinecone, Milvus, or local JSON
360
+ - **Single backend** — one unified Turso/libSQL store for vectors, metadata, and state
331
361
  - **Local-first development** — zero external dependencies for local dev
332
362
  - **Production-ready** — same codebase scales to remote hosted DB
333
363
  - **Cost-effective** — Turso free tier includes 9GB storage, 500M row reads/month
334
364
  - **Vector search native** — `F32_BLOB` vectors, cosine similarity index, `vector_top_k` ANN queries
335
365
 
336
- ## Embeddings: OpenAI
366
+ ## Serverless Deployment (Vercel, Netlify, etc.)
337
367
 
338
- SearchSocket uses **OpenAI's embedding models** to convert text into semantic vectors.
368
+ SearchSocket works on serverless platforms with a few adjustments:
369
+
370
+ ### Requirements
371
+
372
+ 1. **Remote Turso database** — local SQLite is not available in serverless (no persistent filesystem). Set `TURSO_DATABASE_URL` and `TURSO_AUTH_TOKEN` as platform environment variables.
373
+
374
+ 2. **Inline config via `rawConfig`** — the default config loader uses `jiti` to import `searchsocket.config.ts` from disk, which isn't bundled in serverless. Use `rawConfig` to pass config inline:
375
+
376
+ ```ts
377
+ // hooks.server.ts (Vercel / Netlify)
378
+ import { searchsocketHandle } from "searchsocket/sveltekit";
379
+
380
+ export const handle = searchsocketHandle({
381
+ rawConfig: {
382
+ project: { id: "my-docs-site" },
383
+ source: { mode: "static-output" },
384
+ embeddings: { apiKeyEnv: "JINA_API_KEY" },
385
+ }
386
+ });
387
+ ```
388
+
389
+ 3. **Environment variables** — set these on your platform dashboard:
390
+ - `JINA_API_KEY`
391
+ - `TURSO_DATABASE_URL`
392
+ - `TURSO_AUTH_TOKEN`
393
+
394
+ ### Rate Limiting
395
+
396
+ The built-in `InMemoryRateLimiter` auto-disables on serverless platforms (it resets on every cold start). Use your platform's WAF or edge rate-limiting instead.
397
+
398
+ ### What Only Applies to Indexing
399
+
400
+ The following features are only used during `searchsocket index` (CLI), not the search handler:
401
+ - `ensureStateDirs` — creates `.searchsocket/` state directories
402
+ - Markdown mirror — writes `.searchsocket/mirror/` files
403
+ - Local SQLite fallback — only needed when `TURSO_DATABASE_URL` is not set
404
+
405
+ ### Adapter Guidance
406
+
407
+ | Platform | Adapter | Notes |
408
+ |----------|---------|-------|
409
+ | Vercel | `adapter-auto` (default) | Serverless — use `rawConfig` + remote Turso |
410
+ | Netlify | `adapter-netlify` | Serverless — same as Vercel |
411
+ | VPS / Docker | `adapter-node` | Long-lived process — no limitations, local SQLite works |
412
+
413
+ ## Embeddings: Jina AI
414
+
415
+ SearchSocket uses **Jina AI's embedding models** to convert text into semantic vectors. A single `JINA_API_KEY` powers both embeddings and optional reranking.
339
416
 
340
417
  ### Default Model
341
418
 
342
- - **Model**: `text-embedding-3-small`
343
- - **Dimensions**: 1536
344
- - **Cost**: ~$0.00002 per 1K tokens (~4K chars)
419
+ - **Model**: `jina-embeddings-v3`
420
+ - **Dimensions**: 1024 (default)
421
+ - **Cost**: ~$0.00002 per 1K tokens (generous 10M token free tier)
422
+ - **Task adapters**: Uses `retrieval.passage` for indexing, `retrieval.query` for search queries (LoRA task-specific adapters for better retrieval quality)
345
423
 
346
424
  ### How It Works
347
425
 
348
426
  1. **Chunking**: Text is split into semantic chunks (default 2200 chars, 200 overlap)
349
427
  2. **Title Prepend**: Page title is prepended to each chunk for better context (`chunking.prependTitle`, default: true)
350
428
  3. **Summary Chunk**: A synthetic identity chunk is generated per page with title, URL, and first paragraph (`chunking.pageSummaryChunk`, default: true)
351
- 4. **Embedding**: Each chunk is sent to OpenAI's embedding API
429
+ 4. **Embedding**: Each chunk is sent to Jina's embedding API with the `retrieval.passage` task adapter
352
430
  5. **Batching**: Requests batched (64 texts per request) for efficiency
353
431
  6. **Storage**: Vectors stored in Turso with metadata (URL, title, tags, depth, etc.)
354
432
 
@@ -369,17 +447,14 @@ estimated tokens: 32,400
369
447
  estimated cost (USD): $0.000648
370
448
  ```
371
449
 
372
- ### Custom Model
450
+ ### Reranking
451
+
452
+ Since embeddings and reranking share the same Jina API key, enabling reranking is one boolean:
373
453
 
374
- Override in config:
375
454
  ```ts
376
455
  export default {
377
- embeddings: {
378
- provider: "openai",
379
- model: "text-embedding-3-large", // 3072 dims, higher quality
380
- apiKeyEnv: "OPENAI_API_KEY",
381
- pricePer1kTokens: 0.00013
382
- }
456
+ embeddings: { apiKeyEnv: "JINA_API_KEY" },
457
+ rerank: { enabled: true }
383
458
  };
384
459
  ```
385
460
 
@@ -400,6 +475,7 @@ Configure aggregation behavior:
400
475
  ```ts
401
476
  export default {
402
477
  ranking: {
478
+ minScore: 0, // minimum absolute score to include in results (default: 0, disabled)
403
479
  aggregationCap: 5, // max chunks contributing to page score (default: 5)
404
480
  aggregationDecay: 0.5, // decay factor for additional chunks (default: 0.5)
405
481
  minChunkScoreRatio: 0.5, // threshold for sub-chunks in results (default: 0.5)
@@ -420,6 +496,8 @@ export default {
420
496
 
421
497
  `pageWeights` supports exact URL matches and prefix matching. A weight of `1.15` on `"/docs"` boosts all pages under `/docs/` by 15%. Use gentle values (1.05-1.2x) since they compound with aggregation.
422
498
 
499
+ `minScore` filters out low-relevance results before they reach the client. Set to a value like `0.3` to remove noise. In page mode, pages below the threshold are dropped; in chunk mode, individual chunks are filtered. Default is `0` (disabled).
500
+
423
501
  ### Chunk Mode
424
502
 
425
503
  Use `groupBy: "chunk"` for flat per-chunk results without page aggregation:
@@ -534,7 +612,7 @@ pnpm searchsocket status
534
612
  # Output:
535
613
  # project: my-site
536
614
  # resolved scope: main
537
- # embedding model: text-embedding-3-small
615
+ # embedding model: jina-embeddings-v3
538
616
  # vector backend: turso/libsql (local (.searchsocket/vectors.db))
539
617
  # vector health: ok
540
618
  # last indexed (main): 2025-02-23T10:30:00Z
@@ -597,7 +675,7 @@ pnpm searchsocket doctor
597
675
 
598
676
  # Output:
599
677
  # PASS config parse
600
- # PASS env OPENAI_API_KEY
678
+ # PASS env JINA_API_KEY
601
679
  # PASS turso/libsql (local file: .searchsocket/vectors.db)
602
680
  # PASS source: build manifest
603
681
  # PASS source: vite binary
@@ -699,8 +777,8 @@ The CLI automatically loads `.env` from the working directory on startup. Existi
699
777
 
700
778
  ### Required
701
779
 
702
- **OpenAI:**
703
- - `OPENAI_API_KEY` — OpenAI API key for embeddings
780
+ **Jina AI:**
781
+ - `JINA_API_KEY` — Jina AI API key for embeddings and reranking
704
782
 
705
783
  ### Optional (Turso)
706
784
 
@@ -710,11 +788,6 @@ The CLI automatically loads `.env` from the working directory on startup. Existi
710
788
 
711
789
  If not set, uses local file DB at `.searchsocket/vectors.db`.
712
790
 
713
- ### Optional (Rerank)
714
-
715
- **Jina:**
716
- - `JINA_API_KEY` — Jina reranker API key (if using `rerank: { provider: "jina" }`)
717
-
718
791
  ### Optional (Scope/Build)
719
792
 
720
793
  - `SEARCHSOCKET_SCOPE` — Override scope (when `scope.mode: "env"`)
@@ -750,7 +823,11 @@ export default {
750
823
  exclude: ["/api/*"],
751
824
  paramValues: {
752
825
  "/blog/[slug]": ["hello-world", "getting-started"]
753
- }
826
+ },
827
+ discover: false,
828
+ seedUrls: ["/"],
829
+ maxPages: 200,
830
+ maxDepth: 5
754
831
  },
755
832
 
756
833
  // Crawl mode (alternative)
@@ -787,16 +864,19 @@ export default {
787
864
  },
788
865
 
789
866
  embeddings: {
790
- provider: "openai",
791
- model: "text-embedding-3-small",
792
- apiKeyEnv: "OPENAI_API_KEY",
867
+ provider: "jina",
868
+ model: "jina-embeddings-v3",
869
+ apiKey: "jina_...", // direct API key (or use apiKeyEnv)
870
+ apiKeyEnv: "JINA_API_KEY",
793
871
  batchSize: 64,
794
872
  concurrency: 4
795
873
  },
796
874
 
797
875
  vector: {
798
- dimension: 1536, // optional, inferred from first embedding
876
+ dimension: 1024, // optional, inferred from first embedding
799
877
  turso: {
878
+ url: "libsql://my-db.turso.io", // direct URL (or use urlEnv)
879
+ authToken: "eyJhbGc...", // direct token (or use authTokenEnv)
800
880
  urlEnv: "TURSO_DATABASE_URL",
801
881
  authTokenEnv: "TURSO_AUTH_TOKEN",
802
882
  localPath: ".searchsocket/vectors.db"
@@ -804,12 +884,9 @@ export default {
804
884
  },
805
885
 
806
886
  rerank: {
807
- provider: "jina", // "none" | "jina"
887
+ enabled: true,
808
888
  topN: 20,
809
- jina: {
810
- apiKeyEnv: "JINA_API_KEY",
811
- model: "jina-reranker-v2-base-multilingual"
812
- }
889
+ model: "jina-reranker-v2-base-multilingual"
813
890
  },
814
891
 
815
892
  ranking: {
@@ -819,6 +896,7 @@ export default {
819
896
  "/": 1.1,
820
897
  "/docs": 1.15
821
898
  },
899
+ minScore: 0,
822
900
  aggregationCap: 5,
823
901
  aggregationDecay: 0.5,
824
902
  minChunkScoreRatio: 0.5,