npm - searchsocket - Versions diffs - 0.2.0 → 0.3.0 - Mend

searchsocket 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +120 -42
package/dist/cli.js +348 -111
package/dist/client.d.cts +1 -1
package/dist/client.d.ts +1 -1
package/dist/index.cjs +367 -104
package/dist/index.d.cts +20 -3
package/dist/index.d.ts +20 -3
package/dist/index.js +365 -103
package/dist/sveltekit.cjs +350 -104
package/dist/sveltekit.d.cts +8 -2
package/dist/sveltekit.d.ts +8 -2
package/dist/sveltekit.js +349 -102
package/dist/{types-D1K46vwd.d.cts → types-DAXk6A3Y.d.cts} +25 -13
package/dist/{types-D1K46vwd.d.ts → types-DAXk6A3Y.d.ts} +25 -13
package/package.json +3 -3
package/dist/cli.js.map +0 -1
package/dist/client.cjs.map +0 -1
package/dist/client.js.map +0 -1
package/dist/index.cjs.map +0 -1
package/dist/index.js.map +0 -1
package/dist/sveltekit.cjs.map +0 -1
package/dist/sveltekit.js.map +0 -1

package/README.md CHANGED Viewed

@@ -6,9 +6,9 @@ Semantic site search and MCP retrieval for SvelteKit content projects.
 ## Features
-- **Embeddings**: OpenAI `text-embedding-3-small` (configurable)
+- **Embeddings**: Jina AI `jina-embeddings-v3` with task-specific LoRA adapters (configurable)
 - **Vector Backend**: Turso/libSQL with vector search (local file DB for development, remote for production)
-- **Rerank**: Optional Jina reranker for improved relevance
+- **Rerank**: Optional Jina reranker — same API key, one boolean to enable
 - **Page Aggregation**: Group results by page with score-weighted chunk decay
 - **Meta Extraction**: Automatically extracts `<meta name="description">` and `<meta name="keywords">` for improved relevance
 - **SvelteKit Integrations**:
@@ -48,7 +48,7 @@ Minimal config (`searchsocket.config.ts`):
 ```ts
 export default {
-  embeddings: { apiKeyEnv: "OPENAI_API_KEY" }
+  embeddings: { apiKeyEnv: "JINA_API_KEY" }
 };
 ```
@@ -74,12 +74,12 @@ The CLI automatically loads `.env` from the working directory on startup, so you
 Development (`.env`):
 ```bash
-OPENAI_API_KEY=sk-...
+JINA_API_KEY=jina_...
 ```
 Production (add these for remote Turso):
 ```bash
-OPENAI_API_KEY=sk-...
+JINA_API_KEY=jina_...
 TURSO_DATABASE_URL=libsql://your-db.turso.io
 TURSO_AUTH_TOKEN=eyJ...
 ```
@@ -101,7 +101,7 @@ The indexing pipeline:
 - Chunks text with semantic heading boundaries
 - Prepends page title to each chunk for embedding context
 - Generates a synthetic summary chunk per page for identity matching
-- Generates embeddings via OpenAI
+- Generates embeddings via Jina AI (with task-specific LoRA adapters for indexing vs search)
 - Stores vectors in Turso/libSQL with cosine similarity index
 ### 6. Query
@@ -163,7 +163,7 @@ pnpm searchsocket search --q "getting started" --top-k 5 --path-prefix /docs
   "meta": {
     "timingsMs": { "embed": 120, "vector": 15, "rerank": 0, "total": 135 },
     "usedRerank": false,
-    "modelId": "text-embedding-3-small"
+    "modelId": "jina-embeddings-v3"
   }
 }
 ```
@@ -203,7 +203,11 @@ export default {
       paramValues: {                     // values for dynamic routes
         "/blog/[slug]": ["hello-world", "getting-started"],
         "/docs/[category]/[page]": ["guides/quickstart", "api/search"]
-      }
+      },
+      discover: true,                    // crawl internal links to find pages (default: false)
+      seedUrls: ["/"],                   // starting URLs for discovery
+      maxPages: 200,                     // max pages to discover (default: 200)
+      maxDepth: 5                        // max link depth from seed URLs (default: 5)
     }
   }
 };
@@ -221,6 +225,8 @@ Best for: CI/CD pipelines. Enables `vite build && searchsocket index` with zero
 **Dynamic routes**: Each key in `paramValues` maps to a route ID (e.g., `/blog/[slug]`) or its URL equivalent. Each value in the array replaces all `[param]` segments in the URL. Routes with layout groups like `/(app)/blog/[slug]` also match the URL key `/blog/[slug]`.
+**Link discovery**: Enable `discover: true` to automatically find pages by crawling internal links from `seedUrls`. This is useful when dynamic routes have many parameter values that are impractical to enumerate. The crawler respects `maxPages` and `maxDepth` limits and only follows links within the same origin.
 ### `crawl`
 Fetches pages from a running HTTP server.
@@ -325,30 +331,102 @@ For production, switch to **Turso's hosted service**:
 3. **Index normally** — SearchSocket auto-detects the remote URL and uses it.
+### Direct Credential Passing
+Instead of environment variables, you can pass credentials directly in the config. This is useful for serverless deployments or multi-tenant setups:
+```ts
+export default {
+  embeddings: {
+    apiKey: "jina_..."  // direct API key (takes precedence over apiKeyEnv)
+  },
+  vector: {
+    turso: {
+      url: "libsql://my-db.turso.io",       // direct URL
+      authToken: "eyJhbGc..."               // direct auth token
+    }
+  }
+};
+```
+Direct values take precedence over environment variable lookups (`apiKeyEnv`, `urlEnv`, `authTokenEnv`).
+### Dimension Mismatch Auto-Recovery
+When switching embedding models (e.g., from a 1536-dim model to Jina's 1024-dim), the vector dimension changes. SearchSocket automatically detects this and recreates the chunks table with the new dimension — no manual intervention needed. A full re-index (`--force`) is still required after switching models.
 ### Why Turso?
-- **Single backend** — no more choosing between Pinecone, Milvus, or local JSON
+- **Single backend** — one unified Turso/libSQL store for vectors, metadata, and state
 - **Local-first development** — zero external dependencies for local dev
 - **Production-ready** — same codebase scales to remote hosted DB
 - **Cost-effective** — Turso free tier includes 9GB storage, 500M row reads/month
 - **Vector search native** — `F32_BLOB` vectors, cosine similarity index, `vector_top_k` ANN queries
-## Embeddings: OpenAI
+## Serverless Deployment (Vercel, Netlify, etc.)
-SearchSocket uses **OpenAI's embedding models** to convert text into semantic vectors.
+SearchSocket works on serverless platforms with a few adjustments:
+### Requirements
+1. **Remote Turso database** — local SQLite is not available in serverless (no persistent filesystem). Set `TURSO_DATABASE_URL` and `TURSO_AUTH_TOKEN` as platform environment variables.
+2. **Inline config via `rawConfig`** — the default config loader uses `jiti` to import `searchsocket.config.ts` from disk, which isn't bundled in serverless. Use `rawConfig` to pass config inline:
+```ts
+// hooks.server.ts (Vercel / Netlify)
+import { searchsocketHandle } from "searchsocket/sveltekit";
+export const handle = searchsocketHandle({
+  rawConfig: {
+    project: { id: "my-docs-site" },
+    source: { mode: "static-output" },
+    embeddings: { apiKeyEnv: "JINA_API_KEY" },
+  }
+});
+```
+3. **Environment variables** — set these on your platform dashboard:
+   - `JINA_API_KEY`
+   - `TURSO_DATABASE_URL`
+   - `TURSO_AUTH_TOKEN`
+### Rate Limiting
+The built-in `InMemoryRateLimiter` auto-disables on serverless platforms (it resets on every cold start). Use your platform's WAF or edge rate-limiting instead.
+### What Only Applies to Indexing
+The following features are only used during `searchsocket index` (CLI), not the search handler:
+- `ensureStateDirs` — creates `.searchsocket/` state directories
+- Markdown mirror — writes `.searchsocket/mirror/` files
+- Local SQLite fallback — only needed when `TURSO_DATABASE_URL` is not set
+### Adapter Guidance
+| Platform | Adapter | Notes |
+|----------|---------|-------|
+| Vercel | `adapter-auto` (default) | Serverless — use `rawConfig` + remote Turso |
+| Netlify | `adapter-netlify` | Serverless — same as Vercel |
+| VPS / Docker | `adapter-node` | Long-lived process — no limitations, local SQLite works |
+## Embeddings: Jina AI
+SearchSocket uses **Jina AI's embedding models** to convert text into semantic vectors. A single `JINA_API_KEY` powers both embeddings and optional reranking.
 ### Default Model
-- **Model**: `text-embedding-3-small`
-- **Dimensions**: 1536
-- **Cost**: ~$0.00002 per 1K tokens (~4K chars)
+- **Model**: `jina-embeddings-v3`
+- **Dimensions**: 1024 (default)
+- **Cost**: ~$0.00002 per 1K tokens (generous 10M token free tier)
+- **Task adapters**: Uses `retrieval.passage` for indexing, `retrieval.query` for search queries (LoRA task-specific adapters for better retrieval quality)
 ### How It Works
 1. **Chunking**: Text is split into semantic chunks (default 2200 chars, 200 overlap)
 2. **Title Prepend**: Page title is prepended to each chunk for better context (`chunking.prependTitle`, default: true)
 3. **Summary Chunk**: A synthetic identity chunk is generated per page with title, URL, and first paragraph (`chunking.pageSummaryChunk`, default: true)
-4. **Embedding**: Each chunk is sent to OpenAI's embedding API
+4. **Embedding**: Each chunk is sent to Jina's embedding API with the `retrieval.passage` task adapter
 5. **Batching**: Requests batched (64 texts per request) for efficiency
 6. **Storage**: Vectors stored in Turso with metadata (URL, title, tags, depth, etc.)
@@ -369,17 +447,14 @@ estimated tokens: 32,400
 estimated cost (USD): $0.000648
 ```
-### Custom Model
+### Reranking
+Since embeddings and reranking share the same Jina API key, enabling reranking is one boolean:
-Override in config:
 ```ts
 export default {
-  embeddings: {
-    provider: "openai",
-    model: "text-embedding-3-large",  // 3072 dims, higher quality
-    apiKeyEnv: "OPENAI_API_KEY",
-    pricePer1kTokens: 0.00013
-  }
+  embeddings: { apiKeyEnv: "JINA_API_KEY" },
+  rerank: { enabled: true }
 };
 ```
@@ -400,6 +475,7 @@ Configure aggregation behavior:
 ```ts
 export default {
   ranking: {
+    minScore: 0,                // minimum absolute score to include in results (default: 0, disabled)
     aggregationCap: 5,          // max chunks contributing to page score (default: 5)
     aggregationDecay: 0.5,      // decay factor for additional chunks (default: 0.5)
     minChunkScoreRatio: 0.5,    // threshold for sub-chunks in results (default: 0.5)
@@ -420,6 +496,8 @@ export default {
 `pageWeights` supports exact URL matches and prefix matching. A weight of `1.15` on `"/docs"` boosts all pages under `/docs/` by 15%. Use gentle values (1.05-1.2x) since they compound with aggregation.
+`minScore` filters out low-relevance results before they reach the client. Set to a value like `0.3` to remove noise. In page mode, pages below the threshold are dropped; in chunk mode, individual chunks are filtered. Default is `0` (disabled).
 ### Chunk Mode
 Use `groupBy: "chunk"` for flat per-chunk results without page aggregation:
@@ -534,7 +612,7 @@ pnpm searchsocket status
 # Output:
 # project: my-site
 # resolved scope: main
-# embedding model: text-embedding-3-small
+# embedding model: jina-embeddings-v3
 # vector backend: turso/libsql (local (.searchsocket/vectors.db))
 # vector health: ok
 # last indexed (main): 2025-02-23T10:30:00Z
@@ -597,7 +675,7 @@ pnpm searchsocket doctor
 # Output:
 # PASS config parse
-# PASS env OPENAI_API_KEY
+# PASS env JINA_API_KEY
 # PASS turso/libsql (local file: .searchsocket/vectors.db)
 # PASS source: build manifest
 # PASS source: vite binary
@@ -699,8 +777,8 @@ The CLI automatically loads `.env` from the working directory on startup. Existi
 ### Required
-**OpenAI:**
-- `OPENAI_API_KEY` — OpenAI API key for embeddings
+**Jina AI:**
+- `JINA_API_KEY` — Jina AI API key for embeddings and reranking
 ### Optional (Turso)
@@ -710,11 +788,6 @@ The CLI automatically loads `.env` from the working directory on startup. Existi
 If not set, uses local file DB at `.searchsocket/vectors.db`.
-### Optional (Rerank)
-**Jina:**
-- `JINA_API_KEY` — Jina reranker API key (if using `rerank: { provider: "jina" }`)
 ### Optional (Scope/Build)
 - `SEARCHSOCKET_SCOPE` — Override scope (when `scope.mode: "env"`)
@@ -750,7 +823,11 @@ export default {
       exclude: ["/api/*"],
       paramValues: {
         "/blog/[slug]": ["hello-world", "getting-started"]
-      }
+      },
+      discover: false,
+      seedUrls: ["/"],
+      maxPages: 200,
+      maxDepth: 5
     },
     // Crawl mode (alternative)
@@ -787,16 +864,19 @@ export default {
   },
   embeddings: {
-    provider: "openai",
-    model: "text-embedding-3-small",
-    apiKeyEnv: "OPENAI_API_KEY",
+    provider: "jina",
+    model: "jina-embeddings-v3",
+    apiKey: "jina_...",          // direct API key (or use apiKeyEnv)
+    apiKeyEnv: "JINA_API_KEY",
     batchSize: 64,
     concurrency: 4
   },
   vector: {
-    dimension: 1536,  // optional, inferred from first embedding
+    dimension: 1024,  // optional, inferred from first embedding
     turso: {
+      url: "libsql://my-db.turso.io",    // direct URL (or use urlEnv)
+      authToken: "eyJhbGc...",            // direct token (or use authTokenEnv)
       urlEnv: "TURSO_DATABASE_URL",
       authTokenEnv: "TURSO_AUTH_TOKEN",
       localPath: ".searchsocket/vectors.db"
@@ -804,12 +884,9 @@ export default {
   },
   rerank: {
-    provider: "jina",  // "none" | "jina"
+    enabled: true,
     topN: 20,
-    jina: {
-      apiKeyEnv: "JINA_API_KEY",
-      model: "jina-reranker-v2-base-multilingual"
-    }
+    model: "jina-reranker-v2-base-multilingual"
   },
   ranking: {
@@ -819,6 +896,7 @@ export default {
       "/": 1.1,
       "/docs": 1.15
     },
+    minScore: 0,
     aggregationCap: 5,
     aggregationDecay: 0.5,
     minChunkScoreRatio: 0.5,