npm - ctxpkg - Versions diffs - 0.0.1 - Mend

ctxpkg 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

package/LICENSE +661 -0
package/README.md +282 -0
package/bin/cli.js +8 -0
package/bin/daemon.js +7 -0
package/package.json +70 -0
package/src/agent/AGENTS.md +249 -0
package/src/agent/agent.prompts.ts +66 -0
package/src/agent/agent.test-runner.schemas.ts +158 -0
package/src/agent/agent.test-runner.ts +436 -0
package/src/agent/agent.ts +371 -0
package/src/agent/agent.types.ts +94 -0
package/src/backend/AGENTS.md +112 -0
package/src/backend/backend.protocol.ts +95 -0
package/src/backend/backend.schemas.ts +123 -0
package/src/backend/backend.services.ts +151 -0
package/src/backend/backend.ts +111 -0
package/src/backend/backend.types.ts +34 -0
package/src/cli/AGENTS.md +213 -0
package/src/cli/cli.agent.ts +197 -0
package/src/cli/cli.chat.ts +369 -0
package/src/cli/cli.client.ts +55 -0
package/src/cli/cli.collections.ts +491 -0
package/src/cli/cli.config.ts +252 -0
package/src/cli/cli.daemon.ts +160 -0
package/src/cli/cli.documents.ts +413 -0
package/src/cli/cli.mcp.ts +177 -0
package/src/cli/cli.ts +28 -0
package/src/cli/cli.utils.ts +122 -0
package/src/client/AGENTS.md +135 -0
package/src/client/client.adapters.ts +279 -0
package/src/client/client.ts +86 -0
package/src/client/client.types.ts +17 -0
package/src/collections/AGENTS.md +185 -0
package/src/collections/collections.schemas.ts +195 -0
package/src/collections/collections.ts +1160 -0
package/src/config/config.ts +118 -0
package/src/daemon/AGENTS.md +168 -0
package/src/daemon/daemon.config.ts +23 -0
package/src/daemon/daemon.manager.ts +215 -0
package/src/daemon/daemon.schemas.ts +22 -0
package/src/daemon/daemon.ts +205 -0
package/src/database/AGENTS.md +211 -0
package/src/database/database.ts +64 -0
package/src/database/migrations/migrations.001-init.ts +56 -0
package/src/database/migrations/migrations.002-fts5.ts +32 -0
package/src/database/migrations/migrations.ts +20 -0
package/src/database/migrations/migrations.types.ts +9 -0
package/src/documents/AGENTS.md +301 -0
package/src/documents/documents.schemas.ts +190 -0
package/src/documents/documents.ts +734 -0
package/src/embedder/embedder.ts +53 -0
package/src/exports.ts +0 -0
package/src/mcp/AGENTS.md +264 -0
package/src/mcp/mcp.ts +105 -0
package/src/tools/AGENTS.md +228 -0
package/src/tools/agent/agent.ts +45 -0
package/src/tools/documents/documents.ts +401 -0
package/src/tools/tools.langchain.ts +37 -0
package/src/tools/tools.mcp.ts +46 -0
package/src/tools/tools.types.ts +35 -0
package/src/utils/utils.services.ts +46 -0

package/src/documents/AGENTS.md ADDED Viewed

@@ -0,0 +1,301 @@
+# Documents — Agent Guidelines
+This document describes the documents module architecture for AI agents working on this codebase.
+## Overview
+The documents module handles document storage, chunking, embedding, and semantic search. It's the core indexing engine that makes context searchable. Documents are split into chunks, embedded as vectors, and stored in SQLite with sqlite-vec for vector similarity search. The module uses hybrid search combining vector similarity with FTS5 keyword matching for improved retrieval quality.
+## File Structure
+| File | Purpose |
+|------|---------|
+| `documents.ts` | `DocumentsService` — document CRUD, chunking, embedding, hybrid search |
+| `documents.schemas.ts` | Zod schemas for documents, search options, results |
+## Architecture
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                   DocumentsService                              │
+├─────────────────────────────────────────────────────────────────┤
+│                                                                 │
+│  updateDocument()                                               │
+│       │                                                         │
+│       ▼                                                         │
+│  ┌─────────────┐    ┌─────────────┐    ┌─────────────┐          │
+│  │  Hash Check │───▶│   Chunker   │───▶│  Context    │          │
+│  │ (skip if    │    │ (1000 char, │    │  Prepend    │          │
+│  │  unchanged) │    │  200 overlap)│   │  (title,    │          │
+│  └─────────────┘    └─────────────┘    │  section)   │          │
+│                                        └──────┬──────┘          │
+│                                               │                 │
+│                                               ▼                 │
+│                                        ┌─────────────┐          │
+│                                        │  Embedder   │          │
+│                                        │ (document   │          │
+│                                        │  embedding) │          │
+│                                        └──────┬──────┘          │
+│                                               │                 │
+│                                               ▼                 │
+│  ┌──────────────────────────────────────────────────────────┐   │
+│  │                    Database                              │   │
+│  │  ┌────────────────────┐  ┌────────────────────────────┐  │   │
+│  │  │ reference_documents│  │ reference_document_chunks  │  │   │
+│  │  │ (collection, id,   │  │ (collection, document,     │  │   │
+│  │  │  content, hash)    │  │  content, embedding)       │  │   │
+│  │  └────────────────────┘  └────────────────────────────┘  │   │
+│  │                          ┌────────────────────────────┐  │   │
+│  │                          │reference_documentchunks_fts│  │   │
+│  │                          │ (FTS5 for keyword search)  │  │   │
+│  │                          └────────────────────────────┘  │   │
+│  └──────────────────────────────────────────────────────────┘   │
+│                                                                 │
+│  search()                                                       │
+│       │                                                         │
+│       ▼                                                         │
+│  ┌─────────────┐    ┌───────────────────────────────────────┐   │
+│  │  Embedder   │───▶│  Hybrid Search                        │   │
+│  │ (query      │    │  ┌─────────────┐  ┌─────────────────┐ │   │
+│  │  embedding  │    │  │Vector Search│  │ FTS5 Keyword    │ │   │
+│  │  with       │    │  │(cosine dist)│  │ Search          │ │   │
+│  │  instruction│    │  └──────┬──────┘  └────────┬────────┘ │   │
+│  │  prefix)    │    │         │                  │          │   │
+│  └─────────────┘    │         └────────┬─────────┘          │   │
+│                     │                  ▼                    │   │
+│                     │         ┌─────────────────┐           │   │
+│                     │         │ RRF Merge       │           │   │
+│                     │         │ (reciprocal     │           │   │
+│                     │         │  rank fusion)   │           │   │
+│                     │         └────────┬────────┘           │   │
+│                     │                  ▼                    │   │
+│                     │         ┌─────────────────┐           │   │
+│                     │         │ Re-rank         │           │   │
+│                     │         │ (optional)      │           │   │
+│                     │         └─────────────────┘           │   │
+│                     └───────────────────────────────────────┘   │
+└─────────────────────────────────────────────────────────────────┘
+```
+## Data Model
+### Documents
+```typescript
+type ReferenceDocument = {
+  collection: string;  // Collection ID (e.g., "pkg:https://example.com/manifest.json")
+  id: string;          // Document ID within collection (e.g., "intro.md")
+  content: string;     // Full document content
+};
+```
+Documents are stored with a SHA-256 hash of content for change detection.
+### Chunks
+Documents are split into chunks using LangChain's `RecursiveCharacterTextSplitter` with markdown awareness:
+- **Chunk size**: 1000 characters (for better semantic context)
+- **Chunk overlap**: 200 characters (to preserve context at boundaries)
+- **Context prepending**: Each chunk is embedded with document title and section heading for improved retrieval
+Each chunk record contains:
+- `id`: UUID
+- `collection`: Parent collection
+- `document`: Parent document ID
+- `content`: Original chunk text (without context prefix)
+- `embedding`: Vector embedding (JSON-encoded float array, 1024 dimensions)
+Chunks are also indexed in an FTS5 table for keyword search.
+### Search Options
+```typescript
+type SearchChunksOptions = {
+  query: string;           // Search query
+  collections?: string[];  // Filter by collection(s)
+  limit?: number;          // Max results (default: 10)
+  maxDistance?: number;    // Filter out poor matches (0-2 for cosine)
+  hybridSearch?: boolean;  // Combine vector + keyword search (default: true)
+  rerank?: boolean;        // Re-rank with secondary model (default: false)
+};
+```
+### Search Results
+```typescript
+type SearchChunkItem = {
+  id: string;          // Chunk ID
+  document: string;    // Document ID
+  collection: string;  // Collection ID
+  content: string;     // Chunk content
+  distance: number;    // Cosine distance (lower = more similar)
+  score?: number;      // Combined score after hybrid/rerank (higher = better)
+};
+```
+## Document Lifecycle
+### Insert/Update
+1. Compute SHA-256 hash of content
+2. Check if document exists with same hash → skip if unchanged
+3. If updating, delete existing chunks (vector + FTS)
+4. Insert/update document record
+5. Split content into chunks (1000 chars, 200 overlap)
+6. Extract document title and section headings
+7. Prepend context to each chunk for embedding
+8. Generate embeddings using document embedding method
+9. Insert chunk records with embeddings
+10. Insert into FTS5 table for keyword search
+### Delete
+Deletes the document record and all associated chunks from both vector and FTS tables (in a transaction).
+## Key Operations
+### `updateDocument(doc)`
+Upserts a document, re-chunking and re-embedding only if content changed. Uses contextualized embeddings for better retrieval.
+### `search({ query, collections?, limit, maxDistance?, hybridSearch?, rerank? })`
+1. Embed the query with instruction prefix ("Represent this sentence for searching relevant passages: ")
+2. **Vector search**: Query chunks using `vec_distance_cosine()`
+3. **Keyword search** (if hybridSearch=true): Query FTS5 table
+4. **Merge results** using Reciprocal Rank Fusion (RRF)
+5. **Re-rank** (if rerank=true): Use secondary model for precision
+6. Filter by maxDistance threshold
+7. Return top N results sorted by score/distance
+### `getDocumentIds(collection)`
+Returns all document IDs and hashes in a collection — used by sync logic to compute diffs.
+### `deleteDocuments(collection, ids)`
+Batch delete multiple documents and their chunks from all tables.
+### `listDocuments({ collection, limit, offset })`
+List all documents in a collection with pagination. Returns document IDs, titles (extracted from first `# heading`), and sizes.
+### `getOutline({ collection, document, maxDepth })`
+Get the heading structure of a document. Parses markdown headings up to `maxDepth` (default: 3). Returns title and array of outline items with level, text, and line number.
+### `getSection({ collection, document, section, includeSubsections })`
+Get a specific section of a document by heading. Matches section heading case-insensitively. Returns section content, heading level, and line range. If `includeSubsections` is true (default), includes nested subsections.
+### `searchBatch({ queries, limit, maxDistance, hybridSearch })`
+Execute multiple search queries in a single call. Useful for researching multiple concepts efficiently. Limited to 10 queries per batch.
+### `findRelated({ collection, document, chunk, limit, sameDocument })`
+Find content semantically related to a document or chunk:
+- If `chunk` is provided: embed that text and find similar
+- If no `chunk`: compute centroid of document's chunk embeddings
+- If `sameDocument` is false (default): exclude chunks from source document
+## Dependencies
+- **DatabaseService**: SQLite with sqlite-vec extension
+- **EmbedderService**: Generates vector embeddings with instruction-based methods:
+  - `createDocumentEmbeddings()` — for indexing documents
+  - `createQueryEmbedding()` — for search queries (with instruction prefix)
+## Search Quality Features
+### Instruction-Based Embeddings
+Different embedding strategies for documents vs queries improves retrieval:
+```typescript
+// Documents: embedded as-is
+await embedder.createDocumentEmbeddings(chunks);
+// Queries: embedded with instruction prefix
+await embedder.createQueryEmbedding(query);
+// → "Represent this sentence for searching relevant passages: {query}"
+```
+### Context Prepending
+Each chunk is embedded with document context for better semantic understanding:
+```
+Document: {title}
+Section: {nearest heading}
+{chunk content}
+```
+The original content (without prefix) is stored for display.
+### Hybrid Search with RRF
+Combines vector similarity and keyword matching:
+```typescript
+// Vector results ranked by cosine distance
+// Keyword results ranked by FTS5 BM25 score
+// Merged using Reciprocal Rank Fusion:
+const rrfScore = 1 / (k + rank);  // k=60
+```
+This catches both semantic matches and exact keyword matches.
+### Re-ranking
+Optional second-pass ranking using a lightweight model (`all-MiniLM-L6-v2`) for higher precision on top candidates.
+## Key Patterns
+### Change Detection
+Content hashing avoids re-processing unchanged documents:
+```typescript
+const hash = createHash('sha256').update(content).digest('hex');
+if (current && current.hash === hash) {
+  return; // Skip - content unchanged
+}
+```
+### Transaction Safety
+Document and chunk operations use transactions to maintain consistency across all tables:
+```typescript
+await database.transaction(async (trx) => {
+  await trx(tableNames.referenceDocumentChunks).delete()...
+  await trx(tableNames.referenceDocumentChunksFts).delete()...
+  await trx(tableNames.referenceDocuments).update()...
+  await trx(tableNames.referenceDocumentChunks).insert()...
+  await trx(tableNames.referenceDocumentChunksFts).insert()...
+});
+```
+### Vector Search
+sqlite-vec provides `vec_distance_cosine()` for cosine distance:
+```typescript
+database.raw('vec_distance_cosine(?, embedding) as distance', [JSON.stringify(queryEmbedding)])
+```
+Lower distance = more semantically similar (0 = identical, 2 = opposite).
+## Migration Notes
+When upgrading from the previous implementation:
+1. Run database migrations to create the FTS5 table
+2. Re-index existing collections to benefit from:
+   - Larger chunks with overlap
+   - Context-prepended embeddings
+   - FTS5 keyword indexing

package/src/documents/documents.schemas.ts ADDED Viewed

@@ -0,0 +1,190 @@
+import { z } from 'zod';
+const referenceDocumentSchema = z.object({
+  collection: z.string(),
+  id: z.string(),
+  content: z.string(),
+});
+type ReferenceDocument = z.infer<typeof referenceDocumentSchema>;
+const searchChunksOptionsSchema = z.object({
+  query: z.string(),
+  collections: z.array(z.string()).optional(),
+  limit: z.number().optional(),
+  /**
+   * Maximum distance threshold for results (0-2 for cosine, lower is better).
+   * Results with distance greater than this will be filtered out.
+   */
+  maxDistance: z.number().optional(),
+  /**
+   * Enable hybrid search combining vector similarity with keyword matching.
+   * Uses Reciprocal Rank Fusion (RRF) to merge results.
+   * @default true
+   */
+  hybridSearch: z.boolean().optional(),
+  /**
+   * Enable re-ranking of results using cross-encoder model.
+   * Slower but more accurate. Only re-ranks top candidates.
+   * @default false
+   */
+  rerank: z.boolean().optional(),
+});
+type SearchChunksOptions = z.infer<typeof searchChunksOptionsSchema>;
+const searchChunkItemSchema = z.object({
+  id: z.string(),
+  document: z.string(),
+  collection: z.string(),
+  content: z.string(),
+  distance: z.number(),
+  /** Combined score after hybrid search fusion (higher is better) */
+  score: z.number().optional(),
+});
+type SearchChunkItem = z.infer<typeof searchChunkItemSchema>;
+// === New schemas for MCP tools v2 ===
+// List documents params and result
+const listDocumentsParamsSchema = z.object({
+  collection: z.string(),
+  limit: z.number().optional().default(100),
+  offset: z.number().optional().default(0),
+});
+type ListDocumentsParams = z.infer<typeof listDocumentsParamsSchema>;
+const documentInfoSchema = z.object({
+  id: z.string(),
+  title: z.string(),
+  size: z.number(),
+});
+type DocumentInfo = z.infer<typeof documentInfoSchema>;
+const listDocumentsResultSchema = z.object({
+  documents: z.array(documentInfoSchema),
+  total: z.number(),
+  hasMore: z.boolean(),
+});
+type ListDocumentsResult = z.infer<typeof listDocumentsResultSchema>;
+// Get outline params and result
+const getOutlineParamsSchema = z.object({
+  collection: z.string(),
+  document: z.string(),
+  maxDepth: z.number().optional().default(3),
+});
+type GetOutlineParams = z.infer<typeof getOutlineParamsSchema>;
+const outlineItemSchema = z.object({
+  level: z.number(),
+  text: z.string(),
+  line: z.number(),
+});
+type OutlineItem = z.infer<typeof outlineItemSchema>;
+const outlineResultSchema = z.object({
+  title: z.string(),
+  outline: z.array(outlineItemSchema),
+});
+type OutlineResult = z.infer<typeof outlineResultSchema>;
+// Get section params and result
+const getSectionParamsSchema = z.object({
+  collection: z.string(),
+  document: z.string(),
+  section: z.string(),
+  includeSubsections: z.boolean().optional().default(true),
+});
+type GetSectionParams = z.infer<typeof getSectionParamsSchema>;
+const sectionResultSchema = z.object({
+  section: z.string(),
+  level: z.number(),
+  content: z.string(),
+  startLine: z.number(),
+  endLine: z.number(),
+});
+type SectionResult = z.infer<typeof sectionResultSchema>;
+// Find related params
+const findRelatedParamsSchema = z.object({
+  collection: z.string(),
+  document: z.string(),
+  chunk: z.string().optional(),
+  limit: z.number().optional().default(5),
+  sameDocument: z.boolean().optional().default(false),
+});
+type FindRelatedParams = z.infer<typeof findRelatedParamsSchema>;
+// Search batch params and result
+const searchBatchQuerySchema = z.object({
+  query: z.string(),
+  collections: z.array(z.string()).optional(),
+});
+const searchBatchParamsSchema = z.object({
+  queries: z.array(searchBatchQuerySchema).min(1).max(10),
+  limit: z.number().optional().default(5),
+  maxDistance: z.number().optional(),
+  hybridSearch: z.boolean().optional().default(true),
+});
+type SearchBatchParams = z.infer<typeof searchBatchParamsSchema>;
+const searchBatchResultItemSchema = z.object({
+  query: z.string(),
+  results: z.array(searchChunkItemSchema),
+});
+const searchBatchResultSchema = z.object({
+  results: z.array(searchBatchResultItemSchema),
+});
+type SearchBatchResult = z.infer<typeof searchBatchResultSchema>;
+export type {
+  ReferenceDocument,
+  SearchChunksOptions,
+  SearchChunkItem,
+  ListDocumentsParams,
+  DocumentInfo,
+  ListDocumentsResult,
+  GetOutlineParams,
+  OutlineItem,
+  OutlineResult,
+  GetSectionParams,
+  SectionResult,
+  FindRelatedParams,
+  SearchBatchParams,
+  SearchBatchResult,
+};
+export {
+  referenceDocumentSchema,
+  searchChunksOptionsSchema,
+  searchChunkItemSchema,
+  listDocumentsParamsSchema,
+  documentInfoSchema,
+  listDocumentsResultSchema,
+  getOutlineParamsSchema,
+  outlineItemSchema,
+  outlineResultSchema,
+  getSectionParamsSchema,
+  sectionResultSchema,
+  findRelatedParamsSchema,
+  searchBatchQuerySchema,
+  searchBatchParamsSchema,
+  searchBatchResultItemSchema,
+  searchBatchResultSchema,
+};