ctxpkg 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/LICENSE +661 -0
  2. package/README.md +282 -0
  3. package/bin/cli.js +8 -0
  4. package/bin/daemon.js +7 -0
  5. package/package.json +70 -0
  6. package/src/agent/AGENTS.md +249 -0
  7. package/src/agent/agent.prompts.ts +66 -0
  8. package/src/agent/agent.test-runner.schemas.ts +158 -0
  9. package/src/agent/agent.test-runner.ts +436 -0
  10. package/src/agent/agent.ts +371 -0
  11. package/src/agent/agent.types.ts +94 -0
  12. package/src/backend/AGENTS.md +112 -0
  13. package/src/backend/backend.protocol.ts +95 -0
  14. package/src/backend/backend.schemas.ts +123 -0
  15. package/src/backend/backend.services.ts +151 -0
  16. package/src/backend/backend.ts +111 -0
  17. package/src/backend/backend.types.ts +34 -0
  18. package/src/cli/AGENTS.md +213 -0
  19. package/src/cli/cli.agent.ts +197 -0
  20. package/src/cli/cli.chat.ts +369 -0
  21. package/src/cli/cli.client.ts +55 -0
  22. package/src/cli/cli.collections.ts +491 -0
  23. package/src/cli/cli.config.ts +252 -0
  24. package/src/cli/cli.daemon.ts +160 -0
  25. package/src/cli/cli.documents.ts +413 -0
  26. package/src/cli/cli.mcp.ts +177 -0
  27. package/src/cli/cli.ts +28 -0
  28. package/src/cli/cli.utils.ts +122 -0
  29. package/src/client/AGENTS.md +135 -0
  30. package/src/client/client.adapters.ts +279 -0
  31. package/src/client/client.ts +86 -0
  32. package/src/client/client.types.ts +17 -0
  33. package/src/collections/AGENTS.md +185 -0
  34. package/src/collections/collections.schemas.ts +195 -0
  35. package/src/collections/collections.ts +1160 -0
  36. package/src/config/config.ts +118 -0
  37. package/src/daemon/AGENTS.md +168 -0
  38. package/src/daemon/daemon.config.ts +23 -0
  39. package/src/daemon/daemon.manager.ts +215 -0
  40. package/src/daemon/daemon.schemas.ts +22 -0
  41. package/src/daemon/daemon.ts +205 -0
  42. package/src/database/AGENTS.md +211 -0
  43. package/src/database/database.ts +64 -0
  44. package/src/database/migrations/migrations.001-init.ts +56 -0
  45. package/src/database/migrations/migrations.002-fts5.ts +32 -0
  46. package/src/database/migrations/migrations.ts +20 -0
  47. package/src/database/migrations/migrations.types.ts +9 -0
  48. package/src/documents/AGENTS.md +301 -0
  49. package/src/documents/documents.schemas.ts +190 -0
  50. package/src/documents/documents.ts +734 -0
  51. package/src/embedder/embedder.ts +53 -0
  52. package/src/exports.ts +0 -0
  53. package/src/mcp/AGENTS.md +264 -0
  54. package/src/mcp/mcp.ts +105 -0
  55. package/src/tools/AGENTS.md +228 -0
  56. package/src/tools/agent/agent.ts +45 -0
  57. package/src/tools/documents/documents.ts +401 -0
  58. package/src/tools/tools.langchain.ts +37 -0
  59. package/src/tools/tools.mcp.ts +46 -0
  60. package/src/tools/tools.types.ts +35 -0
  61. package/src/utils/utils.services.ts +46 -0
@@ -0,0 +1,301 @@
1
+ # Documents — Agent Guidelines
2
+
3
+ This document describes the documents module architecture for AI agents working on this codebase.
4
+
5
+ ## Overview
6
+
7
+ The documents module handles document storage, chunking, embedding, and semantic search. It's the core indexing engine that makes context searchable. Documents are split into chunks, embedded as vectors, and stored in SQLite with sqlite-vec for vector similarity search. The module uses hybrid search combining vector similarity with FTS5 keyword matching for improved retrieval quality.
8
+
9
+ ## File Structure
10
+
11
+ | File | Purpose |
12
+ |------|---------|
13
+ | `documents.ts` | `DocumentsService` — document CRUD, chunking, embedding, hybrid search |
14
+ | `documents.schemas.ts` | Zod schemas for documents, search options, results |
15
+
16
+ ## Architecture
17
+
18
+ ```
19
+ ┌─────────────────────────────────────────────────────────────────┐
20
+ │ DocumentsService │
21
+ ├─────────────────────────────────────────────────────────────────┤
22
+ │ │
23
+ │ updateDocument() │
24
+ │ │ │
25
+ │ ▼ │
26
+ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
27
+ │ │ Hash Check │───▶│ Chunker │───▶│ Context │ │
28
+ │ │ (skip if │ │ (1000 char, │ │ Prepend │ │
29
+ │ │ unchanged) │ │ 200 overlap)│ │ (title, │ │
30
+ │ └─────────────┘ └─────────────┘ │ section) │ │
31
+ │ └──────┬──────┘ │
32
+ │ │ │
33
+ │ ▼ │
34
+ │ ┌─────────────┐ │
35
+ │ │ Embedder │ │
36
+ │ │ (document │ │
37
+ │ │ embedding) │ │
38
+ │ └──────┬──────┘ │
39
+ │ │ │
40
+ │ ▼ │
41
+ │ ┌──────────────────────────────────────────────────────────┐ │
42
+ │ │ Database │ │
43
+ │ │ ┌────────────────────┐ ┌────────────────────────────┐ │ │
44
+ │ │ │ reference_documents│ │ reference_document_chunks │ │ │
45
+ │ │ │ (collection, id, │ │ (collection, document, │ │ │
46
+ │ │ │ content, hash) │ │ content, embedding) │ │ │
47
+ │ │ └────────────────────┘ └────────────────────────────┘ │ │
48
+ │ │ ┌────────────────────────────┐ │ │
49
+ │ │ │reference_documentchunks_fts│ │ │
50
+ │ │ │ (FTS5 for keyword search) │ │ │
51
+ │ │ └────────────────────────────┘ │ │
52
+ │ └──────────────────────────────────────────────────────────┘ │
53
+ │ │
54
+ │ search() │
55
+ │ │ │
56
+ │ ▼ │
57
+ │ ┌─────────────┐ ┌───────────────────────────────────────┐ │
58
+ │ │ Embedder │───▶│ Hybrid Search │ │
59
+ │ │ (query │ │ ┌─────────────┐ ┌─────────────────┐ │ │
60
+ │ │ embedding │ │ │Vector Search│ │ FTS5 Keyword │ │ │
61
+ │ │ with │ │ │(cosine dist)│ │ Search │ │ │
62
+ │ │ instruction│ │ └──────┬──────┘ └────────┬────────┘ │ │
63
+ │ │ prefix) │ │ │ │ │ │
64
+ │ └─────────────┘ │ └────────┬─────────┘ │ │
65
+ │ │ ▼ │ │
66
+ │ │ ┌─────────────────┐ │ │
67
+ │ │ │ RRF Merge │ │ │
68
+ │ │ │ (reciprocal │ │ │
69
+ │ │ │ rank fusion) │ │ │
70
+ │ │ └────────┬────────┘ │ │
71
+ │ │ ▼ │ │
72
+ │ │ ┌─────────────────┐ │ │
73
+ │ │ │ Re-rank │ │ │
74
+ │ │ │ (optional) │ │ │
75
+ │ │ └─────────────────┘ │ │
76
+ │ └───────────────────────────────────────┘ │
77
+ └─────────────────────────────────────────────────────────────────┘
78
+ ```
79
+
80
+ ## Data Model
81
+
82
+ ### Documents
83
+
84
+ ```typescript
85
+ type ReferenceDocument = {
86
+ collection: string; // Collection ID (e.g., "pkg:https://example.com/manifest.json")
87
+ id: string; // Document ID within collection (e.g., "intro.md")
88
+ content: string; // Full document content
89
+ };
90
+ ```
91
+
92
+ Documents are stored with a SHA-256 hash of content for change detection.
93
+
94
+ ### Chunks
95
+
96
+ Documents are split into chunks using LangChain's `RecursiveCharacterTextSplitter` with markdown awareness:
97
+
98
+ - **Chunk size**: 1000 characters (for better semantic context)
99
+ - **Chunk overlap**: 200 characters (to preserve context at boundaries)
100
+ - **Context prepending**: Each chunk is embedded with document title and section heading for improved retrieval
101
+
102
+ Each chunk record contains:
103
+
104
+ - `id`: UUID
105
+ - `collection`: Parent collection
106
+ - `document`: Parent document ID
107
+ - `content`: Original chunk text (without context prefix)
108
+ - `embedding`: Vector embedding (JSON-encoded float array, 1024 dimensions)
109
+
110
+ Chunks are also indexed in an FTS5 table for keyword search.
111
+
112
+ ### Search Options
113
+
114
+ ```typescript
115
+ type SearchChunksOptions = {
116
+ query: string; // Search query
117
+ collections?: string[]; // Filter by collection(s)
118
+ limit?: number; // Max results (default: 10)
119
+ maxDistance?: number; // Filter out poor matches (0-2 for cosine)
120
+ hybridSearch?: boolean; // Combine vector + keyword search (default: true)
121
+ rerank?: boolean; // Re-rank with secondary model (default: false)
122
+ };
123
+ ```
124
+
125
+ ### Search Results
126
+
127
+ ```typescript
128
+ type SearchChunkItem = {
129
+ id: string; // Chunk ID
130
+ document: string; // Document ID
131
+ collection: string; // Collection ID
132
+ content: string; // Chunk content
133
+ distance: number; // Cosine distance (lower = more similar)
134
+ score?: number; // Combined score after hybrid/rerank (higher = better)
135
+ };
136
+ ```
137
+
138
+ ## Document Lifecycle
139
+
140
+ ### Insert/Update
141
+
142
+ 1. Compute SHA-256 hash of content
143
+ 2. Check if document exists with same hash → skip if unchanged
144
+ 3. If updating, delete existing chunks (vector + FTS)
145
+ 4. Insert/update document record
146
+ 5. Split content into chunks (1000 chars, 200 overlap)
147
+ 6. Extract document title and section headings
148
+ 7. Prepend context to each chunk for embedding
149
+ 8. Generate embeddings using document embedding method
150
+ 9. Insert chunk records with embeddings
151
+ 10. Insert into FTS5 table for keyword search
152
+
153
+ ### Delete
154
+
155
+ Deletes the document record and all associated chunks from both vector and FTS tables (in a transaction).
156
+
157
+ ## Key Operations
158
+
159
+ ### `updateDocument(doc)`
160
+
161
+ Upserts a document, re-chunking and re-embedding only if content changed. Uses contextualized embeddings for better retrieval.
162
+
163
+ ### `search({ query, collections?, limit, maxDistance?, hybridSearch?, rerank? })`
164
+
165
+ 1. Embed the query with instruction prefix ("Represent this sentence for searching relevant passages: ")
166
+ 2. **Vector search**: Query chunks using `vec_distance_cosine()`
167
+ 3. **Keyword search** (if hybridSearch=true): Query FTS5 table
168
+ 4. **Merge results** using Reciprocal Rank Fusion (RRF)
169
+ 5. **Re-rank** (if rerank=true): Use secondary model for precision
170
+ 6. Filter by maxDistance threshold
171
+ 7. Return top N results sorted by score/distance
172
+
173
+ ### `getDocumentIds(collection)`
174
+
175
+ Returns all document IDs and hashes in a collection — used by sync logic to compute diffs.
176
+
177
+ ### `deleteDocuments(collection, ids)`
178
+
179
+ Batch delete multiple documents and their chunks from all tables.
180
+
181
+ ### `listDocuments({ collection, limit, offset })`
182
+
183
+ List all documents in a collection with pagination. Returns document IDs, titles (extracted from first `# heading`), and sizes.
184
+
185
+ ### `getOutline({ collection, document, maxDepth })`
186
+
187
+ Get the heading structure of a document. Parses markdown headings up to `maxDepth` (default: 3). Returns title and array of outline items with level, text, and line number.
188
+
189
+ ### `getSection({ collection, document, section, includeSubsections })`
190
+
191
+ Get a specific section of a document by heading. Matches section heading case-insensitively. Returns section content, heading level, and line range. If `includeSubsections` is true (default), includes nested subsections.
192
+
193
+ ### `searchBatch({ queries, limit, maxDistance, hybridSearch })`
194
+
195
+ Execute multiple search queries in a single call. Useful for researching multiple concepts efficiently. Limited to 10 queries per batch.
196
+
197
+ ### `findRelated({ collection, document, chunk, limit, sameDocument })`
198
+
199
+ Find content semantically related to a document or chunk:
200
+ - If `chunk` is provided: embed that text and find similar
201
+ - If no `chunk`: compute centroid of document's chunk embeddings
202
+ - If `sameDocument` is false (default): exclude chunks from source document
203
+
204
+ ## Dependencies
205
+
206
+ - **DatabaseService**: SQLite with sqlite-vec extension
207
+ - **EmbedderService**: Generates vector embeddings with instruction-based methods:
208
+ - `createDocumentEmbeddings()` — for indexing documents
209
+ - `createQueryEmbedding()` — for search queries (with instruction prefix)
210
+
211
+ ## Search Quality Features
212
+
213
+ ### Instruction-Based Embeddings
214
+
215
+ Different embedding strategies for documents vs queries improves retrieval:
216
+
217
+ ```typescript
218
+ // Documents: embedded as-is
219
+ await embedder.createDocumentEmbeddings(chunks);
220
+
221
+ // Queries: embedded with instruction prefix
222
+ await embedder.createQueryEmbedding(query);
223
+ // → "Represent this sentence for searching relevant passages: {query}"
224
+ ```
225
+
226
+ ### Context Prepending
227
+
228
+ Each chunk is embedded with document context for better semantic understanding:
229
+
230
+ ```
231
+ Document: {title}
232
+ Section: {nearest heading}
233
+
234
+ {chunk content}
235
+ ```
236
+
237
+ The original content (without prefix) is stored for display.
238
+
239
+ ### Hybrid Search with RRF
240
+
241
+ Combines vector similarity and keyword matching:
242
+
243
+ ```typescript
244
+ // Vector results ranked by cosine distance
245
+ // Keyword results ranked by FTS5 BM25 score
246
+ // Merged using Reciprocal Rank Fusion:
247
+ const rrfScore = 1 / (k + rank); // k=60
248
+ ```
249
+
250
+ This catches both semantic matches and exact keyword matches.
251
+
252
+ ### Re-ranking
253
+
254
+ Optional second-pass ranking using a lightweight model (`all-MiniLM-L6-v2`) for higher precision on top candidates.
255
+
256
+ ## Key Patterns
257
+
258
+ ### Change Detection
259
+
260
+ Content hashing avoids re-processing unchanged documents:
261
+
262
+ ```typescript
263
+ const hash = createHash('sha256').update(content).digest('hex');
264
+ if (current && current.hash === hash) {
265
+ return; // Skip - content unchanged
266
+ }
267
+ ```
268
+
269
+ ### Transaction Safety
270
+
271
+ Document and chunk operations use transactions to maintain consistency across all tables:
272
+
273
+ ```typescript
274
+ await database.transaction(async (trx) => {
275
+ await trx(tableNames.referenceDocumentChunks).delete()...
276
+ await trx(tableNames.referenceDocumentChunksFts).delete()...
277
+ await trx(tableNames.referenceDocuments).update()...
278
+ await trx(tableNames.referenceDocumentChunks).insert()...
279
+ await trx(tableNames.referenceDocumentChunksFts).insert()...
280
+ });
281
+ ```
282
+
283
+ ### Vector Search
284
+
285
+ sqlite-vec provides `vec_distance_cosine()` for cosine distance:
286
+
287
+ ```typescript
288
+ database.raw('vec_distance_cosine(?, embedding) as distance', [JSON.stringify(queryEmbedding)])
289
+ ```
290
+
291
+ Lower distance = more semantically similar (0 = identical, 2 = opposite).
292
+
293
+ ## Migration Notes
294
+
295
+ When upgrading from the previous implementation:
296
+
297
+ 1. Run database migrations to create the FTS5 table
298
+ 2. Re-index existing collections to benefit from:
299
+ - Larger chunks with overlap
300
+ - Context-prepended embeddings
301
+ - FTS5 keyword indexing
@@ -0,0 +1,190 @@
1
+ import { z } from 'zod';
2
+
3
+ const referenceDocumentSchema = z.object({
4
+ collection: z.string(),
5
+ id: z.string(),
6
+ content: z.string(),
7
+ });
8
+
9
+ type ReferenceDocument = z.infer<typeof referenceDocumentSchema>;
10
+
11
+ const searchChunksOptionsSchema = z.object({
12
+ query: z.string(),
13
+ collections: z.array(z.string()).optional(),
14
+ limit: z.number().optional(),
15
+ /**
16
+ * Maximum distance threshold for results (0-2 for cosine, lower is better).
17
+ * Results with distance greater than this will be filtered out.
18
+ */
19
+ maxDistance: z.number().optional(),
20
+ /**
21
+ * Enable hybrid search combining vector similarity with keyword matching.
22
+ * Uses Reciprocal Rank Fusion (RRF) to merge results.
23
+ * @default true
24
+ */
25
+ hybridSearch: z.boolean().optional(),
26
+ /**
27
+ * Enable re-ranking of results using cross-encoder model.
28
+ * Slower but more accurate. Only re-ranks top candidates.
29
+ * @default false
30
+ */
31
+ rerank: z.boolean().optional(),
32
+ });
33
+
34
+ type SearchChunksOptions = z.infer<typeof searchChunksOptionsSchema>;
35
+
36
+ const searchChunkItemSchema = z.object({
37
+ id: z.string(),
38
+ document: z.string(),
39
+ collection: z.string(),
40
+ content: z.string(),
41
+ distance: z.number(),
42
+ /** Combined score after hybrid search fusion (higher is better) */
43
+ score: z.number().optional(),
44
+ });
45
+
46
+ type SearchChunkItem = z.infer<typeof searchChunkItemSchema>;
47
+
48
+ // === New schemas for MCP tools v2 ===
49
+
50
+ // List documents params and result
51
+ const listDocumentsParamsSchema = z.object({
52
+ collection: z.string(),
53
+ limit: z.number().optional().default(100),
54
+ offset: z.number().optional().default(0),
55
+ });
56
+
57
+ type ListDocumentsParams = z.infer<typeof listDocumentsParamsSchema>;
58
+
59
+ const documentInfoSchema = z.object({
60
+ id: z.string(),
61
+ title: z.string(),
62
+ size: z.number(),
63
+ });
64
+
65
+ type DocumentInfo = z.infer<typeof documentInfoSchema>;
66
+
67
+ const listDocumentsResultSchema = z.object({
68
+ documents: z.array(documentInfoSchema),
69
+ total: z.number(),
70
+ hasMore: z.boolean(),
71
+ });
72
+
73
+ type ListDocumentsResult = z.infer<typeof listDocumentsResultSchema>;
74
+
75
+ // Get outline params and result
76
+ const getOutlineParamsSchema = z.object({
77
+ collection: z.string(),
78
+ document: z.string(),
79
+ maxDepth: z.number().optional().default(3),
80
+ });
81
+
82
+ type GetOutlineParams = z.infer<typeof getOutlineParamsSchema>;
83
+
84
+ const outlineItemSchema = z.object({
85
+ level: z.number(),
86
+ text: z.string(),
87
+ line: z.number(),
88
+ });
89
+
90
+ type OutlineItem = z.infer<typeof outlineItemSchema>;
91
+
92
+ const outlineResultSchema = z.object({
93
+ title: z.string(),
94
+ outline: z.array(outlineItemSchema),
95
+ });
96
+
97
+ type OutlineResult = z.infer<typeof outlineResultSchema>;
98
+
99
+ // Get section params and result
100
+ const getSectionParamsSchema = z.object({
101
+ collection: z.string(),
102
+ document: z.string(),
103
+ section: z.string(),
104
+ includeSubsections: z.boolean().optional().default(true),
105
+ });
106
+
107
+ type GetSectionParams = z.infer<typeof getSectionParamsSchema>;
108
+
109
+ const sectionResultSchema = z.object({
110
+ section: z.string(),
111
+ level: z.number(),
112
+ content: z.string(),
113
+ startLine: z.number(),
114
+ endLine: z.number(),
115
+ });
116
+
117
+ type SectionResult = z.infer<typeof sectionResultSchema>;
118
+
119
+ // Find related params
120
+ const findRelatedParamsSchema = z.object({
121
+ collection: z.string(),
122
+ document: z.string(),
123
+ chunk: z.string().optional(),
124
+ limit: z.number().optional().default(5),
125
+ sameDocument: z.boolean().optional().default(false),
126
+ });
127
+
128
+ type FindRelatedParams = z.infer<typeof findRelatedParamsSchema>;
129
+
130
+ // Search batch params and result
131
+ const searchBatchQuerySchema = z.object({
132
+ query: z.string(),
133
+ collections: z.array(z.string()).optional(),
134
+ });
135
+
136
+ const searchBatchParamsSchema = z.object({
137
+ queries: z.array(searchBatchQuerySchema).min(1).max(10),
138
+ limit: z.number().optional().default(5),
139
+ maxDistance: z.number().optional(),
140
+ hybridSearch: z.boolean().optional().default(true),
141
+ });
142
+
143
+ type SearchBatchParams = z.infer<typeof searchBatchParamsSchema>;
144
+
145
+ const searchBatchResultItemSchema = z.object({
146
+ query: z.string(),
147
+ results: z.array(searchChunkItemSchema),
148
+ });
149
+
150
+ const searchBatchResultSchema = z.object({
151
+ results: z.array(searchBatchResultItemSchema),
152
+ });
153
+
154
+ type SearchBatchResult = z.infer<typeof searchBatchResultSchema>;
155
+
156
+ export type {
157
+ ReferenceDocument,
158
+ SearchChunksOptions,
159
+ SearchChunkItem,
160
+ ListDocumentsParams,
161
+ DocumentInfo,
162
+ ListDocumentsResult,
163
+ GetOutlineParams,
164
+ OutlineItem,
165
+ OutlineResult,
166
+ GetSectionParams,
167
+ SectionResult,
168
+ FindRelatedParams,
169
+ SearchBatchParams,
170
+ SearchBatchResult,
171
+ };
172
+
173
+ export {
174
+ referenceDocumentSchema,
175
+ searchChunksOptionsSchema,
176
+ searchChunkItemSchema,
177
+ listDocumentsParamsSchema,
178
+ documentInfoSchema,
179
+ listDocumentsResultSchema,
180
+ getOutlineParamsSchema,
181
+ outlineItemSchema,
182
+ outlineResultSchema,
183
+ getSectionParamsSchema,
184
+ sectionResultSchema,
185
+ findRelatedParamsSchema,
186
+ searchBatchQuerySchema,
187
+ searchBatchParamsSchema,
188
+ searchBatchResultItemSchema,
189
+ searchBatchResultSchema,
190
+ };