@grackle-ai/knowledge-core 0.70.0 → 0.70.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +995 -0
  2. package/package.json +1 -1
package/README.md ADDED
@@ -0,0 +1,995 @@
1
+ # @grackle-ai/knowledge-core
2
+
3
+ A domain-agnostic knowledge graph SDK built on [Neo4j](https://neo4j.com/). Provides pluggable text embedding, content chunking, an ingestion pipeline, node/edge CRUD, semantic vector search, and multi-hop graph traversal -- everything needed to build a knowledge graph with semantic search capabilities.
4
+
5
+ Part of the [Grackle](https://github.com/nick-pape/grackle) platform, where it powers the shared knowledge layer that lets AI agents build and query a persistent understanding of codebases, decisions, and context. However, `@grackle-ai/knowledge-core` is a standalone library with no dependency on the rest of Grackle -- you can use it in any Node.js application that needs a knowledge graph with vector search.
6
+
7
+ ## Table of Contents
8
+
9
+ - [Installation](#installation)
10
+ - [Prerequisites](#prerequisites)
11
+ - [Quick Start](#quick-start)
12
+ - [Architecture Overview](#architecture-overview)
13
+ - [Key Concepts](#key-concepts)
14
+ - [Node Kinds](#node-kinds)
15
+ - [Edge Types](#edge-types)
16
+ - [Workspace Scoping](#workspace-scoping)
17
+ - [API Reference](#api-reference)
18
+ - [Connection Management](#connection-management)
19
+ - [Schema Initialization](#schema-initialization)
20
+ - [Embedders](#embedders)
21
+ - [Chunkers](#chunkers)
22
+ - [Ingestion Pipeline](#ingestion-pipeline)
23
+ - [Node Operations](#node-operations)
24
+ - [Edge Operations](#edge-operations)
25
+ - [Semantic Search](#semantic-search)
26
+ - [Graph Expansion](#graph-expansion)
27
+ - [Listing Recent Nodes](#listing-recent-nodes)
28
+ - [Type Guards](#type-guards)
29
+ - [Logging](#logging)
30
+ - [Configuration](#configuration)
31
+ - [Environment Variables](#environment-variables)
32
+ - [Constants](#constants)
33
+ - [Custom Embedders and Chunkers](#custom-embedders-and-chunkers)
34
+ - [License](#license)
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ npm install @grackle-ai/knowledge-core
40
+ ```
41
+
42
+ Requires **Node.js >= 22**.
43
+
44
+ ## Prerequisites
45
+
46
+ ### Neo4j Database
47
+
48
+ This library requires a running [Neo4j](https://neo4j.com/) instance (version 5.x) with vector index support. The simplest way to run one locally is with Docker:
49
+
50
+ ```bash
51
+ docker run -d \
52
+ --name grackle-neo4j \
53
+ -p 7474:7474 \
54
+ -p 7687:7687 \
55
+ -e NEO4J_AUTH=neo4j/grackle-dev \
56
+ -v neo4j-data:/data \
57
+ neo4j:5.25.1-community
58
+ ```
59
+
60
+ This starts Neo4j with:
61
+
62
+ | Port | Protocol | Purpose |
63
+ |------|----------|---------|
64
+ | 7474 | HTTP | Neo4j Browser UI (visit `http://localhost:7474`) |
65
+ | 7687 | Bolt | Driver protocol (what this library connects to) |
66
+
67
+ The default credentials used by the library in development are `neo4j` / `grackle-dev`. In production, you **must** set the password via the `GRACKLE_NEO4J_PASSWORD` environment variable -- the library refuses to use the default password when `NODE_ENV=production`.
68
+
69
+ #### Docker Compose
70
+
71
+ If you are using Docker Compose, here is a service definition:
72
+
73
+ ```yaml
74
+ services:
75
+ neo4j:
76
+ image: neo4j:5.25.1-community
77
+ ports:
78
+ - "7474:7474"
79
+ - "7687:7687"
80
+ environment:
81
+ - NEO4J_AUTH=neo4j/grackle-dev
82
+ volumes:
83
+ - neo4j-data:/data
84
+ healthcheck:
85
+ test: ["CMD", "neo4j", "status"]
86
+ interval: 10s
87
+ timeout: 5s
88
+ retries: 5
89
+ start_period: 30s
90
+
91
+ volumes:
92
+ neo4j-data:
93
+ ```
94
+
95
+ #### Verifying Neo4j is Ready
96
+
97
+ After starting Neo4j, wait for it to become healthy before connecting. The health check above handles this in Docker Compose. For standalone Docker, you can poll:
98
+
99
+ ```bash
100
+ # Wait until Neo4j responds to a status check
101
+ until docker exec grackle-neo4j neo4j status 2>/dev/null; do sleep 2; done
102
+ ```
103
+
104
+ ## Quick Start
105
+
106
+ ```typescript
107
+ import {
108
+ openNeo4j,
109
+ closeNeo4j,
110
+ initSchema,
111
+ createLocalEmbedder,
112
+ createPassThroughChunker,
113
+ ingest,
114
+ createNativeNode,
115
+ knowledgeSearch,
116
+ NATIVE_CATEGORY,
117
+ } from "@grackle-ai/knowledge-core";
118
+
119
+ // 1. Connect to Neo4j
120
+ await openNeo4j({ url: "bolt://127.0.0.1:7687" });
121
+
122
+ // 2. Initialize schema (idempotent -- safe to call on every startup)
123
+ await initSchema();
124
+
125
+ // 3. Create an embedder and chunker
126
+ const embedder = createLocalEmbedder();
127
+ const chunker = createPassThroughChunker();
128
+
129
+ // 4. Ingest some content (chunk + embed in one step)
130
+ const [embedded] = await ingest(
131
+ "WebSocket reconnection uses exponential backoff with jitter",
132
+ chunker,
133
+ embedder,
134
+ );
135
+
136
+ // 5. Store the node in the graph
137
+ const nodeId = await createNativeNode({
138
+ category: NATIVE_CATEGORY.INSIGHT,
139
+ title: "WebSocket reconnection strategy",
140
+ content: "WebSocket reconnection uses exponential backoff with jitter",
141
+ tags: ["websocket", "networking"],
142
+ embedding: embedded.vector,
143
+ workspaceId: "",
144
+ });
145
+
146
+ // 6. Search the graph semantically
147
+ const results = await knowledgeSearch("how does reconnection work?", embedder, {
148
+ limit: 5,
149
+ minScore: 0.5,
150
+ });
151
+
152
+ for (const result of results) {
153
+ console.log(`${result.score.toFixed(3)} - ${result.node.id}`);
154
+ }
155
+
156
+ // 7. Clean up
157
+ await closeNeo4j();
158
+ ```
159
+
160
+ ## Architecture Overview
161
+
162
+ ```mermaid
163
+ graph TD
164
+ subgraph "Ingestion"
165
+ Content["Raw Content"] --> Chunker["Chunker"]
166
+ Chunker --> Chunks["Chunks"]
167
+ Chunks --> Embedder["Embedder"]
168
+ Embedder --> EmbeddedChunks["Embedded Chunks"]
169
+ end
170
+
171
+ subgraph "Storage (Neo4j)"
172
+ EmbeddedChunks --> NodeStore["Node Store"]
173
+ NodeStore --> KG["Knowledge Graph"]
174
+ EdgeStore["Edge Store"] --> KG
175
+ end
176
+
177
+ subgraph "Retrieval"
178
+ Query["Search Query"] --> QueryEmbed["Embedder"]
179
+ QueryEmbed --> VectorSearch["Vector Search"]
180
+ VectorSearch --> KG
181
+ KG --> Results["Search Results"]
182
+ Results --> Expand["Graph Expansion"]
183
+ Expand --> KG
184
+ end
185
+ ```
186
+
187
+ The library is organized into three layers:
188
+
189
+ 1. **Ingestion** -- Content is split into chunks by a `Chunker`, then each chunk is converted to a vector embedding by an `Embedder`. The `ingest()` pipeline composes these two steps.
190
+
191
+ 2. **Storage** -- Nodes and edges are persisted in Neo4j. Nodes carry their embedding vectors, enabling vector similarity search via Neo4j's built-in vector index. Two node kinds exist: **reference nodes** (pointers to external entities) and **native nodes** (self-contained content).
192
+
193
+ 3. **Retrieval** -- Semantic search embeds a query string and runs k-NN vector search against the Neo4j vector index. Results include similarity scores and immediate edges. Graph expansion traverses relationships from result nodes to discover connected context within N hops.
194
+
195
+ ## Key Concepts
196
+
197
+ ### Node Kinds
198
+
199
+ Every node in the knowledge graph is one of two kinds, forming a discriminated union:
200
+
201
+ #### Reference Nodes
202
+
203
+ A **reference node** points to an entity that lives elsewhere (e.g., a task in a relational database, a session log, a finding). It does not duplicate content -- it stores a `sourceType` and `sourceId` that identify the external entity, plus a `label` for display and an `embedding` for search.
204
+
205
+ ```typescript
206
+ import { NODE_KIND, REFERENCE_SOURCE } from "@grackle-ai/knowledge-core";
207
+
208
+ // Built-in reference source types (you can use any string):
209
+ REFERENCE_SOURCE.TASK // "task"
210
+ REFERENCE_SOURCE.SESSION // "session"
211
+ REFERENCE_SOURCE.FINDING // "finding"
212
+ REFERENCE_SOURCE.WORKSPACE // "workspace"
213
+
214
+ // Custom source types are fully supported:
215
+ const sourceType: string = "ado-work-item"; // any string works
216
+ ```
217
+
218
+ **`ReferenceNode` properties:**
219
+
220
+ | Property | Type | Description |
221
+ |----------|------|-------------|
222
+ | `id` | `string` | UUID, auto-generated |
223
+ | `kind` | `"reference"` | Discriminator |
224
+ | `sourceType` | `string` | Entity type (e.g., `"task"`, `"finding"`, or any custom string) |
225
+ | `sourceId` | `string` | ID of the entity in the external system |
226
+ | `label` | `string` | Human-readable label (e.g., task title) |
227
+ | `embedding` | `number[]` | Dense vector for similarity search |
228
+ | `workspaceId` | `string` | Workspace scope (empty string = global) |
229
+ | `createdAt` | `string` | ISO 8601 timestamp, auto-generated |
230
+ | `updatedAt` | `string` | ISO 8601 timestamp, auto-maintained |
231
+
232
+ #### Native Nodes
233
+
234
+ A **native node** owns its content directly. It exists only in the knowledge graph and is suitable for insights, decisions, concepts, code snippets, or any freeform knowledge.
235
+
236
+ ```typescript
237
+ import { NATIVE_CATEGORY } from "@grackle-ai/knowledge-core";
238
+
239
+ // Built-in native categories (you can use any string):
240
+ NATIVE_CATEGORY.DECISION // "decision"
241
+ NATIVE_CATEGORY.INSIGHT // "insight"
242
+ NATIVE_CATEGORY.CONCEPT // "concept"
243
+ NATIVE_CATEGORY.SNIPPET // "snippet"
244
+
245
+ // Custom categories are fully supported:
246
+ const category: string = "research-note";
247
+ ```
248
+
249
+ **`NativeNode` properties:**
250
+
251
+ | Property | Type | Description |
252
+ |----------|------|-------------|
253
+ | `id` | `string` | UUID, auto-generated |
254
+ | `kind` | `"native"` | Discriminator |
255
+ | `category` | `string` | Subcategory (e.g., `"decision"`, `"insight"`, or any custom string) |
256
+ | `title` | `string` | Title or summary |
257
+ | `content` | `string` | Full content owned by this node |
258
+ | `tags` | `string[]` | Free-form tags for categorization |
259
+ | `embedding` | `number[]` | Dense vector for similarity search |
260
+ | `workspaceId` | `string` | Workspace scope (empty string = global) |
261
+ | `createdAt` | `string` | ISO 8601 timestamp, auto-generated |
262
+ | `updatedAt` | `string` | ISO 8601 timestamp, auto-maintained |
263
+
264
+ ### Edge Types
265
+
266
+ Edges are directional typed relationships between nodes. Five relationship types are supported:
267
+
268
+ | Edge Type | Value | Semantics |
269
+ |-----------|-------|-----------|
270
+ | `EDGE_TYPE.RELATES_TO` | `"RELATES_TO"` | General association |
271
+ | `EDGE_TYPE.DEPENDS_ON` | `"DEPENDS_ON"` | Dependency relationship |
272
+ | `EDGE_TYPE.DERIVED_FROM` | `"DERIVED_FROM"` | One node was derived from another |
273
+ | `EDGE_TYPE.MENTIONS` | `"MENTIONS"` | One node references another |
274
+ | `EDGE_TYPE.PART_OF` | `"PART_OF"` | Containment / composition |
275
+
276
+ Edges can carry optional JSON metadata (e.g., a confidence score or a context snippet):
277
+
278
+ ```typescript
279
+ import { createEdge, EDGE_TYPE } from "@grackle-ai/knowledge-core";
280
+
281
+ await createEdge(nodeA, nodeB, EDGE_TYPE.DEPENDS_ON, {
282
+ confidence: 0.92,
283
+ reason: "Identified in code review",
284
+ });
285
+ ```
286
+
287
+ **`KnowledgeEdge` properties:**
288
+
289
+ | Property | Type | Description |
290
+ |----------|------|-------------|
291
+ | `fromId` | `string` | Source node ID |
292
+ | `toId` | `string` | Target node ID |
293
+ | `type` | `EdgeType` | Relationship type |
294
+ | `metadata` | `Record<string, unknown> \| undefined` | Optional JSON metadata |
295
+ | `createdAt` | `string` | ISO 8601 timestamp |
296
+
297
+ ### Workspace Scoping
298
+
299
+ Every node has a `workspaceId` field. Setting it to a non-empty string scopes the node to that workspace. Setting it to an empty string (`""`) makes the node global. Search and listing operations accept an optional `workspaceId` filter to restrict results to a specific workspace.
300
+
301
+ ## API Reference
302
+
303
+ ### Connection Management
304
+
305
+ The library manages a **singleton Neo4j driver**. Call `openNeo4j()` once at application startup and `closeNeo4j()` on shutdown.
306
+
307
+ #### `openNeo4j(config?: Neo4jClientConfig): Promise<void>`
308
+
309
+ Open a connection to Neo4j. Reads configuration from environment variables first, then from the config parameter, then from built-in defaults. Idempotent -- returns silently if already connected.
310
+
311
+ ```typescript
312
+ interface Neo4jClientConfig {
313
+ /** Bolt URL (default: "bolt://127.0.0.1:7687") */
314
+ url?: string;
315
+ /** Username (default: "neo4j") */
316
+ username?: string;
317
+ /** Password (default: "grackle-dev"; REQUIRED in production) */
318
+ password?: string;
319
+ /** Database name (default: "neo4j") */
320
+ database?: string;
321
+ }
322
+ ```
323
+
324
+ Resolution order for each field: **environment variable > config parameter > default constant**.
325
+
326
+ | Field | Environment Variable | Default |
327
+ |-------|---------------------|---------|
328
+ | `url` | `GRACKLE_NEO4J_URL` | `bolt://127.0.0.1:7687` |
329
+ | `username` | `GRACKLE_NEO4J_USER` | `neo4j` |
330
+ | `password` | `GRACKLE_NEO4J_PASSWORD` | `grackle-dev` (dev only) |
331
+ | `database` | `GRACKLE_NEO4J_DATABASE` | `neo4j` |
332
+
333
+ **Production safety:** When `NODE_ENV=production`, the library throws if no explicit password is provided (via env var or config). The default development password is never used in production.
334
+
335
+ ```typescript
336
+ // Minimal (uses all defaults):
337
+ await openNeo4j();
338
+
339
+ // Explicit config:
340
+ await openNeo4j({
341
+ url: "bolt://my-neo4j-host:7687",
342
+ username: "admin",
343
+ password: "secure-password",
344
+ database: "knowledge",
345
+ });
346
+ ```
347
+
348
+ #### `closeNeo4j(): Promise<void>`
349
+
350
+ Gracefully close the Neo4j connection and release resources. Safe to call multiple times or when no connection is open. Waits for any in-flight initialization to settle before closing.
351
+
352
+ ```typescript
353
+ await closeNeo4j();
354
+ ```
355
+
356
+ #### `healthCheck(): Promise<boolean>`
357
+
358
+ Check Neo4j connectivity. Returns `true` if the connection is healthy, `false` otherwise (including when not initialized).
359
+
360
+ ```typescript
361
+ const isHealthy = await healthCheck();
362
+ ```
363
+
364
+ #### `getSession(): Session`
365
+
366
+ Get a Neo4j session for running queries. Sessions are lightweight and should be short-lived -- open one per logical unit of work and close it when done. Throws if `openNeo4j()` has not been called.
367
+
368
+ ```typescript
369
+ const session = getSession();
370
+ try {
371
+ const result = await session.run("MATCH (n) RETURN count(n) AS count");
372
+ console.log(result.records[0].get("count"));
373
+ } finally {
374
+ await session.close();
375
+ }
376
+ ```
377
+
378
+ #### `getDriver(): Driver`
379
+
380
+ Get the raw Neo4j `Driver` instance. Prefer `getSession()` for most use cases. Use the driver directly only for `driver.executeQuery()` one-shot queries. Throws if `openNeo4j()` has not been called.
381
+
382
+ ### Schema Initialization
383
+
384
+ #### `initSchema(): Promise<void>`
385
+
386
+ Initialize the Neo4j schema: constraints, property indexes, and the vector index. All statements use `IF NOT EXISTS` and are idempotent -- safe to call on every application startup. Must be called after `openNeo4j()`.
387
+
388
+ The following schema objects are created:
389
+
390
+ | Statement | Purpose |
391
+ |-----------|---------|
392
+ | `UNIQUE_NODE_ID` | Uniqueness constraint on `KnowledgeNode.id` |
393
+ | `INDEX_KIND` | Index on `kind` for efficient filtering by node type |
394
+ | `INDEX_WORKSPACE` | Index on `workspaceId` for scoped queries |
395
+ | `INDEX_SOURCE` | Composite index on `(sourceType, sourceId)` for reference lookups |
396
+ | `VECTOR_INDEX` | Vector index on `embedding` for cosine similarity search (1536 dimensions) |
397
+
398
+ ```typescript
399
+ await openNeo4j();
400
+ await initSchema();
401
+ ```
402
+
403
+ #### `SCHEMA_STATEMENTS: Record<string, string>`
404
+
405
+ The raw Cypher statements used by `initSchema()`, exported for inspection and testing.
406
+
407
+ ### Embedders
408
+
409
+ An embedder converts text into dense vector embeddings for similarity search. The library ships with a local ONNX-based embedder and defines a pluggable `Embedder` interface so you can bring your own (e.g., OpenAI, Cohere).
410
+
411
+ #### `Embedder` Interface
412
+
413
+ ```typescript
414
+ interface EmbeddingResult {
415
+ /** The input text that was embedded. */
416
+ text: string;
417
+ /** The embedding vector. */
418
+ vector: number[];
419
+ }
420
+
421
+ interface Embedder {
422
+ /** Embed a single text string. */
423
+ embed(text: string): Promise<EmbeddingResult>;
424
+ /** Embed multiple texts in batch. */
425
+ embedBatch(texts: string[]): Promise<EmbeddingResult[]>;
426
+ /** Dimensionality of vectors produced by this embedder. */
427
+ readonly dimensions: number;
428
+ }
429
+ ```
430
+
431
+ #### `createLocalEmbedder(options?: EmbedderOptions): Embedder`
432
+
433
+ Create a local embedder that runs ONNX inference via `@huggingface/transformers`. Downloads and caches a HuggingFace model on first use, then runs inference locally on CPU -- **no API keys or external services required**.
434
+
435
+ The underlying pipeline is lazily initialized on the first call to `embed()` or `embedBatch()`, so construction is non-blocking.
436
+
437
+ ```typescript
438
+ interface EmbedderOptions {
439
+ /** HuggingFace model ID (default: "Xenova/all-MiniLM-L6-v2") */
440
+ modelId?: string;
441
+ /** Expected embedding dimensions (default: 384 for the default model) */
442
+ dimensions?: number;
443
+ }
444
+ ```
445
+
446
+ **Default model:** [`Xenova/all-MiniLM-L6-v2`](https://huggingface.co/Xenova/all-MiniLM-L6-v2) -- a small, fast, general-purpose English sentence embedding model that produces 384-dimensional vectors.
447
+
448
+ ```typescript
449
+ // Use the default model:
450
+ const embedder = createLocalEmbedder();
451
+ console.log(embedder.dimensions); // 384
452
+
453
+ // Use a custom model:
454
+ const customEmbedder = createLocalEmbedder({
455
+ modelId: "Xenova/bge-small-en-v1.5",
456
+ dimensions: 384,
457
+ });
458
+ ```
459
+
460
+ **Important:** The default local embedder produces 384-dimensional vectors, while the default Neo4j vector index schema created by `initSchema()` is configured for 1536 dimensions (see the `EMBEDDING_DIMENSIONS` constant in the library source). Because that constant is baked into the schema definition, downstream applications cannot realistically change it at runtime. If you use the local embedder, you should either:
461
+ - Use an embedding model that produces 1536-dimensional vectors so it matches the default schema created by `initSchema()`, or
462
+ - Skip `initSchema()` and create a custom Neo4j vector index whose dimensions match your embedder (for example, 384 for the default local model).
463
+
464
+ ### Chunkers
465
+
466
+ A chunker splits content into smaller pieces suitable for embedding. The library ships with two chunkers and defines a pluggable `Chunker` interface.
467
+
468
+ #### `Chunker` Interface
469
+
470
+ ```typescript
471
+ interface Chunk {
472
+ /** The text content of this chunk. */
473
+ text: string;
474
+ /** Zero-based index within the source content. */
475
+ index: number;
476
+ /** Optional metadata carried through from the source. */
477
+ metadata?: Record<string, unknown>;
478
+ }
479
+
480
+ interface Chunker {
481
+ /** Split content into one or more chunks. */
482
+ chunk(content: string, metadata?: Record<string, unknown>): Chunk[];
483
+ }
484
+ ```
485
+
486
+ #### `createPassThroughChunker(): Chunker`
487
+
488
+ Create a chunker that returns the entire input as a single chunk unchanged. Suitable for short content (findings, decisions, insights) that does not need splitting.
489
+
490
+ ```typescript
491
+ const chunker = createPassThroughChunker();
492
+ const chunks = chunker.chunk("A short insight about architecture.");
493
+ // => [{ text: "A short insight about architecture.", index: 0 }]
494
+ ```
495
+
496
+ #### `createTranscriptChunker(options?: TranscriptChunkerOptions): Chunker`
497
+
498
+ Create a chunker that splits JSONL session transcripts by conversation turn. Designed for Grackle's session `stream.jsonl` logs, but usable with any JSONL content that follows the log entry format.
499
+
500
+ A "turn" starts at a `user_input` event and includes all subsequent events until the next `user_input`. Events before the first `user_input` are grouped into turn 0. Turns exceeding `maxChunkSize` are split into sub-chunks at line boundaries.
501
+
502
+ ```typescript
503
+ interface TranscriptChunkerOptions {
504
+ /** Max characters per chunk before splitting (default: 4000). */
505
+ maxChunkSize?: number;
506
+ /** Event types to skip (default: ["status", "signal", "usage", "system"]). */
507
+ skipEventTypes?: string[];
508
+ }
509
+ ```
510
+
511
+ Each chunk's `metadata` includes:
512
+
513
+ | Field | Type | Description |
514
+ |-------|------|-------------|
515
+ | `turnIndex` | `number` | Zero-based turn number |
516
+ | `timestampStart` | `string` | ISO 8601 timestamp of the first event in the turn |
517
+ | `timestampEnd` | `string` | ISO 8601 timestamp of the last event in the turn |
518
+ | `eventTypes` | `string[]` | Distinct event types present in the turn |
519
+
520
+ Events are rendered with human-readable labels: `user_input` becomes `"User:"`, `text` becomes `"Assistant:"`, `tool_use` becomes `"Tool:"`, `tool_result` becomes `"Result:"`, `error` becomes `"Error:"`, `finding` becomes `"Finding:"`, and `subtask_create` becomes `"Subtask:"`.
521
+
522
+ ```typescript
523
+ const chunker = createTranscriptChunker({ maxChunkSize: 2000 });
524
+
525
+ const jsonl = [
526
+ '{"session_id":"s1","type":"user_input","timestamp":"2026-01-01T10:00:00Z","content":"Fix the login bug"}',
527
+ '{"session_id":"s1","type":"text","timestamp":"2026-01-01T10:00:05Z","content":"I found the issue in auth.ts..."}',
528
+ '{"session_id":"s1","type":"user_input","timestamp":"2026-01-01T10:01:00Z","content":"Now write tests"}',
529
+ '{"session_id":"s1","type":"text","timestamp":"2026-01-01T10:01:10Z","content":"Here are the test cases..."}',
530
+ ].join("\n");
531
+
532
+ const chunks = chunker.chunk(jsonl, { sessionId: "s1" });
533
+ // => 2 chunks, one per conversation turn
534
+ ```
535
+
536
+ ### Ingestion Pipeline
537
+
538
+ #### `ingest(content, chunker, embedder, metadata?): Promise<EmbeddedChunk[]>`
539
+
540
+ Chunk content and embed each chunk in a single pipeline. This is the primary entry point for getting content into a form ready for storage.
541
+
542
+ ```typescript
543
+ interface EmbeddedChunk extends Chunk {
544
+ /** The embedding vector for this chunk's text. */
545
+ vector: number[];
546
+ }
547
+ ```
548
+
549
+ | Parameter | Type | Description |
550
+ |-----------|------|-------------|
551
+ | `content` | `string` | Raw text content to ingest |
552
+ | `chunker` | `Chunker` | Splits content into chunks |
553
+ | `embedder` | `Embedder` | Produces embedding vectors |
554
+ | `metadata` | `Record<string, unknown>` | Optional metadata passed to the chunker |
555
+
556
+ Returns an array of chunks, each augmented with its embedding vector. Returns an empty array if the chunker produces no chunks.
557
+
558
+ ```typescript
559
+ const embedder = createLocalEmbedder();
560
+ const chunker = createPassThroughChunker();
561
+
562
+ const embeddedChunks = await ingest(
563
+ "We decided to use Neo4j for the knowledge graph because of its native vector index support.",
564
+ chunker,
565
+ embedder,
566
+ { source: "architecture-decision" },
567
+ );
568
+
569
+ // Each chunk now has a .vector property ready for createNativeNode() or createReferenceNode()
570
+ for (const chunk of embeddedChunks) {
571
+ await createNativeNode({
572
+ category: "decision",
573
+ title: "Use Neo4j for knowledge graph",
574
+ content: chunk.text,
575
+ tags: ["architecture", "database"],
576
+ embedding: chunk.vector,
577
+ workspaceId: "my-workspace",
578
+ });
579
+ }
580
+ ```
581
+
582
+ ### Node Operations
583
+
584
+ All node operations use short-lived Neo4j sessions that are automatically closed after each operation.
585
+
586
+ #### `createReferenceNode(input: CreateReferenceNodeInput): Promise<string>`
587
+
588
+ Create a reference node. Generates a UUID and timestamps automatically. Returns the new node's ID.
589
+
590
+ ```typescript
591
+ interface CreateReferenceNodeInput {
592
+ sourceType: string; // e.g., "task", "finding", or any custom type
593
+ sourceId: string; // ID in the external system
594
+ label: string; // Human-readable label
595
+ embedding: number[]; // Vector embedding
596
+ workspaceId: string; // Workspace scope ("" = global)
597
+ }
598
+ ```
599
+
600
+ ```typescript
601
+ const nodeId = await createReferenceNode({
602
+ sourceType: REFERENCE_SOURCE.TASK,
603
+ sourceId: "task-42",
604
+ label: "Fix authentication middleware",
605
+ embedding: (await embedder.embed("Fix authentication middleware")).vector,
606
+ workspaceId: "workspace-1",
607
+ });
608
+ ```
609
+
610
+ #### `createNativeNode(input: CreateNativeNodeInput): Promise<string>`
611
+
612
+ Create a native node. Generates a UUID and timestamps automatically. Returns the new node's ID.
613
+
614
+ ```typescript
615
+ interface CreateNativeNodeInput {
616
+ category: string; // e.g., "decision", "insight", or any custom category
617
+ title: string; // Title or summary
618
+ content: string; // Full content
619
+ tags: string[]; // Free-form tags
620
+ embedding: number[]; // Vector embedding
621
+ workspaceId: string; // Workspace scope ("" = global)
622
+ }
623
+ ```
624
+
625
+ ```typescript
626
+ const nodeId = await createNativeNode({
627
+ category: NATIVE_CATEGORY.DECISION,
628
+ title: "Use Neo4j for knowledge graph",
629
+ content: "We chose Neo4j because of native vector index support and Cypher query language.",
630
+ tags: ["architecture", "database"],
631
+ embedding: (await embedder.embed("Use Neo4j for knowledge graph")).vector,
632
+ workspaceId: "",
633
+ });
634
+ ```
635
+
636
+ #### `getNode(id: string): Promise<NodeWithEdges | undefined>`
637
+
638
+ Get a node by ID, including all its edges (both incoming and outgoing). Returns `undefined` if the node does not exist.
639
+
640
+ ```typescript
641
+ interface NodeWithEdges {
642
+ node: KnowledgeNode; // The node (ReferenceNode | NativeNode)
643
+ edges: KnowledgeEdge[]; // All connected edges
644
+ }
645
+ ```
646
+
647
+ ```typescript
648
+ const result = await getNode("some-uuid");
649
+ if (result) {
650
+ console.log(result.node.kind); // "reference" or "native"
651
+ console.log(result.edges.length);
652
+ }
653
+ ```
654
+
655
+ #### `updateNode(id: string, updates: UpdateNodeInput): Promise<KnowledgeNode | undefined>`
656
+
657
+ Update a node's mutable properties. Cannot change `kind`, `id`, `createdAt`, or `workspaceId`. Automatically updates the `updatedAt` timestamp. Returns the updated node, or `undefined` if the node was not found.
658
+
659
+ ```typescript
660
+ // For reference nodes:
661
+ interface UpdateReferenceNodeInput {
662
+ label?: string;
663
+ sourceId?: string;
664
+ embedding?: number[];
665
+ }
666
+
667
+ // For native nodes:
668
+ interface UpdateNativeNodeInput {
669
+ title?: string;
670
+ content?: string;
671
+ tags?: string[];
672
+ embedding?: number[];
673
+ }
674
+ ```
675
+
676
+ ```typescript
677
+ const updated = await updateNode("some-uuid", {
678
+ title: "Updated title",
679
+ tags: ["new-tag"],
680
+ });
681
+ ```
682
+
683
+ #### `deleteNode(id: string): Promise<boolean>`
684
+
685
+ Delete a node and all its edges (`DETACH DELETE`). Returns `true` if a node was deleted, `false` if the node was not found.
686
+
687
+ ```typescript
688
+ const wasDeleted = await deleteNode("some-uuid");
689
+ ```
690
+
691
+ #### `recordToNode(properties: Record<string, unknown>): KnowledgeNode`
692
+
693
+ Convert raw Neo4j node properties to a typed `KnowledgeNode`. Handles the discriminated union based on the `kind` property. Useful when running custom Cypher queries.
694
+
695
+ #### `recordToEdge(raw: Record<string, unknown>): KnowledgeEdge`
696
+
697
+ Convert a raw edge object from Cypher results to a typed `KnowledgeEdge`. Parses JSON metadata. Useful when running custom Cypher queries.
698
+
699
+ ### Edge Operations
700
+
701
+ #### `createEdge(fromId, toId, type, metadata?): Promise<KnowledgeEdge>`
702
+
703
+ Create a directed, typed edge between two nodes.
704
+
705
+ | Parameter | Type | Description |
706
+ |-----------|------|-------------|
707
+ | `fromId` | `string` | Source node ID |
708
+ | `toId` | `string` | Target node ID |
709
+ | `type` | `EdgeType` | One of `RELATES_TO`, `DEPENDS_ON`, `DERIVED_FROM`, `MENTIONS`, `PART_OF` |
710
+ | `metadata` | `Record<string, unknown>` | Optional metadata stored as JSON on the relationship |
711
+
712
+ Throws if either node does not exist or if the edge type is invalid.
713
+
714
+ ```typescript
715
+ const edge = await createEdge(
716
+ nodeIdA,
717
+ nodeIdB,
718
+ EDGE_TYPE.DERIVED_FROM,
719
+ { confidence: 0.95 },
720
+ );
721
+ ```
722
+
723
+ #### `removeEdge(fromId, toId, type): Promise<boolean>`
724
+
725
+ Remove a directed edge between two nodes. Returns `true` if an edge was removed, `false` if no matching edge existed. Throws if the edge type is invalid.
726
+
727
+ ```typescript
728
+ const wasRemoved = await removeEdge(nodeIdA, nodeIdB, EDGE_TYPE.DERIVED_FROM);
729
+ ```
730
+
731
+ ### Semantic Search
732
+
733
+ #### `knowledgeSearch(query, embedder, options?): Promise<SearchResult[]>`
734
+
735
+ Search the knowledge graph by semantic similarity. Embeds the query text, runs k-NN vector search against Neo4j's vector index, and returns ranked results with similarity scores and immediate edges.
736
+
737
+ ```typescript
738
+ interface SearchOptions {
739
+ /** Max results to return (default: 10). */
740
+ limit?: number;
741
+ /** Filter to specific node kinds ("reference", "native", or both). */
742
+ nodeKinds?: NodeKind[];
743
+ /** Minimum similarity score threshold (default: 0). */
744
+ minScore?: number;
745
+ /** Filter to a specific workspace. */
746
+ workspaceId?: string;
747
+ }
748
+
749
+ interface SearchResult {
750
+ /** The matched node. */
751
+ node: KnowledgeNode;
752
+ /** Cosine similarity score (0-1). */
753
+ score: number;
754
+ /** Edges connected to this node. */
755
+ edges: KnowledgeEdge[];
756
+ }
757
+ ```
758
+
759
+ When filters (`nodeKinds`, `workspaceId`) are applied, the library automatically over-fetches candidates (3x the limit) to compensate for post-filtering, ensuring you still get enough results.
760
+
761
+ ```typescript
762
+ // Basic search
763
+ const results = await knowledgeSearch("authentication middleware", embedder);
764
+
765
+ // Filtered search
766
+ const results = await knowledgeSearch("authentication", embedder, {
767
+ limit: 5,
768
+ minScore: 0.7,
769
+ nodeKinds: ["native"],
770
+ workspaceId: "workspace-1",
771
+ });
772
+
773
+ for (const { node, score, edges } of results) {
774
+ console.log(`[${score.toFixed(3)}] ${node.id}`);
775
+ if (node.kind === "native") {
776
+ console.log(` Title: ${node.title}`);
777
+ }
778
+ console.log(` Edges: ${edges.length}`);
779
+ }
780
+ ```
781
+
782
+ ### Graph Expansion
783
+
784
+ Graph expansion traverses relationships from a starting node to discover connected context. This is useful for enriching search results with related knowledge.
785
+
786
+ #### `expandNode(nodeId, options?): Promise<ExpansionResult>`
787
+
788
+ Expand a single node: return its neighbors within N hops.
789
+
790
+ ```typescript
791
+ interface ExpandOptions {
792
+ /** Max traversal depth (default: 1). */
793
+ depth?: number;
794
+ /** Only follow edges of these types. If empty/undefined, follow all. */
795
+ edgeTypes?: EdgeType[];
796
+ }
797
+
798
+ interface ExpansionResult {
799
+ /** All neighbor nodes found (deduplicated), excluding the start node. */
800
+ nodes: KnowledgeNode[];
801
+ /** All edges traversed. */
802
+ edges: KnowledgeEdge[];
803
+ }
804
+ ```
805
+
806
+ ```typescript
807
+ // Get immediate neighbors
808
+ const { nodes, edges } = await expandNode("some-uuid");
809
+
810
+ // Multi-hop expansion, following only specific edge types
811
+ const { nodes, edges } = await expandNode("some-uuid", {
812
+ depth: 3,
813
+ edgeTypes: [EDGE_TYPE.DEPENDS_ON, EDGE_TYPE.PART_OF],
814
+ });
815
+ ```
816
+
817
+ #### `expandResults(results, options?): Promise<ExpansionResult>`
818
+
819
+ Expand all nodes from search results and merge the subgraphs. Runs `expandNode()` for each search result and deduplicates nodes and edges. The original search result nodes are excluded from the neighbor list.
820
+
821
+ ```typescript
822
+ const searchResults = await knowledgeSearch("authentication", embedder);
823
+ const { nodes: neighbors, edges } = await expandResults(searchResults, { depth: 2 });
824
+
825
+ // neighbors contains nodes connected to the search results (but not the results themselves)
826
+ ```
827
+
828
+ ### Listing Recent Nodes
829
+
830
+ #### `listRecentNodes(limit?, workspaceId?): Promise<RecentNodesResult>`
831
+
832
+ List the most recently updated knowledge graph nodes. Returns nodes ordered by `updatedAt` descending, along with all edges between the returned nodes. Used by explorer/dashboard UIs.
833
+
834
+ ```typescript
835
+ interface RecentNodesResult {
836
+ /** The most recently updated nodes. */
837
+ nodes: KnowledgeNode[];
838
+ /** All edges between the returned nodes. */
839
+ edges: KnowledgeEdge[];
840
+ }
841
+ ```
842
+
843
+ | Parameter | Type | Default | Description |
844
+ |-----------|------|---------|-------------|
845
+ | `limit` | `number` | `20` | Maximum number of nodes to return |
846
+ | `workspaceId` | `string` | `undefined` | Optional workspace filter |
847
+
848
+ ```typescript
849
+ // Global recent nodes
850
+ const { nodes, edges } = await listRecentNodes(10);
851
+
852
+ // Workspace-scoped recent nodes
853
+ const { nodes, edges } = await listRecentNodes(20, "workspace-1");
854
+ ```
855
+
856
+ ### Type Guards
857
+
858
+ #### `isReferenceNode(node: KnowledgeNode): node is ReferenceNode`
859
+
860
+ Returns `true` if the node is a reference node. Narrows the type for TypeScript.
861
+
862
+ #### `isNativeNode(node: KnowledgeNode): node is NativeNode`
863
+
864
+ Returns `true` if the node is a native node. Narrows the type for TypeScript.
865
+
866
+ ```typescript
867
+ import { isReferenceNode, isNativeNode } from "@grackle-ai/knowledge-core";
868
+
869
+ const result = await getNode("some-uuid");
870
+ if (result) {
871
+ if (isReferenceNode(result.node)) {
872
+ console.log(result.node.sourceType, result.node.sourceId);
873
+ }
874
+ if (isNativeNode(result.node)) {
875
+ console.log(result.node.title, result.node.content);
876
+ }
877
+ }
878
+ ```
879
+
880
+ ### Logging
881
+
882
+ The library uses [pino](https://getpino.io/) for structured logging. The logger instance is exported if you need to adjust settings or pipe logs.
883
+
884
+ ```typescript
885
+ import { logger } from "@grackle-ai/knowledge-core";
886
+
887
+ // Logger name: "grackle-knowledge"
888
+ // Default level: "info" (override with LOG_LEVEL env var)
889
+ ```
890
+
891
+ ## Configuration
892
+
893
+ ### Environment Variables
894
+
895
+ | Variable | Purpose | Default |
896
+ |----------|---------|---------|
897
+ | `GRACKLE_NEO4J_URL` | Neo4j Bolt connection URL | `bolt://127.0.0.1:7687` |
898
+ | `GRACKLE_NEO4J_USER` | Neo4j username | `neo4j` |
899
+ | `GRACKLE_NEO4J_PASSWORD` | Neo4j password (**required** in production) | `grackle-dev` |
900
+ | `GRACKLE_NEO4J_DATABASE` | Neo4j database name | `neo4j` |
901
+ | `NODE_ENV` | When `"production"`, requires explicit password | -- |
902
+ | `LOG_LEVEL` | Pino log level (`trace`, `debug`, `info`, `warn`, `error`, `fatal`) | `info` |
903
+
904
+ ### Constants
905
+
906
+ These are exported for reference and customization. They control the Neo4j connection pool and the vector index configuration.
907
+
908
+ | Constant | Value | Description |
909
+ |----------|-------|-------------|
910
+ | `DEFAULT_NEO4J_URL` | `"bolt://127.0.0.1:7687"` | Default Bolt URL |
911
+ | `DEFAULT_NEO4J_USER` | `"neo4j"` | Default username |
912
+ | `DEFAULT_NEO4J_DATABASE` | `"neo4j"` | Default database name |
913
+ | `NEO4J_MAX_POOL_SIZE` | `50` | Max connections in the Neo4j driver pool |
914
+ | `NEO4J_CONNECTION_ACQUISITION_TIMEOUT` | `30000` | Timeout (ms) for acquiring a pool connection |
915
+ | `NODE_LABEL` | `"KnowledgeNode"` | Neo4j label applied to all knowledge graph nodes |
916
+ | `VECTOR_INDEX_NAME` | `"knowledge_embedding_index"` | Name of the vector index |
917
+ | `EMBEDDING_DIMENSIONS` | `1536` | Dimensionality of the vector index |
918
+ | `VECTOR_SIMILARITY_FUNCTION` | `"cosine"` | Similarity function used by the vector index |
919
+
920
+ ## Custom Embedders and Chunkers
921
+
922
+ The `Embedder` and `Chunker` interfaces are intentionally minimal so you can plug in any implementation.
923
+
924
+ ### Custom Embedder Example (OpenAI)
925
+
926
+ ```typescript
927
+ import type { Embedder, EmbeddingResult } from "@grackle-ai/knowledge-core";
928
+
929
+ function createOpenAIEmbedder(apiKey: string): Embedder {
930
+ return {
931
+ dimensions: 1536,
932
+
933
+ async embed(text: string): Promise<EmbeddingResult> {
934
+ const response = await fetch("https://api.openai.com/v1/embeddings", {
935
+ method: "POST",
936
+ headers: {
937
+ Authorization: `Bearer ${apiKey}`,
938
+ "Content-Type": "application/json",
939
+ },
940
+ body: JSON.stringify({
941
+ model: "text-embedding-3-small",
942
+ input: text,
943
+ }),
944
+ });
945
+ const data = await response.json();
946
+ return { text, vector: data.data[0].embedding };
947
+ },
948
+
949
+ async embedBatch(texts: string[]): Promise<EmbeddingResult[]> {
950
+ const response = await fetch("https://api.openai.com/v1/embeddings", {
951
+ method: "POST",
952
+ headers: {
953
+ Authorization: `Bearer ${apiKey}`,
954
+ "Content-Type": "application/json",
955
+ },
956
+ body: JSON.stringify({
957
+ model: "text-embedding-3-small",
958
+ input: texts,
959
+ }),
960
+ });
961
+ const data = await response.json();
962
+ return data.data.map((d: { embedding: number[] }, i: number) => ({
963
+ text: texts[i],
964
+ vector: d.embedding,
965
+ }));
966
+ },
967
+ };
968
+ }
969
+ ```
970
+
971
+ ### Custom Chunker Example (Fixed-Size)
972
+
973
+ ```typescript
974
+ import type { Chunker, Chunk } from "@grackle-ai/knowledge-core";
975
+
976
+ function createFixedSizeChunker(maxChars: number = 1000): Chunker {
977
+ return {
978
+ chunk(content: string, metadata?: Record<string, unknown>): Chunk[] {
979
+ const chunks: Chunk[] = [];
980
+ for (let i = 0; i < content.length; i += maxChars) {
981
+ chunks.push({
982
+ text: content.slice(i, i + maxChars),
983
+ index: chunks.length,
984
+ metadata,
985
+ });
986
+ }
987
+ return chunks.length > 0 ? chunks : [{ text: content, index: 0, metadata }];
988
+ },
989
+ };
990
+ }
991
+ ```
992
+
993
+ ## License
994
+
995
+ MIT
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@grackle-ai/knowledge-core",
3
- "version": "0.70.0",
3
+ "version": "0.70.1",
4
4
  "description": "Generic knowledge graph SDK on top of Neo4j with vector search, graph traversal, and pluggable embedders/chunkers",
5
5
  "license": "MIT",
6
6
  "repository": {