@vainplex/openclaw-knowledge-engine 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/ARCHITECTURE.md DELETED
@@ -1,374 +0,0 @@
1
- # Architecture: @vainplex/openclaw-knowledge-engine
2
-
3
- ## 1. Overview and Scope
4
-
5
- `@vainplex/openclaw-knowledge-engine` is a TypeScript-based OpenClaw plugin for real-time and batch knowledge extraction from conversational data. It replaces a collection of legacy Python scripts with a unified, modern, and tightly integrated solution.
6
-
7
- The primary goal of this plugin is to identify, extract, and store key information (entities and facts) from user and agent messages. This knowledge is then made available for long-term memory, context enrichment, and improved agent performance. It operates directly within the OpenClaw event pipeline, eliminating the need for external NATS consumers and schedulers.
8
-
9
- ### 1.1. Core Features
10
-
11
- - **Hybrid Entity Extraction:** Combines high-speed, low-cost regex extraction with optional, high-fidelity LLM-based extraction.
12
- - **Structured Fact Store:** Manages a durable store of facts with metadata, relevance scoring, and a temporal decay mechanism.
13
- - **Seamless Integration:** Hooks directly into OpenClaw's lifecycle events (`message_received`, `message_sent`, `session_start`).
14
- - **Configurable & Maintainable:** All features are configurable via a JSON schema, and the TypeScript codebase ensures type safety and maintainability.
15
- - **Zero Runtime Dependencies:** Relies only on Node.js built-in APIs, mirroring the pattern of `@vainplex/openclaw-cortex`.
16
- - **Optional Embeddings:** Can integrate with ChromaDB for semantic search over extracted facts.
17
-
18
- ### 1.2. Out of Scope
19
-
20
- - **TypeDB Integration:** The legacy TypeDB dependency is explicitly removed and will not be supported.
21
- - **Direct NATS Consumption:** The plugin relies on OpenClaw hooks, not direct interaction with NATS streams.
22
- - **UI/Frontend:** This plugin is purely a backend data processing engine.
23
-
24
- ---
25
-
26
- ## 2. Module Breakdown
27
-
28
- The plugin will be structured similarly to `@vainplex/openclaw-cortex`, with a clear separation of concerns between modules. All source code will reside in the `src/` directory.
29
-
30
- | File | Responsibility |
31
- | --------------------- | -------------------------------------------------------------------------------------------------------------- |
32
- | `index.ts` | Plugin entry point. Registers hooks, commands, and performs initial configuration validation. |
33
- | `src/hooks.ts` | Main integration logic. Registers and orchestrates all OpenClaw hook handlers. Manages shared state. |
34
- | `src/types.ts` | Centralized TypeScript type definitions for configuration, entities, facts, and API interfaces. |
35
- | `src/config.ts` | Provides functions for resolving and validating the plugin's configuration from `openclaw.plugin.json`. |
36
- | `src/storage.ts` | Low-level file I/O utilities for reading/writing JSON files, ensuring atomic writes and handling debouncing. |
37
- | `src/entity-extractor.ts`| Implements the entity extraction pipeline. Contains the `EntityExtractor` class. |
38
- | `src/fact-store.ts` | Implements the fact storage and retrieval logic. Contains the `FactStore` class, including decay logic. |
39
- | `src/llm-enhancer.ts` | Handles communication with an external LLM (e.g., Ollama) for batched, deep extraction of entities and facts. |
40
- | `src/embeddings.ts` | Manages optional integration with ChromaDB, including batching and syncing embeddings. |
41
- | `src/maintenance.ts` | Encapsulates background tasks like fact decay and embeddings sync, triggered by an internal timer. |
42
- | `src/patterns.ts` | Stores default regex patterns for common entities (dates, names, locations, etc.). |
43
-
44
- ---
45
-
46
- ## 3. Type Definitions
47
-
48
- Located in `src/types.ts`.
49
-
50
- ```typescript
51
- // src/types.ts
52
-
53
- /**
54
- * The public API exposed by the OpenClaw host to the plugin.
55
- */
56
- export interface OpenClawPluginApi {
57
- pluginConfig: Record<string, unknown>;
58
- logger: {
59
- info: (msg: string) => void;
60
- warn: (msg: string) => void;
61
- error: (msg: string) => void;
62
- };
63
- on: (event: string, handler: (event: HookEvent, ctx: HookContext) => void, options?: { priority: number }) => void;
64
- }
65
-
66
- export interface HookEvent {
67
- content?: string;
68
- message?: string;
69
- text?: string;
70
- from?: string;
71
- sender?: string;
72
- role?: "user" | "assistant";
73
- [key: string]: unknown;
74
- }
75
-
76
- export interface HookContext {
77
- workspace: string; // Absolute path to the OpenClaw workspace
78
- }
79
-
80
- /**
81
- * Plugin configuration schema, validated from openclaw.plugin.json.
82
- */
83
- export interface KnowledgeConfig {
84
- enabled: boolean;
85
- workspace: string;
86
- extraction: {
87
- regex: {
88
- enabled: boolean;
89
- };
90
- llm: {
91
- enabled: boolean;
92
- model: string;
93
- endpoint: string;
94
- batchSize: number;
95
- cooldownMs: number;
96
- };
97
- };
98
- decay: {
99
- enabled: boolean;
100
- intervalHours: number;
101
- rate: number; // e.g., 0.05 for 5% decay per interval
102
- };
103
- embeddings: {
104
- enabled: boolean;
105
- endpoint: string;
106
- syncIntervalMinutes: number;
107
- collectionName: string;
108
- };
109
- storage: {
110
- maxEntities: number;
111
- maxFacts: number;
112
- writeDebounceMs: number;
113
- };
114
- }
115
-
116
- /**
117
- * Represents an extracted entity.
118
- */
119
- export interface Entity {
120
- id: string; // e.g., "person:claude"
121
- type: "person" | "location" | "organization" | "date" | "product" | "concept" | "unknown";
122
- value: string; // The canonical value, e.g., "Claude"
123
- mentions: string[]; // Different ways it was mentioned, e.g., ["claude", "Claude's"]
124
- count: number;
125
- importance: number; // 0.0 to 1.0
126
- lastSeen: string; // ISO 8601 timestamp
127
- source: ("regex" | "llm")[];
128
- }
129
-
130
- /**
131
- * Represents a structured fact.
132
- */
133
- export interface Fact {
134
- id: string; // UUID v4
135
- subject: string; // Entity ID
136
- predicate: string; // e.g., "is-a", "has-property", "works-at"
137
- object: string; // Entity ID or literal value
138
- relevance: number; // 0.0 to 1.0, subject to decay
139
- createdAt: string; // ISO 8601 timestamp
140
- lastAccessed: string; // ISO 8601 timestamp
141
- source: "ingested" | "extracted-regex" | "extracted-llm";
142
- }
143
-
144
- /**
145
- * Data structure for entities.json
146
- */
147
- export interface EntitiesData {
148
- updated: string;
149
- entities: Entity[];
150
- }
151
-
152
- /**
153
- * Data structure for facts.json
154
- */
155
- export interface FactsData {
156
- updated: string;
157
- facts: Fact[];
158
- }
159
- ```
160
-
161
- ---
162
-
163
- ## 4. Hook Integration Points
164
-
165
- The plugin will register handlers for the following OpenClaw core events:
166
-
167
- | Hook Event | Priority | Handler Logic |
168
- | ------------------ | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
169
- | `message_received` | 100 | - Triggers the real-time entity extraction pipeline. <br> - Extracts content and sender. <br> - Adds the message to the `LlmEnhancer` batch if LLM is enabled. |
170
- | `message_sent` | 100 | - Same as `message_received`. Ensures the agent's own messages are processed for knowledge. |
171
- | `session_start` | 200 | - Initializes the `Maintenance` service. <br> - Starts the internal timers for fact decay and embeddings sync. <br> - Ensures workspace directories exist. |
172
-
173
- ---
174
-
175
- ## 5. Entity Extraction Pipeline
176
-
177
- The extraction process runs on every message and is designed to be fast and efficient.
178
-
179
- ### 5.1. Regex Extraction
180
-
181
- - **Always On (if enabled):** Runs first on every message.
182
- - **Patterns:** A configurable set of regular expressions will be defined in `src/patterns.ts`. These will cover common entities like dates (`YYYY-MM-DD`), email addresses, URLs, and potentially user-defined patterns.
183
- - **Performance:** This step is extremely fast and has negligible overhead.
184
- - **Output:** Produces a preliminary list of potential entities.
185
-
186
- ### 5.2. LLM Enhancement (Batched)
187
-
188
- - **Optional:** Enabled via configuration.
189
- - **Batching:** The `LlmEnhancer` class collects messages up to `batchSize` or until `cooldownMs` has passed since the last message. This avoids overwhelming the LLM with single requests.
190
- - **Process:**
191
- 1. A batch of messages is formatted into a single prompt.
192
- 2. The prompt instructs the LLM to identify entities (person, location, etc.) and structured facts (triples like `Subject-Predicate-Object`).
193
- 3. The request is sent to the configured LLM endpoint (`extraction.llm.endpoint`).
194
- 4. The LLM's JSON response is parsed.
195
- - **Merging:** LLM-extracted entities are merged with the regex-based results. The `source` array on the `Entity` object is updated to reflect that it was identified by both methods. LLM results are generally given a higher initial `importance` score.
196
-
197
- ---
198
-
199
- ## 6. Fact Store Design
200
-
201
- The `FactStore` class manages the `facts.json` file, providing an in-memory cache and methods for interacting with facts.
202
-
203
- ### 6.1. Data Structure (`facts.json`)
204
-
205
- The file will contain a `FactsData` object:
206
-
207
- ```json
208
- {
209
- "updated": "2026-02-17T15:30:00Z",
210
- "facts": [
211
- {
212
- "id": "f0a4c1b0-9b1e-4b7b-8f3a-0e9c8d7b6a5a",
213
- "subject": "person:atlas",
214
- "predicate": "is-a",
215
- "object": "sub-agent",
216
- "relevance": 0.95,
217
- "createdAt": "2026-02-17T14:00:00Z",
218
- "lastAccessed": "2026-02-17T15:20:00Z",
219
- "source": "extracted-llm"
220
- }
221
- ]
222
- }
223
- ```
224
-
225
- ### 6.2. `FactStore` Class API
226
-
227
- ```typescript
228
- // src/fact-store.ts
229
- class FactStore {
230
- constructor(workspace: string, config: KnowledgeConfig['storage'], logger: Logger);
231
-
232
- // Load facts from facts.json into memory
233
- load(): Promise<void>;
234
-
235
- // Add a new fact or update an existing one
236
- addFact(fact: Omit<Fact, 'id' | 'createdAt' | 'lastAccessed'>): Fact;
237
-
238
- // Retrieve a fact by its ID
239
- getFact(id: string): Fact | undefined;
240
-
241
- // Query facts based on subject, predicate, or object
242
- query(query: { subject?: string; predicate?: string; object?: string }): Fact[];
243
-
244
- // Run the decay algorithm on all facts
245
- decayFacts(rate: number): { decayedCount: number };
246
-
247
- // Persist the in-memory store to disk (debounced)
248
- commit(): Promise<void>;
249
- }
250
- ```
251
-
252
- ### 6.3. Storage and Persistence
253
-
254
- - **Debounced Writes:** All modifications to the fact store will trigger a debounced `commit()` call. This ensures that rapid, successive writes (e.g., during a fast-paced conversation) are batched into a single file I/O operation, configured by `storage.writeDebounceMs`.
255
- - **Atomic Writes:** The `storage.ts` module will use a "write to temp file then rename" strategy to prevent data corruption if the application terminates mid-write.
256
-
257
- ---
258
-
259
- ## 7. Decay Algorithm
260
-
261
- The decay algorithm prevents the fact store from becoming cluttered with stale, irrelevant information. It is managed by the `Maintenance` service.
262
-
263
- - **Trigger:** Runs on a schedule defined by `decay.intervalHours`.
264
- - **Logic:** For each fact, the relevance score is reduced by the `decay.rate`.
265
- ```
266
- newRelevance = currentRelevance * (1 - decayRate)
267
- ```
268
- - **Floor:** Relevance will not decay below a certain floor (e.g., 0.1) to keep it in the system.
269
- - **Promotion:** When a fact is "accessed" (e.g., used to answer a question or mentioned again), its `relevance` score is boosted, and its `lastAccessed` timestamp is updated. A simple boost could be `newRelevance = currentRelevance + (1 - currentRelevance) * 0.5`, pushing it halfway to 1.0.
270
- - **Pruning:** Facts with a relevance score below a configurable threshold (e.g., 0.05) after decay might be pruned from the store entirely if `storage.maxFacts` is exceeded.
271
-
272
- ---
273
-
274
- ## 8. Embeddings Integration
275
-
276
- This feature allows for semantic querying of facts and is entirely optional.
277
-
278
- ### 8.1. `Embeddings` Service
279
-
280
- - **Trigger:** Runs on a schedule defined by `embeddings.syncIntervalMinutes`.
281
- - **Process:**
282
- 1. The service scans `facts.json` for any facts that have not yet been embedded.
283
- 2. It formats each fact into a natural language string, e.g., "Atlas is a sub-agent."
284
- 3. It sends a batch of these strings to a ChromaDB-compatible vector database via its HTTP API.
285
- 4. The fact's ID is stored as metadata alongside the vector in ChromaDB.
286
- - **Configuration:** The `embeddings.endpoint` must be a valid URL to the ChromaDB `/api/v1/collections/{name}/add` endpoint.
287
- - **Decoupling:** The plugin does **not** query ChromaDB. Its only responsibility is to push embeddings. Other plugins or services would be responsible for leveraging the vector store for retrieval-augmented generation (RAG).
288
-
289
- ---
290
-
291
- ## 9. Config Schema
292
-
293
- The full `openclaw.plugin.json` schema for this plugin.
294
-
295
- ```json
296
- {
297
- "id": "@vainplex/openclaw-knowledge-engine",
298
- "config": {
299
- "enabled": true,
300
- "workspace": "~/.clawd/plugins/knowledge-engine",
301
- "extraction": {
302
- "regex": {
303
- "enabled": true
304
- },
305
- "llm": {
306
- "enabled": true,
307
- "model": "mistral:7b",
308
- "endpoint": "http://localhost:11434/api/generate",
309
- "batchSize": 10,
310
- "cooldownMs": 30000
311
- }
312
- },
313
- "decay": {
314
- "enabled": true,
315
- "intervalHours": 24,
316
- "rate": 0.02
317
- },
318
- "embeddings": {
319
- "enabled": false,
320
- "endpoint": "http://localhost:8000/api/v1/collections/facts/add",
321
- "collectionName": "openclaw-facts",
322
- "syncIntervalMinutes": 15
323
- },
324
- "storage": {
325
- "maxEntities": 5000,
326
- "maxFacts": 10000,
327
- "writeDebounceMs": 15000
328
- }
329
- }
330
- }
331
- ```
332
-
333
- ---
334
-
335
- ## 10. Test Strategy
336
-
337
- Testing will be comprehensive and follow the patterns of `@vainplex/openclaw-cortex`, using Node.js's built-in test runner.
338
-
339
- - **Unit Tests:** Each class (`EntityExtractor`, `FactStore`, `LlmEnhancer`, etc.) will have its own test file (e.g., `fact-store.test.ts`). Tests will use mock objects for dependencies like the logger and file system.
340
- - **Integration Tests:** `hooks.test.ts` will test the end-to-end flow by simulating OpenClaw hook events and asserting that the correct file system changes occur.
341
- - **Configuration Tests:** `config.test.ts` will verify that default values are applied correctly and that invalid configurations are handled gracefully.
342
- - **CI/CD:** Tests will be run automatically in a CI pipeline on every commit.
343
-
344
- ---
345
-
346
- ## 11. Migration Guide
347
-
348
- This section outlines the process for decommissioning the old Python scripts and migrating to the new plugin.
349
-
350
- 1. **Disable Old Services:** Stop and disable the `systemd` services and timers for `entity-extractor-stream.py`, `smart-extractor.py`, `knowledge-engine.py`, and `cortex-loops-stream.py`.
351
- ```bash
352
- systemctl stop entity-extractor-stream.service smart-extractor.timer knowledge-engine.service cortex-loops.timer
353
- systemctl disable entity-extractor-stream.service smart-extractor.timer knowledge-engine.service cortex-loops.timer
354
- ```
355
-
356
- 2. **Install the Plugin:** Install the `@vainplex/openclaw-knowledge-engine` plugin into OpenClaw according to standard procedures.
357
-
358
- 3. **Configure the Plugin:** Create a configuration file at `~/.clawd/plugins/openclaw-knowledge-engine.json` (or the equivalent path) using the schema from section 9. Ensure the `workspace` directory is set to the desired location.
359
-
360
- 4. **Data Migration (Optional):**
361
- - **Entities:** A one-time script (`./scripts/migrate-entities.js`) will be provided to convert the old `~/.cortex/knowledge/entities.json` format to the new `Entity` format defined in `src/types.ts`.
362
- - **Facts:** As the old `knowledge-engine.py` had a different structure and no durable fact store equivalent to `facts.json`, facts will not be migrated. The system will start with a fresh fact store.
363
- - **TypeDB:** No migration from TypeDB will be provided.
364
-
365
- 5. **Enable and Restart:** Enable the plugin in OpenClaw's main configuration and restart the OpenClaw instance. Monitor the logs for successful initialization.
366
-
367
- ---
368
-
369
- ## 12. Performance Requirements
370
-
371
- - **Message Hook Overhead:** The synchronous part of the message hook (regex extraction) must complete in under **5ms** on average to avoid delaying the message processing pipeline.
372
- - **LLM Latency:** LLM processing is asynchronous and batched, so it does not block the main thread. However, the total time to analyze a batch should be logged and monitored.
373
- - **Memory Usage:** The plugin's heap size should not exceed **100MB** under normal load, assuming the configured `maxEntities` and `maxFacts` limits.
374
- - **CPU Usage:** Background maintenance tasks (decay, embeddings sync) should be staggered and have low CPU impact, consuming less than 5% of a single core while running.
package/index.ts DELETED
@@ -1,38 +0,0 @@
1
- // index.ts
2
-
3
- import { resolveConfig } from './src/config.js';
4
- import { HookManager } from './src/hooks.js';
5
- import type { OpenClawPluginApi } from './src/types.js';
6
-
7
- // The main entry point for the OpenClaw plugin.
8
- // This function is called by the OpenClaw host during plugin loading.
9
- export default (api: OpenClawPluginApi, context: { workspace: string }): void => {
10
- const { pluginConfig, logger } = api;
11
- const { workspace: openClawWorkspace } = context;
12
-
13
- // 1. Resolve and validate the configuration
14
- const config = resolveConfig(pluginConfig, logger, openClawWorkspace);
15
-
16
- if (!config) {
17
- logger.error('Failed to initialize Knowledge Engine: Invalid configuration. The plugin will be disabled.');
18
- return;
19
- }
20
-
21
- if (!config.enabled) {
22
- logger.info('Knowledge Engine is disabled in the configuration.');
23
- return;
24
- }
25
-
26
- // 2. Initialize the Hook Manager with the resolved config
27
- try {
28
- const hookManager = new HookManager(api, config);
29
-
30
- // 3. Register all the event hooks
31
- hookManager.registerHooks();
32
-
33
- logger.info('Knowledge Engine plugin initialized successfully.');
34
-
35
- } catch (err) {
36
- logger.error('An unexpected error occurred during Knowledge Engine initialization.', err as Error);
37
- }
38
- };
package/src/config.ts DELETED
@@ -1,180 +0,0 @@
1
- // src/config.ts
2
-
3
- import * as path from 'node:path';
4
- import { KnowledgeConfig, Logger } from './types.js';
5
-
6
- /**
7
- * The default configuration values for the plugin.
8
- * These are merged with the user-provided configuration.
9
- */
10
- export const DEFAULT_CONFIG: Omit<KnowledgeConfig, 'workspace'> = {
11
- enabled: true,
12
- extraction: {
13
- regex: { enabled: true },
14
- llm: {
15
- enabled: true,
16
- model: 'mistral:7b',
17
- endpoint: 'http://localhost:11434/api/generate',
18
- batchSize: 10,
19
- cooldownMs: 30000,
20
- },
21
- },
22
- decay: {
23
- enabled: true,
24
- intervalHours: 24,
25
- rate: 0.02,
26
- },
27
- embeddings: {
28
- enabled: false,
29
- endpoint: 'http://localhost:8000/api/v1/collections/facts/add',
30
- collectionName: 'openclaw-facts',
31
- syncIntervalMinutes: 15,
32
- },
33
- storage: {
34
- maxEntities: 5000,
35
- maxFacts: 10000,
36
- writeDebounceMs: 15000,
37
- },
38
- };
39
-
40
- /** Type-safe deep merge: spread source into target for Record values. */
41
- function deepMerge<T extends Record<string, unknown>>(
42
- target: T,
43
- source: Record<string, unknown>
44
- ): T {
45
- const result = { ...target } as Record<string, unknown>;
46
- for (const key of Object.keys(source)) {
47
- const srcVal = source[key];
48
- const tgtVal = result[key];
49
- if (isPlainObject(srcVal) && isPlainObject(tgtVal)) {
50
- result[key] = deepMerge(
51
- tgtVal as Record<string, unknown>,
52
- srcVal as Record<string, unknown>
53
- );
54
- } else if (srcVal !== undefined) {
55
- result[key] = srcVal;
56
- }
57
- }
58
- return result as T;
59
- }
60
-
61
- function isPlainObject(val: unknown): val is Record<string, unknown> {
62
- return typeof val === 'object' && val !== null && !Array.isArray(val);
63
- }
64
-
65
- /** Merge user config over defaults and resolve workspace. */
66
- function mergeConfigDefaults(
67
- userConfig: Record<string, unknown>,
68
- openClawWorkspace: string
69
- ): KnowledgeConfig {
70
- const merged = deepMerge(
71
- DEFAULT_CONFIG as unknown as Record<string, unknown>,
72
- userConfig
73
- );
74
- const ws = typeof userConfig.workspace === 'string' && userConfig.workspace
75
- ? userConfig.workspace
76
- : path.join(openClawWorkspace, 'knowledge-engine');
77
- return { ...merged, workspace: ws } as KnowledgeConfig;
78
- }
79
-
80
- /** Replace a leading tilde with the user's home directory. */
81
- function resolveTilde(ws: string, logger: Logger, fallback: string): string {
82
- if (!ws.startsWith('~')) return ws;
83
- const homeDir = process.env.HOME || process.env.USERPROFILE;
84
- if (homeDir) return path.join(homeDir, ws.slice(1));
85
- logger.warn('Could not resolve home directory for workspace path.');
86
- return fallback;
87
- }
88
-
89
- /**
90
- * Resolves and validates the plugin's configuration.
91
- *
92
- * @param userConfig The user-provided configuration from OpenClaw's pluginConfig.
93
- * @param logger A logger instance for logging warnings or errors.
94
- * @param openClawWorkspace The root workspace directory provided by OpenClaw.
95
- * @returns A fully resolved KnowledgeConfig, or null if validation fails.
96
- */
97
- export function resolveConfig(
98
- userConfig: Record<string, unknown>,
99
- logger: Logger,
100
- openClawWorkspace: string
101
- ): KnowledgeConfig | null {
102
- const config = mergeConfigDefaults(userConfig, openClawWorkspace);
103
- const fallbackWs = path.join(openClawWorkspace, 'knowledge-engine');
104
- config.workspace = resolveTilde(config.workspace, logger, fallbackWs);
105
-
106
- const errors = validateConfig(config);
107
- if (errors.length > 0) {
108
- errors.forEach(e => logger.error(`Invalid configuration: ${e}`));
109
- return null;
110
- }
111
-
112
- logger.info('Knowledge Engine configuration resolved successfully.');
113
- return config;
114
- }
115
-
116
- // ── Validation ──────────────────────────────────────────────
117
-
118
- function validateConfig(config: KnowledgeConfig): string[] {
119
- return [
120
- ...validateRoot(config),
121
- ...validateExtraction(config.extraction),
122
- ...validateDecay(config.decay),
123
- ...validateEmbeddings(config.embeddings),
124
- ...validateStorage(config.storage),
125
- ];
126
- }
127
-
128
- function validateRoot(c: KnowledgeConfig): string[] {
129
- const errs: string[] = [];
130
- if (typeof c.enabled !== 'boolean') errs.push('"enabled" must be a boolean.');
131
- if (typeof c.workspace !== 'string' || c.workspace.trim() === '') {
132
- errs.push('"workspace" must be a non-empty string.');
133
- }
134
- return errs;
135
- }
136
-
137
- function validateExtraction(ext: KnowledgeConfig['extraction']): string[] {
138
- const errs: string[] = [];
139
- if (ext.llm.enabled) {
140
- if (!isValidHttpUrl(ext.llm.endpoint)) {
141
- errs.push('"extraction.llm.endpoint" must be a valid HTTP/S URL.');
142
- }
143
- if ((ext.llm.batchSize ?? 0) < 1) {
144
- errs.push('"extraction.llm.batchSize" must be at least 1.');
145
- }
146
- }
147
- return errs;
148
- }
149
-
150
- function validateDecay(d: KnowledgeConfig['decay']): string[] {
151
- const errs: string[] = [];
152
- if (d.rate < 0 || d.rate > 1) errs.push('"decay.rate" must be between 0 and 1.');
153
- if ((d.intervalHours ?? 0) <= 0) errs.push('"decay.intervalHours" must be greater than 0.');
154
- return errs;
155
- }
156
-
157
- function validateEmbeddings(e: KnowledgeConfig['embeddings']): string[] {
158
- const errs: string[] = [];
159
- if (e.enabled && !isValidHttpUrl(e.endpoint)) {
160
- errs.push('"embeddings.endpoint" must be a valid HTTP/S URL.');
161
- }
162
- return errs;
163
- }
164
-
165
- function validateStorage(s: KnowledgeConfig['storage']): string[] {
166
- const errs: string[] = [];
167
- if ((s.writeDebounceMs ?? 0) < 0) {
168
- errs.push('"storage.writeDebounceMs" must be a non-negative number.');
169
- }
170
- return errs;
171
- }
172
-
173
- function isValidHttpUrl(str: string): boolean {
174
- try {
175
- const url = new URL(str);
176
- return url.protocol === 'http:' || url.protocol === 'https:';
177
- } catch {
178
- return false;
179
- }
180
- }
package/src/embeddings.ts DELETED
@@ -1,82 +0,0 @@
1
- // src/embeddings.ts
2
-
3
- import { Fact, KnowledgeConfig, Logger } from './types.js';
4
- import { httpPost } from './http-client.js';
5
-
6
- /** ChromaDB v2 API payload format. */
7
- interface ChromaPayload {
8
- ids: string[];
9
- documents: string[];
10
- metadatas: Record<string, string>[];
11
- }
12
-
13
- /**
14
- * Manages optional integration with a ChromaDB-compatible vector database.
15
- */
16
- export class Embeddings {
17
- private readonly config: KnowledgeConfig['embeddings'];
18
- private readonly logger: Logger;
19
-
20
- constructor(config: KnowledgeConfig['embeddings'], logger: Logger) {
21
- this.config = config;
22
- this.logger = logger;
23
- }
24
-
25
- /** Checks if the embeddings service is enabled. */
26
- public isEnabled(): boolean {
27
- return this.config.enabled;
28
- }
29
-
30
- /**
31
- * Syncs a batch of facts to the vector database.
32
- * @returns The number of successfully synced facts.
33
- */
34
- public async sync(facts: Fact[]): Promise<number> {
35
- if (!this.isEnabled() || facts.length === 0) return 0;
36
-
37
- this.logger.info(`Starting embedding sync for ${facts.length} facts.`);
38
- const payload = this.constructChromaPayload(facts);
39
- const url = this.buildEndpointUrl();
40
-
41
- try {
42
- await httpPost(url, payload);
43
- this.logger.info(`Successfully synced ${facts.length} facts to ChromaDB.`);
44
- return facts.length;
45
- } catch (err) {
46
- this.logger.error('Failed to sync embeddings to ChromaDB.', err as Error);
47
- return 0;
48
- }
49
- }
50
-
51
- /** Builds the full endpoint URL with collection name substituted. */
52
- private buildEndpointUrl(): string {
53
- return this.config.endpoint
54
- .replace('{name}', this.config.collectionName)
55
- .replace('//', '//') // preserve protocol double-slash
56
- .replace(/([^:])\/\//g, '$1/'); // collapse any other double-slashes
57
- }
58
-
59
- /**
60
- * Constructs the payload for ChromaDB v2 API.
61
- * Metadata values are all strings (v2 requirement).
62
- */
63
- private constructChromaPayload(facts: Fact[]): ChromaPayload {
64
- const payload: ChromaPayload = { ids: [], documents: [], metadatas: [] };
65
-
66
- for (const fact of facts) {
67
- payload.ids.push(fact.id);
68
- payload.documents.push(
69
- `${fact.subject} ${fact.predicate.replace(/-/g, ' ')} ${fact.object}.`
70
- );
71
- payload.metadatas.push({
72
- subject: fact.subject,
73
- predicate: fact.predicate,
74
- object: fact.object,
75
- source: fact.source,
76
- createdAt: fact.createdAt,
77
- });
78
- }
79
-
80
- return payload;
81
- }
82
- }