@hasna/knowledge 0.2.27 → 0.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +41 -0
  2. package/bin/open-knowledge-mcp.js +15 -7
  3. package/bin/open-knowledge.js +17 -17
  4. package/dist/agent.d.ts +35 -0
  5. package/dist/artifact-store.d.ts +63 -0
  6. package/dist/auth.d.ts +35 -0
  7. package/dist/embeddings.d.ts +77 -0
  8. package/dist/index.d.ts +20 -0
  9. package/dist/index.js +5709 -0
  10. package/dist/knowledge-db.d.ts +27 -0
  11. package/dist/manifest-ingest.d.ts +35 -0
  12. package/dist/outbox-consume.d.ts +25 -0
  13. package/dist/provenance.d.ts +50 -0
  14. package/dist/providers.d.ts +89 -0
  15. package/dist/reindex.d.ts +37 -0
  16. package/dist/remote-client.d.ts +108 -0
  17. package/dist/retrieval.d.ts +71 -0
  18. package/dist/safety.d.ts +70 -0
  19. package/dist/sdk.d.ts +72 -0
  20. package/dist/search.d.ts +65 -0
  21. package/dist/service.d.ts +117 -0
  22. package/dist/source-ingest.d.ts +18 -0
  23. package/dist/source-ref.d.ts +30 -0
  24. package/dist/source-resolver.d.ts +92 -0
  25. package/dist/storage-contract.d.ts +106 -0
  26. package/dist/web-search.d.ts +40 -0
  27. package/dist/wiki-compiler.d.ts +67 -0
  28. package/dist/wiki-layout.d.ts +23 -0
  29. package/dist/workspace.d.ts +111 -0
  30. package/package.json +15 -7
  31. package/src/agent.ts +0 -367
  32. package/src/artifact-store.ts +0 -184
  33. package/src/auth.ts +0 -123
  34. package/src/cli.ts +0 -1184
  35. package/src/embeddings.ts +0 -516
  36. package/src/knowledge-db.ts +0 -354
  37. package/src/manifest-ingest.ts +0 -515
  38. package/src/mcp-http.js +0 -110
  39. package/src/mcp.js +0 -1503
  40. package/src/outbox-consume.ts +0 -463
  41. package/src/provenance.ts +0 -93
  42. package/src/providers.ts +0 -308
  43. package/src/reindex.ts +0 -260
  44. package/src/remote-client.ts +0 -268
  45. package/src/retrieval.ts +0 -326
  46. package/src/safety.ts +0 -265
  47. package/src/schema.js +0 -25
  48. package/src/search.ts +0 -510
  49. package/src/service.ts +0 -443
  50. package/src/source-ingest.ts +0 -268
  51. package/src/source-ref.ts +0 -104
  52. package/src/source-resolver.ts +0 -436
  53. package/src/storage-contract.ts +0 -346
  54. package/src/store.ts +0 -113
  55. package/src/web-search.ts +0 -330
  56. package/src/wiki-compiler.ts +0 -711
  57. package/src/wiki-layout.ts +0 -251
  58. package/src/workspace.ts +0 -251
package/src/embeddings.ts DELETED
@@ -1,516 +0,0 @@
1
- import { createHash } from 'node:crypto';
2
- import type { Database } from 'bun:sqlite';
3
- import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
4
- import { assertProviderCredentials, parseModelRef, providerSettings, type AiProviderId } from './providers';
5
- import { sourceProvenance, type KnowledgeProvenance } from './provenance';
6
- import type { KnowledgeConfig } from './workspace';
7
-
8
- export interface EmbeddingRuntimeOptions {
9
- config?: KnowledgeConfig;
10
- env?: Record<string, string | undefined>;
11
- modelRef?: string;
12
- dimensions?: number;
13
- fake?: boolean;
14
- batchSize?: number;
15
- maxParallelCalls?: number;
16
- }
17
-
18
- export interface EmbeddingIndexOptions extends EmbeddingRuntimeOptions {
19
- dbPath: string;
20
- limit?: number;
21
- sourceRevisionId?: string;
22
- now?: Date;
23
- }
24
-
25
- export interface EmbeddingSearchOptions extends EmbeddingRuntimeOptions {
26
- dbPath: string;
27
- query: string;
28
- limit?: number;
29
- }
30
-
31
- export interface EmbeddingUsage {
32
- input_tokens: number;
33
- }
34
-
35
- export interface EmbeddingVectorResult {
36
- provider: AiProviderId;
37
- model: string;
38
- dimensions: number;
39
- vectors: number[][];
40
- usage: EmbeddingUsage;
41
- }
42
-
43
- export interface EmbeddingIndexResult {
44
- provider: AiProviderId;
45
- model: string;
46
- dimensions: number;
47
- chunks_seen: number;
48
- chunks_embedded: number;
49
- embeddings_upserted: number;
50
- vector_entries_upserted: number;
51
- usage: EmbeddingUsage;
52
- }
53
-
54
- export interface EmbeddingStatusResult {
55
- total_embeddings: number;
56
- total_vector_entries: number;
57
- indexes: Array<{
58
- provider: string;
59
- model: string;
60
- dimensions: number;
61
- entries: number;
62
- updated_at: string | null;
63
- }>;
64
- }
65
-
66
- export interface SemanticSearchResult {
67
- provider: AiProviderId;
68
- model: string;
69
- dimensions: number;
70
- query: string;
71
- results: Array<{
72
- chunk_id: string;
73
- score: number;
74
- text: string;
75
- source_uri: string | null;
76
- source_ref: string | null;
77
- revision: string | null;
78
- hash: string | null;
79
- provenance: KnowledgeProvenance | null;
80
- }>;
81
- }
82
-
83
- interface CandidateChunk {
84
- id: string;
85
- text: string;
86
- token_count: number | null;
87
- start_offset: number | null;
88
- end_offset: number | null;
89
- metadata_json: string;
90
- source_revision_id: string | null;
91
- revision: string | null;
92
- hash: string | null;
93
- source_uri: string | null;
94
- source_kind: string | null;
95
- }
96
-
97
- interface VectorRow {
98
- chunk_id: string;
99
- text: string;
100
- vector_json: string;
101
- vector_norm: number;
102
- source_uri: string | null;
103
- source_ref: string | null;
104
- revision: string | null;
105
- hash: string | null;
106
- metadata_json: string;
107
- }
108
-
109
- export const DEFAULT_EMBEDDING_MODEL_REF = 'openai:text-embedding-3-small';
110
- export const DEFAULT_EMBEDDING_DIMENSIONS = 1536;
111
-
112
- function embeddingConfig(config?: KnowledgeConfig) {
113
- return (config as KnowledgeConfig & {
114
- embeddings?: {
115
- default_model?: string;
116
- dimensions?: number;
117
- batch_size?: number;
118
- max_parallel_calls?: number;
119
- };
120
- } | undefined)?.embeddings ?? {};
121
- }
122
-
123
- function stableId(prefix: string, value: string): string {
124
- return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
125
- }
126
-
127
- function parseJsonObject(value: string | null | undefined): Record<string, unknown> {
128
- if (!value) return {};
129
- try {
130
- const parsed = JSON.parse(value);
131
- return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed as Record<string, unknown> : {};
132
- } catch {
133
- return {};
134
- }
135
- }
136
-
137
- function metadataString(metadata: Record<string, unknown>, keys: string[]): string | null {
138
- for (const key of keys) {
139
- const value = metadata[key];
140
- if (typeof value === 'string' && value.length > 0) return value;
141
- }
142
- return null;
143
- }
144
-
145
- function metadataNumber(metadata: Record<string, unknown>, keys: string[]): number | null {
146
- for (const key of keys) {
147
- const value = metadata[key];
148
- if (typeof value === 'number' && Number.isFinite(value)) return value;
149
- }
150
- return null;
151
- }
152
-
153
- function vectorNorm(vector: number[]): number {
154
- return Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0));
155
- }
156
-
157
- function cosineSimilarity(a: number[], b: number[], bNorm = vectorNorm(b)): number {
158
- const aNorm = vectorNorm(a);
159
- if (aNorm === 0 || bNorm === 0) return 0;
160
- const length = Math.min(a.length, b.length);
161
- let dot = 0;
162
- for (let i = 0; i < length; i += 1) dot += a[i] * b[i];
163
- return dot / (aNorm * bNorm);
164
- }
165
-
166
- function deterministicVector(text: string, dimensions: number): number[] {
167
- const bytes = createHash('sha256').update(text).digest();
168
- return Array.from({ length: dimensions }, (_, index) => {
169
- const value = bytes[index % bytes.length] / 255;
170
- return Number((value * 2 - 1).toFixed(6));
171
- });
172
- }
173
-
174
- async function openAiEmbeddingModel(model: string, config?: KnowledgeConfig, env: Record<string, string | undefined> = process.env): Promise<unknown> {
175
- assertProviderCredentials('openai', config, env);
176
- const settings = providerSettings(config, 'openai');
177
- const { createOpenAI } = await import('@ai-sdk/openai');
178
- const openai = createOpenAI({
179
- apiKey: env[settings.api_key_env],
180
- baseURL: settings.base_url,
181
- }) as unknown as {
182
- embeddingModel?: (modelId: string) => unknown;
183
- textEmbedding?: (modelId: string) => unknown;
184
- textEmbeddingModel?: (modelId: string) => unknown;
185
- };
186
- if (openai.embeddingModel) return openai.embeddingModel(model);
187
- if (openai.textEmbedding) return openai.textEmbedding(model);
188
- if (openai.textEmbeddingModel) return openai.textEmbeddingModel(model);
189
- throw new Error('OpenAI provider does not expose an embedding model factory.');
190
- }
191
-
192
- export function resolveEmbeddingModelRef(modelRef?: string, config?: KnowledgeConfig): string {
193
- if (!modelRef || modelRef === 'default' || modelRef === 'embedding') {
194
- return embeddingConfig(config).default_model ?? DEFAULT_EMBEDDING_MODEL_REF;
195
- }
196
- return modelRef;
197
- }
198
-
199
- export async function embedTexts(texts: string[], options: EmbeddingRuntimeOptions = {}): Promise<EmbeddingVectorResult> {
200
- const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
201
- const parsed = parseModelRef(modelRef);
202
- if (parsed.provider !== 'openai') {
203
- throw new Error(`Embedding provider ${parsed.provider} is not supported yet. Use openai:text-embedding-3-small.`);
204
- }
205
- const dimensions = options.dimensions ?? embeddingConfig(options.config).dimensions ?? DEFAULT_EMBEDDING_DIMENSIONS;
206
-
207
- if (options.fake) {
208
- return {
209
- provider: parsed.provider,
210
- model: parsed.model,
211
- dimensions,
212
- vectors: texts.map((text) => deterministicVector(text, dimensions)),
213
- usage: { input_tokens: texts.reduce((sum, text) => sum + Math.max(1, Math.ceil(text.split(/\s+/).filter(Boolean).length * 1.25)), 0) },
214
- };
215
- }
216
-
217
- const { embedMany } = await import('ai');
218
- const model = await openAiEmbeddingModel(parsed.model, options.config, options.env);
219
- const result = await embedMany({
220
- model: model as never,
221
- values: texts,
222
- maxParallelCalls: options.maxParallelCalls ?? embeddingConfig(options.config).max_parallel_calls,
223
- providerOptions: {
224
- openai: {
225
- dimensions,
226
- },
227
- },
228
- });
229
- const vectors = result.embeddings as number[][];
230
- return {
231
- provider: parsed.provider,
232
- model: parsed.model,
233
- dimensions: vectors[0]?.length ?? dimensions,
234
- vectors,
235
- usage: { input_tokens: result.usage?.tokens ?? 0 },
236
- };
237
- }
238
-
239
- function selectCandidateChunks(db: Database, options: {
240
- provider: AiProviderId;
241
- model: string;
242
- limit: number;
243
- sourceRevisionId?: string;
244
- }): CandidateChunk[] {
245
- const baseQuery =
246
- `SELECT
247
- c.id,
248
- c.text,
249
- c.token_count,
250
- c.start_offset,
251
- c.end_offset,
252
- c.metadata_json,
253
- c.source_revision_id,
254
- sr.revision,
255
- sr.hash,
256
- s.uri AS source_uri,
257
- s.kind AS source_kind
258
- FROM chunks c
259
- LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
260
- LEFT JOIN sources s ON s.id = sr.source_id
261
- LEFT JOIN vector_index_entries v
262
- ON v.chunk_id = c.id AND v.provider = ? AND v.model = ?
263
- WHERE v.id IS NULL`;
264
- const suffix = `
265
- ORDER BY c.created_at ASC, c.ordinal ASC
266
- LIMIT ?`;
267
- if (options.sourceRevisionId) {
268
- return db.query<CandidateChunk, [string, string, string, number]>(
269
- `${baseQuery} AND c.source_revision_id = ?${suffix}`,
270
- ).all(options.provider, options.model, options.sourceRevisionId, options.limit);
271
- }
272
- return db.query<CandidateChunk, [string, string, number]>(
273
- `${baseQuery}${suffix}`,
274
- ).all(options.provider, options.model, options.limit);
275
- }
276
-
277
- function provenanceForChunk(row: CandidateChunk): KnowledgeProvenance {
278
- const metadata = parseJsonObject(row.metadata_json);
279
- const existing = metadata.provenance;
280
- if (existing && typeof existing === 'object' && !Array.isArray(existing)) return existing as KnowledgeProvenance;
281
- return sourceProvenance({
282
- source_ref: metadataString(metadata, ['source_ref']),
283
- source_uri: row.source_uri ?? metadataString(metadata, ['source_uri']),
284
- source_kind: row.source_kind ?? metadataString(metadata, ['source_kind']),
285
- source_revision_id: row.source_revision_id,
286
- revision: row.revision ?? metadataString(metadata, ['revision']),
287
- hash: row.hash ?? metadataString(metadata, ['hash']),
288
- chunk_id: row.id,
289
- start_offset: row.start_offset ?? metadataNumber(metadata, ['start_offset']),
290
- end_offset: row.end_offset ?? metadataNumber(metadata, ['end_offset']),
291
- status: metadataString(metadata, ['status']),
292
- resolver: 'open-files-read-only',
293
- });
294
- }
295
-
296
- function upsertVectors(db: Database, rows: CandidateChunk[], embedding: EmbeddingVectorResult, now: string): number {
297
- const insertEmbedding = db.prepare(`
298
- INSERT INTO chunk_embeddings (id, chunk_id, provider, model, dimensions, vector_json, created_at)
299
- VALUES (?, ?, ?, ?, ?, ?, ?)
300
- ON CONFLICT(chunk_id, provider, model) DO UPDATE SET
301
- dimensions = excluded.dimensions,
302
- vector_json = excluded.vector_json,
303
- created_at = excluded.created_at
304
- `);
305
- const insertVector = db.prepare(`
306
- INSERT INTO vector_index_entries (
307
- id, chunk_id, source_revision_id, provider, model, dimensions, vector_json, vector_norm,
308
- source_uri, source_ref, revision, hash, start_offset, end_offset, token_count, status,
309
- metadata_json, created_at, updated_at
310
- )
311
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
312
- ON CONFLICT(chunk_id, provider, model) DO UPDATE SET
313
- source_revision_id = excluded.source_revision_id,
314
- dimensions = excluded.dimensions,
315
- vector_json = excluded.vector_json,
316
- vector_norm = excluded.vector_norm,
317
- source_uri = excluded.source_uri,
318
- source_ref = excluded.source_ref,
319
- revision = excluded.revision,
320
- hash = excluded.hash,
321
- start_offset = excluded.start_offset,
322
- end_offset = excluded.end_offset,
323
- token_count = excluded.token_count,
324
- status = excluded.status,
325
- metadata_json = excluded.metadata_json,
326
- updated_at = excluded.updated_at
327
- `);
328
-
329
- const write = db.transaction(() => {
330
- for (let index = 0; index < rows.length; index += 1) {
331
- const row = rows[index];
332
- const vector = embedding.vectors[index];
333
- if (!vector) continue;
334
- const metadata = parseJsonObject(row.metadata_json);
335
- const provenance = provenanceForChunk(row);
336
- const sourceRef = provenance.source_ref ?? metadataString(metadata, ['source_ref']);
337
- const sourceUri = provenance.source_uri ?? row.source_uri ?? metadataString(metadata, ['source_uri']);
338
- const revision = provenance.revision ?? row.revision ?? metadataString(metadata, ['revision']);
339
- const hash = provenance.hash ?? row.hash ?? metadataString(metadata, ['hash']);
340
- const status = provenance.status ?? metadataString(metadata, ['status']) ?? 'active';
341
- const vectorJson = JSON.stringify(vector);
342
- insertEmbedding.run(
343
- stableId('emb', `${row.id}\u0000${embedding.provider}\u0000${embedding.model}`),
344
- row.id,
345
- embedding.provider,
346
- embedding.model,
347
- embedding.dimensions,
348
- vectorJson,
349
- now,
350
- );
351
- insertVector.run(
352
- stableId('vec', `${row.id}\u0000${embedding.provider}\u0000${embedding.model}`),
353
- row.id,
354
- row.source_revision_id,
355
- embedding.provider,
356
- embedding.model,
357
- embedding.dimensions,
358
- vectorJson,
359
- vectorNorm(vector),
360
- sourceUri,
361
- sourceRef,
362
- revision,
363
- hash,
364
- provenance.start_offset,
365
- provenance.end_offset,
366
- row.token_count,
367
- status,
368
- JSON.stringify({
369
- ...metadata,
370
- provenance,
371
- embedded_at: now,
372
- }),
373
- now,
374
- now,
375
- );
376
- }
377
- });
378
- write();
379
- return rows.length;
380
- }
381
-
382
- export async function indexKnowledgeEmbeddings(options: EmbeddingIndexOptions): Promise<EmbeddingIndexResult> {
383
- const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
384
- const parsed = parseModelRef(modelRef);
385
- if (parsed.provider !== 'openai') throw new Error(`Embedding provider ${parsed.provider} is not supported yet.`);
386
- const now = (options.now ?? new Date()).toISOString();
387
- const limit = Math.max(1, Math.min(options.limit ?? 100, 1000));
388
- migrateKnowledgeDb(options.dbPath);
389
- const readDb = openKnowledgeDb(options.dbPath);
390
- let rows: CandidateChunk[];
391
- try {
392
- rows = selectCandidateChunks(readDb, {
393
- provider: parsed.provider,
394
- model: parsed.model,
395
- limit,
396
- sourceRevisionId: options.sourceRevisionId,
397
- });
398
- } finally {
399
- readDb.close();
400
- }
401
-
402
- if (rows.length === 0) {
403
- return {
404
- provider: parsed.provider,
405
- model: parsed.model,
406
- dimensions: options.dimensions ?? embeddingConfig(options.config).dimensions ?? DEFAULT_EMBEDDING_DIMENSIONS,
407
- chunks_seen: 0,
408
- chunks_embedded: 0,
409
- embeddings_upserted: 0,
410
- vector_entries_upserted: 0,
411
- usage: { input_tokens: 0 },
412
- };
413
- }
414
-
415
- const embedding = await embedTexts(rows.map((row) => row.text), options);
416
- const writeDb = openKnowledgeDb(options.dbPath);
417
- try {
418
- const upserted = upsertVectors(writeDb, rows, embedding, now);
419
- return {
420
- provider: embedding.provider,
421
- model: embedding.model,
422
- dimensions: embedding.dimensions,
423
- chunks_seen: rows.length,
424
- chunks_embedded: rows.length,
425
- embeddings_upserted: upserted,
426
- vector_entries_upserted: upserted,
427
- usage: embedding.usage,
428
- };
429
- } finally {
430
- writeDb.close();
431
- }
432
- }
433
-
434
- export function embeddingIndexStatus(dbPath: string): EmbeddingStatusResult {
435
- migrateKnowledgeDb(dbPath);
436
- const db = openKnowledgeDb(dbPath);
437
- try {
438
- const totalEmbeddings = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM chunk_embeddings').get()?.n ?? 0;
439
- const totalVectorEntries = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM vector_index_entries').get()?.n ?? 0;
440
- const indexes = db.query<{
441
- provider: string;
442
- model: string;
443
- dimensions: number;
444
- entries: number;
445
- updated_at: string | null;
446
- }, []>(
447
- `SELECT provider, model, dimensions, COUNT(*) AS entries, MAX(updated_at) AS updated_at
448
- FROM vector_index_entries
449
- GROUP BY provider, model, dimensions
450
- ORDER BY provider, model`,
451
- ).all();
452
- return {
453
- total_embeddings: totalEmbeddings,
454
- total_vector_entries: totalVectorEntries,
455
- indexes,
456
- };
457
- } finally {
458
- db.close();
459
- }
460
- }
461
-
462
- export async function searchVectorIndex(options: EmbeddingSearchOptions): Promise<SemanticSearchResult> {
463
- const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
464
- const parsed = parseModelRef(modelRef);
465
- const limit = Math.max(1, Math.min(options.limit ?? 10, 100));
466
- const embedded = await embedTexts([options.query], options);
467
- const queryVector = embedded.vectors[0] ?? [];
468
-
469
- migrateKnowledgeDb(options.dbPath);
470
- const db = openKnowledgeDb(options.dbPath);
471
- try {
472
- const rows = db.query<VectorRow, [string, string]>(
473
- `SELECT
474
- v.chunk_id,
475
- c.text,
476
- v.vector_json,
477
- v.vector_norm,
478
- v.source_uri,
479
- v.source_ref,
480
- v.revision,
481
- v.hash,
482
- v.metadata_json
483
- FROM vector_index_entries v
484
- JOIN chunks c ON c.id = v.chunk_id
485
- WHERE v.provider = ? AND v.model = ? AND v.status = 'active'`,
486
- ).all(parsed.provider, parsed.model);
487
-
488
- const scored = rows.map((row) => {
489
- const vector = JSON.parse(row.vector_json) as number[];
490
- const metadata = parseJsonObject(row.metadata_json);
491
- const provenance = metadata.provenance && typeof metadata.provenance === 'object' && !Array.isArray(metadata.provenance)
492
- ? metadata.provenance as KnowledgeProvenance
493
- : null;
494
- return {
495
- chunk_id: row.chunk_id,
496
- score: cosineSimilarity(queryVector, vector, row.vector_norm),
497
- text: row.text,
498
- source_uri: row.source_uri,
499
- source_ref: row.source_ref,
500
- revision: row.revision,
501
- hash: row.hash,
502
- provenance,
503
- };
504
- }).sort((a, b) => b.score - a.score).slice(0, limit);
505
-
506
- return {
507
- provider: parsed.provider,
508
- model: parsed.model,
509
- dimensions: embedded.dimensions,
510
- query: options.query,
511
- results: scored,
512
- };
513
- } finally {
514
- db.close();
515
- }
516
- }