@hasna/knowledge 0.2.12 → 0.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13660,7 +13660,7 @@ import { existsSync as existsSync7, readFileSync as readFileSync7, writeFileSync
13660
13660
  // package.json
13661
13661
  var package_default = {
13662
13662
  name: "@hasna/knowledge",
13663
- version: "0.2.12",
13663
+ version: "0.2.14",
13664
13664
  description: "Agent-friendly local knowledge CLI with JSON output, pagination, and safe destructive actions",
13665
13665
  type: "module",
13666
13666
  bin: {
@@ -13790,6 +13790,12 @@ function defaultKnowledgeConfig() {
13790
13790
  default_model: "deepseek-chat"
13791
13791
  }
13792
13792
  },
13793
+ embeddings: {
13794
+ default_model: "openai:text-embedding-3-small",
13795
+ dimensions: 1536,
13796
+ batch_size: 64,
13797
+ max_parallel_calls: 4
13798
+ },
13793
13799
  safety: {
13794
13800
  network: {
13795
13801
  web_search_enabled: false,
@@ -14128,10 +14134,8 @@ function createArtifactStore(config2, workspace) {
14128
14134
  return new LocalArtifactStore(workspace.artifactsDir);
14129
14135
  }
14130
14136
 
14131
- // src/outbox-consume.ts
14132
- import { createHash as createHash2, randomUUID as randomUUID3 } from "crypto";
14133
- import { existsSync as existsSync4, readFileSync as readFileSync4 } from "fs";
14134
- import { basename } from "path";
14137
+ // src/embeddings.ts
14138
+ import { createHash } from "crypto";
14135
14139
 
14136
14140
  // src/knowledge-db.ts
14137
14141
  import { Database } from "bun:sqlite";
@@ -14349,10 +14353,43 @@ CREATE INDEX IF NOT EXISTS idx_approval_gates_status ON approval_gates(status);
14349
14353
  INSERT OR IGNORE INTO schema_versions(version, applied_at)
14350
14354
  VALUES (3, datetime('now'));
14351
14355
  `;
14356
+ var MIGRATION_4 = `
14357
+ CREATE TABLE IF NOT EXISTS vector_index_entries (
14358
+ id TEXT PRIMARY KEY,
14359
+ chunk_id TEXT NOT NULL REFERENCES chunks(id) ON DELETE CASCADE,
14360
+ source_revision_id TEXT REFERENCES source_revisions(id) ON DELETE CASCADE,
14361
+ provider TEXT NOT NULL,
14362
+ model TEXT NOT NULL,
14363
+ dimensions INTEGER NOT NULL,
14364
+ vector_json TEXT NOT NULL,
14365
+ vector_norm REAL NOT NULL,
14366
+ source_uri TEXT,
14367
+ source_ref TEXT,
14368
+ revision TEXT,
14369
+ hash TEXT,
14370
+ start_offset INTEGER,
14371
+ end_offset INTEGER,
14372
+ token_count INTEGER,
14373
+ status TEXT NOT NULL DEFAULT 'active',
14374
+ metadata_json TEXT NOT NULL DEFAULT '{}',
14375
+ created_at TEXT NOT NULL,
14376
+ updated_at TEXT NOT NULL,
14377
+ UNIQUE(chunk_id, provider, model)
14378
+ );
14379
+
14380
+ CREATE INDEX IF NOT EXISTS idx_vector_index_provider_model ON vector_index_entries(provider, model);
14381
+ CREATE INDEX IF NOT EXISTS idx_vector_index_source_revision ON vector_index_entries(source_revision_id);
14382
+ CREATE INDEX IF NOT EXISTS idx_vector_index_source_uri ON vector_index_entries(source_uri);
14383
+ CREATE INDEX IF NOT EXISTS idx_vector_index_status ON vector_index_entries(status);
14384
+
14385
+ INSERT OR IGNORE INTO schema_versions(version, applied_at)
14386
+ VALUES (4, datetime('now'));
14387
+ `;
14352
14388
  function openKnowledgeDb(path) {
14353
14389
  ensureParentDir(path);
14354
14390
  const db = new Database(path);
14355
14391
  db.exec("PRAGMA foreign_keys = ON;");
14392
+ db.exec("PRAGMA busy_timeout = 5000;");
14356
14393
  return db;
14357
14394
  }
14358
14395
  function migrateKnowledgeDb(path) {
@@ -14363,6 +14400,8 @@ function migrateKnowledgeDb(path) {
14363
14400
  db.exec(MIGRATION_2);
14364
14401
  if (getSchemaVersion(db) < 3)
14365
14402
  db.exec(MIGRATION_3);
14403
+ if (getSchemaVersion(db) < 4)
14404
+ db.exec(MIGRATION_4);
14366
14405
  return { path, schema_version: getSchemaVersion(db) };
14367
14406
  } finally {
14368
14407
  db.close();
@@ -14392,15 +14431,530 @@ function getKnowledgeDbStats(path) {
14392
14431
  redaction_findings: count(db, "redaction_findings"),
14393
14432
  audit_events: count(db, "audit_events"),
14394
14433
  approval_gates: count(db, "approval_gates"),
14395
- storage_objects: count(db, "storage_objects")
14434
+ storage_objects: count(db, "storage_objects"),
14435
+ embeddings: count(db, "chunk_embeddings"),
14436
+ vector_entries: count(db, "vector_index_entries")
14396
14437
  };
14397
14438
  } finally {
14398
14439
  db.close();
14399
14440
  }
14400
14441
  }
14401
14442
 
14443
+ // src/providers.ts
14444
+ var DEFAULT_PROVIDER_SETTINGS = {
14445
+ openai: {
14446
+ api_key_env: "OPENAI_API_KEY",
14447
+ default_model: "gpt-5.2"
14448
+ },
14449
+ anthropic: {
14450
+ api_key_env: "ANTHROPIC_API_KEY",
14451
+ default_model: "claude-sonnet-4-6"
14452
+ },
14453
+ deepseek: {
14454
+ api_key_env: "DEEPSEEK_API_KEY",
14455
+ default_model: "deepseek-chat"
14456
+ }
14457
+ };
14458
+ var PROVIDER_CAPABILITIES = {
14459
+ openai: {
14460
+ text_generation: true,
14461
+ structured_output: true,
14462
+ tool_usage: true,
14463
+ tool_streaming: true,
14464
+ image_input: true,
14465
+ native_web_search: true,
14466
+ reasoning: true,
14467
+ embeddings: true
14468
+ },
14469
+ anthropic: {
14470
+ text_generation: true,
14471
+ structured_output: true,
14472
+ tool_usage: true,
14473
+ tool_streaming: true,
14474
+ image_input: true,
14475
+ native_web_search: false,
14476
+ reasoning: true,
14477
+ embeddings: false
14478
+ },
14479
+ deepseek: {
14480
+ text_generation: true,
14481
+ structured_output: true,
14482
+ tool_usage: true,
14483
+ tool_streaming: true,
14484
+ image_input: false,
14485
+ native_web_search: false,
14486
+ reasoning: true,
14487
+ embeddings: false
14488
+ }
14489
+ };
14490
+ var BUILTIN_ALIASES = {
14491
+ default: "openai:gpt-5.2",
14492
+ fast: "openai:gpt-5-mini",
14493
+ reasoning: "anthropic:claude-opus-4-6",
14494
+ sonnet: "anthropic:claude-sonnet-4-6",
14495
+ deepseek: "deepseek:deepseek-chat",
14496
+ "deepseek-reasoning": "deepseek:deepseek-reasoner"
14497
+ };
14498
+ function providerConfig(config2) {
14499
+ return config2.providers ?? {};
14500
+ }
14501
+ function providerSettings(config2, provider) {
14502
+ const configured = providerConfig(config2)[provider] ?? {};
14503
+ return {
14504
+ ...DEFAULT_PROVIDER_SETTINGS[provider],
14505
+ ...configured
14506
+ };
14507
+ }
14508
+ function modelAliases(config2) {
14509
+ const configured = providerConfig(config2);
14510
+ return {
14511
+ ...BUILTIN_ALIASES,
14512
+ ...configured.default_model ? { default: configured.default_model } : {},
14513
+ ...configured.aliases ?? {}
14514
+ };
14515
+ }
14516
+ function parseModelRef(modelRef) {
14517
+ const [provider, ...rest] = modelRef.split(":");
14518
+ const model = rest.join(":");
14519
+ if (provider !== "openai" && provider !== "anthropic" && provider !== "deepseek") {
14520
+ throw new Error(`Unsupported AI provider: ${provider}`);
14521
+ }
14522
+ if (!model)
14523
+ throw new Error(`Invalid model ref: ${modelRef}. Expected provider:model.`);
14524
+ return { provider, model };
14525
+ }
14526
+ function resolveModelRef(aliasOrRef, config2) {
14527
+ const aliases = modelAliases(config2);
14528
+ return aliases[aliasOrRef] ?? aliasOrRef;
14529
+ }
14530
+ function listModelRegistry(config2) {
14531
+ const aliases = modelAliases(config2);
14532
+ return Object.entries(aliases).map(([alias, modelRef]) => {
14533
+ const parsed = parseModelRef(modelRef);
14534
+ return {
14535
+ alias,
14536
+ model_ref: modelRef,
14537
+ provider: parsed.provider,
14538
+ model: parsed.model,
14539
+ default: alias === "default",
14540
+ capabilities: PROVIDER_CAPABILITIES[parsed.provider]
14541
+ };
14542
+ });
14543
+ }
14544
+ function providerCredentialStatus(config2, env = process.env) {
14545
+ return Object.keys(DEFAULT_PROVIDER_SETTINGS).map((provider) => {
14546
+ const settings = providerSettings(config2, provider);
14547
+ const configured = Boolean(env[settings.api_key_env]);
14548
+ return {
14549
+ provider,
14550
+ api_key_env: settings.api_key_env,
14551
+ configured,
14552
+ source: configured ? "env" : "missing",
14553
+ base_url: settings.base_url ?? null,
14554
+ default_model: settings.default_model
14555
+ };
14556
+ });
14557
+ }
14558
+ function providerStatus(config2, env = process.env) {
14559
+ return {
14560
+ default_model: resolveModelRef("default", config2),
14561
+ providers: providerCredentialStatus(config2, env),
14562
+ models: listModelRegistry(config2)
14563
+ };
14564
+ }
14565
+ function assertProviderCredentials(provider, config2, env = process.env) {
14566
+ const status = providerCredentialStatus(config2, env).find((entry) => entry.provider === provider);
14567
+ if (!status)
14568
+ throw new Error(`Unsupported AI provider: ${provider}`);
14569
+ if (!status.configured)
14570
+ throw new Error(`Missing ${status.api_key_env} for ${provider}. Set the env var to use this provider.`);
14571
+ return status;
14572
+ }
14573
+
14574
+ // src/provenance.ts
14575
+ function isStaleStatus(status) {
14576
+ return ["deleted", "stale", "invalidated", "reindex_required"].includes((status ?? "").toLowerCase());
14577
+ }
14578
+ function sourceProvenance(input) {
14579
+ const status = input.status ?? null;
14580
+ return {
14581
+ source_owner: "open-files",
14582
+ source_ref: input.source_ref ?? null,
14583
+ source_uri: input.source_uri ?? null,
14584
+ source_kind: input.source_kind ?? null,
14585
+ source_revision_id: input.source_revision_id ?? null,
14586
+ revision: input.revision ?? null,
14587
+ hash: input.hash ?? null,
14588
+ chunk_id: input.chunk_id ?? null,
14589
+ start_offset: input.start_offset ?? null,
14590
+ end_offset: input.end_offset ?? null,
14591
+ status,
14592
+ read_only: true,
14593
+ citation_required: true,
14594
+ resolver: input.resolver ?? null,
14595
+ stale: isStaleStatus(status)
14596
+ };
14597
+ }
14598
+ function generatedArtifactProvenance(input) {
14599
+ return {
14600
+ source_owner: "open-files",
14601
+ generated_from: input.generated_from,
14602
+ artifact_key: input.artifact_key,
14603
+ source_refs: input.source_refs ?? [],
14604
+ read_only_sources: true,
14605
+ citation_required: input.citation_required ?? true,
14606
+ raw_source_bytes_stored_in_open_knowledge: false
14607
+ };
14608
+ }
14609
+ function withProvenance(metadata, provenance) {
14610
+ return {
14611
+ ...metadata,
14612
+ provenance
14613
+ };
14614
+ }
14615
+
14616
+ // src/embeddings.ts
14617
+ var DEFAULT_EMBEDDING_MODEL_REF = "openai:text-embedding-3-small";
14618
+ var DEFAULT_EMBEDDING_DIMENSIONS = 1536;
14619
+ function embeddingConfig(config2) {
14620
+ return config2?.embeddings ?? {};
14621
+ }
14622
+ function stableId(prefix, value) {
14623
+ return `${prefix}_${createHash("sha256").update(value).digest("hex").slice(0, 20)}`;
14624
+ }
14625
+ function parseJsonObject(value) {
14626
+ if (!value)
14627
+ return {};
14628
+ try {
14629
+ const parsed = JSON.parse(value);
14630
+ return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : {};
14631
+ } catch {
14632
+ return {};
14633
+ }
14634
+ }
14635
+ function metadataString(metadata, keys) {
14636
+ for (const key of keys) {
14637
+ const value = metadata[key];
14638
+ if (typeof value === "string" && value.length > 0)
14639
+ return value;
14640
+ }
14641
+ return null;
14642
+ }
14643
+ function metadataNumber(metadata, keys) {
14644
+ for (const key of keys) {
14645
+ const value = metadata[key];
14646
+ if (typeof value === "number" && Number.isFinite(value))
14647
+ return value;
14648
+ }
14649
+ return null;
14650
+ }
14651
+ function vectorNorm(vector) {
14652
+ return Math.sqrt(vector.reduce((sum, value) => sum + value * value, 0));
14653
+ }
14654
+ function cosineSimilarity(a, b, bNorm = vectorNorm(b)) {
14655
+ const aNorm = vectorNorm(a);
14656
+ if (aNorm === 0 || bNorm === 0)
14657
+ return 0;
14658
+ const length = Math.min(a.length, b.length);
14659
+ let dot = 0;
14660
+ for (let i = 0;i < length; i += 1)
14661
+ dot += a[i] * b[i];
14662
+ return dot / (aNorm * bNorm);
14663
+ }
14664
+ function deterministicVector(text, dimensions) {
14665
+ const bytes = createHash("sha256").update(text).digest();
14666
+ return Array.from({ length: dimensions }, (_, index) => {
14667
+ const value = bytes[index % bytes.length] / 255;
14668
+ return Number((value * 2 - 1).toFixed(6));
14669
+ });
14670
+ }
14671
+ async function openAiEmbeddingModel(model, config2, env = process.env) {
14672
+ assertProviderCredentials("openai", config2, env);
14673
+ const settings = providerSettings(config2, "openai");
14674
+ const { createOpenAI } = await import("@ai-sdk/openai");
14675
+ const openai = createOpenAI({
14676
+ apiKey: env[settings.api_key_env],
14677
+ baseURL: settings.base_url
14678
+ });
14679
+ if (openai.embeddingModel)
14680
+ return openai.embeddingModel(model);
14681
+ if (openai.textEmbedding)
14682
+ return openai.textEmbedding(model);
14683
+ if (openai.textEmbeddingModel)
14684
+ return openai.textEmbeddingModel(model);
14685
+ throw new Error("OpenAI provider does not expose an embedding model factory.");
14686
+ }
14687
+ function resolveEmbeddingModelRef(modelRef, config2) {
14688
+ if (!modelRef || modelRef === "default" || modelRef === "embedding") {
14689
+ return embeddingConfig(config2).default_model ?? DEFAULT_EMBEDDING_MODEL_REF;
14690
+ }
14691
+ return modelRef;
14692
+ }
14693
+ async function embedTexts(texts, options = {}) {
14694
+ const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
14695
+ const parsed = parseModelRef(modelRef);
14696
+ if (parsed.provider !== "openai") {
14697
+ throw new Error(`Embedding provider ${parsed.provider} is not supported yet. Use openai:text-embedding-3-small.`);
14698
+ }
14699
+ const dimensions = options.dimensions ?? embeddingConfig(options.config).dimensions ?? DEFAULT_EMBEDDING_DIMENSIONS;
14700
+ if (options.fake) {
14701
+ return {
14702
+ provider: parsed.provider,
14703
+ model: parsed.model,
14704
+ dimensions,
14705
+ vectors: texts.map((text) => deterministicVector(text, dimensions)),
14706
+ usage: { input_tokens: texts.reduce((sum, text) => sum + Math.max(1, Math.ceil(text.split(/\s+/).filter(Boolean).length * 1.25)), 0) }
14707
+ };
14708
+ }
14709
+ const { embedMany } = await import("ai");
14710
+ const model = await openAiEmbeddingModel(parsed.model, options.config, options.env);
14711
+ const result = await embedMany({
14712
+ model,
14713
+ values: texts,
14714
+ maxParallelCalls: options.maxParallelCalls ?? embeddingConfig(options.config).max_parallel_calls,
14715
+ providerOptions: {
14716
+ openai: {
14717
+ dimensions
14718
+ }
14719
+ }
14720
+ });
14721
+ const vectors = result.embeddings;
14722
+ return {
14723
+ provider: parsed.provider,
14724
+ model: parsed.model,
14725
+ dimensions: vectors[0]?.length ?? dimensions,
14726
+ vectors,
14727
+ usage: { input_tokens: result.usage?.tokens ?? 0 }
14728
+ };
14729
+ }
14730
+ function selectCandidateChunks(db, options) {
14731
+ const baseQuery = `SELECT
14732
+ c.id,
14733
+ c.text,
14734
+ c.token_count,
14735
+ c.start_offset,
14736
+ c.end_offset,
14737
+ c.metadata_json,
14738
+ c.source_revision_id,
14739
+ sr.revision,
14740
+ sr.hash,
14741
+ s.uri AS source_uri,
14742
+ s.kind AS source_kind
14743
+ FROM chunks c
14744
+ LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
14745
+ LEFT JOIN sources s ON s.id = sr.source_id
14746
+ LEFT JOIN vector_index_entries v
14747
+ ON v.chunk_id = c.id AND v.provider = ? AND v.model = ?
14748
+ WHERE v.id IS NULL`;
14749
+ const suffix = `
14750
+ ORDER BY c.created_at ASC, c.ordinal ASC
14751
+ LIMIT ?`;
14752
+ if (options.sourceRevisionId) {
14753
+ return db.query(`${baseQuery} AND c.source_revision_id = ?${suffix}`).all(options.provider, options.model, options.sourceRevisionId, options.limit);
14754
+ }
14755
+ return db.query(`${baseQuery}${suffix}`).all(options.provider, options.model, options.limit);
14756
+ }
14757
+ function provenanceForChunk(row) {
14758
+ const metadata = parseJsonObject(row.metadata_json);
14759
+ const existing = metadata.provenance;
14760
+ if (existing && typeof existing === "object" && !Array.isArray(existing))
14761
+ return existing;
14762
+ return sourceProvenance({
14763
+ source_ref: metadataString(metadata, ["source_ref"]),
14764
+ source_uri: row.source_uri ?? metadataString(metadata, ["source_uri"]),
14765
+ source_kind: row.source_kind ?? metadataString(metadata, ["source_kind"]),
14766
+ source_revision_id: row.source_revision_id,
14767
+ revision: row.revision ?? metadataString(metadata, ["revision"]),
14768
+ hash: row.hash ?? metadataString(metadata, ["hash"]),
14769
+ chunk_id: row.id,
14770
+ start_offset: row.start_offset ?? metadataNumber(metadata, ["start_offset"]),
14771
+ end_offset: row.end_offset ?? metadataNumber(metadata, ["end_offset"]),
14772
+ status: metadataString(metadata, ["status"]),
14773
+ resolver: "open-files-read-only"
14774
+ });
14775
+ }
14776
+ function upsertVectors(db, rows, embedding, now) {
14777
+ const insertEmbedding = db.prepare(`
14778
+ INSERT INTO chunk_embeddings (id, chunk_id, provider, model, dimensions, vector_json, created_at)
14779
+ VALUES (?, ?, ?, ?, ?, ?, ?)
14780
+ ON CONFLICT(chunk_id, provider, model) DO UPDATE SET
14781
+ dimensions = excluded.dimensions,
14782
+ vector_json = excluded.vector_json,
14783
+ created_at = excluded.created_at
14784
+ `);
14785
+ const insertVector = db.prepare(`
14786
+ INSERT INTO vector_index_entries (
14787
+ id, chunk_id, source_revision_id, provider, model, dimensions, vector_json, vector_norm,
14788
+ source_uri, source_ref, revision, hash, start_offset, end_offset, token_count, status,
14789
+ metadata_json, created_at, updated_at
14790
+ )
14791
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
14792
+ ON CONFLICT(chunk_id, provider, model) DO UPDATE SET
14793
+ source_revision_id = excluded.source_revision_id,
14794
+ dimensions = excluded.dimensions,
14795
+ vector_json = excluded.vector_json,
14796
+ vector_norm = excluded.vector_norm,
14797
+ source_uri = excluded.source_uri,
14798
+ source_ref = excluded.source_ref,
14799
+ revision = excluded.revision,
14800
+ hash = excluded.hash,
14801
+ start_offset = excluded.start_offset,
14802
+ end_offset = excluded.end_offset,
14803
+ token_count = excluded.token_count,
14804
+ status = excluded.status,
14805
+ metadata_json = excluded.metadata_json,
14806
+ updated_at = excluded.updated_at
14807
+ `);
14808
+ const write = db.transaction(() => {
14809
+ for (let index = 0;index < rows.length; index += 1) {
14810
+ const row = rows[index];
14811
+ const vector = embedding.vectors[index];
14812
+ if (!vector)
14813
+ continue;
14814
+ const metadata = parseJsonObject(row.metadata_json);
14815
+ const provenance = provenanceForChunk(row);
14816
+ const sourceRef = provenance.source_ref ?? metadataString(metadata, ["source_ref"]);
14817
+ const sourceUri = provenance.source_uri ?? row.source_uri ?? metadataString(metadata, ["source_uri"]);
14818
+ const revision = provenance.revision ?? row.revision ?? metadataString(metadata, ["revision"]);
14819
+ const hash2 = provenance.hash ?? row.hash ?? metadataString(metadata, ["hash"]);
14820
+ const status = provenance.status ?? metadataString(metadata, ["status"]) ?? "active";
14821
+ const vectorJson = JSON.stringify(vector);
14822
+ insertEmbedding.run(stableId("emb", `${row.id}\x00${embedding.provider}\x00${embedding.model}`), row.id, embedding.provider, embedding.model, embedding.dimensions, vectorJson, now);
14823
+ insertVector.run(stableId("vec", `${row.id}\x00${embedding.provider}\x00${embedding.model}`), row.id, row.source_revision_id, embedding.provider, embedding.model, embedding.dimensions, vectorJson, vectorNorm(vector), sourceUri, sourceRef, revision, hash2, provenance.start_offset, provenance.end_offset, row.token_count, status, JSON.stringify({
14824
+ ...metadata,
14825
+ provenance,
14826
+ embedded_at: now
14827
+ }), now, now);
14828
+ }
14829
+ });
14830
+ write();
14831
+ return rows.length;
14832
+ }
14833
+ async function indexKnowledgeEmbeddings(options) {
14834
+ const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
14835
+ const parsed = parseModelRef(modelRef);
14836
+ if (parsed.provider !== "openai")
14837
+ throw new Error(`Embedding provider ${parsed.provider} is not supported yet.`);
14838
+ const now = (options.now ?? new Date).toISOString();
14839
+ const limit = Math.max(1, Math.min(options.limit ?? 100, 1000));
14840
+ migrateKnowledgeDb(options.dbPath);
14841
+ const readDb = openKnowledgeDb(options.dbPath);
14842
+ let rows;
14843
+ try {
14844
+ rows = selectCandidateChunks(readDb, {
14845
+ provider: parsed.provider,
14846
+ model: parsed.model,
14847
+ limit,
14848
+ sourceRevisionId: options.sourceRevisionId
14849
+ });
14850
+ } finally {
14851
+ readDb.close();
14852
+ }
14853
+ if (rows.length === 0) {
14854
+ return {
14855
+ provider: parsed.provider,
14856
+ model: parsed.model,
14857
+ dimensions: options.dimensions ?? embeddingConfig(options.config).dimensions ?? DEFAULT_EMBEDDING_DIMENSIONS,
14858
+ chunks_seen: 0,
14859
+ chunks_embedded: 0,
14860
+ embeddings_upserted: 0,
14861
+ vector_entries_upserted: 0,
14862
+ usage: { input_tokens: 0 }
14863
+ };
14864
+ }
14865
+ const embedding = await embedTexts(rows.map((row) => row.text), options);
14866
+ const writeDb = openKnowledgeDb(options.dbPath);
14867
+ try {
14868
+ const upserted = upsertVectors(writeDb, rows, embedding, now);
14869
+ return {
14870
+ provider: embedding.provider,
14871
+ model: embedding.model,
14872
+ dimensions: embedding.dimensions,
14873
+ chunks_seen: rows.length,
14874
+ chunks_embedded: rows.length,
14875
+ embeddings_upserted: upserted,
14876
+ vector_entries_upserted: upserted,
14877
+ usage: embedding.usage
14878
+ };
14879
+ } finally {
14880
+ writeDb.close();
14881
+ }
14882
+ }
14883
+ function embeddingIndexStatus(dbPath) {
14884
+ migrateKnowledgeDb(dbPath);
14885
+ const db = openKnowledgeDb(dbPath);
14886
+ try {
14887
+ const totalEmbeddings = db.query("SELECT COUNT(*) AS n FROM chunk_embeddings").get()?.n ?? 0;
14888
+ const totalVectorEntries = db.query("SELECT COUNT(*) AS n FROM vector_index_entries").get()?.n ?? 0;
14889
+ const indexes = db.query(`SELECT provider, model, dimensions, COUNT(*) AS entries, MAX(updated_at) AS updated_at
14890
+ FROM vector_index_entries
14891
+ GROUP BY provider, model, dimensions
14892
+ ORDER BY provider, model`).all();
14893
+ return {
14894
+ total_embeddings: totalEmbeddings,
14895
+ total_vector_entries: totalVectorEntries,
14896
+ indexes
14897
+ };
14898
+ } finally {
14899
+ db.close();
14900
+ }
14901
+ }
14902
+ async function searchVectorIndex(options) {
14903
+ const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
14904
+ const parsed = parseModelRef(modelRef);
14905
+ const limit = Math.max(1, Math.min(options.limit ?? 10, 100));
14906
+ const embedded = await embedTexts([options.query], options);
14907
+ const queryVector = embedded.vectors[0] ?? [];
14908
+ migrateKnowledgeDb(options.dbPath);
14909
+ const db = openKnowledgeDb(options.dbPath);
14910
+ try {
14911
+ const rows = db.query(`SELECT
14912
+ v.chunk_id,
14913
+ c.text,
14914
+ v.vector_json,
14915
+ v.vector_norm,
14916
+ v.source_uri,
14917
+ v.source_ref,
14918
+ v.revision,
14919
+ v.hash,
14920
+ v.metadata_json
14921
+ FROM vector_index_entries v
14922
+ JOIN chunks c ON c.id = v.chunk_id
14923
+ WHERE v.provider = ? AND v.model = ? AND v.status = 'active'`).all(parsed.provider, parsed.model);
14924
+ const scored = rows.map((row) => {
14925
+ const vector = JSON.parse(row.vector_json);
14926
+ const metadata = parseJsonObject(row.metadata_json);
14927
+ const provenance = metadata.provenance && typeof metadata.provenance === "object" && !Array.isArray(metadata.provenance) ? metadata.provenance : null;
14928
+ return {
14929
+ chunk_id: row.chunk_id,
14930
+ score: cosineSimilarity(queryVector, vector, row.vector_norm),
14931
+ text: row.text,
14932
+ source_uri: row.source_uri,
14933
+ source_ref: row.source_ref,
14934
+ revision: row.revision,
14935
+ hash: row.hash,
14936
+ provenance
14937
+ };
14938
+ }).sort((a, b) => b.score - a.score).slice(0, limit);
14939
+ return {
14940
+ provider: parsed.provider,
14941
+ model: parsed.model,
14942
+ dimensions: embedded.dimensions,
14943
+ query: options.query,
14944
+ results: scored
14945
+ };
14946
+ } finally {
14947
+ db.close();
14948
+ }
14949
+ }
14950
+
14951
+ // src/outbox-consume.ts
14952
+ import { createHash as createHash3, randomUUID as randomUUID3 } from "crypto";
14953
+ import { existsSync as existsSync4, readFileSync as readFileSync4 } from "fs";
14954
+ import { basename } from "path";
14955
+
14402
14956
  // src/safety.ts
14403
- import { createHash, randomUUID as randomUUID2 } from "crypto";
14957
+ import { createHash as createHash2, randomUUID as randomUUID2 } from "crypto";
14404
14958
  import { relative as relative2, resolve as resolve2, sep as sep2 } from "path";
14405
14959
  function envEnabled(name) {
14406
14960
  const value = process.env[name];
@@ -14495,7 +15049,7 @@ function redactSecrets(text, policy) {
14495
15049
  return { text: output, findings };
14496
15050
  }
14497
15051
  function auditId(input) {
14498
- return `audit_${createHash("sha256").update(`${input.event_type}\x00${input.action}\x00${input.target_uri ?? ""}\x00${input.created_at ?? ""}\x00${JSON.stringify(input.metadata ?? {})}\x00${randomUUID2()}`).digest("hex").slice(0, 24)}`;
15052
+ return `audit_${createHash2("sha256").update(`${input.event_type}\x00${input.action}\x00${input.target_uri ?? ""}\x00${input.created_at ?? ""}\x00${JSON.stringify(input.metadata ?? {})}\x00${randomUUID2()}`).digest("hex").slice(0, 24)}`;
14499
15053
  }
14500
15054
  function recordAuditEvent(db, input) {
14501
15055
  const createdAt = input.created_at ?? new Date().toISOString();
@@ -14530,8 +15084,8 @@ function recordRedactionFindings(db, input) {
14530
15084
  }
14531
15085
 
14532
15086
  // src/outbox-consume.ts
14533
- function stableId(prefix, value) {
14534
- return `${prefix}_${createHash2("sha256").update(value).digest("hex").slice(0, 20)}`;
15087
+ function stableId2(prefix, value) {
15088
+ return `${prefix}_${createHash3("sha256").update(value).digest("hex").slice(0, 20)}`;
14535
15089
  }
14536
15090
  function asObject(value) {
14537
15091
  return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
@@ -14685,7 +15239,7 @@ function mergeJson(existing, patch) {
14685
15239
  return JSON.stringify({ ...base, ...patch });
14686
15240
  }
14687
15241
  function ensureSource(db, event, now) {
14688
- const id = stableId("src", event.sourceUri);
15242
+ const id = stableId2("src", event.sourceUri);
14689
15243
  db.run(`INSERT INTO sources (id, uri, kind, title, metadata_json, acl_json, created_at, updated_at)
14690
15244
  VALUES (?, ?, ?, ?, ?, ?, ?, ?)
14691
15245
  ON CONFLICT(uri) DO UPDATE SET
@@ -14726,7 +15280,7 @@ function ensureSource(db, event, now) {
14726
15280
  function ensureRevision(db, sourceId, event, now) {
14727
15281
  if (!event.revision)
14728
15282
  return null;
14729
- const id = stableId("rev", `${sourceId}\x00${event.revision}`);
15283
+ const id = stableId2("rev", `${sourceId}\x00${event.revision}`);
14730
15284
  const metadata = {
14731
15285
  source_ref: event.sourceRef,
14732
15286
  source_uri: event.sourceUri,
@@ -14754,16 +15308,20 @@ function revisionIdsForEvent(db, sourceId, event) {
14754
15308
  function invalidateRevision(db, revisionId) {
14755
15309
  const chunks = db.query("SELECT id FROM chunks WHERE source_revision_id = ?").all(revisionId);
14756
15310
  let embeddingsDeleted = 0;
15311
+ let vectorEntriesDeleted = 0;
14757
15312
  for (const chunk of chunks) {
14758
15313
  const row = db.query("SELECT COUNT(*) AS n FROM chunk_embeddings WHERE chunk_id = ?").get(chunk.id);
14759
15314
  embeddingsDeleted += row?.n ?? 0;
15315
+ const vectorRow = db.query("SELECT COUNT(*) AS n FROM vector_index_entries WHERE chunk_id = ?").get(chunk.id);
15316
+ vectorEntriesDeleted += vectorRow?.n ?? 0;
15317
+ db.run("DELETE FROM vector_index_entries WHERE chunk_id = ?", [chunk.id]);
14760
15318
  db.run("DELETE FROM chunk_embeddings WHERE chunk_id = ?", [chunk.id]);
14761
15319
  db.run("DELETE FROM chunks_fts WHERE chunk_id = ?", [chunk.id]);
14762
15320
  }
14763
15321
  db.run("DELETE FROM chunks WHERE source_revision_id = ?", [revisionId]);
14764
15322
  const revision = db.query("SELECT metadata_json FROM source_revisions WHERE id = ?").get(revisionId);
14765
15323
  db.run("UPDATE source_revisions SET metadata_json = ? WHERE id = ?", [mergeJson(revision?.metadata_json, { reindex_required: true, invalidated_at: new Date().toISOString() }), revisionId]);
14766
- return { chunksDeleted: chunks.length, embeddingsDeleted };
15324
+ return { chunksDeleted: chunks.length, embeddingsDeleted, vectorEntriesDeleted };
14767
15325
  }
14768
15326
  function isDeleteEvent(eventType2, status) {
14769
15327
  return status === "deleted" || ["delete", "deleted", "remove", "removed"].includes(eventType2);
@@ -14801,6 +15359,7 @@ async function consumeOpenFilesOutbox(options) {
14801
15359
  const revisionsTouched = new Set;
14802
15360
  let chunksDeleted = 0;
14803
15361
  let embeddingsDeleted = 0;
15362
+ let vectorEntriesDeleted = 0;
14804
15363
  let staleRevisions = 0;
14805
15364
  let deletedSources = 0;
14806
15365
  let movedSources = 0;
@@ -14826,6 +15385,7 @@ async function consumeOpenFilesOutbox(options) {
14826
15385
  const invalidation = invalidateRevision(db, revisionId);
14827
15386
  chunksDeleted += invalidation.chunksDeleted;
14828
15387
  embeddingsDeleted += invalidation.embeddingsDeleted;
15388
+ vectorEntriesDeleted += invalidation.vectorEntriesDeleted;
14829
15389
  staleRevisions += 1;
14830
15390
  }
14831
15391
  if (isDeleteEvent(event.eventType, event.status))
@@ -14836,7 +15396,7 @@ async function consumeOpenFilesOutbox(options) {
14836
15396
  permissionUpdates += 1;
14837
15397
  db.run(`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
14838
15398
  VALUES (?, ?, ?, ?, ?, ?)`, [
14839
- stableId("evt", `${runId}\x00${index}\x00${event.sourceRef}\x00${event.eventType}`),
15399
+ stableId2("evt", `${runId}\x00${index}\x00${event.sourceRef}\x00${event.eventType}`),
14840
15400
  runId,
14841
15401
  "info",
14842
15402
  event.eventType,
@@ -14853,7 +15413,7 @@ async function consumeOpenFilesOutbox(options) {
14853
15413
  });
14854
15414
  db.run(`INSERT INTO provider_usage (id, run_id, provider, model, input_tokens, output_tokens, cost_usd, metadata_json, created_at)
14855
15415
  VALUES (?, ?, ?, ?, 0, 0, 0, ?, ?)`, [
14856
- stableId("usage", runId),
15416
+ stableId2("usage", runId),
14857
15417
  runId,
14858
15418
  "local",
14859
15419
  "open-files-outbox",
@@ -14871,7 +15431,8 @@ async function consumeOpenFilesOutbox(options) {
14871
15431
  sources: sourcesTouched.size,
14872
15432
  revisions: revisionsTouched.size,
14873
15433
  chunks_deleted: chunksDeleted,
14874
- embeddings_deleted: embeddingsDeleted
15434
+ embeddings_deleted: embeddingsDeleted,
15435
+ vector_entries_deleted: vectorEntriesDeleted
14875
15436
  },
14876
15437
  created_at: now
14877
15438
  });
@@ -14884,6 +15445,7 @@ async function consumeOpenFilesOutbox(options) {
14884
15445
  revisions_touched: revisionsTouched.size,
14885
15446
  chunks_deleted: chunksDeleted,
14886
15447
  embeddings_deleted: embeddingsDeleted,
15448
+ vector_entries_deleted: vectorEntriesDeleted,
14887
15449
  stale_revisions: staleRevisions,
14888
15450
  deleted_sources: deletedSources,
14889
15451
  moved_sources: movedSources,
@@ -14896,11 +15458,11 @@ async function consumeOpenFilesOutbox(options) {
14896
15458
  }
14897
15459
 
14898
15460
  // src/manifest-ingest.ts
14899
- import { createHash as createHash3 } from "crypto";
15461
+ import { createHash as createHash4 } from "crypto";
14900
15462
  import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
14901
15463
  import { basename as basename2 } from "path";
14902
- function stableId2(prefix, value) {
14903
- return `${prefix}_${createHash3("sha256").update(value).digest("hex").slice(0, 20)}`;
15464
+ function stableId3(prefix, value) {
15465
+ return `${prefix}_${createHash4("sha256").update(value).digest("hex").slice(0, 20)}`;
14904
15466
  }
14905
15467
  function asObject2(value) {
14906
15468
  return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
@@ -15120,7 +15682,7 @@ function deleteChunksForRevision(db, sourceRevisionId) {
15120
15682
  return rows.length;
15121
15683
  }
15122
15684
  function upsertSource(db, item, now) {
15123
- const sourceId = stableId2("src", item.sourceUri);
15685
+ const sourceId = stableId3("src", item.sourceUri);
15124
15686
  db.run(`INSERT INTO sources (id, uri, kind, title, metadata_json, acl_json, created_at, updated_at)
15125
15687
  VALUES (?, ?, ?, ?, ?, ?, ?, ?)
15126
15688
  ON CONFLICT(uri) DO UPDATE SET
@@ -15144,7 +15706,7 @@ function upsertSource(db, item, now) {
15144
15706
  return row.id;
15145
15707
  }
15146
15708
  function upsertRevision(db, sourceId, item, now) {
15147
- const revisionId = stableId2("rev", `${sourceId}\x00${item.revision}`);
15709
+ const revisionId = stableId3("rev", `${sourceId}\x00${item.revision}`);
15148
15710
  db.run(`INSERT INTO source_revisions (id, source_id, revision, hash, extracted_text_uri, metadata_json, created_at)
15149
15711
  VALUES (?, ?, ?, ?, ?, ?, ?)
15150
15712
  ON CONFLICT(source_id, revision) DO UPDATE SET
@@ -15186,16 +15748,32 @@ function insertChunks(db, sourceRevisionId, item, now, maxChars, overlapChars, s
15186
15748
  }
15187
15749
  const chunks = chunkText(redacted.text, maxChars, overlapChars);
15188
15750
  for (const chunk of chunks) {
15189
- const chunkId = stableId2("chk", `${sourceRevisionId}\x00${chunk.ordinal}\x00${chunk.text}`);
15190
- const metadata = {
15751
+ const chunkId = stableId3("chk", `${sourceRevisionId}\x00${chunk.ordinal}\x00${chunk.text}`);
15752
+ const provenance = sourceProvenance({
15753
+ source_ref: item.sourceRef,
15754
+ source_uri: item.sourceUri,
15755
+ source_kind: item.kind,
15756
+ source_revision_id: sourceRevisionId,
15757
+ revision: item.revision,
15758
+ hash: item.hash,
15759
+ chunk_id: chunkId,
15760
+ start_offset: chunk.startOffset,
15761
+ end_offset: chunk.endOffset,
15762
+ status: item.status,
15763
+ resolver: "open-files-read-only"
15764
+ });
15765
+ const metadata = withProvenance({
15191
15766
  source_ref: item.sourceRef,
15192
15767
  source_uri: item.sourceUri,
15768
+ source_kind: item.kind,
15769
+ source_revision_id: sourceRevisionId,
15770
+ revision: item.revision,
15193
15771
  hash: item.hash,
15194
15772
  status: item.status,
15195
15773
  path: asString2(item.raw.path) ?? null,
15196
15774
  mime: asString2(item.raw.mime) ?? asString2(item.raw.content_type) ?? null,
15197
15775
  size: asNumber(item.raw.size) ?? null
15198
- };
15776
+ }, provenance);
15199
15777
  db.run(`INSERT INTO chunks (id, source_revision_id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json, created_at)
15200
15778
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
15201
15779
  chunkId,
@@ -15298,12 +15876,12 @@ async function ingestOpenFilesManifestItems(options) {
15298
15876
  }
15299
15877
 
15300
15878
  // src/source-ingest.ts
15301
- import { createHash as createHash4 } from "crypto";
15879
+ import { createHash as createHash5 } from "crypto";
15302
15880
  import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
15303
15881
  import { basename as basename3 } from "path";
15304
15882
 
15305
15883
  // src/source-resolver.ts
15306
- function parseJsonObject(value) {
15884
+ function parseJsonObject2(value) {
15307
15885
  if (!value)
15308
15886
  return {};
15309
15887
  try {
@@ -15313,7 +15891,7 @@ function parseJsonObject(value) {
15313
15891
  return {};
15314
15892
  }
15315
15893
  }
15316
- function metadataString(metadata, keys) {
15894
+ function metadataString2(metadata, keys) {
15317
15895
  for (const key of keys) {
15318
15896
  const value = metadata[key];
15319
15897
  if (typeof value === "string" && value.length > 0)
@@ -15321,7 +15899,7 @@ function metadataString(metadata, keys) {
15321
15899
  }
15322
15900
  return null;
15323
15901
  }
15324
- function metadataNumber(metadata, keys) {
15902
+ function metadataNumber2(metadata, keys) {
15325
15903
  for (const key of keys) {
15326
15904
  const value = metadata[key];
15327
15905
  if (typeof value === "number" && Number.isFinite(value))
@@ -15446,8 +16024,8 @@ async function resolveOpenFilesSource(options) {
15446
16024
  citations: []
15447
16025
  };
15448
16026
  }
15449
- const sourceMetadata = parseJsonObject(source.metadata_json);
15450
- const permissions = parseJsonObject(source.acl_json);
16027
+ const sourceMetadata = parseJsonObject2(source.metadata_json);
16028
+ const permissions = parseJsonObject2(source.acl_json);
15451
16029
  try {
15452
16030
  assertPurposeAllowed(permissions, purpose);
15453
16031
  } catch (error48) {
@@ -15467,27 +16045,40 @@ async function resolveOpenFilesSource(options) {
15467
16045
  throw error48;
15468
16046
  }
15469
16047
  const revision = selectRevision(db, source.id, requestedRevision);
15470
- const revisionMetadata = parseJsonObject(revision?.metadata_json);
16048
+ const revisionMetadata = parseJsonObject2(revision?.metadata_json);
15471
16049
  const totalChunks = countChunks(db, revision?.id ?? null);
15472
16050
  const rows = selectChunks(db, revision?.id ?? null, limit);
15473
16051
  const effectiveSourceRef = sourceRevisionRef(source.uri, revision, options.sourceRef);
15474
16052
  const chunks = rows.map((row) => {
15475
- const metadata = parseJsonObject(row.metadata_json);
16053
+ const metadata = parseJsonObject2(row.metadata_json);
15476
16054
  const evidence = {
15477
16055
  resolver: "open-files-read-only",
15478
16056
  mode: "local_catalog",
15479
16057
  purpose,
15480
16058
  read_only: true,
15481
- source_ref: metadataString(metadata, ["source_ref"]) ?? effectiveSourceRef,
16059
+ source_ref: metadataString2(metadata, ["source_ref"]) ?? effectiveSourceRef,
15482
16060
  source_uri: source.uri,
15483
16061
  source_revision_id: revision?.id ?? null,
15484
16062
  revision: revision?.revision ?? null,
15485
- hash: revision?.hash ?? metadataString(metadata, ["hash"]),
16063
+ hash: revision?.hash ?? metadataString2(metadata, ["hash"]),
15486
16064
  chunk_id: row.id,
15487
16065
  start_offset: row.start_offset,
15488
16066
  end_offset: row.end_offset,
15489
16067
  resolved_at: resolvedAt
15490
16068
  };
16069
+ const provenance = sourceProvenance({
16070
+ source_ref: evidence.source_ref,
16071
+ source_uri: evidence.source_uri,
16072
+ source_kind: source.kind,
16073
+ source_revision_id: evidence.source_revision_id,
16074
+ revision: evidence.revision,
16075
+ hash: evidence.hash,
16076
+ chunk_id: row.id,
16077
+ start_offset: row.start_offset,
16078
+ end_offset: row.end_offset,
16079
+ status: metadataString2(metadata, ["status"]),
16080
+ resolver: evidence.resolver
16081
+ });
15491
16082
  return {
15492
16083
  id: row.id,
15493
16084
  kind: row.kind,
@@ -15497,7 +16088,8 @@ async function resolveOpenFilesSource(options) {
15497
16088
  start_offset: row.start_offset,
15498
16089
  end_offset: row.end_offset,
15499
16090
  metadata,
15500
- evidence
16091
+ evidence,
16092
+ provenance
15501
16093
  };
15502
16094
  });
15503
16095
  const citations = chunks.map((chunk) => ({
@@ -15507,7 +16099,8 @@ async function resolveOpenFilesSource(options) {
15507
16099
  quote: chunk.text.slice(0, 500),
15508
16100
  start_offset: chunk.start_offset,
15509
16101
  end_offset: chunk.end_offset,
15510
- evidence: chunk.evidence
16102
+ evidence: chunk.evidence,
16103
+ provenance: chunk.provenance
15511
16104
  }));
15512
16105
  recordAuditEvent(db, {
15513
16106
  event_type: "source_read",
@@ -15524,8 +16117,8 @@ async function resolveOpenFilesSource(options) {
15524
16117
  },
15525
16118
  created_at: resolvedAt
15526
16119
  });
15527
- const mime = metadataString(sourceMetadata, ["mime", "content_type"]) ?? metadataString(revisionMetadata, ["mime", "content_type"]);
15528
- const size = metadataNumber(sourceMetadata, ["size", "size_bytes"]) ?? metadataNumber(revisionMetadata, ["size", "size_bytes"]);
16120
+ const mime = metadataString2(sourceMetadata, ["mime", "content_type"]) ?? metadataString2(revisionMetadata, ["mime", "content_type"]);
16121
+ const size = metadataNumber2(sourceMetadata, ["size", "size_bytes"]) ?? metadataNumber2(revisionMetadata, ["size", "size_bytes"]);
15529
16122
  return {
15530
16123
  source_ref: effectiveSourceRef,
15531
16124
  source_uri: source.uri,
@@ -15558,12 +16151,12 @@ async function resolveOpenFilesSource(options) {
15558
16151
  content: {
15559
16152
  mime,
15560
16153
  size,
15561
- hash: revision?.hash ?? metadataString(sourceMetadata, ["hash", "checksum", "sha256"]),
16154
+ hash: revision?.hash ?? metadataString2(sourceMetadata, ["hash", "checksum", "sha256"]),
15562
16155
  text_available: totalChunks > 0,
15563
16156
  chunks_total: totalChunks,
15564
16157
  chunks_returned: chunks.length,
15565
16158
  char_count_returned: chunks.reduce((sum, chunk) => sum + chunk.text.length, 0),
15566
- extracted_text_ref: revision?.extracted_text_uri ?? metadataString(revisionMetadata, ["extracted_text_ref", "extracted_text_uri"]),
16159
+ extracted_text_ref: revision?.extracted_text_uri ?? metadataString2(revisionMetadata, ["extracted_text_ref", "extracted_text_uri"]),
15567
16160
  bytes_available: false,
15568
16161
  bytes_exposed: false
15569
16162
  },
@@ -15578,7 +16171,7 @@ async function resolveOpenFilesSource(options) {
15578
16171
 
15579
16172
  // src/source-ingest.ts
15580
16173
  function sha256Text(text) {
15581
- return `sha256:${createHash4("sha256").update(text).digest("hex")}`;
16174
+ return `sha256:${createHash5("sha256").update(text).digest("hex")}`;
15582
16175
  }
15583
16176
  function stripHtml(html) {
15584
16177
  return html.replace(/<script[\s\S]*?<\/script>/gi, " ").replace(/<style[\s\S]*?<\/style>/gi, " ").replace(/<[^>]+>/g, " ").replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/\s+\n/g, `
@@ -15800,131 +16393,8 @@ async function ingestSourceRef(options) {
15800
16393
  };
15801
16394
  }
15802
16395
 
15803
- // src/providers.ts
15804
- var DEFAULT_PROVIDER_SETTINGS = {
15805
- openai: {
15806
- api_key_env: "OPENAI_API_KEY",
15807
- default_model: "gpt-5.2"
15808
- },
15809
- anthropic: {
15810
- api_key_env: "ANTHROPIC_API_KEY",
15811
- default_model: "claude-sonnet-4-6"
15812
- },
15813
- deepseek: {
15814
- api_key_env: "DEEPSEEK_API_KEY",
15815
- default_model: "deepseek-chat"
15816
- }
15817
- };
15818
- var PROVIDER_CAPABILITIES = {
15819
- openai: {
15820
- text_generation: true,
15821
- structured_output: true,
15822
- tool_usage: true,
15823
- tool_streaming: true,
15824
- image_input: true,
15825
- native_web_search: true,
15826
- reasoning: true,
15827
- embeddings: true
15828
- },
15829
- anthropic: {
15830
- text_generation: true,
15831
- structured_output: true,
15832
- tool_usage: true,
15833
- tool_streaming: true,
15834
- image_input: true,
15835
- native_web_search: false,
15836
- reasoning: true,
15837
- embeddings: false
15838
- },
15839
- deepseek: {
15840
- text_generation: true,
15841
- structured_output: true,
15842
- tool_usage: true,
15843
- tool_streaming: true,
15844
- image_input: false,
15845
- native_web_search: false,
15846
- reasoning: true,
15847
- embeddings: false
15848
- }
15849
- };
15850
- var BUILTIN_ALIASES = {
15851
- default: "openai:gpt-5.2",
15852
- fast: "openai:gpt-5-mini",
15853
- reasoning: "anthropic:claude-opus-4-6",
15854
- sonnet: "anthropic:claude-sonnet-4-6",
15855
- deepseek: "deepseek:deepseek-chat",
15856
- "deepseek-reasoning": "deepseek:deepseek-reasoner"
15857
- };
15858
- function providerConfig(config2) {
15859
- return config2.providers ?? {};
15860
- }
15861
- function providerSettings(config2, provider) {
15862
- const configured = providerConfig(config2)[provider] ?? {};
15863
- return {
15864
- ...DEFAULT_PROVIDER_SETTINGS[provider],
15865
- ...configured
15866
- };
15867
- }
15868
- function modelAliases(config2) {
15869
- const configured = providerConfig(config2);
15870
- return {
15871
- ...BUILTIN_ALIASES,
15872
- ...configured.default_model ? { default: configured.default_model } : {},
15873
- ...configured.aliases ?? {}
15874
- };
15875
- }
15876
- function parseModelRef(modelRef) {
15877
- const [provider, ...rest] = modelRef.split(":");
15878
- const model = rest.join(":");
15879
- if (provider !== "openai" && provider !== "anthropic" && provider !== "deepseek") {
15880
- throw new Error(`Unsupported AI provider: ${provider}`);
15881
- }
15882
- if (!model)
15883
- throw new Error(`Invalid model ref: ${modelRef}. Expected provider:model.`);
15884
- return { provider, model };
15885
- }
15886
- function resolveModelRef(aliasOrRef, config2) {
15887
- const aliases = modelAliases(config2);
15888
- return aliases[aliasOrRef] ?? aliasOrRef;
15889
- }
15890
- function listModelRegistry(config2) {
15891
- const aliases = modelAliases(config2);
15892
- return Object.entries(aliases).map(([alias, modelRef]) => {
15893
- const parsed = parseModelRef(modelRef);
15894
- return {
15895
- alias,
15896
- model_ref: modelRef,
15897
- provider: parsed.provider,
15898
- model: parsed.model,
15899
- default: alias === "default",
15900
- capabilities: PROVIDER_CAPABILITIES[parsed.provider]
15901
- };
15902
- });
15903
- }
15904
- function providerCredentialStatus(config2, env = process.env) {
15905
- return Object.keys(DEFAULT_PROVIDER_SETTINGS).map((provider) => {
15906
- const settings = providerSettings(config2, provider);
15907
- const configured = Boolean(env[settings.api_key_env]);
15908
- return {
15909
- provider,
15910
- api_key_env: settings.api_key_env,
15911
- configured,
15912
- source: configured ? "env" : "missing",
15913
- base_url: settings.base_url ?? null,
15914
- default_model: settings.default_model
15915
- };
15916
- });
15917
- }
15918
- function providerStatus(config2, env = process.env) {
15919
- return {
15920
- default_model: resolveModelRef("default", config2),
15921
- providers: providerCredentialStatus(config2, env),
15922
- models: listModelRegistry(config2)
15923
- };
15924
- }
15925
-
15926
16396
  // src/storage-contract.ts
15927
- import { createHash as createHash5, randomUUID as randomUUID4 } from "crypto";
16397
+ import { createHash as createHash6, randomUUID as randomUUID4 } from "crypto";
15928
16398
  var GENERATED_ARTIFACTS = [
15929
16399
  {
15930
16400
  kind: "schema",
@@ -15960,7 +16430,7 @@ var GENERATED_ARTIFACTS = [
15960
16430
  function hashArtifactBody(body) {
15961
16431
  const bytes = typeof body === "string" ? Buffer.from(body) : Buffer.from(body);
15962
16432
  return {
15963
- hash: `sha256:${createHash5("sha256").update(bytes).digest("hex")}`,
16433
+ hash: `sha256:${createHash6("sha256").update(bytes).digest("hex")}`,
15964
16434
  size_bytes: bytes.byteLength
15965
16435
  };
15966
16436
  }
@@ -16095,12 +16565,16 @@ function recordStorageObjects(db, objects, now = new Date) {
16095
16565
  }
16096
16566
 
16097
16567
  // src/wiki-layout.ts
16568
+ import { createHash as createHash7 } from "crypto";
16098
16569
  function todayParts(now) {
16099
16570
  const year = String(now.getUTCFullYear());
16100
16571
  const month = String(now.getUTCMonth() + 1).padStart(2, "0");
16101
16572
  const day = String(now.getUTCDate()).padStart(2, "0");
16102
16573
  return { year, month, day };
16103
16574
  }
16575
+ function stableId4(prefix, value) {
16576
+ return `${prefix}_${createHash7("sha256").update(value).digest("hex").slice(0, 20)}`;
16577
+ }
16104
16578
  function agentSchemaTemplate() {
16105
16579
  return `# Knowledge Agent Schema v1
16106
16580
 
@@ -16182,6 +16656,13 @@ async function initializeWikiLayout(store, now = new Date) {
16182
16656
  uri: result.uri,
16183
16657
  kind: artifactKindForKey(entry.key),
16184
16658
  content_type: entry.content_type,
16659
+ metadata: {
16660
+ provenance: generatedArtifactProvenance({
16661
+ generated_from: "wiki_layout_init",
16662
+ artifact_key: entry.key,
16663
+ citation_required: entry.key.startsWith("wiki/") || entry.key.startsWith("indexes/")
16664
+ })
16665
+ },
16185
16666
  ...hashArtifactBody(entry.body)
16186
16667
  };
16187
16668
  }));
@@ -16194,6 +16675,66 @@ async function initializeWikiLayout(store, now = new Date) {
16194
16675
  written: [schemaKey, rootIndexKey, wikiReadmeKey, logKey]
16195
16676
  };
16196
16677
  }
16678
+ function provenanceFor(artifact) {
16679
+ const existing = artifact.metadata?.provenance;
16680
+ if (existing && typeof existing === "object" && !Array.isArray(existing)) {
16681
+ return existing;
16682
+ }
16683
+ return generatedArtifactProvenance({
16684
+ generated_from: "wiki_layout_init",
16685
+ artifact_key: artifact.key
16686
+ });
16687
+ }
16688
+ function recordWikiLayoutCatalog(db, artifacts, now = new Date) {
16689
+ const timestamp = now.toISOString();
16690
+ const rootIndex = artifacts.find((artifact) => artifact.key.endsWith("indexes/root.md"));
16691
+ const wikiReadme = artifacts.find((artifact) => artifact.key.endsWith("wiki/README.md"));
16692
+ if (rootIndex) {
16693
+ db.run(`INSERT INTO knowledge_indexes (id, kind, name, artifact_uri, shard_key, metadata_json, created_at, updated_at)
16694
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
16695
+ ON CONFLICT(kind, name, shard_key) DO UPDATE SET
16696
+ artifact_uri = excluded.artifact_uri,
16697
+ metadata_json = excluded.metadata_json,
16698
+ updated_at = excluded.updated_at`, [
16699
+ stableId4("idx", "root:indexes/root.md"),
16700
+ "root",
16701
+ "root",
16702
+ rootIndex.uri,
16703
+ "root",
16704
+ JSON.stringify({
16705
+ artifact_key: rootIndex.key,
16706
+ content_hash: rootIndex.hash ?? null,
16707
+ provenance: provenanceFor(rootIndex)
16708
+ }),
16709
+ timestamp,
16710
+ timestamp
16711
+ ]);
16712
+ }
16713
+ if (wikiReadme) {
16714
+ db.run(`INSERT INTO wiki_pages (id, path, title, artifact_uri, content_hash, status, metadata_json, created_at, updated_at)
16715
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
16716
+ ON CONFLICT(path) DO UPDATE SET
16717
+ title = excluded.title,
16718
+ artifact_uri = excluded.artifact_uri,
16719
+ content_hash = excluded.content_hash,
16720
+ status = excluded.status,
16721
+ metadata_json = excluded.metadata_json,
16722
+ updated_at = excluded.updated_at`, [
16723
+ stableId4("wiki", "wiki/README.md"),
16724
+ "wiki/README.md",
16725
+ "Wiki",
16726
+ wikiReadme.uri,
16727
+ wikiReadme.hash ?? null,
16728
+ "active",
16729
+ JSON.stringify({
16730
+ artifact_key: wikiReadme.key,
16731
+ provenance: provenanceFor(wikiReadme)
16732
+ }),
16733
+ timestamp,
16734
+ timestamp
16735
+ ]);
16736
+ }
16737
+ }
16197
16738
 
16198
16739
  // src/service.ts
16199
16740
  class KnowledgeService {
@@ -16270,6 +16811,7 @@ class KnowledgeService {
16270
16811
  const db = openKnowledgeDb(workspace.knowledgeDbPath);
16271
16812
  try {
16272
16813
  recordStorageObjects(db, result.artifacts);
16814
+ recordWikiLayoutCatalog(db, result.artifacts);
16273
16815
  } finally {
16274
16816
  db.close();
16275
16817
  }
@@ -16319,6 +16861,26 @@ class KnowledgeService {
16319
16861
  modelRegistry() {
16320
16862
  return listModelRegistry(this.config());
16321
16863
  }
16864
+ embeddingStatus() {
16865
+ const workspace = this.ensureWorkspace();
16866
+ return embeddingIndexStatus(workspace.knowledgeDbPath);
16867
+ }
16868
+ async indexEmbeddings(options = {}) {
16869
+ const workspace = this.ensureWorkspace();
16870
+ return indexKnowledgeEmbeddings({
16871
+ ...options,
16872
+ dbPath: workspace.knowledgeDbPath,
16873
+ config: this.config()
16874
+ });
16875
+ }
16876
+ async semanticSearch(options) {
16877
+ const workspace = this.ensureWorkspace();
16878
+ return searchVectorIndex({
16879
+ ...options,
16880
+ dbPath: workspace.knowledgeDbPath,
16881
+ config: this.config()
16882
+ });
16883
+ }
16322
16884
  }
16323
16885
  function createKnowledgeService(options = {}) {
16324
16886
  return new KnowledgeService(options);
@@ -16433,6 +16995,41 @@ function buildServer() {
16433
16995
  const service = createKnowledgeService({ scope });
16434
16996
  return jsonText({ ok: true, models: service.modelRegistry() });
16435
16997
  });
16998
+ registerTool(server, "ok_embeddings_status", "Embedding index status", "Inspect local embedding/vector index counts by provider and model", {
16999
+ scope: scopeField
17000
+ }, async ({ scope }) => {
17001
+ const service = createKnowledgeService({ scope });
17002
+ return jsonText({ ok: true, ...service.embeddingStatus() });
17003
+ });
17004
+ registerTool(server, "ok_embeddings_index", "Index embeddings", "Embed unindexed knowledge chunks into the local vector index", {
17005
+ scope: scopeField,
17006
+ limit: exports_external.number().optional().describe("Maximum chunks to embed"),
17007
+ model: exports_external.string().optional().describe("Embedding model ref, default openai:text-embedding-3-small"),
17008
+ dimensions: exports_external.number().optional().describe("Embedding dimensions for deterministic fake mode"),
17009
+ fake: exports_external.boolean().optional().describe("Use deterministic fake embeddings for local tests")
17010
+ }, async ({ scope, limit, model, dimensions, fake }) => {
17011
+ const service = createKnowledgeService({ scope });
17012
+ try {
17013
+ return jsonText({ ok: true, ...await service.indexEmbeddings({ limit, modelRef: model, dimensions, fake }) });
17014
+ } catch (error48) {
17015
+ return errorText(error48 instanceof Error ? error48.message : String(error48));
17016
+ }
17017
+ });
17018
+ registerTool(server, "ok_semantic_search", "Semantic search", "Search the local vector index and return cited chunks with provenance", {
17019
+ scope: scopeField,
17020
+ query: exports_external.string().describe("Semantic query"),
17021
+ limit: exports_external.number().optional().describe("Maximum results"),
17022
+ model: exports_external.string().optional().describe("Embedding model ref, default openai:text-embedding-3-small"),
17023
+ dimensions: exports_external.number().optional().describe("Embedding dimensions for deterministic fake mode"),
17024
+ fake: exports_external.boolean().optional().describe("Use deterministic fake embeddings for local tests")
17025
+ }, async ({ scope, query, limit, model, dimensions, fake }) => {
17026
+ const service = createKnowledgeService({ scope });
17027
+ try {
17028
+ return jsonText({ ok: true, ...await service.semanticSearch({ query, limit, modelRef: model, dimensions, fake }) });
17029
+ } catch (error48) {
17030
+ return errorText(error48 instanceof Error ? error48.message : String(error48));
17031
+ }
17032
+ });
16436
17033
  registerTool(server, "ok_add", "Add a knowledge item", "Add a new item to the knowledge store", {
16437
17034
  title: exports_external.string().describe("Item title"),
16438
17035
  content: exports_external.string().describe("Item content/body"),