@hasna/knowledge 0.2.16 → 0.2.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13660,10 +13660,11 @@ import { existsSync as existsSync7, readFileSync as readFileSync7, writeFileSync
13660
13660
  // package.json
13661
13661
  var package_default = {
13662
13662
  name: "@hasna/knowledge",
13663
- version: "0.2.16",
13663
+ version: "0.2.17",
13664
13664
  description: "Agent-friendly local knowledge CLI with JSON output, pagination, and safe destructive actions",
13665
13665
  type: "module",
13666
13666
  bin: {
13667
+ knowledge: "bin/open-knowledge.js",
13667
13668
  "open-knowledge": "bin/open-knowledge.js",
13668
13669
  "open-knowledge-mcp": "bin/open-knowledge-mcp.js"
13669
13670
  },
@@ -14134,8 +14135,8 @@ function createArtifactStore(config2, workspace) {
14134
14135
  return new LocalArtifactStore(workspace.artifactsDir);
14135
14136
  }
14136
14137
 
14137
- // src/embeddings.ts
14138
- import { createHash } from "crypto";
14138
+ // src/agent.ts
14139
+ import { randomUUID as randomUUID3 } from "crypto";
14139
14140
 
14140
14141
  // src/knowledge-db.ts
14141
14142
  import { Database } from "bun:sqlite";
@@ -14441,6 +14442,7 @@ function getKnowledgeDbStats(path) {
14441
14442
  }
14442
14443
 
14443
14444
  // src/providers.ts
14445
+ import { randomUUID as randomUUID2 } from "crypto";
14444
14446
  var DEFAULT_PROVIDER_SETTINGS = {
14445
14447
  openai: {
14446
14448
  api_key_env: "OPENAI_API_KEY",
@@ -14496,7 +14498,7 @@ var BUILTIN_ALIASES = {
14496
14498
  "deepseek-reasoning": "deepseek:deepseek-reasoner"
14497
14499
  };
14498
14500
  function providerConfig(config2) {
14499
- return config2.providers ?? {};
14501
+ return config2?.providers ?? {};
14500
14502
  }
14501
14503
  function providerSettings(config2, provider) {
14502
14504
  const configured = providerConfig(config2)[provider] ?? {};
@@ -14570,6 +14572,80 @@ function assertProviderCredentials(provider, config2, env = process.env) {
14570
14572
  throw new Error(`Missing ${status.api_key_env} for ${provider}. Set the env var to use this provider.`);
14571
14573
  return status;
14572
14574
  }
14575
+ async function defaultFactory(provider) {
14576
+ if (provider === "openai") {
14577
+ const { createOpenAI } = await import("@ai-sdk/openai");
14578
+ return createOpenAI;
14579
+ }
14580
+ if (provider === "anthropic") {
14581
+ const { createAnthropic } = await import("@ai-sdk/anthropic");
14582
+ return createAnthropic;
14583
+ }
14584
+ const { createDeepSeek } = await import("@ai-sdk/deepseek");
14585
+ return createDeepSeek;
14586
+ }
14587
+ async function createAiSdkProviderRegistry(options = {}) {
14588
+ const { createProviderRegistry } = await import("ai");
14589
+ const env = options.env ?? process.env;
14590
+ const providers = {};
14591
+ for (const provider of Object.keys(DEFAULT_PROVIDER_SETTINGS)) {
14592
+ const settings = providerSettings(options.config, provider);
14593
+ const apiKey = env[settings.api_key_env];
14594
+ if (!apiKey)
14595
+ continue;
14596
+ const factory = options.factories?.[provider] ?? await defaultFactory(provider);
14597
+ providers[provider] = factory({ apiKey, baseURL: settings.base_url });
14598
+ }
14599
+ return createProviderRegistry(providers);
14600
+ }
14601
+ async function languageModelFor(aliasOrRef, options = {}) {
14602
+ const modelRef = resolveModelRef(aliasOrRef, options.config);
14603
+ const parsed = parseModelRef(modelRef);
14604
+ assertProviderCredentials(parsed.provider, options.config, options.env);
14605
+ const registry2 = await createAiSdkProviderRegistry(options);
14606
+ return registry2.languageModel(modelRef);
14607
+ }
14608
+ function usageNumber(usage, keys) {
14609
+ for (const key of keys) {
14610
+ const value = usage[key];
14611
+ if (typeof value === "number" && Number.isFinite(value))
14612
+ return value;
14613
+ }
14614
+ return 0;
14615
+ }
14616
+ function normalizeAiSdkUsage(input) {
14617
+ const usage = input.usage ?? {};
14618
+ return {
14619
+ provider: input.provider,
14620
+ model: input.model,
14621
+ input_tokens: usageNumber(usage, ["inputTokens", "promptTokens", "input_tokens", "prompt_tokens"]),
14622
+ output_tokens: usageNumber(usage, ["outputTokens", "completionTokens", "output_tokens", "completion_tokens"]),
14623
+ cost_usd: input.costUsd ?? 0,
14624
+ metadata: {
14625
+ usage,
14626
+ provider_metadata: input.providerMetadata ?? {}
14627
+ }
14628
+ };
14629
+ }
14630
+ function recordProviderUsage(db, input) {
14631
+ const id = `usage_${randomUUID2()}`;
14632
+ db.run(`INSERT INTO provider_usage (id, run_id, provider, model, input_tokens, output_tokens, cost_usd, metadata_json, created_at)
14633
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
14634
+ id,
14635
+ input.run_id ?? null,
14636
+ input.provider,
14637
+ input.model,
14638
+ input.input_tokens,
14639
+ input.output_tokens,
14640
+ input.cost_usd,
14641
+ JSON.stringify(input.metadata),
14642
+ input.created_at ?? new Date().toISOString()
14643
+ ]);
14644
+ return id;
14645
+ }
14646
+
14647
+ // src/retrieval.ts
14648
+ import { createHash as createHash2 } from "crypto";
14573
14649
 
14574
14650
  // src/provenance.ts
14575
14651
  function isStaleStatus(status) {
@@ -14614,6 +14690,7 @@ function withProvenance(metadata, provenance) {
14614
14690
  }
14615
14691
 
14616
14692
  // src/embeddings.ts
14693
+ import { createHash } from "crypto";
14617
14694
  var DEFAULT_EMBEDDING_MODEL_REF = "openai:text-embedding-3-small";
14618
14695
  var DEFAULT_EMBEDDING_DIMENSIONS = 1536;
14619
14696
  function embeddingConfig(config2) {
@@ -14948,1242 +15025,1547 @@ async function searchVectorIndex(options) {
14948
15025
  }
14949
15026
  }
14950
15027
 
14951
- // src/outbox-consume.ts
14952
- import { createHash as createHash3, randomUUID as randomUUID3 } from "crypto";
14953
- import { existsSync as existsSync4, readFileSync as readFileSync4 } from "fs";
14954
- import { basename } from "path";
14955
-
14956
- // src/safety.ts
14957
- import { createHash as createHash2, randomUUID as randomUUID2 } from "crypto";
14958
- import { relative as relative2, resolve as resolve2, sep as sep2 } from "path";
14959
- function envEnabled(name) {
14960
- const value = process.env[name];
14961
- return value === "1" || value === "true" || value === "yes";
14962
- }
14963
- function resolveSafetyPolicy(config2, workspace) {
14964
- const extended = config2;
14965
- const configuredBuckets = new Set(extended.safety?.network?.allowed_s3_buckets ?? []);
14966
- if (config2.storage.type === "s3" && config2.storage.s3?.bucket)
14967
- configuredBuckets.add(config2.storage.s3.bucket);
14968
- if (process.env.HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS) {
14969
- for (const bucket of process.env.HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS.split(",").map((entry) => entry.trim()).filter(Boolean)) {
14970
- configuredBuckets.add(bucket);
14971
- }
14972
- }
14973
- return {
14974
- mode: config2.mode,
14975
- allowWriteRoots: [
14976
- workspace.home,
14977
- workspace.artifactsDir,
14978
- workspace.cacheDir,
14979
- workspace.exportsDir,
14980
- workspace.indexesDir,
14981
- workspace.logsDir,
14982
- workspace.runsDir,
14983
- workspace.schemasDir,
14984
- workspace.wikiDir
14985
- ].map((entry) => resolve2(entry)),
14986
- readOnlySourceAccess: true,
14987
- network: {
14988
- webSearchEnabled: extended.safety?.network?.web_search_enabled ?? envEnabled("HASNA_KNOWLEDGE_WEB_SEARCH"),
14989
- s3ReadsEnabled: extended.safety?.network?.s3_reads_enabled ?? envEnabled("HASNA_KNOWLEDGE_ALLOW_S3_READS"),
14990
- allowedS3Buckets: [...configuredBuckets].sort()
14991
- },
14992
- redaction: {
14993
- enabled: extended.safety?.redaction?.enabled ?? true
14994
- },
14995
- approvals: {
14996
- generatedWritesRequireApproval: extended.safety?.approvals?.generated_writes_require_approval ?? true
14997
- }
14998
- };
14999
- }
15000
- function isInside(root, target) {
15001
- const rel = relative2(root, target);
15002
- return rel === "" || !rel.startsWith("..") && rel !== ".." && !rel.startsWith(`..${sep2}`);
15003
- }
15004
- function assertWriteAllowed(targetPath, policy) {
15005
- const resolved = resolve2(targetPath);
15006
- if (!policy.allowWriteRoots.some((root) => isInside(root, resolved))) {
15007
- throw new Error(`Safety policy denied write outside .hasna/apps/knowledge: ${targetPath}`);
15008
- }
15009
- }
15010
- function assertS3ReadAllowed(uri, policy) {
15011
- const parsed = new URL(uri);
15012
- const bucket = parsed.hostname;
15013
- if (!policy.network.s3ReadsEnabled) {
15014
- throw new Error("Safety policy denied S3 read. Set safety.network.s3_reads_enabled=true or HASNA_KNOWLEDGE_ALLOW_S3_READS=1.");
15015
- }
15016
- if (!policy.network.allowedS3Buckets.includes(bucket)) {
15017
- throw new Error(`Safety policy denied S3 bucket "${bucket}". Add it to safety.network.allowed_s3_buckets or HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS.`);
15028
+ // src/search.ts
15029
+ function parseJsonObject2(value) {
15030
+ if (!value)
15031
+ return {};
15032
+ try {
15033
+ const parsed = JSON.parse(value);
15034
+ return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : {};
15035
+ } catch {
15036
+ return {};
15018
15037
  }
15019
15038
  }
15020
- function assertWebSearchAllowed(policy) {
15021
- if (!policy.network.webSearchEnabled) {
15022
- throw new Error("Safety policy denied web search. Set safety.network.web_search_enabled=true or HASNA_KNOWLEDGE_WEB_SEARCH=1.");
15039
+ function metadataString2(metadata, keys) {
15040
+ for (const key of keys) {
15041
+ const value = metadata[key];
15042
+ if (typeof value === "string" && value.length > 0)
15043
+ return value;
15023
15044
  }
15045
+ return null;
15024
15046
  }
15025
- var REDACTION_PATTERNS = [
15026
- { type: "private_key_block", severity: "high", regex: /-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*?-----END [A-Z ]*PRIVATE KEY-----/g, replacement: "[REDACTED:private_key_block]" },
15027
- { type: "secret_assignment", severity: "high", regex: /\b(?:api[_-]?key|secret|token|password)\s*[:=]\s*['"]?[^'"\s]{8,}/gi, replacement: "[REDACTED:secret_assignment]" },
15028
- { type: "openai_api_key", severity: "high", regex: /\bsk-[A-Za-z0-9_-]{20,}\b/g, replacement: "[REDACTED:openai_api_key]" },
15029
- { type: "anthropic_api_key", severity: "high", regex: /\bsk-ant-[A-Za-z0-9_-]{20,}\b/g, replacement: "[REDACTED:anthropic_api_key]" },
15030
- { type: "aws_access_key_id", severity: "high", regex: /\bA(?:KIA|SIA)[A-Z0-9]{16}\b/g, replacement: "[REDACTED:aws_access_key_id]" }
15031
- ];
15032
- function redactSecrets(text, policy) {
15033
- if (policy && !policy.redaction.enabled)
15034
- return { text, findings: [] };
15035
- let output = text;
15036
- const findings = [];
15037
- for (const pattern of REDACTION_PATTERNS) {
15038
- output = output.replace(pattern.regex, (match, ...args) => {
15039
- const offset = typeof args.at(-2) === "number" ? args.at(-2) : output.indexOf(match);
15040
- findings.push({
15041
- type: pattern.type,
15042
- severity: pattern.severity,
15043
- start: Math.max(0, offset),
15044
- end: Math.max(0, offset + match.length)
15045
- });
15046
- return pattern.replacement;
15047
- });
15047
+ function metadataNumber2(metadata, keys) {
15048
+ for (const key of keys) {
15049
+ const value = metadata[key];
15050
+ if (typeof value === "number" && Number.isFinite(value))
15051
+ return value;
15048
15052
  }
15049
- return { text: output, findings };
15053
+ return null;
15050
15054
  }
15051
- function auditId(input) {
15052
- return `audit_${createHash2("sha256").update(`${input.event_type}\x00${input.action}\x00${input.target_uri ?? ""}\x00${input.created_at ?? ""}\x00${JSON.stringify(input.metadata ?? {})}\x00${randomUUID2()}`).digest("hex").slice(0, 24)}`;
15055
+ function unique(values) {
15056
+ return Array.from(new Set(values));
15053
15057
  }
15054
- function recordAuditEvent(db, input) {
15055
- const createdAt = input.created_at ?? new Date().toISOString();
15056
- const id = auditId({ ...input, created_at: createdAt });
15057
- db.run(`INSERT INTO audit_events (id, event_type, action, target_uri, decision, metadata_json, created_at)
15058
- VALUES (?, ?, ?, ?, ?, ?, ?)`, [
15059
- id,
15060
- input.event_type,
15061
- input.action,
15062
- input.target_uri ?? null,
15063
- input.decision,
15064
- JSON.stringify(input.metadata ?? {}),
15065
- createdAt
15066
- ]);
15067
- return id;
15058
+ function queryTerms(query) {
15059
+ const terms = query.normalize("NFKC").toLowerCase().match(/[\p{L}\p{N}_]+/gu) ?? [];
15060
+ return unique(terms.filter((term) => term.length > 0)).slice(0, 16);
15068
15061
  }
15069
- function recordRedactionFindings(db, input) {
15070
- const createdAt = input.created_at ?? new Date().toISOString();
15071
- for (const finding of input.findings) {
15072
- db.run(`INSERT INTO redaction_findings (id, source_uri, run_id, severity, finding_type, metadata_json, created_at)
15073
- VALUES (?, ?, ?, ?, ?, ?, ?)`, [
15074
- `redact_${randomUUID2()}`,
15075
- input.source_uri ?? null,
15076
- input.run_id ?? null,
15077
- finding.severity,
15078
- finding.type,
15079
- JSON.stringify({ ...input.metadata ?? {}, start: finding.start, end: finding.end }),
15080
- createdAt
15081
- ]);
15082
- }
15083
- return input.findings.length;
15062
+ function ftsQueryForTerms(terms) {
15063
+ if (terms.length === 0)
15064
+ return null;
15065
+ return terms.map((term) => `${term}*`).join(" OR ");
15084
15066
  }
15085
-
15086
- // src/outbox-consume.ts
15087
- function stableId2(prefix, value) {
15088
- return `${prefix}_${createHash3("sha256").update(value).digest("hex").slice(0, 20)}`;
15067
+ function escapeLikeTerm(term) {
15068
+ return term.replace(/[\\%_]/g, (char) => `\\${char}`);
15089
15069
  }
15090
- function asObject(value) {
15091
- return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
15070
+ function likeParams(terms, fieldsPerTerm) {
15071
+ return terms.flatMap((term) => Array.from({ length: fieldsPerTerm }, () => `%${escapeLikeTerm(term)}%`));
15092
15072
  }
15093
- function asString(value) {
15094
- return typeof value === "string" && value.length > 0 ? value : undefined;
15073
+ function scoreFromRank(rank, index) {
15074
+ const rankScore = Number.isFinite(rank) ? 1 / (1 + Math.abs(rank)) : 0;
15075
+ const orderScore = 1 / (1 + index);
15076
+ return roundScore(Math.max(rankScore, orderScore));
15095
15077
  }
15096
- function buildSourceRef(event) {
15097
- const explicit = asString(event.source_ref) ?? asString(event.source_uri) ?? asString(event.uri);
15098
- if (explicit)
15099
- return explicit;
15100
- const fileId = asString(event.file_id);
15101
- if (fileId) {
15102
- const revision = asString(event.revision_id) ?? asString(event.revision);
15103
- const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
15104
- return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
15105
- }
15106
- const sourceId = asString(event.source_id);
15107
- const path = asString(event.path);
15108
- if (sourceId && path) {
15109
- return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
15110
- }
15111
- throw new Error("Outbox event is missing source_ref, file_id, or source_id/path.");
15078
+ function catalogScore(haystack, terms) {
15079
+ if (terms.length === 0)
15080
+ return 0;
15081
+ const matched = terms.filter((term) => haystack.includes(term)).length;
15082
+ if (matched === 0)
15083
+ return 0;
15084
+ return roundScore(Math.min(0.85, 0.35 + matched / terms.length * 0.5));
15112
15085
  }
15113
- function baseSourceUri(sourceRef, parsed) {
15114
- if (parsed.kind === "open-files" && parsed.entity === "file" && parsed.revision_id) {
15115
- return sourceRef.replace(/\/revision\/[^/]+$/, "");
15116
- }
15117
- return sourceRef;
15086
+ function semanticScore(score) {
15087
+ return roundScore(Math.max(0, Math.min(1, (score + 1) / 2)));
15118
15088
  }
15119
- function hashFromEvent(event) {
15120
- return asString(event.hash) ?? asString(event.checksum) ?? asString(event.sha256) ?? null;
15089
+ function roundScore(score) {
15090
+ return Number(score.toFixed(6));
15121
15091
  }
15122
- function revisionFromEvent(event, parsed, hash2) {
15123
- return asString(event.revision_id) ?? asString(event.revision) ?? asString(event.version_id) ?? (parsed.kind === "open-files" ? parsed.revision_id : undefined) ?? hash2 ?? null;
15092
+ function combinedScore(scores, citation) {
15093
+ const keyword = scores.keyword ?? 0;
15094
+ const semantic = scores.semantic ?? 0;
15095
+ const catalog = scores.catalog ?? 0;
15096
+ const citationBoost = citation?.chunk_id ? 0.05 : 0;
15097
+ return roundScore(Math.min(1, keyword * 0.55 + semantic * 0.4 + catalog * 0.35 + citationBoost));
15124
15098
  }
15125
- function eventType(event) {
15126
- return (asString(event.event) ?? asString(event.type) ?? asString(event.action) ?? asString(event.change_type) ?? "changed").toLowerCase();
15099
+ function existingProvenance(metadata) {
15100
+ const provenance = metadata.provenance;
15101
+ return provenance && typeof provenance === "object" && !Array.isArray(provenance) ? provenance : null;
15127
15102
  }
15128
- function titleFromEvent(event) {
15129
- const path = asString(event.path);
15130
- return asString(event.title) ?? asString(event.name) ?? (path ? basename(path) : null);
15103
+ function provenanceForChunk2(row) {
15104
+ const metadata = parseJsonObject2(row.chunk_metadata_json);
15105
+ const existing = existingProvenance(metadata);
15106
+ if (existing)
15107
+ return existing;
15108
+ if (!row.source_revision_id && !row.source_uri)
15109
+ return null;
15110
+ return sourceProvenance({
15111
+ source_ref: metadataString2(metadata, ["source_ref"]),
15112
+ source_uri: row.source_uri ?? metadataString2(metadata, ["source_uri"]),
15113
+ source_kind: row.source_kind ?? metadataString2(metadata, ["source_kind"]),
15114
+ source_revision_id: row.source_revision_id,
15115
+ revision: row.revision ?? metadataString2(metadata, ["revision"]),
15116
+ hash: row.hash ?? metadataString2(metadata, ["hash"]),
15117
+ chunk_id: row.chunk_id,
15118
+ start_offset: row.start_offset ?? metadataNumber2(metadata, ["start_offset"]),
15119
+ end_offset: row.end_offset ?? metadataNumber2(metadata, ["end_offset"]),
15120
+ status: metadataString2(metadata, ["status"]),
15121
+ resolver: "open-files-read-only"
15122
+ });
15131
15123
  }
15132
- function normalizeEvent(event, now) {
15133
- const sourceRef = buildSourceRef(event);
15134
- const parsed = parseSourceRef(sourceRef);
15135
- const hash2 = hashFromEvent(event);
15136
- return {
15137
- raw: event,
15138
- eventType: eventType(event),
15139
- sourceRef,
15140
- sourceUri: baseSourceUri(sourceRef, parsed),
15141
- kind: parsed.kind,
15142
- title: titleFromEvent(event),
15143
- revision: revisionFromEvent(event, parsed, hash2),
15144
- hash: hash2,
15145
- status: asString(event.status)?.toLowerCase() ?? null,
15146
- updatedAt: asString(event.updated_at) ?? now,
15147
- acl: event.permissions ?? event.acl ?? undefined
15148
- };
15149
- }
15150
- function parseOutboxText(text) {
15151
- const trimmed = text.trim();
15152
- if (!trimmed)
15124
+ function selectFtsChunks(db, ftsQuery, limit) {
15125
+ if (!ftsQuery)
15153
15126
  return [];
15154
- if (trimmed.startsWith("[")) {
15155
- const parsed = JSON.parse(trimmed);
15156
- if (!Array.isArray(parsed))
15157
- throw new Error("Outbox array parse failed.");
15158
- return parsed.map((entry) => {
15159
- const event = asObject(entry);
15160
- if (!event)
15161
- throw new Error("Outbox array entries must be objects.");
15162
- return event;
15163
- });
15164
- }
15165
- if (trimmed.startsWith("{")) {
15166
- try {
15167
- const parsed = JSON.parse(trimmed);
15168
- const object2 = asObject(parsed);
15169
- if (!object2)
15170
- throw new Error("Outbox object parse failed.");
15171
- if (Array.isArray(object2.events)) {
15172
- return object2.events.map((entry) => {
15173
- const event = asObject(entry);
15174
- if (!event)
15175
- throw new Error("Outbox events entries must be objects.");
15176
- return event;
15177
- });
15178
- }
15179
- if ("source_ref" in object2 || "source_uri" in object2 || "file_id" in object2)
15180
- return [object2];
15181
- } catch (error48) {
15182
- const lines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0);
15183
- if (lines.length <= 1)
15184
- throw error48;
15185
- return lines.map((line) => {
15186
- const event = asObject(JSON.parse(line));
15187
- if (!event)
15188
- throw new Error("Outbox JSONL entries must be objects.");
15189
- return event;
15190
- });
15191
- }
15192
- }
15193
- return trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => {
15194
- const event = asObject(JSON.parse(line));
15195
- if (!event)
15196
- throw new Error("Outbox JSONL entries must be objects.");
15197
- return event;
15198
- });
15127
+ return db.query(`SELECT
15128
+ chunks_fts.chunk_id,
15129
+ c.kind AS chunk_kind,
15130
+ c.wiki_page_id,
15131
+ c.text,
15132
+ c.token_count,
15133
+ c.start_offset,
15134
+ c.end_offset,
15135
+ c.metadata_json AS chunk_metadata_json,
15136
+ c.source_revision_id,
15137
+ sr.revision,
15138
+ sr.hash,
15139
+ s.uri AS source_uri,
15140
+ s.kind AS source_kind,
15141
+ s.title AS source_title,
15142
+ wp.path AS wiki_path,
15143
+ wp.title AS wiki_title,
15144
+ wp.artifact_uri AS wiki_artifact_uri,
15145
+ wp.content_hash AS wiki_content_hash,
15146
+ wp.status AS wiki_status,
15147
+ wp.metadata_json AS wiki_metadata_json,
15148
+ bm25(chunks_fts) AS rank
15149
+ FROM chunks_fts
15150
+ JOIN chunks c ON c.id = chunks_fts.chunk_id
15151
+ LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
15152
+ LEFT JOIN sources s ON s.id = sr.source_id
15153
+ LEFT JOIN wiki_pages wp ON wp.id = c.wiki_page_id
15154
+ WHERE chunks_fts MATCH ?
15155
+ ORDER BY rank ASC
15156
+ LIMIT ?`).all(ftsQuery, limit);
15199
15157
  }
15200
- async function readS3Text(uri, config2, safetyPolicy) {
15201
- const parsed = new URL(uri);
15202
- const bucket = parsed.hostname;
15203
- const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ""));
15204
- if (!bucket || !key)
15205
- throw new Error(`Invalid S3 outbox URI: ${uri}`);
15206
- if (safetyPolicy)
15207
- assertS3ReadAllowed(uri, safetyPolicy);
15208
- const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
15209
- import("@aws-sdk/client-s3"),
15210
- import("@aws-sdk/credential-providers")
15211
- ]);
15212
- const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
15213
- const client = new S3Client({
15214
- region: s3Config?.region,
15215
- credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
15216
- maxAttempts: s3Config?.max_attempts
15217
- });
15218
- const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
15219
- if (!response.Body)
15220
- return "";
15221
- return await response.Body.transformToString();
15158
+ function catalogWhere(fields, terms) {
15159
+ if (terms.length === 0)
15160
+ return "1 = 0";
15161
+ const clauses = terms.map(() => `(${fields.map((field) => `lower(COALESCE(${field}, '')) LIKE ? ESCAPE '\\'`).join(" OR ")})`);
15162
+ return clauses.join(" OR ");
15222
15163
  }
15223
- async function readOutboxInput(input, config2, safetyPolicy) {
15224
- if (input.startsWith("s3://"))
15225
- return readS3Text(input, config2, safetyPolicy);
15226
- if (!existsSync4(input))
15227
- throw new Error(`Outbox not found: ${input}`);
15228
- return readFileSync4(input, "utf8");
15164
+ function selectWikiPages(db, terms, limit) {
15165
+ const fields = ["path", "title", "artifact_uri", "metadata_json"];
15166
+ return db.query(`SELECT id, path, title, artifact_uri, content_hash, status, metadata_json
15167
+ FROM wiki_pages
15168
+ WHERE status = 'active' AND (${catalogWhere(fields, terms)})
15169
+ ORDER BY updated_at DESC
15170
+ LIMIT ?`).all(...likeParams(terms, fields.length), limit);
15229
15171
  }
15230
- function mergeJson(existing, patch) {
15231
- let base = {};
15232
- if (existing) {
15233
- try {
15234
- base = asObject(JSON.parse(existing)) ?? {};
15235
- } catch {
15236
- base = {};
15237
- }
15238
- }
15239
- return JSON.stringify({ ...base, ...patch });
15172
+ function selectKnowledgeIndexes(db, terms, limit) {
15173
+ const fields = ["kind", "name", "shard_key", "artifact_uri", "metadata_json"];
15174
+ return db.query(`SELECT id, kind, name, artifact_uri, shard_key, metadata_json
15175
+ FROM knowledge_indexes
15176
+ WHERE ${catalogWhere(fields, terms)}
15177
+ ORDER BY updated_at DESC
15178
+ LIMIT ?`).all(...likeParams(terms, fields.length), limit);
15240
15179
  }
15241
- function ensureSource(db, event, now) {
15242
- const id = stableId2("src", event.sourceUri);
15243
- db.run(`INSERT INTO sources (id, uri, kind, title, metadata_json, acl_json, created_at, updated_at)
15244
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
15245
- ON CONFLICT(uri) DO UPDATE SET
15246
- kind = excluded.kind,
15247
- title = COALESCE(excluded.title, sources.title),
15248
- updated_at = excluded.updated_at`, [
15249
- id,
15250
- event.sourceUri,
15251
- event.kind,
15252
- event.title,
15253
- JSON.stringify({ source_ref: event.sourceRef, source_uri: event.sourceUri, status: event.status, last_outbox_event: event.eventType }),
15254
- JSON.stringify(event.acl ?? {}),
15255
- now,
15256
- event.updatedAt
15257
- ]);
15258
- const row = db.query("SELECT id, metadata_json, acl_json FROM sources WHERE uri = ?").get(event.sourceUri);
15259
- if (!row)
15260
- throw new Error(`Failed to upsert source for outbox event: ${event.sourceUri}`);
15261
- const patch = {
15262
- source_ref: event.sourceRef,
15263
- source_uri: event.sourceUri,
15264
- last_outbox_event: event.eventType,
15265
- last_outbox_at: event.updatedAt
15180
+ function chunkResult(row, keywordScore) {
15181
+ const metadata = parseJsonObject2(row.chunk_metadata_json);
15182
+ const provenance = provenanceForChunk2(row);
15183
+ const sourceRef = metadataString2(metadata, ["source_ref"]);
15184
+ const sourceUri = row.source_uri ?? metadataString2(metadata, ["source_uri"]);
15185
+ const isWiki = Boolean(row.wiki_page_id);
15186
+ const result = {
15187
+ kind: isWiki ? "wiki_chunk" : "source_chunk",
15188
+ id: row.chunk_id,
15189
+ title: isWiki ? row.wiki_title : row.source_title,
15190
+ text: row.text,
15191
+ score: 0,
15192
+ scores: { keyword: keywordScore },
15193
+ source: sourceUri || sourceRef ? {
15194
+ uri: sourceUri,
15195
+ ref: sourceRef,
15196
+ kind: row.source_kind ?? metadataString2(metadata, ["source_kind"]),
15197
+ revision: row.revision ?? metadataString2(metadata, ["revision"]),
15198
+ hash: row.hash ?? metadataString2(metadata, ["hash"])
15199
+ } : null,
15200
+ citation: {
15201
+ chunk_id: row.chunk_id,
15202
+ start_offset: row.start_offset,
15203
+ end_offset: row.end_offset
15204
+ },
15205
+ artifact: isWiki ? {
15206
+ uri: row.wiki_artifact_uri,
15207
+ path: row.wiki_path,
15208
+ hash: row.wiki_content_hash,
15209
+ shard_key: row.wiki_path
15210
+ } : null,
15211
+ provenance,
15212
+ reasons: ["keyword_match"]
15266
15213
  };
15267
- if (event.status)
15268
- patch.status = event.status;
15269
- if (asString(event.raw.path))
15270
- patch.path = event.raw.path;
15271
- db.run("UPDATE sources SET metadata_json = ?, acl_json = CASE WHEN ? IS NULL THEN acl_json ELSE ? END, updated_at = ? WHERE id = ?", [
15272
- mergeJson(row.metadata_json, patch),
15273
- event.acl === undefined ? null : JSON.stringify(event.acl),
15274
- event.acl === undefined ? null : JSON.stringify(event.acl),
15275
- event.updatedAt,
15276
- row.id
15277
- ]);
15278
- return row.id;
15214
+ result.score = combinedScore(result.scores, result.citation);
15215
+ return result;
15279
15216
  }
15280
- function ensureRevision(db, sourceId, event, now) {
15281
- if (!event.revision)
15282
- return null;
15283
- const id = stableId2("rev", `${sourceId}\x00${event.revision}`);
15284
- const metadata = {
15285
- source_ref: event.sourceRef,
15286
- source_uri: event.sourceUri,
15287
- status: event.status,
15288
- last_outbox_event: event.eventType,
15289
- reindex_required: true
15217
+ function wikiPageResult(row, terms) {
15218
+ const metadata = parseJsonObject2(row.metadata_json);
15219
+ const score = catalogScore(`${row.path} ${row.title} ${row.artifact_uri ?? ""} ${row.metadata_json}`.toLowerCase(), terms);
15220
+ const result = {
15221
+ kind: "wiki_page",
15222
+ id: row.id,
15223
+ title: row.title,
15224
+ text: null,
15225
+ score: 0,
15226
+ scores: { catalog: score },
15227
+ source: null,
15228
+ citation: null,
15229
+ artifact: {
15230
+ uri: row.artifact_uri,
15231
+ path: row.path,
15232
+ hash: row.content_hash,
15233
+ shard_key: row.path
15234
+ },
15235
+ provenance: existingProvenance(metadata),
15236
+ reasons: ["wiki_catalog_match"]
15290
15237
  };
15291
- db.run(`INSERT INTO source_revisions (id, source_id, revision, hash, extracted_text_uri, metadata_json, created_at)
15292
- VALUES (?, ?, ?, ?, ?, ?, ?)
15293
- ON CONFLICT(source_id, revision) DO UPDATE SET
15294
- hash = COALESCE(excluded.hash, source_revisions.hash),
15295
- metadata_json = excluded.metadata_json`, [id, sourceId, event.revision, event.hash, asString(event.raw.extracted_text_ref) ?? null, JSON.stringify(metadata), now]);
15296
- const row = db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").get(sourceId, event.revision);
15297
- return row?.id ?? null;
15238
+ result.score = combinedScore(result.scores, result.citation);
15239
+ return result;
15298
15240
  }
15299
- function revisionIdsForEvent(db, sourceId, event) {
15300
- if (event.revision) {
15301
- return db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").all(sourceId, event.revision).map((row) => row.id);
15302
- }
15303
- if (event.hash) {
15304
- return db.query("SELECT id FROM source_revisions WHERE source_id = ? AND hash = ?").all(sourceId, event.hash).map((row) => row.id);
15305
- }
15306
- return db.query("SELECT id FROM source_revisions WHERE source_id = ?").all(sourceId).map((row) => row.id);
15241
+ function indexResult(row, terms) {
15242
+ const metadata = parseJsonObject2(row.metadata_json);
15243
+ const score = catalogScore(`${row.kind} ${row.name} ${row.shard_key ?? ""} ${row.artifact_uri ?? ""} ${row.metadata_json}`.toLowerCase(), terms);
15244
+ const result = {
15245
+ kind: "knowledge_index",
15246
+ id: row.id,
15247
+ title: row.name,
15248
+ text: null,
15249
+ score: 0,
15250
+ scores: { catalog: score },
15251
+ source: null,
15252
+ citation: null,
15253
+ artifact: {
15254
+ uri: row.artifact_uri,
15255
+ path: metadataString2(metadata, ["artifact_key"]),
15256
+ hash: metadataString2(metadata, ["content_hash"]),
15257
+ shard_key: row.shard_key
15258
+ },
15259
+ provenance: existingProvenance(metadata),
15260
+ reasons: ["index_catalog_match"]
15261
+ };
15262
+ result.score = combinedScore(result.scores, result.citation);
15263
+ return result;
15307
15264
  }
15308
- function invalidateRevision(db, revisionId) {
15309
- const chunks = db.query("SELECT id FROM chunks WHERE source_revision_id = ?").all(revisionId);
15310
- let embeddingsDeleted = 0;
15311
- let vectorEntriesDeleted = 0;
15312
- for (const chunk of chunks) {
15313
- const row = db.query("SELECT COUNT(*) AS n FROM chunk_embeddings WHERE chunk_id = ?").get(chunk.id);
15314
- embeddingsDeleted += row?.n ?? 0;
15315
- const vectorRow = db.query("SELECT COUNT(*) AS n FROM vector_index_entries WHERE chunk_id = ?").get(chunk.id);
15316
- vectorEntriesDeleted += vectorRow?.n ?? 0;
15317
- db.run("DELETE FROM vector_index_entries WHERE chunk_id = ?", [chunk.id]);
15318
- db.run("DELETE FROM chunk_embeddings WHERE chunk_id = ?", [chunk.id]);
15319
- db.run("DELETE FROM chunks_fts WHERE chunk_id = ?", [chunk.id]);
15265
+ function mergeResult(results, entry) {
15266
+ const key = `${entry.kind}:${entry.id}`;
15267
+ const existing = results.get(key);
15268
+ if (!existing) {
15269
+ results.set(key, entry);
15270
+ return;
15320
15271
  }
15321
- db.run("DELETE FROM chunks WHERE source_revision_id = ?", [revisionId]);
15322
- const revision = db.query("SELECT metadata_json FROM source_revisions WHERE id = ?").get(revisionId);
15323
- db.run("UPDATE source_revisions SET metadata_json = ? WHERE id = ?", [mergeJson(revision?.metadata_json, { reindex_required: true, invalidated_at: new Date().toISOString() }), revisionId]);
15324
- return { chunksDeleted: chunks.length, embeddingsDeleted, vectorEntriesDeleted };
15325
- }
15326
- function isDeleteEvent(eventType2, status) {
15327
- return status === "deleted" || ["delete", "deleted", "remove", "removed"].includes(eventType2);
15328
- }
15329
- function isMoveEvent(eventType2) {
15330
- return ["move", "moved", "rename", "renamed", "path_changed"].includes(eventType2);
15272
+ existing.scores = {
15273
+ keyword: Math.max(existing.scores.keyword ?? 0, entry.scores.keyword ?? 0) || undefined,
15274
+ semantic: Math.max(existing.scores.semantic ?? 0, entry.scores.semantic ?? 0) || undefined,
15275
+ catalog: Math.max(existing.scores.catalog ?? 0, entry.scores.catalog ?? 0) || undefined
15276
+ };
15277
+ existing.reasons = unique([...existing.reasons, ...entry.reasons]);
15278
+ existing.text = existing.text ?? entry.text;
15279
+ existing.title = existing.title ?? entry.title;
15280
+ existing.source = existing.source ?? entry.source;
15281
+ existing.citation = existing.citation ?? entry.citation;
15282
+ existing.artifact = existing.artifact ?? entry.artifact;
15283
+ existing.provenance = existing.provenance ?? entry.provenance;
15284
+ existing.score = combinedScore(existing.scores, existing.citation);
15331
15285
  }
15332
- function isPermissionEvent(eventType2) {
15333
- return ["permission", "permissions", "permission_changed", "acl_changed"].includes(eventType2);
15286
+ function sortResults(results) {
15287
+ const kindOrder = {
15288
+ source_chunk: 0,
15289
+ wiki_chunk: 1,
15290
+ wiki_page: 2,
15291
+ knowledge_index: 3
15292
+ };
15293
+ return results.sort((a, b) => {
15294
+ if (b.score !== a.score)
15295
+ return b.score - a.score;
15296
+ return kindOrder[a.kind] - kindOrder[b.kind] || a.id.localeCompare(b.id);
15297
+ });
15334
15298
  }
15335
- async function consumeOpenFilesOutbox(options) {
15336
- const now = (options.now ?? new Date).toISOString();
15337
- if (options.safetyPolicy)
15338
- assertWriteAllowed(options.dbPath, options.safetyPolicy);
15299
+ async function hybridSearch(options) {
15300
+ const query = options.query.trim();
15301
+ if (!query)
15302
+ throw new Error("Search query is required.");
15303
+ const limit = Math.max(1, Math.min(options.limit ?? 10, 100));
15304
+ const terms = queryTerms(query);
15305
+ const ftsQuery = ftsQueryForTerms(terms);
15306
+ const semanticEnabled = options.semantic === true || options.fake === true || Boolean(options.modelRef);
15307
+ const warnings = [];
15308
+ let semanticProvider = null;
15309
+ let semanticModel = null;
15310
+ let semanticDimensions = null;
15311
+ let keywordCount = 0;
15312
+ let catalogCount = 0;
15313
+ let semanticCount = 0;
15314
+ const merged = new Map;
15339
15315
  migrateKnowledgeDb(options.dbPath);
15340
- const text = await readOutboxInput(options.input, options.config, options.safetyPolicy);
15341
- const events = parseOutboxText(text);
15342
15316
  const db = openKnowledgeDb(options.dbPath);
15343
- const runId = `run_${randomUUID3()}`;
15344
15317
  try {
15345
- return db.transaction(() => {
15346
- db.run(`INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
15347
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
15348
- runId,
15349
- "open-files-outbox",
15350
- options.input,
15351
- "completed",
15352
- "local",
15353
- "open-files-outbox",
15354
- JSON.stringify({ path: options.input, events: events.length }),
15355
- now,
15356
- now
15357
- ]);
15358
- const sourcesTouched = new Set;
15359
- const revisionsTouched = new Set;
15360
- let chunksDeleted = 0;
15361
- let embeddingsDeleted = 0;
15362
- let vectorEntriesDeleted = 0;
15363
- let staleRevisions = 0;
15364
- let deletedSources = 0;
15365
- let movedSources = 0;
15366
- let permissionUpdates = 0;
15367
- recordAuditEvent(db, {
15368
- event_type: "source_read",
15369
- action: options.input.startsWith("s3://") ? "s3_outbox_read" : "local_outbox_read",
15370
- target_uri: options.input,
15371
- decision: "allow",
15372
- metadata: { events: events.length, read_only: true },
15373
- created_at: now
15374
- });
15375
- events.forEach((raw, index) => {
15376
- const event = normalizeEvent(raw, now);
15377
- const sourceId = ensureSource(db, event, now);
15378
- sourcesTouched.add(sourceId);
15379
- const createdRevisionId = ensureRevision(db, sourceId, event, now);
15380
- if (createdRevisionId)
15381
- revisionsTouched.add(createdRevisionId);
15382
- const affectedRevisionIds = revisionIdsForEvent(db, sourceId, event);
15383
- for (const revisionId of affectedRevisionIds) {
15384
- revisionsTouched.add(revisionId);
15385
- const invalidation = invalidateRevision(db, revisionId);
15386
- chunksDeleted += invalidation.chunksDeleted;
15387
- embeddingsDeleted += invalidation.embeddingsDeleted;
15388
- vectorEntriesDeleted += invalidation.vectorEntriesDeleted;
15389
- staleRevisions += 1;
15390
- }
15391
- if (isDeleteEvent(event.eventType, event.status))
15392
- deletedSources += 1;
15393
- if (isMoveEvent(event.eventType))
15394
- movedSources += 1;
15395
- if (isPermissionEvent(event.eventType) || event.acl !== undefined)
15396
- permissionUpdates += 1;
15397
- db.run(`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
15398
- VALUES (?, ?, ?, ?, ?, ?)`, [
15399
- stableId2("evt", `${runId}\x00${index}\x00${event.sourceRef}\x00${event.eventType}`),
15400
- runId,
15401
- "info",
15402
- event.eventType,
15403
- JSON.stringify({
15404
- source_ref: event.sourceRef,
15405
- source_uri: event.sourceUri,
15406
- revision: event.revision,
15407
- hash: event.hash,
15408
- status: event.status,
15409
- affected_revisions: affectedRevisionIds.length
15410
- }),
15411
- event.updatedAt
15412
- ]);
15413
- });
15414
- db.run(`INSERT INTO provider_usage (id, run_id, provider, model, input_tokens, output_tokens, cost_usd, metadata_json, created_at)
15415
- VALUES (?, ?, ?, ?, 0, 0, 0, ?, ?)`, [
15416
- stableId2("usage", runId),
15417
- runId,
15418
- "local",
15419
- "open-files-outbox",
15420
- JSON.stringify({ note: "No model provider used for outbox invalidation." }),
15421
- now
15422
- ]);
15423
- recordAuditEvent(db, {
15424
- event_type: "write",
15425
- action: "knowledge_outbox_invalidation",
15426
- target_uri: options.dbPath,
15427
- decision: "allow",
15428
- metadata: {
15429
- run_id: runId,
15430
- events: events.length,
15431
- sources: sourcesTouched.size,
15432
- revisions: revisionsTouched.size,
15433
- chunks_deleted: chunksDeleted,
15434
- embeddings_deleted: embeddingsDeleted,
15435
- vector_entries_deleted: vectorEntriesDeleted
15436
- },
15437
- created_at: now
15438
- });
15439
- return {
15440
- path: options.input,
15441
- db_path: options.dbPath,
15442
- run_id: runId,
15443
- events_seen: events.length,
15444
- sources_touched: sourcesTouched.size,
15445
- revisions_touched: revisionsTouched.size,
15446
- chunks_deleted: chunksDeleted,
15447
- embeddings_deleted: embeddingsDeleted,
15448
- vector_entries_deleted: vectorEntriesDeleted,
15449
- stale_revisions: staleRevisions,
15450
- deleted_sources: deletedSources,
15451
- moved_sources: movedSources,
15452
- permission_updates: permissionUpdates
15453
- };
15454
- })();
15318
+ const ftsRows = selectFtsChunks(db, ftsQuery, Math.max(limit * 3, 20));
15319
+ keywordCount = ftsRows.length;
15320
+ ftsRows.forEach((row, index) => mergeResult(merged, chunkResult(row, scoreFromRank(row.rank, index))));
15321
+ const wikiRows = selectWikiPages(db, terms, Math.max(limit, 10));
15322
+ const indexRows = selectKnowledgeIndexes(db, terms, Math.max(limit, 10));
15323
+ catalogCount = wikiRows.length + indexRows.length;
15324
+ wikiRows.forEach((row) => mergeResult(merged, wikiPageResult(row, terms)));
15325
+ indexRows.forEach((row) => mergeResult(merged, indexResult(row, terms)));
15455
15326
  } finally {
15456
15327
  db.close();
15457
15328
  }
15458
- }
15459
-
15460
- // src/manifest-ingest.ts
15461
- import { createHash as createHash4 } from "crypto";
15462
- import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
15463
- import { basename as basename2 } from "path";
15464
- function stableId3(prefix, value) {
15465
- return `${prefix}_${createHash4("sha256").update(value).digest("hex").slice(0, 20)}`;
15466
- }
15467
- function asObject2(value) {
15468
- return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
15469
- }
15470
- function asString2(value) {
15471
- return typeof value === "string" && value.length > 0 ? value : undefined;
15472
- }
15473
- function asNumber(value) {
15474
- return typeof value === "number" && Number.isFinite(value) ? value : undefined;
15475
- }
15476
- function buildSourceRefFromItem(item) {
15477
- const explicit = asString2(item.source_ref) ?? asString2(item.source_uri) ?? asString2(item.uri);
15478
- if (explicit)
15479
- return explicit;
15480
- const fileId = asString2(item.file_id);
15481
- if (fileId) {
15482
- const revision = asString2(item.revision_id) ?? asString2(item.revision);
15483
- const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
15484
- return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
15485
- }
15486
- const sourceId = asString2(item.source_id);
15487
- const path = asString2(item.path);
15488
- if (sourceId && path) {
15489
- return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
15490
- }
15491
- throw new Error("Manifest item is missing source_ref, file_id, or source_id/path.");
15492
- }
15493
- function baseSourceUri2(sourceRef, parsed) {
15494
- if (parsed.kind === "open-files" && parsed.entity === "file" && parsed.revision_id) {
15495
- return sourceRef.replace(/\/revision\/[^/]+$/, "");
15496
- }
15497
- return sourceRef;
15498
- }
15499
- function textFromItem(item) {
15500
- const direct = asString2(item.extracted_text) ?? asString2(item.text) ?? asString2(item.content_text) ?? asString2(item.markdown);
15501
- if (direct !== undefined)
15502
- return direct;
15503
- const content = item.content;
15504
- return typeof content === "string" ? content : null;
15505
- }
15506
- function extractedTextUriFromItem(item) {
15507
- const direct = asString2(item.extracted_text_ref) ?? asString2(item.extracted_text_uri) ?? asString2(item.text_ref);
15508
- if (direct)
15509
- return direct;
15510
- const content = asObject2(item.content);
15511
- return asString2(content?.extracted_text_ref) ?? asString2(content?.extracted_text_uri) ?? null;
15512
- }
15513
- function titleFromItem(item) {
15514
- const path = asString2(item.path);
15515
- return asString2(item.title) ?? asString2(item.name) ?? (path ? basename2(path) : null);
15516
- }
15517
- function hashFromItem(item) {
15518
- return asString2(item.hash) ?? asString2(item.checksum) ?? asString2(item.sha256) ?? null;
15519
- }
15520
- function revisionFromItem(item, parsed, hash2) {
15521
- const revision = asString2(item.revision_id) ?? asString2(item.revision) ?? asString2(item.version_id) ?? (parsed.kind === "open-files" ? parsed.revision_id : undefined) ?? hash2 ?? asString2(item.updated_at);
15522
- return revision ?? "current";
15523
- }
15524
- function metadataFromItem(item, normalized) {
15525
- const metadata = {};
15526
- for (const [key, value] of Object.entries(item)) {
15527
- if (["text", "content", "content_text", "extracted_text", "markdown"].includes(key))
15528
- continue;
15529
- metadata[key] = value;
15530
- }
15531
- metadata.source_ref = normalized.sourceRef;
15532
- metadata.source_uri = normalized.sourceUri;
15533
- metadata.status = normalized.status;
15534
- return metadata;
15535
- }
15536
- function normalizeManifestItem(item, now) {
15537
- const sourceRef = buildSourceRefFromItem(item);
15538
- const parsed = parseSourceRef(sourceRef);
15539
- const sourceUri = baseSourceUri2(sourceRef, parsed);
15540
- const hash2 = hashFromItem(item);
15541
- const status = asString2(item.status) ?? "active";
15542
- return {
15543
- raw: item,
15544
- sourceRef,
15545
- sourceUri,
15546
- kind: parsed.kind,
15547
- title: titleFromItem(item),
15548
- revision: revisionFromItem(item, parsed, hash2),
15549
- hash: hash2,
15550
- extractedTextUri: extractedTextUriFromItem(item),
15551
- text: textFromItem(item),
15552
- metadata: metadataFromItem(item, { sourceRef, sourceUri, status }),
15553
- acl: item.permissions ?? item.acl ?? {},
15554
- status,
15555
- updatedAt: asString2(item.updated_at) ?? now
15556
- };
15557
- }
15558
- function parseManifestText(text) {
15559
- const trimmed = text.trim();
15560
- if (!trimmed)
15561
- return [];
15562
- if (trimmed.startsWith("[")) {
15563
- const parsed = JSON.parse(trimmed);
15564
- if (!Array.isArray(parsed))
15565
- throw new Error("Manifest array parse failed.");
15566
- return parsed.map((entry) => {
15567
- const item = asObject2(entry);
15568
- if (!item)
15569
- throw new Error("Manifest array entries must be objects.");
15570
- return item;
15571
- });
15572
- }
15573
- if (trimmed.startsWith("{")) {
15329
+ if (semanticEnabled) {
15574
15330
  try {
15575
- const parsed = JSON.parse(trimmed);
15576
- const object2 = asObject2(parsed);
15577
- if (!object2)
15578
- throw new Error("Manifest object parse failed.");
15579
- if (Array.isArray(object2.items)) {
15580
- return object2.items.map((entry) => {
15581
- const item = asObject2(entry);
15582
- if (!item)
15583
- throw new Error("Manifest items entries must be objects.");
15584
- return item;
15585
- });
15331
+ const semantic = await searchVectorIndex({
15332
+ dbPath: options.dbPath,
15333
+ query,
15334
+ limit: Math.max(limit * 3, 20),
15335
+ config: options.config,
15336
+ env: options.env,
15337
+ modelRef: options.modelRef,
15338
+ dimensions: options.dimensions,
15339
+ fake: options.fake,
15340
+ batchSize: options.batchSize,
15341
+ maxParallelCalls: options.maxParallelCalls
15342
+ });
15343
+ semanticProvider = semantic.provider;
15344
+ semanticModel = semantic.model;
15345
+ semanticDimensions = semantic.dimensions;
15346
+ semanticCount = semantic.results.length;
15347
+ for (const row of semantic.results) {
15348
+ const result = {
15349
+ kind: "source_chunk",
15350
+ id: row.chunk_id,
15351
+ title: null,
15352
+ text: row.text,
15353
+ score: 0,
15354
+ scores: { semantic: semanticScore(row.score) },
15355
+ source: {
15356
+ uri: row.source_uri,
15357
+ ref: row.source_ref,
15358
+ kind: row.provenance?.source_kind ?? null,
15359
+ revision: row.revision,
15360
+ hash: row.hash
15361
+ },
15362
+ citation: {
15363
+ chunk_id: row.chunk_id,
15364
+ start_offset: row.provenance?.start_offset ?? null,
15365
+ end_offset: row.provenance?.end_offset ?? null
15366
+ },
15367
+ artifact: null,
15368
+ provenance: row.provenance,
15369
+ reasons: ["semantic_match"]
15370
+ };
15371
+ result.score = combinedScore(result.scores, result.citation);
15372
+ mergeResult(merged, result);
15586
15373
  }
15587
- if ("source_ref" in object2 || "source_uri" in object2 || "file_id" in object2)
15588
- return [object2];
15589
15374
  } catch (error48) {
15590
- const lines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0);
15591
- if (lines.length <= 1)
15592
- throw error48;
15593
- return lines.map((line) => {
15594
- const item = asObject2(JSON.parse(line));
15595
- if (!item)
15596
- throw new Error("Manifest JSONL entries must be objects.");
15597
- return item;
15598
- });
15375
+ warnings.push(`semantic_search_failed: ${error48 instanceof Error ? error48.message : String(error48)}`);
15599
15376
  }
15600
15377
  }
15601
- return trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => {
15602
- const item = asObject2(JSON.parse(line));
15603
- if (!item)
15604
- throw new Error("Manifest JSONL entries must be objects.");
15605
- return item;
15606
- });
15378
+ const results = sortResults(Array.from(merged.values())).slice(0, limit);
15379
+ return {
15380
+ query,
15381
+ limit,
15382
+ mode: {
15383
+ keyword: true,
15384
+ catalog: true,
15385
+ semantic: semanticEnabled
15386
+ },
15387
+ semantic_provider: semanticProvider,
15388
+ semantic_model: semanticModel,
15389
+ semantic_dimensions: semanticDimensions,
15390
+ counts: {
15391
+ keyword_results: keywordCount,
15392
+ catalog_results: catalogCount,
15393
+ semantic_results: semanticCount,
15394
+ merged_results: results.length
15395
+ },
15396
+ warnings,
15397
+ results
15398
+ };
15607
15399
  }
15608
- async function readS3Text2(uri, config2, safetyPolicy) {
15609
- const parsed = new URL(uri);
15610
- const bucket = parsed.hostname;
15611
- const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ""));
15612
- if (!bucket || !key)
15613
- throw new Error(`Invalid S3 manifest URI: ${uri}`);
15614
- if (safetyPolicy)
15615
- assertS3ReadAllowed(uri, safetyPolicy);
15616
- const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
15617
- import("@aws-sdk/client-s3"),
15618
- import("@aws-sdk/credential-providers")
15619
- ]);
15620
- const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
15621
- const client = new S3Client({
15622
- region: s3Config?.region,
15623
- credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
15624
- maxAttempts: s3Config?.max_attempts
15625
- });
15626
- const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
15627
- if (!response.Body)
15628
- return "";
15629
- return await response.Body.transformToString();
15400
+
15401
+ // src/retrieval.ts
15402
+ function stableId2(prefix, value) {
15403
+ return `${prefix}_${createHash2("sha256").update(value).digest("hex").slice(0, 20)}`;
15630
15404
  }
15631
- async function readManifestInput(input, config2, safetyPolicy) {
15632
- if (input.startsWith("s3://"))
15633
- return readS3Text2(input, config2, safetyPolicy);
15634
- if (!existsSync5(input))
15635
- throw new Error(`Manifest not found: ${input}`);
15636
- return readFileSync5(input, "utf8");
15405
+ function normalizeQuery(query) {
15406
+ return query.normalize("NFKC").trim().replace(/\s+/g, " ").toLowerCase();
15637
15407
  }
15638
- function chunkText(text, maxChars, overlapChars) {
15639
- const normalized = text.replace(/\r\n/g, `
15640
- `);
15641
- if (!normalized.trim())
15642
- return [];
15643
- const chunks = [];
15644
- let start = 0;
15645
- while (start < normalized.length) {
15646
- const hardEnd = Math.min(normalized.length, start + maxChars);
15647
- let end = hardEnd;
15648
- if (hardEnd < normalized.length) {
15649
- const paragraphBreak = normalized.lastIndexOf(`
15650
-
15651
- `, hardEnd);
15652
- const sentenceBreak = normalized.lastIndexOf(". ", hardEnd);
15653
- const candidate = Math.max(paragraphBreak, sentenceBreak);
15654
- if (candidate > start + Math.floor(maxChars * 0.5))
15655
- end = candidate + (candidate === paragraphBreak ? 2 : 1);
15656
- }
15657
- const chunk = normalized.slice(start, end).trim();
15658
- if (chunk) {
15659
- chunks.push({
15660
- ordinal: chunks.length,
15661
- text: chunk,
15662
- startOffset: start,
15663
- endOffset: end
15664
- });
15665
- }
15666
- if (end >= normalized.length)
15667
- break;
15668
- start = Math.max(0, end - overlapChars);
15669
- }
15670
- return chunks;
15408
+ function queryTerms2(query) {
15409
+ return Array.from(new Set(normalizeQuery(query).match(/[\p{L}\p{N}_]+/gu) ?? [])).slice(0, 16);
15671
15410
  }
15672
- function estimateTokenCount(text) {
15673
- const words = text.trim().split(/\s+/).filter(Boolean).length;
15674
- return Math.max(1, Math.ceil(words * 1.25));
15411
+ function textForResult(result) {
15412
+ return [result.title, result.text].filter(Boolean).join(" ").toLowerCase();
15675
15413
  }
15676
- function deleteChunksForRevision(db, sourceRevisionId) {
15677
- const rows = db.query("SELECT id FROM chunks WHERE source_revision_id = ?").all(sourceRevisionId);
15678
- for (const row of rows) {
15679
- db.run("DELETE FROM chunks_fts WHERE chunk_id = ?", [row.id]);
15680
- }
15681
- db.run("DELETE FROM chunks WHERE source_revision_id = ?", [sourceRevisionId]);
15682
- return rows.length;
15414
+ function exactScore(result, terms) {
15415
+ if (terms.length === 0)
15416
+ return 0;
15417
+ const text = textForResult(result);
15418
+ const matched = terms.filter((term) => text.includes(term)).length;
15419
+ return Number((matched / terms.length).toFixed(6));
15683
15420
  }
15684
- function upsertSource(db, item, now) {
15685
- const sourceId = stableId3("src", item.sourceUri);
15421
+ function hasReadOnlyProvenance(provenance) {
15422
+ if (!provenance)
15423
+ return true;
15424
+ if ("read_only" in provenance)
15425
+ return provenance.read_only === true;
15426
+ if ("read_only_sources" in provenance)
15427
+ return provenance.read_only_sources === true;
15428
+ return true;
15429
+ }
15430
+ function isStale(provenance) {
15431
+ if (!provenance)
15432
+ return false;
15433
+ if ("stale" in provenance && provenance.stale)
15434
+ return true;
15435
+ if ("status" in provenance)
15436
+ return isStaleStatus(provenance.status);
15437
+ return false;
15438
+ }
15439
+ function freshnessScore(result) {
15440
+ if (isStale(result.provenance))
15441
+ return 0;
15442
+ if (result.source?.hash || result.source?.revision)
15443
+ return 1;
15444
+ if (result.artifact?.hash)
15445
+ return 0.85;
15446
+ if (result.provenance && "source_refs" in result.provenance && result.provenance.source_refs.length > 0)
15447
+ return 0.75;
15448
+ return 0.55;
15449
+ }
15450
+ function citationScore(result) {
15451
+ if (result.citation?.chunk_id && (result.source?.uri || result.artifact?.uri))
15452
+ return 1;
15453
+ if (result.provenance && "citation_required" in result.provenance && result.provenance.citation_required)
15454
+ return 0.75;
15455
+ if (result.artifact?.uri)
15456
+ return 0.65;
15457
+ return 0.35;
15458
+ }
15459
+ function authorityScore(result) {
15460
+ if (result.kind === "wiki_chunk")
15461
+ return 0.85;
15462
+ if (result.kind === "source_chunk")
15463
+ return 0.8;
15464
+ if (result.kind === "wiki_page")
15465
+ return 0.65;
15466
+ return 0.55;
15467
+ }
15468
+ function rerank(result, terms) {
15469
+ const scores = {
15470
+ base_score: result.score,
15471
+ exact_score: exactScore(result, terms),
15472
+ citation_score: citationScore(result),
15473
+ freshness_score: freshnessScore(result),
15474
+ authority_score: authorityScore(result)
15475
+ };
15476
+ const final = Math.min(1, scores.base_score * 0.65 + scores.exact_score * 0.1 + scores.citation_score * 0.1 + scores.freshness_score * 0.1 + scores.authority_score * 0.05);
15477
+ const reasons = new Set(result.reasons);
15478
+ if (scores.exact_score > 0.5)
15479
+ reasons.add("exact_term");
15480
+ if (scores.citation_score >= 0.75)
15481
+ reasons.add("cited_source");
15482
+ if (scores.freshness_score >= 0.85)
15483
+ reasons.add("fresh_source");
15484
+ return {
15485
+ ...result,
15486
+ score: Number(final.toFixed(6)),
15487
+ reasons: Array.from(reasons),
15488
+ rerank: {
15489
+ ...scores,
15490
+ final_score: Number(final.toFixed(6))
15491
+ }
15492
+ };
15493
+ }
15494
+ function quoteFor(result, maxChars) {
15495
+ const source = result.text ?? result.title;
15496
+ if (!source)
15497
+ return null;
15498
+ const normalized = source.replace(/\s+/g, " ").trim();
15499
+ return normalized.length <= maxChars ? normalized : `${normalized.slice(0, Math.max(0, maxChars - 1)).trim()}...`;
15500
+ }
15501
+ function citationFor(result) {
15502
+ const id = stableId2("cite", `${result.kind}\x00${result.id}\x00${result.source?.uri ?? ""}\x00${result.artifact?.uri ?? ""}`);
15503
+ return {
15504
+ id,
15505
+ result_id: result.id,
15506
+ kind: result.kind,
15507
+ source_uri: result.source?.uri ?? null,
15508
+ source_ref: result.source?.ref ?? null,
15509
+ artifact_uri: result.artifact?.uri ?? null,
15510
+ artifact_path: result.artifact?.path ?? null,
15511
+ revision: result.source?.revision ?? null,
15512
+ hash: result.source?.hash ?? result.artifact?.hash ?? null,
15513
+ chunk_id: result.citation?.chunk_id ?? null,
15514
+ start_offset: result.citation?.start_offset ?? null,
15515
+ end_offset: result.citation?.end_offset ?? null,
15516
+ quote: quoteFor(result, 500),
15517
+ provenance: result.provenance
15518
+ };
15519
+ }
15520
+ function excerptFor(result, citation, contextChars) {
15521
+ const text = quoteFor(result, contextChars);
15522
+ if (!text)
15523
+ return null;
15524
+ return {
15525
+ id: stableId2("excerpt", `${result.kind}\x00${result.id}`),
15526
+ result_id: result.id,
15527
+ citation_id: citation.id,
15528
+ kind: result.kind,
15529
+ text,
15530
+ score: result.score
15531
+ };
15532
+ }
15533
+ function placeholders(values) {
15534
+ return values.map(() => "?").join(", ");
15535
+ }
15536
+ function loadGraphEvidence(dbPath, results) {
15537
+ const chunkIds = results.map((result) => result.citation?.chunk_id).filter((id) => Boolean(id));
15538
+ const wikiPageIds = results.filter((result) => result.kind === "wiki_page").map((result) => result.id);
15539
+ const citations = [];
15540
+ const backlinks = [];
15541
+ if (chunkIds.length === 0 && wikiPageIds.length === 0)
15542
+ return { citations, backlinks };
15543
+ const db = openKnowledgeDb(dbPath);
15544
+ try {
15545
+ if (chunkIds.length > 0) {
15546
+ citations.push(...db.query(`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
15547
+ FROM citations
15548
+ WHERE chunk_id IN (${placeholders(chunkIds)})
15549
+ ORDER BY created_at DESC
15550
+ LIMIT 50`).all(...chunkIds));
15551
+ }
15552
+ if (wikiPageIds.length > 0) {
15553
+ citations.push(...db.query(`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
15554
+ FROM citations
15555
+ WHERE wiki_page_id IN (${placeholders(wikiPageIds)})
15556
+ ORDER BY created_at DESC
15557
+ LIMIT 50`).all(...wikiPageIds));
15558
+ backlinks.push(...db.query(`SELECT from_page_id, to_page_id, label
15559
+ FROM wiki_backlinks
15560
+ WHERE from_page_id IN (${placeholders(wikiPageIds)}) OR to_page_id IN (${placeholders(wikiPageIds)})
15561
+ LIMIT 50`).all(...wikiPageIds, ...wikiPageIds));
15562
+ }
15563
+ } finally {
15564
+ db.close();
15565
+ }
15566
+ return { citations, backlinks };
15567
+ }
15568
+ async function retrieveKnowledgeContext(options) {
15569
+ const contextChars = Math.max(200, Math.min(options.contextChars ?? 1200, 4000));
15570
+ const search = await hybridSearch(options);
15571
+ const terms = queryTerms2(search.query);
15572
+ const warnings = [...search.warnings];
15573
+ const permissionNotes = new Set;
15574
+ const freshnessNotes = new Set;
15575
+ const filtered = search.results.filter((result) => {
15576
+ if (!hasReadOnlyProvenance(result.provenance)) {
15577
+ warnings.push(`permission_filtered: ${result.kind}:${result.id}`);
15578
+ permissionNotes.add("Dropped a result because provenance was not read-only.");
15579
+ return false;
15580
+ }
15581
+ if (isStale(result.provenance)) {
15582
+ warnings.push(`stale_filtered: ${result.kind}:${result.id}`);
15583
+ freshnessNotes.add("Dropped a stale result whose source status requires reindexing.");
15584
+ return false;
15585
+ }
15586
+ return true;
15587
+ });
15588
+ const results = filtered.map((result) => rerank(result, terms)).sort((a, b) => b.score - a.score || a.id.localeCompare(b.id)).slice(0, search.limit);
15589
+ const citations = results.map(citationFor);
15590
+ const excerpts = results.map((result, index) => excerptFor(result, citations[index], contextChars)).filter((entry) => Boolean(entry));
15591
+ for (const result of results) {
15592
+ if (result.provenance && "read_only" in result.provenance && result.provenance.read_only) {
15593
+ permissionNotes.add("All source-backed excerpts are read-only and citation-required.");
15594
+ }
15595
+ if (result.rerank.freshness_score >= 0.85) {
15596
+ freshnessNotes.add("Fresh source revision/hash or artifact hash is present for top context.");
15597
+ }
15598
+ }
15599
+ return {
15600
+ query: search.query,
15601
+ normalized_query: normalizeQuery(search.query),
15602
+ created_at: new Date().toISOString(),
15603
+ mode: search.mode,
15604
+ warnings,
15605
+ search_counts: search.counts,
15606
+ results,
15607
+ citations,
15608
+ excerpts,
15609
+ graph: loadGraphEvidence(options.dbPath, results),
15610
+ notes: {
15611
+ permissions: Array.from(permissionNotes),
15612
+ freshness: Array.from(freshnessNotes)
15613
+ }
15614
+ };
15615
+ }
15616
+
15617
+ // src/agent.ts
15618
+ function estimateTokens(text) {
15619
+ const words = text.trim().split(/\s+/).filter(Boolean).length;
15620
+ return Math.max(1, Math.ceil(words * 1.25));
15621
+ }
15622
+ function citationLabel(index) {
15623
+ return `C${index + 1}`;
15624
+ }
15625
+ function localAnswer(prompt, context) {
15626
+ if (context.excerpts.length === 0) {
15627
+ return `No indexed knowledge matched the prompt: ${prompt}`;
15628
+ }
15629
+ const lines = [
15630
+ `Found ${context.excerpts.length} relevant knowledge excerpt(s) for: ${prompt}`,
15631
+ "",
15632
+ ...context.excerpts.slice(0, 5).map((excerpt, index) => {
15633
+ const citation = context.citations.find((entry) => entry.id === excerpt.citation_id);
15634
+ const ref = citation?.source_ref ?? citation?.source_uri ?? citation?.artifact_path ?? citation?.artifact_uri ?? "unknown source";
15635
+ return `[${citationLabel(index)}] ${excerpt.text} (${ref})`;
15636
+ })
15637
+ ];
15638
+ return lines.join(`
15639
+ `);
15640
+ }
15641
+ function promptForModel(prompt, context) {
15642
+ const citations = context.citations.map((citation, index) => ({
15643
+ id: citationLabel(index),
15644
+ source_ref: citation.source_ref,
15645
+ source_uri: citation.source_uri,
15646
+ artifact_path: citation.artifact_path,
15647
+ revision: citation.revision,
15648
+ hash: citation.hash,
15649
+ quote: citation.quote
15650
+ }));
15651
+ const excerpts = context.excerpts.map((excerpt, index) => ({
15652
+ id: citationLabel(index),
15653
+ kind: excerpt.kind,
15654
+ text: excerpt.text,
15655
+ score: excerpt.score
15656
+ }));
15657
+ return [
15658
+ `Prompt: ${prompt}`,
15659
+ "",
15660
+ "Use only the provided context. Cite claims with citation ids like [C1]. If context is insufficient, say what is missing.",
15661
+ "",
15662
+ `Context excerpts:
15663
+ ${JSON.stringify(excerpts, null, 2)}`,
15664
+ "",
15665
+ `Citations:
15666
+ ${JSON.stringify(citations, null, 2)}`
15667
+ ].join(`
15668
+ `);
15669
+ }
15670
+ function proposedUpdates(prompt, context) {
15671
+ if (context.citations.length === 0)
15672
+ return [];
15673
+ return [{
15674
+ kind: "answer_note",
15675
+ title: prompt.length > 80 ? `${prompt.slice(0, 77)}...` : prompt,
15676
+ citations: context.citations.map((citation) => citation.id),
15677
+ requires_approval: true
15678
+ }];
15679
+ }
15680
+ function insertRun(dbPath, input) {
15681
+ const db = openKnowledgeDb(dbPath);
15682
+ try {
15683
+ db.run(`INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
15684
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
15685
+ input.runId,
15686
+ "knowledge-prompt",
15687
+ input.prompt,
15688
+ input.status,
15689
+ input.provider,
15690
+ input.model,
15691
+ JSON.stringify(input.metadata),
15692
+ input.now,
15693
+ input.now
15694
+ ]);
15695
+ } finally {
15696
+ db.close();
15697
+ }
15698
+ }
15699
+ function addRunEvent(dbPath, input) {
15700
+ const db = openKnowledgeDb(dbPath);
15701
+ try {
15702
+ db.run(`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
15703
+ VALUES (?, ?, ?, ?, ?, ?)`, [
15704
+ `evt_${randomUUID3()}`,
15705
+ input.runId,
15706
+ input.level,
15707
+ input.event,
15708
+ JSON.stringify(input.metadata),
15709
+ input.now
15710
+ ]);
15711
+ } finally {
15712
+ db.close();
15713
+ }
15714
+ }
15715
+ function updateRun(dbPath, input) {
15716
+ const db = openKnowledgeDb(dbPath);
15717
+ try {
15718
+ db.run(`UPDATE runs
15719
+ SET status = ?, provider = ?, model = ?, metadata_json = ?, updated_at = ?
15720
+ WHERE id = ?`, [
15721
+ input.status,
15722
+ input.provider,
15723
+ input.model,
15724
+ JSON.stringify(input.metadata),
15725
+ input.now,
15726
+ input.runId
15727
+ ]);
15728
+ } finally {
15729
+ db.close();
15730
+ }
15731
+ }
15732
+ function recordUsage(dbPath, runId, usage, provider, model, now, metadata = {}) {
15733
+ const db = openKnowledgeDb(dbPath);
15734
+ try {
15735
+ recordProviderUsage(db, {
15736
+ run_id: runId,
15737
+ provider,
15738
+ model,
15739
+ input_tokens: usage.input_tokens,
15740
+ output_tokens: usage.output_tokens,
15741
+ cost_usd: usage.cost_usd,
15742
+ metadata,
15743
+ created_at: now
15744
+ });
15745
+ } finally {
15746
+ db.close();
15747
+ }
15748
+ }
15749
+ async function runKnowledgePrompt(options) {
15750
+ const prompt = options.prompt.trim();
15751
+ if (!prompt)
15752
+ throw new Error("Knowledge prompt is required.");
15753
+ const now = (options.now ?? new Date).toISOString();
15754
+ const runId = `run_${randomUUID3()}`;
15755
+ const modelRef = resolveModelRef(options.modelRef ?? "default", options.config);
15756
+ const parsed = parseModelRef(modelRef);
15757
+ migrateKnowledgeDb(options.dbPath);
15758
+ insertRun(options.dbPath, {
15759
+ runId,
15760
+ prompt,
15761
+ status: options.generate ? "running" : "dry_run",
15762
+ provider: options.generate ? parsed.provider : "local",
15763
+ model: options.generate ? parsed.model : "context-draft",
15764
+ metadata: {
15765
+ semantic: options.semantic === true || options.fake === true || Boolean(options.modelRef),
15766
+ approve_write: options.approveWrite === true,
15767
+ generated: options.generate === true
15768
+ },
15769
+ now
15770
+ });
15771
+ const { prompt: _prompt, generate: _generate, approveWrite: _approveWrite, now: _now, ...retrievalOptions } = options;
15772
+ const context = await retrieveKnowledgeContext({
15773
+ ...retrievalOptions,
15774
+ query: prompt
15775
+ });
15776
+ addRunEvent(options.dbPath, {
15777
+ runId,
15778
+ level: "info",
15779
+ event: "context_retrieved",
15780
+ metadata: {
15781
+ results: context.results.length,
15782
+ citations: context.citations.length,
15783
+ warnings: context.warnings
15784
+ },
15785
+ now
15786
+ });
15787
+ let answer = localAnswer(prompt, context);
15788
+ let generated = false;
15789
+ let provider = "local";
15790
+ let model = "context-draft";
15791
+ let usage = {
15792
+ input_tokens: estimateTokens(prompt) + context.excerpts.reduce((sum, excerpt) => sum + estimateTokens(excerpt.text), 0),
15793
+ output_tokens: estimateTokens(answer),
15794
+ cost_usd: 0
15795
+ };
15796
+ const warnings = [...context.warnings];
15797
+ if (options.generate) {
15798
+ try {
15799
+ if (options.fake) {
15800
+ generated = true;
15801
+ provider = parsed.provider;
15802
+ model = parsed.model;
15803
+ answer = `Fake generated answer for: ${prompt}
15804
+
15805
+ ${answer}`;
15806
+ } else {
15807
+ const { generateText } = await import("ai");
15808
+ const languageModel = await languageModelFor(modelRef, {
15809
+ config: options.config,
15810
+ env: options.env
15811
+ });
15812
+ const result = await generateText({
15813
+ model: languageModel,
15814
+ system: "You answer company knowledge-base prompts using only provided context and citation ids.",
15815
+ prompt: promptForModel(prompt, context)
15816
+ });
15817
+ generated = true;
15818
+ provider = parsed.provider;
15819
+ model = parsed.model;
15820
+ answer = result.text;
15821
+ const normalized = normalizeAiSdkUsage({
15822
+ provider,
15823
+ model,
15824
+ usage: result.usage,
15825
+ providerMetadata: result.providerMetadata
15826
+ });
15827
+ usage = {
15828
+ input_tokens: normalized.input_tokens,
15829
+ output_tokens: normalized.output_tokens,
15830
+ cost_usd: normalized.cost_usd
15831
+ };
15832
+ }
15833
+ } catch (error48) {
15834
+ addRunEvent(options.dbPath, {
15835
+ runId,
15836
+ level: "error",
15837
+ event: "answer_generation_failed",
15838
+ metadata: { message: error48 instanceof Error ? error48.message : String(error48) },
15839
+ now
15840
+ });
15841
+ updateRun(options.dbPath, {
15842
+ runId,
15843
+ status: "failed",
15844
+ provider: parsed.provider,
15845
+ model: parsed.model,
15846
+ metadata: {
15847
+ generated: false,
15848
+ error: error48 instanceof Error ? error48.message : String(error48)
15849
+ },
15850
+ now
15851
+ });
15852
+ throw error48;
15853
+ }
15854
+ }
15855
+ const updates = proposedUpdates(prompt, context);
15856
+ const writePolicy = {
15857
+ approved: options.approveWrite === true,
15858
+ durable_writes_performed: false,
15859
+ reason: options.approveWrite ? "Approval flag recorded; durable wiki writing is deferred to the wiki compile task." : "Dry-run mode: proposed wiki updates require approval before durable writes."
15860
+ };
15861
+ addRunEvent(options.dbPath, {
15862
+ runId,
15863
+ level: "info",
15864
+ event: generated ? "answer_generated" : "answer_drafted",
15865
+ metadata: {
15866
+ provider,
15867
+ model,
15868
+ proposed_updates: updates.length,
15869
+ durable_writes_performed: false
15870
+ },
15871
+ now
15872
+ });
15873
+ recordUsage(options.dbPath, runId, usage, provider, model, now, {
15874
+ generated,
15875
+ citations: context.citations.length
15876
+ });
15877
+ updateRun(options.dbPath, {
15878
+ runId,
15879
+ status: generated ? "completed" : "dry_run",
15880
+ provider,
15881
+ model,
15882
+ metadata: {
15883
+ generated,
15884
+ citations: context.citations.length,
15885
+ proposed_updates: updates.length,
15886
+ approve_write: options.approveWrite === true
15887
+ },
15888
+ now
15889
+ });
15890
+ return {
15891
+ run_id: runId,
15892
+ prompt,
15893
+ generated,
15894
+ provider,
15895
+ model,
15896
+ answer,
15897
+ context,
15898
+ citations: context.citations,
15899
+ proposed_wiki_updates: updates,
15900
+ write_policy: writePolicy,
15901
+ usage,
15902
+ warnings
15903
+ };
15904
+ }
15905
+
15906
+ // src/outbox-consume.ts
15907
+ import { createHash as createHash4, randomUUID as randomUUID5 } from "crypto";
15908
+ import { existsSync as existsSync4, readFileSync as readFileSync4 } from "fs";
15909
+ import { basename } from "path";
15910
+
15911
+ // src/safety.ts
15912
+ import { createHash as createHash3, randomUUID as randomUUID4 } from "crypto";
15913
+ import { relative as relative2, resolve as resolve2, sep as sep2 } from "path";
15914
+ function envEnabled(name) {
15915
+ const value = process.env[name];
15916
+ return value === "1" || value === "true" || value === "yes";
15917
+ }
15918
+ function resolveSafetyPolicy(config2, workspace) {
15919
+ const extended = config2;
15920
+ const configuredBuckets = new Set(extended.safety?.network?.allowed_s3_buckets ?? []);
15921
+ if (config2.storage.type === "s3" && config2.storage.s3?.bucket)
15922
+ configuredBuckets.add(config2.storage.s3.bucket);
15923
+ if (process.env.HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS) {
15924
+ for (const bucket of process.env.HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS.split(",").map((entry) => entry.trim()).filter(Boolean)) {
15925
+ configuredBuckets.add(bucket);
15926
+ }
15927
+ }
15928
+ return {
15929
+ mode: config2.mode,
15930
+ allowWriteRoots: [
15931
+ workspace.home,
15932
+ workspace.artifactsDir,
15933
+ workspace.cacheDir,
15934
+ workspace.exportsDir,
15935
+ workspace.indexesDir,
15936
+ workspace.logsDir,
15937
+ workspace.runsDir,
15938
+ workspace.schemasDir,
15939
+ workspace.wikiDir
15940
+ ].map((entry) => resolve2(entry)),
15941
+ readOnlySourceAccess: true,
15942
+ network: {
15943
+ webSearchEnabled: extended.safety?.network?.web_search_enabled ?? envEnabled("HASNA_KNOWLEDGE_WEB_SEARCH"),
15944
+ s3ReadsEnabled: extended.safety?.network?.s3_reads_enabled ?? envEnabled("HASNA_KNOWLEDGE_ALLOW_S3_READS"),
15945
+ allowedS3Buckets: [...configuredBuckets].sort()
15946
+ },
15947
+ redaction: {
15948
+ enabled: extended.safety?.redaction?.enabled ?? true
15949
+ },
15950
+ approvals: {
15951
+ generatedWritesRequireApproval: extended.safety?.approvals?.generated_writes_require_approval ?? true
15952
+ }
15953
+ };
15954
+ }
15955
+ function isInside(root, target) {
15956
+ const rel = relative2(root, target);
15957
+ return rel === "" || !rel.startsWith("..") && rel !== ".." && !rel.startsWith(`..${sep2}`);
15958
+ }
15959
+ function assertWriteAllowed(targetPath, policy) {
15960
+ const resolved = resolve2(targetPath);
15961
+ if (!policy.allowWriteRoots.some((root) => isInside(root, resolved))) {
15962
+ throw new Error(`Safety policy denied write outside .hasna/apps/knowledge: ${targetPath}`);
15963
+ }
15964
+ }
15965
+ function assertS3ReadAllowed(uri, policy) {
15966
+ const parsed = new URL(uri);
15967
+ const bucket = parsed.hostname;
15968
+ if (!policy.network.s3ReadsEnabled) {
15969
+ throw new Error("Safety policy denied S3 read. Set safety.network.s3_reads_enabled=true or HASNA_KNOWLEDGE_ALLOW_S3_READS=1.");
15970
+ }
15971
+ if (!policy.network.allowedS3Buckets.includes(bucket)) {
15972
+ throw new Error(`Safety policy denied S3 bucket "${bucket}". Add it to safety.network.allowed_s3_buckets or HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS.`);
15973
+ }
15974
+ }
15975
+ function assertWebSearchAllowed(policy) {
15976
+ if (!policy.network.webSearchEnabled) {
15977
+ throw new Error("Safety policy denied web search. Set safety.network.web_search_enabled=true or HASNA_KNOWLEDGE_WEB_SEARCH=1.");
15978
+ }
15979
+ }
15980
+ var REDACTION_PATTERNS = [
15981
+ { type: "private_key_block", severity: "high", regex: /-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*?-----END [A-Z ]*PRIVATE KEY-----/g, replacement: "[REDACTED:private_key_block]" },
15982
+ { type: "secret_assignment", severity: "high", regex: /\b(?:api[_-]?key|secret|token|password)\s*[:=]\s*['"]?[^'"\s]{8,}/gi, replacement: "[REDACTED:secret_assignment]" },
15983
+ { type: "openai_api_key", severity: "high", regex: /\bsk-[A-Za-z0-9_-]{20,}\b/g, replacement: "[REDACTED:openai_api_key]" },
15984
+ { type: "anthropic_api_key", severity: "high", regex: /\bsk-ant-[A-Za-z0-9_-]{20,}\b/g, replacement: "[REDACTED:anthropic_api_key]" },
15985
+ { type: "aws_access_key_id", severity: "high", regex: /\bA(?:KIA|SIA)[A-Z0-9]{16}\b/g, replacement: "[REDACTED:aws_access_key_id]" }
15986
+ ];
15987
+ function redactSecrets(text, policy) {
15988
+ if (policy && !policy.redaction.enabled)
15989
+ return { text, findings: [] };
15990
+ let output = text;
15991
+ const findings = [];
15992
+ for (const pattern of REDACTION_PATTERNS) {
15993
+ output = output.replace(pattern.regex, (match, ...args) => {
15994
+ const offset = typeof args.at(-2) === "number" ? args.at(-2) : output.indexOf(match);
15995
+ findings.push({
15996
+ type: pattern.type,
15997
+ severity: pattern.severity,
15998
+ start: Math.max(0, offset),
15999
+ end: Math.max(0, offset + match.length)
16000
+ });
16001
+ return pattern.replacement;
16002
+ });
16003
+ }
16004
+ return { text: output, findings };
16005
+ }
16006
+ function auditId(input) {
16007
+ return `audit_${createHash3("sha256").update(`${input.event_type}\x00${input.action}\x00${input.target_uri ?? ""}\x00${input.created_at ?? ""}\x00${JSON.stringify(input.metadata ?? {})}\x00${randomUUID4()}`).digest("hex").slice(0, 24)}`;
16008
+ }
16009
+ function recordAuditEvent(db, input) {
16010
+ const createdAt = input.created_at ?? new Date().toISOString();
16011
+ const id = auditId({ ...input, created_at: createdAt });
16012
+ db.run(`INSERT INTO audit_events (id, event_type, action, target_uri, decision, metadata_json, created_at)
16013
+ VALUES (?, ?, ?, ?, ?, ?, ?)`, [
16014
+ id,
16015
+ input.event_type,
16016
+ input.action,
16017
+ input.target_uri ?? null,
16018
+ input.decision,
16019
+ JSON.stringify(input.metadata ?? {}),
16020
+ createdAt
16021
+ ]);
16022
+ return id;
16023
+ }
16024
+ function recordRedactionFindings(db, input) {
16025
+ const createdAt = input.created_at ?? new Date().toISOString();
16026
+ for (const finding of input.findings) {
16027
+ db.run(`INSERT INTO redaction_findings (id, source_uri, run_id, severity, finding_type, metadata_json, created_at)
16028
+ VALUES (?, ?, ?, ?, ?, ?, ?)`, [
16029
+ `redact_${randomUUID4()}`,
16030
+ input.source_uri ?? null,
16031
+ input.run_id ?? null,
16032
+ finding.severity,
16033
+ finding.type,
16034
+ JSON.stringify({ ...input.metadata ?? {}, start: finding.start, end: finding.end }),
16035
+ createdAt
16036
+ ]);
16037
+ }
16038
+ return input.findings.length;
16039
+ }
16040
+
16041
+ // src/outbox-consume.ts
16042
+ function stableId3(prefix, value) {
16043
+ return `${prefix}_${createHash4("sha256").update(value).digest("hex").slice(0, 20)}`;
16044
+ }
16045
+ function asObject(value) {
16046
+ return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
16047
+ }
16048
+ function asString(value) {
16049
+ return typeof value === "string" && value.length > 0 ? value : undefined;
16050
+ }
16051
+ function buildSourceRef(event) {
16052
+ const explicit = asString(event.source_ref) ?? asString(event.source_uri) ?? asString(event.uri);
16053
+ if (explicit)
16054
+ return explicit;
16055
+ const fileId = asString(event.file_id);
16056
+ if (fileId) {
16057
+ const revision = asString(event.revision_id) ?? asString(event.revision);
16058
+ const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
16059
+ return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
16060
+ }
16061
+ const sourceId = asString(event.source_id);
16062
+ const path = asString(event.path);
16063
+ if (sourceId && path) {
16064
+ return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
16065
+ }
16066
+ throw new Error("Outbox event is missing source_ref, file_id, or source_id/path.");
16067
+ }
16068
+ function baseSourceUri(sourceRef, parsed) {
16069
+ if (parsed.kind === "open-files" && parsed.entity === "file" && parsed.revision_id) {
16070
+ return sourceRef.replace(/\/revision\/[^/]+$/, "");
16071
+ }
16072
+ return sourceRef;
16073
+ }
16074
+ function hashFromEvent(event) {
16075
+ return asString(event.hash) ?? asString(event.checksum) ?? asString(event.sha256) ?? null;
16076
+ }
16077
+ function revisionFromEvent(event, parsed, hash2) {
16078
+ return asString(event.revision_id) ?? asString(event.revision) ?? asString(event.version_id) ?? (parsed.kind === "open-files" ? parsed.revision_id : undefined) ?? hash2 ?? null;
16079
+ }
16080
+ function eventType(event) {
16081
+ return (asString(event.event) ?? asString(event.type) ?? asString(event.action) ?? asString(event.change_type) ?? "changed").toLowerCase();
16082
+ }
16083
+ function titleFromEvent(event) {
16084
+ const path = asString(event.path);
16085
+ return asString(event.title) ?? asString(event.name) ?? (path ? basename(path) : null);
16086
+ }
16087
+ function normalizeEvent(event, now) {
16088
+ const sourceRef = buildSourceRef(event);
16089
+ const parsed = parseSourceRef(sourceRef);
16090
+ const hash2 = hashFromEvent(event);
16091
+ return {
16092
+ raw: event,
16093
+ eventType: eventType(event),
16094
+ sourceRef,
16095
+ sourceUri: baseSourceUri(sourceRef, parsed),
16096
+ kind: parsed.kind,
16097
+ title: titleFromEvent(event),
16098
+ revision: revisionFromEvent(event, parsed, hash2),
16099
+ hash: hash2,
16100
+ status: asString(event.status)?.toLowerCase() ?? null,
16101
+ updatedAt: asString(event.updated_at) ?? now,
16102
+ acl: event.permissions ?? event.acl ?? undefined
16103
+ };
16104
+ }
16105
+ function parseOutboxText(text) {
16106
+ const trimmed = text.trim();
16107
+ if (!trimmed)
16108
+ return [];
16109
+ if (trimmed.startsWith("[")) {
16110
+ const parsed = JSON.parse(trimmed);
16111
+ if (!Array.isArray(parsed))
16112
+ throw new Error("Outbox array parse failed.");
16113
+ return parsed.map((entry) => {
16114
+ const event = asObject(entry);
16115
+ if (!event)
16116
+ throw new Error("Outbox array entries must be objects.");
16117
+ return event;
16118
+ });
16119
+ }
16120
+ if (trimmed.startsWith("{")) {
16121
+ try {
16122
+ const parsed = JSON.parse(trimmed);
16123
+ const object2 = asObject(parsed);
16124
+ if (!object2)
16125
+ throw new Error("Outbox object parse failed.");
16126
+ if (Array.isArray(object2.events)) {
16127
+ return object2.events.map((entry) => {
16128
+ const event = asObject(entry);
16129
+ if (!event)
16130
+ throw new Error("Outbox events entries must be objects.");
16131
+ return event;
16132
+ });
16133
+ }
16134
+ if ("source_ref" in object2 || "source_uri" in object2 || "file_id" in object2)
16135
+ return [object2];
16136
+ } catch (error48) {
16137
+ const lines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0);
16138
+ if (lines.length <= 1)
16139
+ throw error48;
16140
+ return lines.map((line) => {
16141
+ const event = asObject(JSON.parse(line));
16142
+ if (!event)
16143
+ throw new Error("Outbox JSONL entries must be objects.");
16144
+ return event;
16145
+ });
16146
+ }
16147
+ }
16148
+ return trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => {
16149
+ const event = asObject(JSON.parse(line));
16150
+ if (!event)
16151
+ throw new Error("Outbox JSONL entries must be objects.");
16152
+ return event;
16153
+ });
16154
+ }
16155
+ async function readS3Text(uri, config2, safetyPolicy) {
16156
+ const parsed = new URL(uri);
16157
+ const bucket = parsed.hostname;
16158
+ const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ""));
16159
+ if (!bucket || !key)
16160
+ throw new Error(`Invalid S3 outbox URI: ${uri}`);
16161
+ if (safetyPolicy)
16162
+ assertS3ReadAllowed(uri, safetyPolicy);
16163
+ const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
16164
+ import("@aws-sdk/client-s3"),
16165
+ import("@aws-sdk/credential-providers")
16166
+ ]);
16167
+ const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
16168
+ const client = new S3Client({
16169
+ region: s3Config?.region,
16170
+ credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
16171
+ maxAttempts: s3Config?.max_attempts
16172
+ });
16173
+ const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
16174
+ if (!response.Body)
16175
+ return "";
16176
+ return await response.Body.transformToString();
16177
+ }
16178
+ async function readOutboxInput(input, config2, safetyPolicy) {
16179
+ if (input.startsWith("s3://"))
16180
+ return readS3Text(input, config2, safetyPolicy);
16181
+ if (!existsSync4(input))
16182
+ throw new Error(`Outbox not found: ${input}`);
16183
+ return readFileSync4(input, "utf8");
16184
+ }
16185
+ function mergeJson(existing, patch) {
16186
+ let base = {};
16187
+ if (existing) {
16188
+ try {
16189
+ base = asObject(JSON.parse(existing)) ?? {};
16190
+ } catch {
16191
+ base = {};
16192
+ }
16193
+ }
16194
+ return JSON.stringify({ ...base, ...patch });
16195
+ }
16196
+ function ensureSource(db, event, now) {
16197
+ const id = stableId3("src", event.sourceUri);
15686
16198
  db.run(`INSERT INTO sources (id, uri, kind, title, metadata_json, acl_json, created_at, updated_at)
15687
16199
  VALUES (?, ?, ?, ?, ?, ?, ?, ?)
15688
16200
  ON CONFLICT(uri) DO UPDATE SET
15689
16201
  kind = excluded.kind,
15690
- title = excluded.title,
15691
- metadata_json = excluded.metadata_json,
15692
- acl_json = excluded.acl_json,
16202
+ title = COALESCE(excluded.title, sources.title),
15693
16203
  updated_at = excluded.updated_at`, [
15694
- sourceId,
15695
- item.sourceUri,
15696
- item.kind,
15697
- item.title,
15698
- JSON.stringify(item.metadata),
15699
- JSON.stringify(item.acl ?? {}),
16204
+ id,
16205
+ event.sourceUri,
16206
+ event.kind,
16207
+ event.title,
16208
+ JSON.stringify({ source_ref: event.sourceRef, source_uri: event.sourceUri, status: event.status, last_outbox_event: event.eventType }),
16209
+ JSON.stringify(event.acl ?? {}),
15700
16210
  now,
15701
- item.updatedAt
16211
+ event.updatedAt
15702
16212
  ]);
15703
- const row = db.query("SELECT id FROM sources WHERE uri = ?").get(item.sourceUri);
16213
+ const row = db.query("SELECT id, metadata_json, acl_json FROM sources WHERE uri = ?").get(event.sourceUri);
15704
16214
  if (!row)
15705
- throw new Error(`Failed to upsert source: ${item.sourceUri}`);
16215
+ throw new Error(`Failed to upsert source for outbox event: ${event.sourceUri}`);
16216
+ const patch = {
16217
+ source_ref: event.sourceRef,
16218
+ source_uri: event.sourceUri,
16219
+ last_outbox_event: event.eventType,
16220
+ last_outbox_at: event.updatedAt
16221
+ };
16222
+ if (event.status)
16223
+ patch.status = event.status;
16224
+ if (asString(event.raw.path))
16225
+ patch.path = event.raw.path;
16226
+ db.run("UPDATE sources SET metadata_json = ?, acl_json = CASE WHEN ? IS NULL THEN acl_json ELSE ? END, updated_at = ? WHERE id = ?", [
16227
+ mergeJson(row.metadata_json, patch),
16228
+ event.acl === undefined ? null : JSON.stringify(event.acl),
16229
+ event.acl === undefined ? null : JSON.stringify(event.acl),
16230
+ event.updatedAt,
16231
+ row.id
16232
+ ]);
15706
16233
  return row.id;
15707
16234
  }
15708
- function upsertRevision(db, sourceId, item, now) {
15709
- const revisionId = stableId3("rev", `${sourceId}\x00${item.revision}`);
16235
+ function ensureRevision(db, sourceId, event, now) {
16236
+ if (!event.revision)
16237
+ return null;
16238
+ const id = stableId3("rev", `${sourceId}\x00${event.revision}`);
16239
+ const metadata = {
16240
+ source_ref: event.sourceRef,
16241
+ source_uri: event.sourceUri,
16242
+ status: event.status,
16243
+ last_outbox_event: event.eventType,
16244
+ reindex_required: true
16245
+ };
15710
16246
  db.run(`INSERT INTO source_revisions (id, source_id, revision, hash, extracted_text_uri, metadata_json, created_at)
15711
16247
  VALUES (?, ?, ?, ?, ?, ?, ?)
15712
16248
  ON CONFLICT(source_id, revision) DO UPDATE SET
15713
- hash = excluded.hash,
15714
- extracted_text_uri = excluded.extracted_text_uri,
15715
- metadata_json = excluded.metadata_json`, [
15716
- revisionId,
15717
- sourceId,
15718
- item.revision,
15719
- item.hash,
15720
- item.extractedTextUri,
15721
- JSON.stringify(item.metadata),
15722
- now
15723
- ]);
15724
- const row = db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").get(sourceId, item.revision);
15725
- if (!row)
15726
- throw new Error(`Failed to upsert source revision: ${item.sourceRef}`);
15727
- return row.id;
16249
+ hash = COALESCE(excluded.hash, source_revisions.hash),
16250
+ metadata_json = excluded.metadata_json`, [id, sourceId, event.revision, event.hash, asString(event.raw.extracted_text_ref) ?? null, JSON.stringify(metadata), now]);
16251
+ const row = db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").get(sourceId, event.revision);
16252
+ return row?.id ?? null;
15728
16253
  }
15729
- function insertChunks(db, sourceRevisionId, item, now, maxChars, overlapChars, safetyPolicy) {
15730
- if (!item.text || item.status.toLowerCase() === "deleted")
15731
- return { chunksInserted: 0, redactions: 0 };
15732
- const redacted = redactSecrets(item.text, safetyPolicy);
15733
- if (redacted.findings.length > 0) {
15734
- recordRedactionFindings(db, {
15735
- source_uri: item.sourceUri,
15736
- findings: redacted.findings,
15737
- metadata: { source_ref: item.sourceRef, revision: item.revision },
15738
- created_at: now
15739
- });
15740
- recordAuditEvent(db, {
15741
- event_type: "redaction",
15742
- action: "source_text_redact",
15743
- target_uri: item.sourceUri,
15744
- decision: "redacted",
15745
- metadata: { findings: redacted.findings.length, source_ref: item.sourceRef, revision: item.revision },
15746
- created_at: now
15747
- });
16254
+ function revisionIdsForEvent(db, sourceId, event) {
16255
+ if (event.revision) {
16256
+ return db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").all(sourceId, event.revision).map((row) => row.id);
15748
16257
  }
15749
- const chunks = chunkText(redacted.text, maxChars, overlapChars);
16258
+ if (event.hash) {
16259
+ return db.query("SELECT id FROM source_revisions WHERE source_id = ? AND hash = ?").all(sourceId, event.hash).map((row) => row.id);
16260
+ }
16261
+ return db.query("SELECT id FROM source_revisions WHERE source_id = ?").all(sourceId).map((row) => row.id);
16262
+ }
16263
+ function invalidateRevision(db, revisionId) {
16264
+ const chunks = db.query("SELECT id FROM chunks WHERE source_revision_id = ?").all(revisionId);
16265
+ let embeddingsDeleted = 0;
16266
+ let vectorEntriesDeleted = 0;
15750
16267
  for (const chunk of chunks) {
15751
- const chunkId = stableId3("chk", `${sourceRevisionId}\x00${chunk.ordinal}\x00${chunk.text}`);
15752
- const provenance = sourceProvenance({
15753
- source_ref: item.sourceRef,
15754
- source_uri: item.sourceUri,
15755
- source_kind: item.kind,
15756
- source_revision_id: sourceRevisionId,
15757
- revision: item.revision,
15758
- hash: item.hash,
15759
- chunk_id: chunkId,
15760
- start_offset: chunk.startOffset,
15761
- end_offset: chunk.endOffset,
15762
- status: item.status,
15763
- resolver: "open-files-read-only"
15764
- });
15765
- const metadata = withProvenance({
15766
- source_ref: item.sourceRef,
15767
- source_uri: item.sourceUri,
15768
- source_kind: item.kind,
15769
- source_revision_id: sourceRevisionId,
15770
- revision: item.revision,
15771
- hash: item.hash,
15772
- status: item.status,
15773
- path: asString2(item.raw.path) ?? null,
15774
- mime: asString2(item.raw.mime) ?? asString2(item.raw.content_type) ?? null,
15775
- size: asNumber(item.raw.size) ?? null
15776
- }, provenance);
15777
- db.run(`INSERT INTO chunks (id, source_revision_id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json, created_at)
15778
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
15779
- chunkId,
15780
- sourceRevisionId,
15781
- "source",
15782
- chunk.ordinal,
15783
- chunk.text,
15784
- estimateTokenCount(chunk.text),
15785
- chunk.startOffset,
15786
- chunk.endOffset,
15787
- JSON.stringify(metadata),
15788
- now
15789
- ]);
15790
- db.run("INSERT INTO chunks_fts (chunk_id, text, title, source_uri) VALUES (?, ?, ?, ?)", [chunkId, chunk.text, item.title ?? "", item.sourceUri]);
16268
+ const row = db.query("SELECT COUNT(*) AS n FROM chunk_embeddings WHERE chunk_id = ?").get(chunk.id);
16269
+ embeddingsDeleted += row?.n ?? 0;
16270
+ const vectorRow = db.query("SELECT COUNT(*) AS n FROM vector_index_entries WHERE chunk_id = ?").get(chunk.id);
16271
+ vectorEntriesDeleted += vectorRow?.n ?? 0;
16272
+ db.run("DELETE FROM vector_index_entries WHERE chunk_id = ?", [chunk.id]);
16273
+ db.run("DELETE FROM chunk_embeddings WHERE chunk_id = ?", [chunk.id]);
16274
+ db.run("DELETE FROM chunks_fts WHERE chunk_id = ?", [chunk.id]);
15791
16275
  }
15792
- return { chunksInserted: chunks.length, redactions: redacted.findings.length };
16276
+ db.run("DELETE FROM chunks WHERE source_revision_id = ?", [revisionId]);
16277
+ const revision = db.query("SELECT metadata_json FROM source_revisions WHERE id = ?").get(revisionId);
16278
+ db.run("UPDATE source_revisions SET metadata_json = ? WHERE id = ?", [mergeJson(revision?.metadata_json, { reindex_required: true, invalidated_at: new Date().toISOString() }), revisionId]);
16279
+ return { chunksDeleted: chunks.length, embeddingsDeleted, vectorEntriesDeleted };
15793
16280
  }
15794
- async function ingestOpenFilesManifest(options) {
15795
- const now = options.now ?? new Date;
15796
- if (options.safetyPolicy)
15797
- assertWriteAllowed(options.dbPath, options.safetyPolicy);
15798
- migrateKnowledgeDb(options.dbPath);
15799
- const text = await readManifestInput(options.input, options.config, options.safetyPolicy);
15800
- const items = parseManifestText(text);
15801
- return ingestOpenFilesManifestItems({
15802
- dbPath: options.dbPath,
15803
- items,
15804
- sourceLabel: options.input,
15805
- safetyPolicy: options.safetyPolicy,
15806
- now,
15807
- maxChunkChars: options.maxChunkChars,
15808
- chunkOverlapChars: options.chunkOverlapChars
15809
- });
16281
+ function isDeleteEvent(eventType2, status) {
16282
+ return status === "deleted" || ["delete", "deleted", "remove", "removed"].includes(eventType2);
15810
16283
  }
15811
- async function ingestOpenFilesManifestItems(options) {
16284
+ function isMoveEvent(eventType2) {
16285
+ return ["move", "moved", "rename", "renamed", "path_changed"].includes(eventType2);
16286
+ }
16287
+ function isPermissionEvent(eventType2) {
16288
+ return ["permission", "permissions", "permission_changed", "acl_changed"].includes(eventType2);
16289
+ }
16290
+ async function consumeOpenFilesOutbox(options) {
15812
16291
  const now = (options.now ?? new Date).toISOString();
15813
- const maxChunkChars = options.maxChunkChars ?? 4000;
15814
- const chunkOverlapChars = options.chunkOverlapChars ?? 200;
15815
- if (maxChunkChars < 500)
15816
- throw new Error("maxChunkChars must be at least 500.");
15817
- if (chunkOverlapChars < 0 || chunkOverlapChars >= maxChunkChars)
15818
- throw new Error("chunkOverlapChars must be less than maxChunkChars.");
15819
16292
  if (options.safetyPolicy)
15820
16293
  assertWriteAllowed(options.dbPath, options.safetyPolicy);
15821
16294
  migrateKnowledgeDb(options.dbPath);
16295
+ const text = await readOutboxInput(options.input, options.config, options.safetyPolicy);
16296
+ const events = parseOutboxText(text);
15822
16297
  const db = openKnowledgeDb(options.dbPath);
16298
+ const runId = `run_${randomUUID5()}`;
15823
16299
  try {
15824
- const result = db.transaction(() => {
15825
- const seenSources = new Set;
15826
- const seenRevisions = new Set;
15827
- let chunksInserted = 0;
16300
+ return db.transaction(() => {
16301
+ db.run(`INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
16302
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
16303
+ runId,
16304
+ "open-files-outbox",
16305
+ options.input,
16306
+ "completed",
16307
+ "local",
16308
+ "open-files-outbox",
16309
+ JSON.stringify({ path: options.input, events: events.length }),
16310
+ now,
16311
+ now
16312
+ ]);
16313
+ const sourcesTouched = new Set;
16314
+ const revisionsTouched = new Set;
15828
16315
  let chunksDeleted = 0;
15829
- let redactions = 0;
15830
- let skipped = 0;
16316
+ let embeddingsDeleted = 0;
16317
+ let vectorEntriesDeleted = 0;
16318
+ let staleRevisions = 0;
16319
+ let deletedSources = 0;
16320
+ let movedSources = 0;
16321
+ let permissionUpdates = 0;
15831
16322
  recordAuditEvent(db, {
15832
16323
  event_type: "source_read",
15833
- action: options.readAction ?? (options.sourceLabel.startsWith("s3://") ? "s3_manifest_read" : "local_manifest_read"),
15834
- target_uri: options.sourceLabel,
16324
+ action: options.input.startsWith("s3://") ? "s3_outbox_read" : "local_outbox_read",
16325
+ target_uri: options.input,
15835
16326
  decision: "allow",
15836
- metadata: { items: options.items.length, read_only: true },
16327
+ metadata: { events: events.length, read_only: true },
15837
16328
  created_at: now
15838
16329
  });
15839
- for (const raw of options.items) {
15840
- const item = normalizeManifestItem(raw, now);
15841
- const sourceId = upsertSource(db, item, now);
15842
- const revisionId = upsertRevision(db, sourceId, item, now);
15843
- seenSources.add(sourceId);
15844
- seenRevisions.add(revisionId);
15845
- if (item.text || item.status.toLowerCase() === "deleted") {
15846
- chunksDeleted += deleteChunksForRevision(db, revisionId);
16330
+ events.forEach((raw, index) => {
16331
+ const event = normalizeEvent(raw, now);
16332
+ const sourceId = ensureSource(db, event, now);
16333
+ sourcesTouched.add(sourceId);
16334
+ const createdRevisionId = ensureRevision(db, sourceId, event, now);
16335
+ if (createdRevisionId)
16336
+ revisionsTouched.add(createdRevisionId);
16337
+ const affectedRevisionIds = revisionIdsForEvent(db, sourceId, event);
16338
+ for (const revisionId of affectedRevisionIds) {
16339
+ revisionsTouched.add(revisionId);
16340
+ const invalidation = invalidateRevision(db, revisionId);
16341
+ chunksDeleted += invalidation.chunksDeleted;
16342
+ embeddingsDeleted += invalidation.embeddingsDeleted;
16343
+ vectorEntriesDeleted += invalidation.vectorEntriesDeleted;
16344
+ staleRevisions += 1;
15847
16345
  }
15848
- const inserted = insertChunks(db, revisionId, item, now, maxChunkChars, chunkOverlapChars, options.safetyPolicy);
15849
- chunksInserted += inserted.chunksInserted;
15850
- redactions += inserted.redactions;
15851
- }
16346
+ if (isDeleteEvent(event.eventType, event.status))
16347
+ deletedSources += 1;
16348
+ if (isMoveEvent(event.eventType))
16349
+ movedSources += 1;
16350
+ if (isPermissionEvent(event.eventType) || event.acl !== undefined)
16351
+ permissionUpdates += 1;
16352
+ db.run(`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
16353
+ VALUES (?, ?, ?, ?, ?, ?)`, [
16354
+ stableId3("evt", `${runId}\x00${index}\x00${event.sourceRef}\x00${event.eventType}`),
16355
+ runId,
16356
+ "info",
16357
+ event.eventType,
16358
+ JSON.stringify({
16359
+ source_ref: event.sourceRef,
16360
+ source_uri: event.sourceUri,
16361
+ revision: event.revision,
16362
+ hash: event.hash,
16363
+ status: event.status,
16364
+ affected_revisions: affectedRevisionIds.length
16365
+ }),
16366
+ event.updatedAt
16367
+ ]);
16368
+ });
16369
+ db.run(`INSERT INTO provider_usage (id, run_id, provider, model, input_tokens, output_tokens, cost_usd, metadata_json, created_at)
16370
+ VALUES (?, ?, ?, ?, 0, 0, 0, ?, ?)`, [
16371
+ stableId3("usage", runId),
16372
+ runId,
16373
+ "local",
16374
+ "open-files-outbox",
16375
+ JSON.stringify({ note: "No model provider used for outbox invalidation." }),
16376
+ now
16377
+ ]);
15852
16378
  recordAuditEvent(db, {
15853
16379
  event_type: "write",
15854
- action: "knowledge_manifest_ingest",
16380
+ action: "knowledge_outbox_invalidation",
15855
16381
  target_uri: options.dbPath,
15856
16382
  decision: "allow",
15857
- metadata: { items: options.items.length, sources: seenSources.size, revisions: seenRevisions.size, chunks_inserted: chunksInserted, redactions },
16383
+ metadata: {
16384
+ run_id: runId,
16385
+ events: events.length,
16386
+ sources: sourcesTouched.size,
16387
+ revisions: revisionsTouched.size,
16388
+ chunks_deleted: chunksDeleted,
16389
+ embeddings_deleted: embeddingsDeleted,
16390
+ vector_entries_deleted: vectorEntriesDeleted
16391
+ },
15858
16392
  created_at: now
15859
16393
  });
15860
16394
  return {
15861
- path: options.sourceLabel,
16395
+ path: options.input,
15862
16396
  db_path: options.dbPath,
15863
- items_seen: options.items.length,
15864
- sources_upserted: seenSources.size,
15865
- revisions_upserted: seenRevisions.size,
15866
- chunks_inserted: chunksInserted,
16397
+ run_id: runId,
16398
+ events_seen: events.length,
16399
+ sources_touched: sourcesTouched.size,
16400
+ revisions_touched: revisionsTouched.size,
15867
16401
  chunks_deleted: chunksDeleted,
15868
- redactions,
15869
- skipped
16402
+ embeddings_deleted: embeddingsDeleted,
16403
+ vector_entries_deleted: vectorEntriesDeleted,
16404
+ stale_revisions: staleRevisions,
16405
+ deleted_sources: deletedSources,
16406
+ moved_sources: movedSources,
16407
+ permission_updates: permissionUpdates
15870
16408
  };
15871
16409
  })();
15872
- return result;
15873
16410
  } finally {
15874
16411
  db.close();
15875
16412
  }
15876
16413
  }
15877
16414
 
15878
- // src/source-ingest.ts
16415
+ // src/manifest-ingest.ts
15879
16416
  import { createHash as createHash5 } from "crypto";
15880
- import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
15881
- import { basename as basename3 } from "path";
15882
-
15883
- // src/source-resolver.ts
15884
- function parseJsonObject2(value) {
15885
- if (!value)
15886
- return {};
15887
- try {
15888
- const parsed = JSON.parse(value);
15889
- return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : {};
15890
- } catch {
15891
- return {};
15892
- }
16417
+ import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
16418
+ import { basename as basename2 } from "path";
16419
+ function stableId4(prefix, value) {
16420
+ return `${prefix}_${createHash5("sha256").update(value).digest("hex").slice(0, 20)}`;
15893
16421
  }
15894
- function metadataString2(metadata, keys) {
15895
- for (const key of keys) {
15896
- const value = metadata[key];
15897
- if (typeof value === "string" && value.length > 0)
15898
- return value;
15899
- }
15900
- return null;
16422
+ function asObject2(value) {
16423
+ return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
15901
16424
  }
15902
- function metadataNumber2(metadata, keys) {
15903
- for (const key of keys) {
15904
- const value = metadata[key];
15905
- if (typeof value === "number" && Number.isFinite(value))
15906
- return value;
15907
- }
15908
- return null;
16425
+ function asString2(value) {
16426
+ return typeof value === "string" && value.length > 0 ? value : undefined;
15909
16427
  }
15910
- function assertPurposeAllowed(permissions, purpose) {
15911
- const mode = permissions.mode;
15912
- if (typeof mode === "string" && mode !== "read_only") {
15913
- throw new Error(`Source resolver denied ${purpose}. Permission mode is ${mode}, expected read_only.`);
15914
- }
15915
- const denied = permissions.denied_purposes;
15916
- if (Array.isArray(denied) && denied.includes(purpose)) {
15917
- throw new Error(`Source resolver denied ${purpose}. Purpose is explicitly denied.`);
16428
+ function asNumber(value) {
16429
+ return typeof value === "number" && Number.isFinite(value) ? value : undefined;
16430
+ }
16431
+ function buildSourceRefFromItem(item) {
16432
+ const explicit = asString2(item.source_ref) ?? asString2(item.source_uri) ?? asString2(item.uri);
16433
+ if (explicit)
16434
+ return explicit;
16435
+ const fileId = asString2(item.file_id);
16436
+ if (fileId) {
16437
+ const revision = asString2(item.revision_id) ?? asString2(item.revision);
16438
+ const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
16439
+ return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
15918
16440
  }
15919
- const allowed = permissions.allowed_purposes;
15920
- if (Array.isArray(allowed) && allowed.length > 0 && !allowed.includes(purpose)) {
15921
- throw new Error(`Source resolver denied ${purpose}. Allowed purposes: ${allowed.join(", ")}`);
16441
+ const sourceId = asString2(item.source_id);
16442
+ const path = asString2(item.path);
16443
+ if (sourceId && path) {
16444
+ return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
15922
16445
  }
16446
+ throw new Error("Manifest item is missing source_ref, file_id, or source_id/path.");
15923
16447
  }
15924
- function sourceRevisionRef(sourceUri, revision, fallback) {
15925
- if (!revision)
15926
- return fallback;
15927
- try {
15928
- const parsed = parseSourceRef(sourceUri);
15929
- if (parsed.kind === "open-files" && parsed.entity === "file") {
15930
- return `${sourceUri}/revision/${encodeURIComponent(revision.revision)}`;
15931
- }
15932
- } catch {
15933
- return fallback;
16448
+ function baseSourceUri2(sourceRef, parsed) {
16449
+ if (parsed.kind === "open-files" && parsed.entity === "file" && parsed.revision_id) {
16450
+ return sourceRef.replace(/\/revision\/[^/]+$/, "");
15934
16451
  }
15935
- return fallback;
16452
+ return sourceRef;
15936
16453
  }
15937
- function selectSource(db, sourceUri, requestedRef) {
15938
- return db.query(`SELECT id, uri, kind, title, metadata_json, acl_json, updated_at
15939
- FROM sources
15940
- WHERE uri = ? OR uri = ?
15941
- ORDER BY CASE WHEN uri = ? THEN 0 ELSE 1 END
15942
- LIMIT 1`).get(sourceUri, requestedRef, sourceUri) ?? null;
16454
+ function textFromItem(item) {
16455
+ const direct = asString2(item.extracted_text) ?? asString2(item.text) ?? asString2(item.content_text) ?? asString2(item.markdown);
16456
+ if (direct !== undefined)
16457
+ return direct;
16458
+ const content = item.content;
16459
+ return typeof content === "string" ? content : null;
15943
16460
  }
15944
- function selectRevision(db, sourceId, revisionId) {
15945
- if (revisionId) {
15946
- return db.query(`SELECT id, revision, hash, extracted_text_uri, metadata_json, created_at
15947
- FROM source_revisions
15948
- WHERE source_id = ? AND revision = ?
15949
- LIMIT 1`).get(sourceId, revisionId) ?? null;
16461
+ function extractedTextUriFromItem(item) {
16462
+ const direct = asString2(item.extracted_text_ref) ?? asString2(item.extracted_text_uri) ?? asString2(item.text_ref);
16463
+ if (direct)
16464
+ return direct;
16465
+ const content = asObject2(item.content);
16466
+ return asString2(content?.extracted_text_ref) ?? asString2(content?.extracted_text_uri) ?? null;
16467
+ }
16468
+ function titleFromItem(item) {
16469
+ const path = asString2(item.path);
16470
+ return asString2(item.title) ?? asString2(item.name) ?? (path ? basename2(path) : null);
16471
+ }
16472
+ function hashFromItem(item) {
16473
+ return asString2(item.hash) ?? asString2(item.checksum) ?? asString2(item.sha256) ?? null;
16474
+ }
16475
+ function revisionFromItem(item, parsed, hash2) {
16476
+ const revision = asString2(item.revision_id) ?? asString2(item.revision) ?? asString2(item.version_id) ?? (parsed.kind === "open-files" ? parsed.revision_id : undefined) ?? hash2 ?? asString2(item.updated_at);
16477
+ return revision ?? "current";
16478
+ }
16479
+ function metadataFromItem(item, normalized) {
16480
+ const metadata = {};
16481
+ for (const [key, value] of Object.entries(item)) {
16482
+ if (["text", "content", "content_text", "extracted_text", "markdown"].includes(key))
16483
+ continue;
16484
+ metadata[key] = value;
15950
16485
  }
15951
- return db.query(`SELECT id, revision, hash, extracted_text_uri, metadata_json, created_at
15952
- FROM source_revisions
15953
- WHERE source_id = ?
15954
- ORDER BY created_at DESC, revision DESC
15955
- LIMIT 1`).get(sourceId) ?? null;
16486
+ metadata.source_ref = normalized.sourceRef;
16487
+ metadata.source_uri = normalized.sourceUri;
16488
+ metadata.status = normalized.status;
16489
+ return metadata;
15956
16490
  }
15957
- function countChunks(db, revisionId) {
15958
- if (!revisionId)
15959
- return 0;
15960
- const row = db.query("SELECT COUNT(*) AS n FROM chunks WHERE source_revision_id = ?").get(revisionId);
15961
- return row?.n ?? 0;
16491
+ function normalizeManifestItem(item, now) {
16492
+ const sourceRef = buildSourceRefFromItem(item);
16493
+ const parsed = parseSourceRef(sourceRef);
16494
+ const sourceUri = baseSourceUri2(sourceRef, parsed);
16495
+ const hash2 = hashFromItem(item);
16496
+ const status = asString2(item.status) ?? "active";
16497
+ return {
16498
+ raw: item,
16499
+ sourceRef,
16500
+ sourceUri,
16501
+ kind: parsed.kind,
16502
+ title: titleFromItem(item),
16503
+ revision: revisionFromItem(item, parsed, hash2),
16504
+ hash: hash2,
16505
+ extractedTextUri: extractedTextUriFromItem(item),
16506
+ text: textFromItem(item),
16507
+ metadata: metadataFromItem(item, { sourceRef, sourceUri, status }),
16508
+ acl: item.permissions ?? item.acl ?? {},
16509
+ status,
16510
+ updatedAt: asString2(item.updated_at) ?? now
16511
+ };
15962
16512
  }
15963
- function selectChunks(db, revisionId, limit) {
15964
- if (!revisionId || limit <= 0)
16513
+ function parseManifestText(text) {
16514
+ const trimmed = text.trim();
16515
+ if (!trimmed)
15965
16516
  return [];
15966
- return db.query(`SELECT id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json
15967
- FROM chunks
15968
- WHERE source_revision_id = ?
15969
- ORDER BY ordinal ASC
15970
- LIMIT ?`).all(revisionId, limit);
15971
- }
15972
- async function resolveOpenFilesSource(options) {
15973
- const purpose = options.purpose ?? "knowledge_answer";
15974
- const limit = Math.max(0, Math.min(options.limit ?? 10, 100));
15975
- const resolvedAt = (options.now ?? new Date).toISOString();
15976
- const parsed = parseSourceRef(options.sourceRef);
15977
- const sourceUri = catalogSourceUriForRef(options.sourceRef, parsed);
15978
- const requestedRevision = revisionIdForSourceRef(options.sourceRef);
15979
- if (options.safetyPolicy) {
15980
- if (!options.safetyPolicy.readOnlySourceAccess)
15981
- throw new Error("Safety policy denied source resolution.");
15982
- assertWriteAllowed(options.dbPath, options.safetyPolicy);
16517
+ if (trimmed.startsWith("[")) {
16518
+ const parsed = JSON.parse(trimmed);
16519
+ if (!Array.isArray(parsed))
16520
+ throw new Error("Manifest array parse failed.");
16521
+ return parsed.map((entry) => {
16522
+ const item = asObject2(entry);
16523
+ if (!item)
16524
+ throw new Error("Manifest array entries must be objects.");
16525
+ return item;
16526
+ });
15983
16527
  }
15984
- migrateKnowledgeDb(options.dbPath);
15985
- const db = openKnowledgeDb(options.dbPath);
15986
- try {
15987
- return db.transaction(() => {
15988
- const source = selectSource(db, sourceUri, options.sourceRef);
15989
- if (!source) {
15990
- recordAuditEvent(db, {
15991
- event_type: "source_read",
15992
- action: "open_files_resolve_missing",
15993
- target_uri: options.sourceRef,
15994
- decision: "allow",
15995
- metadata: { purpose, read_only: true, source_uri: sourceUri },
15996
- created_at: resolvedAt
15997
- });
15998
- return {
15999
- source_ref: options.sourceRef,
16000
- source_uri: sourceUri,
16001
- purpose,
16002
- read_only: true,
16003
- resolved: false,
16004
- resolver: {
16005
- name: "open-files-read-only",
16006
- mode: "local_catalog",
16007
- contract: "open-files-knowledge-source-v1"
16008
- },
16009
- source: null,
16010
- revision: null,
16011
- content: {
16012
- mime: null,
16013
- size: null,
16014
- hash: null,
16015
- text_available: false,
16016
- chunks_total: 0,
16017
- chunks_returned: 0,
16018
- char_count_returned: 0,
16019
- extracted_text_ref: null,
16020
- bytes_available: false,
16021
- bytes_exposed: false
16022
- },
16023
- chunks: [],
16024
- citations: []
16025
- };
16026
- }
16027
- const sourceMetadata = parseJsonObject2(source.metadata_json);
16028
- const permissions = parseJsonObject2(source.acl_json);
16029
- try {
16030
- assertPurposeAllowed(permissions, purpose);
16031
- } catch (error48) {
16032
- recordAuditEvent(db, {
16033
- event_type: "source_read",
16034
- action: "open_files_resolve",
16035
- target_uri: options.sourceRef,
16036
- decision: "deny",
16037
- metadata: {
16038
- purpose,
16039
- read_only: true,
16040
- source_uri: source.uri,
16041
- error: error48 instanceof Error ? error48.message : String(error48)
16042
- },
16043
- created_at: resolvedAt
16528
+ if (trimmed.startsWith("{")) {
16529
+ try {
16530
+ const parsed = JSON.parse(trimmed);
16531
+ const object2 = asObject2(parsed);
16532
+ if (!object2)
16533
+ throw new Error("Manifest object parse failed.");
16534
+ if (Array.isArray(object2.items)) {
16535
+ return object2.items.map((entry) => {
16536
+ const item = asObject2(entry);
16537
+ if (!item)
16538
+ throw new Error("Manifest items entries must be objects.");
16539
+ return item;
16044
16540
  });
16045
- throw error48;
16046
16541
  }
16047
- const revision = selectRevision(db, source.id, requestedRevision);
16048
- const revisionMetadata = parseJsonObject2(revision?.metadata_json);
16049
- const totalChunks = countChunks(db, revision?.id ?? null);
16050
- const rows = selectChunks(db, revision?.id ?? null, limit);
16051
- const effectiveSourceRef = sourceRevisionRef(source.uri, revision, options.sourceRef);
16052
- const chunks = rows.map((row) => {
16053
- const metadata = parseJsonObject2(row.metadata_json);
16054
- const evidence = {
16055
- resolver: "open-files-read-only",
16056
- mode: "local_catalog",
16057
- purpose,
16058
- read_only: true,
16059
- source_ref: metadataString2(metadata, ["source_ref"]) ?? effectiveSourceRef,
16060
- source_uri: source.uri,
16061
- source_revision_id: revision?.id ?? null,
16062
- revision: revision?.revision ?? null,
16063
- hash: revision?.hash ?? metadataString2(metadata, ["hash"]),
16064
- chunk_id: row.id,
16065
- start_offset: row.start_offset,
16066
- end_offset: row.end_offset,
16067
- resolved_at: resolvedAt
16068
- };
16069
- const provenance = sourceProvenance({
16070
- source_ref: evidence.source_ref,
16071
- source_uri: evidence.source_uri,
16072
- source_kind: source.kind,
16073
- source_revision_id: evidence.source_revision_id,
16074
- revision: evidence.revision,
16075
- hash: evidence.hash,
16076
- chunk_id: row.id,
16077
- start_offset: row.start_offset,
16078
- end_offset: row.end_offset,
16079
- status: metadataString2(metadata, ["status"]),
16080
- resolver: evidence.resolver
16081
- });
16082
- return {
16083
- id: row.id,
16084
- kind: row.kind,
16085
- ordinal: row.ordinal,
16086
- text: row.text,
16087
- token_count: row.token_count,
16088
- start_offset: row.start_offset,
16089
- end_offset: row.end_offset,
16090
- metadata,
16091
- evidence,
16092
- provenance
16093
- };
16094
- });
16095
- const citations = chunks.map((chunk) => ({
16096
- source_ref: chunk.evidence.source_ref,
16097
- source_uri: source.uri,
16098
- chunk_id: chunk.id,
16099
- quote: chunk.text.slice(0, 500),
16100
- start_offset: chunk.start_offset,
16101
- end_offset: chunk.end_offset,
16102
- evidence: chunk.evidence,
16103
- provenance: chunk.provenance
16104
- }));
16105
- recordAuditEvent(db, {
16106
- event_type: "source_read",
16107
- action: "open_files_resolve",
16108
- target_uri: options.sourceRef,
16109
- decision: "allow",
16110
- metadata: {
16111
- purpose,
16112
- read_only: true,
16113
- source_uri: source.uri,
16114
- revision: revision?.revision ?? null,
16115
- chunks_returned: chunks.length,
16116
- chunks_total: totalChunks
16117
- },
16118
- created_at: resolvedAt
16542
+ if ("source_ref" in object2 || "source_uri" in object2 || "file_id" in object2)
16543
+ return [object2];
16544
+ } catch (error48) {
16545
+ const lines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0);
16546
+ if (lines.length <= 1)
16547
+ throw error48;
16548
+ return lines.map((line) => {
16549
+ const item = asObject2(JSON.parse(line));
16550
+ if (!item)
16551
+ throw new Error("Manifest JSONL entries must be objects.");
16552
+ return item;
16119
16553
  });
16120
- const mime = metadataString2(sourceMetadata, ["mime", "content_type"]) ?? metadataString2(revisionMetadata, ["mime", "content_type"]);
16121
- const size = metadataNumber2(sourceMetadata, ["size", "size_bytes"]) ?? metadataNumber2(revisionMetadata, ["size", "size_bytes"]);
16122
- return {
16123
- source_ref: effectiveSourceRef,
16124
- source_uri: source.uri,
16125
- purpose,
16126
- read_only: true,
16127
- resolved: true,
16128
- resolver: {
16129
- name: "open-files-read-only",
16130
- mode: "local_catalog",
16131
- contract: "open-files-knowledge-source-v1"
16132
- },
16133
- source: {
16134
- id: source.id,
16135
- uri: source.uri,
16136
- kind: source.kind,
16137
- title: source.title,
16138
- metadata: sourceMetadata,
16139
- permissions,
16140
- updated_at: source.updated_at
16141
- },
16142
- revision: revision ? {
16143
- id: revision.id,
16144
- revision: revision.revision,
16145
- hash: revision.hash,
16146
- extracted_text_uri: revision.extracted_text_uri,
16147
- metadata: revisionMetadata,
16148
- created_at: revision.created_at,
16149
- reindex_required: revisionMetadata.reindex_required === true
16150
- } : null,
16151
- content: {
16152
- mime,
16153
- size,
16154
- hash: revision?.hash ?? metadataString2(sourceMetadata, ["hash", "checksum", "sha256"]),
16155
- text_available: totalChunks > 0,
16156
- chunks_total: totalChunks,
16157
- chunks_returned: chunks.length,
16158
- char_count_returned: chunks.reduce((sum, chunk) => sum + chunk.text.length, 0),
16159
- extracted_text_ref: revision?.extracted_text_uri ?? metadataString2(revisionMetadata, ["extracted_text_ref", "extracted_text_uri"]),
16160
- bytes_available: false,
16161
- bytes_exposed: false
16162
- },
16163
- chunks,
16164
- citations
16165
- };
16166
- })();
16167
- } finally {
16168
- db.close();
16554
+ }
16169
16555
  }
16556
+ return trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => {
16557
+ const item = asObject2(JSON.parse(line));
16558
+ if (!item)
16559
+ throw new Error("Manifest JSONL entries must be objects.");
16560
+ return item;
16561
+ });
16170
16562
  }
16171
-
16172
- // src/source-ingest.ts
16173
- function sha256Text(text) {
16174
- return `sha256:${createHash5("sha256").update(text).digest("hex")}`;
16175
- }
16176
- function stripHtml(html) {
16177
- return html.replace(/<script[\s\S]*?<\/script>/gi, " ").replace(/<style[\s\S]*?<\/style>/gi, " ").replace(/<[^>]+>/g, " ").replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/\s+\n/g, `
16178
- `).replace(/\n\s+/g, `
16179
- `).replace(/[ \t]{2,}/g, " ").trim();
16180
- }
16181
- async function readS3Text3(uri, config2, safetyPolicy) {
16563
+ async function readS3Text2(uri, config2, safetyPolicy) {
16182
16564
  const parsed = new URL(uri);
16183
16565
  const bucket = parsed.hostname;
16184
16566
  const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ""));
16185
16567
  if (!bucket || !key)
16186
- throw new Error(`Invalid S3 source URI: ${uri}`);
16568
+ throw new Error(`Invalid S3 manifest URI: ${uri}`);
16187
16569
  if (safetyPolicy)
16188
16570
  assertS3ReadAllowed(uri, safetyPolicy);
16189
16571
  const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
@@ -16201,202 +16583,259 @@ async function readS3Text3(uri, config2, safetyPolicy) {
16201
16583
  return "";
16202
16584
  return await response.Body.transformToString();
16203
16585
  }
16204
- async function readWebText(uri, safetyPolicy) {
16205
- if (safetyPolicy)
16206
- assertWebSearchAllowed(safetyPolicy);
16207
- const response = await fetch(uri, {
16208
- headers: {
16209
- accept: "text/markdown,text/plain,text/html,application/json;q=0.8,*/*;q=0.5",
16210
- "user-agent": "@hasna/knowledge source-ingest"
16211
- }
16212
- });
16213
- if (!response.ok)
16214
- throw new Error(`Web source read failed ${response.status}: ${uri}`);
16215
- const mime = response.headers.get("content-type");
16216
- const body = await response.text();
16217
- return { text: mime?.includes("html") ? stripHtml(body) : body, mime };
16218
- }
16219
- function titleForRef(parsed) {
16220
- if (parsed.kind === "file")
16221
- return basename3(parsed.path);
16222
- if (parsed.kind === "s3")
16223
- return basename3(parsed.key);
16224
- if (parsed.kind === "web")
16225
- return basename3(new URL(parsed.url).pathname) || parsed.url;
16226
- return parsed.path ? basename3(parsed.path) : parsed.id;
16586
+ async function readManifestInput(input, config2, safetyPolicy) {
16587
+ if (input.startsWith("s3://"))
16588
+ return readS3Text2(input, config2, safetyPolicy);
16589
+ if (!existsSync5(input))
16590
+ throw new Error(`Manifest not found: ${input}`);
16591
+ return readFileSync5(input, "utf8");
16227
16592
  }
16228
- async function readDirectSourceText(parsed, config2, safetyPolicy) {
16229
- if (parsed.kind === "file") {
16230
- if (!existsSync6(parsed.path))
16231
- throw new Error(`Source file not found: ${parsed.path}`);
16232
- const text = readFileSync6(parsed.path, "utf8");
16233
- return {
16234
- text,
16235
- contentSource: "file",
16236
- title: titleForRef(parsed),
16237
- mime: "text/plain",
16238
- size: text.length,
16239
- hash: sha256Text(text),
16240
- revision: null,
16241
- extractedTextRef: null,
16242
- metadata: { path: parsed.path },
16243
- permissions: { mode: "read_only" }
16244
- };
16245
- }
16246
- if (parsed.kind === "s3") {
16247
- const text = await readS3Text3(parsed.uri, config2, safetyPolicy);
16248
- return {
16249
- text,
16250
- contentSource: "s3",
16251
- title: titleForRef(parsed),
16252
- mime: "text/plain",
16253
- size: text.length,
16254
- hash: sha256Text(text),
16255
- revision: null,
16256
- extractedTextRef: null,
16257
- metadata: { bucket: parsed.bucket, key: parsed.key },
16258
- permissions: { mode: "read_only" }
16259
- };
16260
- }
16261
- if (parsed.kind === "web") {
16262
- const web = await readWebText(parsed.url, safetyPolicy);
16263
- return {
16264
- text: web.text,
16265
- contentSource: "web",
16266
- title: titleForRef(parsed),
16267
- mime: web.mime,
16268
- size: web.text.length,
16269
- hash: sha256Text(web.text),
16270
- revision: null,
16271
- extractedTextRef: null,
16272
- metadata: { url: parsed.url },
16273
- permissions: { mode: "read_only" }
16274
- };
16593
+ function chunkText(text, maxChars, overlapChars) {
16594
+ const normalized = text.replace(/\r\n/g, `
16595
+ `);
16596
+ if (!normalized.trim())
16597
+ return [];
16598
+ const chunks = [];
16599
+ let start = 0;
16600
+ while (start < normalized.length) {
16601
+ const hardEnd = Math.min(normalized.length, start + maxChars);
16602
+ let end = hardEnd;
16603
+ if (hardEnd < normalized.length) {
16604
+ const paragraphBreak = normalized.lastIndexOf(`
16605
+
16606
+ `, hardEnd);
16607
+ const sentenceBreak = normalized.lastIndexOf(". ", hardEnd);
16608
+ const candidate = Math.max(paragraphBreak, sentenceBreak);
16609
+ if (candidate > start + Math.floor(maxChars * 0.5))
16610
+ end = candidate + (candidate === paragraphBreak ? 2 : 1);
16611
+ }
16612
+ const chunk = normalized.slice(start, end).trim();
16613
+ if (chunk) {
16614
+ chunks.push({
16615
+ ordinal: chunks.length,
16616
+ text: chunk,
16617
+ startOffset: start,
16618
+ endOffset: end
16619
+ });
16620
+ }
16621
+ if (end >= normalized.length)
16622
+ break;
16623
+ start = Math.max(0, end - overlapChars);
16275
16624
  }
16276
- throw new Error(`Direct source reading is not available for ${parsed.uri}`);
16625
+ return chunks;
16277
16626
  }
16278
- async function readTextRef(uri, config2, safetyPolicy) {
16279
- if (uri.startsWith("open-files://")) {
16280
- throw new Error("Open-files extracted text refs require an open-files resolver API. Ingest an open-files manifest with extracted_text or an extracted_text_ref using file://, s3://, or https://.");
16627
+ function estimateTokenCount(text) {
16628
+ const words = text.trim().split(/\s+/).filter(Boolean).length;
16629
+ return Math.max(1, Math.ceil(words * 1.25));
16630
+ }
16631
+ function deleteChunksForRevision(db, sourceRevisionId) {
16632
+ const rows = db.query("SELECT id FROM chunks WHERE source_revision_id = ?").all(sourceRevisionId);
16633
+ for (const row of rows) {
16634
+ db.run("DELETE FROM chunks_fts WHERE chunk_id = ?", [row.id]);
16281
16635
  }
16282
- const parsed = parseSourceRef(uri);
16283
- const direct = await readDirectSourceText(parsed, config2, safetyPolicy);
16284
- return { text: direct.text, contentSource: "extracted_text_ref" };
16636
+ db.run("DELETE FROM chunks WHERE source_revision_id = ?", [sourceRevisionId]);
16637
+ return rows.length;
16285
16638
  }
16286
- async function readOpenFilesSourceText(options) {
16287
- const resolved = await resolveOpenFilesSource({
16639
+ function upsertSource(db, item, now) {
16640
+ const sourceId = stableId4("src", item.sourceUri);
16641
+ db.run(`INSERT INTO sources (id, uri, kind, title, metadata_json, acl_json, created_at, updated_at)
16642
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
16643
+ ON CONFLICT(uri) DO UPDATE SET
16644
+ kind = excluded.kind,
16645
+ title = excluded.title,
16646
+ metadata_json = excluded.metadata_json,
16647
+ acl_json = excluded.acl_json,
16648
+ updated_at = excluded.updated_at`, [
16649
+ sourceId,
16650
+ item.sourceUri,
16651
+ item.kind,
16652
+ item.title,
16653
+ JSON.stringify(item.metadata),
16654
+ JSON.stringify(item.acl ?? {}),
16655
+ now,
16656
+ item.updatedAt
16657
+ ]);
16658
+ const row = db.query("SELECT id FROM sources WHERE uri = ?").get(item.sourceUri);
16659
+ if (!row)
16660
+ throw new Error(`Failed to upsert source: ${item.sourceUri}`);
16661
+ return row.id;
16662
+ }
16663
+ function upsertRevision(db, sourceId, item, now) {
16664
+ const revisionId = stableId4("rev", `${sourceId}\x00${item.revision}`);
16665
+ db.run(`INSERT INTO source_revisions (id, source_id, revision, hash, extracted_text_uri, metadata_json, created_at)
16666
+ VALUES (?, ?, ?, ?, ?, ?, ?)
16667
+ ON CONFLICT(source_id, revision) DO UPDATE SET
16668
+ hash = excluded.hash,
16669
+ extracted_text_uri = excluded.extracted_text_uri,
16670
+ metadata_json = excluded.metadata_json`, [
16671
+ revisionId,
16672
+ sourceId,
16673
+ item.revision,
16674
+ item.hash,
16675
+ item.extractedTextUri,
16676
+ JSON.stringify(item.metadata),
16677
+ now
16678
+ ]);
16679
+ const row = db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").get(sourceId, item.revision);
16680
+ if (!row)
16681
+ throw new Error(`Failed to upsert source revision: ${item.sourceRef}`);
16682
+ return row.id;
16683
+ }
16684
+ function insertChunks(db, sourceRevisionId, item, now, maxChars, overlapChars, safetyPolicy) {
16685
+ if (!item.text || item.status.toLowerCase() === "deleted")
16686
+ return { chunksInserted: 0, redactions: 0 };
16687
+ const redacted = redactSecrets(item.text, safetyPolicy);
16688
+ if (redacted.findings.length > 0) {
16689
+ recordRedactionFindings(db, {
16690
+ source_uri: item.sourceUri,
16691
+ findings: redacted.findings,
16692
+ metadata: { source_ref: item.sourceRef, revision: item.revision },
16693
+ created_at: now
16694
+ });
16695
+ recordAuditEvent(db, {
16696
+ event_type: "redaction",
16697
+ action: "source_text_redact",
16698
+ target_uri: item.sourceUri,
16699
+ decision: "redacted",
16700
+ metadata: { findings: redacted.findings.length, source_ref: item.sourceRef, revision: item.revision },
16701
+ created_at: now
16702
+ });
16703
+ }
16704
+ const chunks = chunkText(redacted.text, maxChars, overlapChars);
16705
+ for (const chunk of chunks) {
16706
+ const chunkId = stableId4("chk", `${sourceRevisionId}\x00${chunk.ordinal}\x00${chunk.text}`);
16707
+ const provenance = sourceProvenance({
16708
+ source_ref: item.sourceRef,
16709
+ source_uri: item.sourceUri,
16710
+ source_kind: item.kind,
16711
+ source_revision_id: sourceRevisionId,
16712
+ revision: item.revision,
16713
+ hash: item.hash,
16714
+ chunk_id: chunkId,
16715
+ start_offset: chunk.startOffset,
16716
+ end_offset: chunk.endOffset,
16717
+ status: item.status,
16718
+ resolver: "open-files-read-only"
16719
+ });
16720
+ const metadata = withProvenance({
16721
+ source_ref: item.sourceRef,
16722
+ source_uri: item.sourceUri,
16723
+ source_kind: item.kind,
16724
+ source_revision_id: sourceRevisionId,
16725
+ revision: item.revision,
16726
+ hash: item.hash,
16727
+ status: item.status,
16728
+ path: asString2(item.raw.path) ?? null,
16729
+ mime: asString2(item.raw.mime) ?? asString2(item.raw.content_type) ?? null,
16730
+ size: asNumber(item.raw.size) ?? null
16731
+ }, provenance);
16732
+ db.run(`INSERT INTO chunks (id, source_revision_id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json, created_at)
16733
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
16734
+ chunkId,
16735
+ sourceRevisionId,
16736
+ "source",
16737
+ chunk.ordinal,
16738
+ chunk.text,
16739
+ estimateTokenCount(chunk.text),
16740
+ chunk.startOffset,
16741
+ chunk.endOffset,
16742
+ JSON.stringify(metadata),
16743
+ now
16744
+ ]);
16745
+ db.run("INSERT INTO chunks_fts (chunk_id, text, title, source_uri) VALUES (?, ?, ?, ?)", [chunkId, chunk.text, item.title ?? "", item.sourceUri]);
16746
+ }
16747
+ return { chunksInserted: chunks.length, redactions: redacted.findings.length };
16748
+ }
16749
+ async function ingestOpenFilesManifest(options) {
16750
+ const now = options.now ?? new Date;
16751
+ if (options.safetyPolicy)
16752
+ assertWriteAllowed(options.dbPath, options.safetyPolicy);
16753
+ migrateKnowledgeDb(options.dbPath);
16754
+ const text = await readManifestInput(options.input, options.config, options.safetyPolicy);
16755
+ const items = parseManifestText(text);
16756
+ return ingestOpenFilesManifestItems({
16288
16757
  dbPath: options.dbPath,
16289
- sourceRef: options.sourceRef,
16290
- purpose: options.purpose ?? "knowledge_index",
16291
- limit: 100,
16758
+ items,
16759
+ sourceLabel: options.input,
16292
16760
  safetyPolicy: options.safetyPolicy,
16293
- now: options.now
16761
+ now,
16762
+ maxChunkChars: options.maxChunkChars,
16763
+ chunkOverlapChars: options.chunkOverlapChars
16294
16764
  });
16295
- if (!resolved.resolved) {
16296
- throw new Error("Open-files source is not in the local knowledge catalog. Ingest an open-files manifest first or use the open-files resolver API.");
16297
- }
16298
- if (resolved.revision?.extracted_text_uri && !resolved.content.text_available) {
16299
- const textRef = await readTextRef(resolved.revision.extracted_text_uri, options.config, options.safetyPolicy);
16300
- return {
16301
- text: textRef.text,
16302
- contentSource: textRef.contentSource,
16303
- title: resolved.source?.title ?? null,
16304
- mime: resolved.content.mime,
16305
- size: textRef.text.length,
16306
- hash: resolved.revision.hash ?? sha256Text(textRef.text),
16307
- revision: resolved.revision.revision,
16308
- extractedTextRef: resolved.revision.extracted_text_uri,
16309
- metadata: resolved.source?.metadata ?? {},
16310
- permissions: resolved.source?.permissions ?? { mode: "read_only" }
16311
- };
16312
- }
16313
- if (resolved.chunks.length === 0) {
16314
- throw new Error("Open-files source has no extracted text chunks yet. Ingest an open-files manifest with extracted_text or extracted_text_ref first.");
16315
- }
16316
- const text = resolved.chunks.map((chunk) => chunk.text).join(`
16317
-
16318
- `);
16319
- return {
16320
- text,
16321
- contentSource: "catalog_chunks",
16322
- title: resolved.source?.title ?? null,
16323
- mime: resolved.content.mime,
16324
- size: text.length,
16325
- hash: resolved.revision?.hash ?? sha256Text(text),
16326
- revision: resolved.revision?.revision ?? null,
16327
- extractedTextRef: resolved.revision?.extracted_text_uri ?? null,
16328
- metadata: resolved.source?.metadata ?? {},
16329
- permissions: resolved.source?.permissions ?? { mode: "read_only" }
16330
- };
16331
16765
  }
16332
- function manifestItemForSource(sourceRef, parsed, resolved, purpose) {
16333
- const hash2 = resolved.hash ?? sha256Text(resolved.text);
16334
- const metadata = {
16335
- ...resolved.metadata,
16336
- source_ref: sourceRef,
16337
- content_source: resolved.contentSource,
16338
- read_only: true
16339
- };
16340
- const item = {
16341
- source_ref: sourceRef,
16342
- name: resolved.title ?? titleForRef(parsed),
16343
- mime: resolved.mime ?? "text/plain",
16344
- size: resolved.size ?? resolved.text.length,
16345
- hash: hash2,
16346
- revision: resolved.revision ?? hash2,
16347
- status: "active",
16348
- updated_at: new Date().toISOString(),
16349
- permissions: {
16350
- mode: "read_only",
16351
- allowed_purposes: [purpose],
16352
- ...resolved.permissions
16353
- },
16354
- metadata,
16355
- extracted_text_ref: resolved.extractedTextRef,
16356
- extracted_text: resolved.text
16357
- };
16358
- if (parsed.kind === "open-files") {
16359
- if (parsed.entity === "file")
16360
- item.file_id = parsed.id;
16361
- if (parsed.entity === "source") {
16362
- item.source_id = parsed.id;
16363
- item.path = parsed.path;
16364
- }
16766
+ async function ingestOpenFilesManifestItems(options) {
16767
+ const now = (options.now ?? new Date).toISOString();
16768
+ const maxChunkChars = options.maxChunkChars ?? 4000;
16769
+ const chunkOverlapChars = options.chunkOverlapChars ?? 200;
16770
+ if (maxChunkChars < 500)
16771
+ throw new Error("maxChunkChars must be at least 500.");
16772
+ if (chunkOverlapChars < 0 || chunkOverlapChars >= maxChunkChars)
16773
+ throw new Error("chunkOverlapChars must be less than maxChunkChars.");
16774
+ if (options.safetyPolicy)
16775
+ assertWriteAllowed(options.dbPath, options.safetyPolicy);
16776
+ migrateKnowledgeDb(options.dbPath);
16777
+ const db = openKnowledgeDb(options.dbPath);
16778
+ try {
16779
+ const result = db.transaction(() => {
16780
+ const seenSources = new Set;
16781
+ const seenRevisions = new Set;
16782
+ let chunksInserted = 0;
16783
+ let chunksDeleted = 0;
16784
+ let redactions = 0;
16785
+ let skipped = 0;
16786
+ recordAuditEvent(db, {
16787
+ event_type: "source_read",
16788
+ action: options.readAction ?? (options.sourceLabel.startsWith("s3://") ? "s3_manifest_read" : "local_manifest_read"),
16789
+ target_uri: options.sourceLabel,
16790
+ decision: "allow",
16791
+ metadata: { items: options.items.length, read_only: true },
16792
+ created_at: now
16793
+ });
16794
+ for (const raw of options.items) {
16795
+ const item = normalizeManifestItem(raw, now);
16796
+ const sourceId = upsertSource(db, item, now);
16797
+ const revisionId = upsertRevision(db, sourceId, item, now);
16798
+ seenSources.add(sourceId);
16799
+ seenRevisions.add(revisionId);
16800
+ if (item.text || item.status.toLowerCase() === "deleted") {
16801
+ chunksDeleted += deleteChunksForRevision(db, revisionId);
16802
+ }
16803
+ const inserted = insertChunks(db, revisionId, item, now, maxChunkChars, chunkOverlapChars, options.safetyPolicy);
16804
+ chunksInserted += inserted.chunksInserted;
16805
+ redactions += inserted.redactions;
16806
+ }
16807
+ recordAuditEvent(db, {
16808
+ event_type: "write",
16809
+ action: "knowledge_manifest_ingest",
16810
+ target_uri: options.dbPath,
16811
+ decision: "allow",
16812
+ metadata: { items: options.items.length, sources: seenSources.size, revisions: seenRevisions.size, chunks_inserted: chunksInserted, redactions },
16813
+ created_at: now
16814
+ });
16815
+ return {
16816
+ path: options.sourceLabel,
16817
+ db_path: options.dbPath,
16818
+ items_seen: options.items.length,
16819
+ sources_upserted: seenSources.size,
16820
+ revisions_upserted: seenRevisions.size,
16821
+ chunks_inserted: chunksInserted,
16822
+ chunks_deleted: chunksDeleted,
16823
+ redactions,
16824
+ skipped
16825
+ };
16826
+ })();
16827
+ return result;
16828
+ } finally {
16829
+ db.close();
16365
16830
  }
16366
- if (parsed.kind === "file")
16367
- item.path = parsed.path;
16368
- if (parsed.kind === "s3")
16369
- item.path = parsed.key;
16370
- if (parsed.kind === "web")
16371
- item.url = parsed.url;
16372
- return item;
16373
- }
16374
- async function ingestSourceRef(options) {
16375
- const purpose = options.purpose ?? "knowledge_index";
16376
- const parsed = parseSourceRef(options.sourceRef);
16377
- const resolved = parsed.kind === "open-files" ? await readOpenFilesSourceText(options) : await readDirectSourceText(parsed, options.config, options.safetyPolicy);
16378
- const item = manifestItemForSource(options.sourceRef, parsed, resolved, purpose);
16379
- const result = await ingestOpenFilesManifestItems({
16380
- dbPath: options.dbPath,
16381
- items: [item],
16382
- sourceLabel: options.sourceRef,
16383
- readAction: "source_ref_ingest_read",
16384
- safetyPolicy: options.safetyPolicy,
16385
- now: options.now
16386
- });
16387
- return {
16388
- ...result,
16389
- source_ref: options.sourceRef,
16390
- content_source: resolved.contentSource,
16391
- read_only: true,
16392
- hash: String(item.hash)
16393
- };
16394
16831
  }
16395
16832
 
16396
- // src/retrieval.ts
16833
+ // src/source-ingest.ts
16397
16834
  import { createHash as createHash6 } from "crypto";
16835
+ import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
16836
+ import { basename as basename3 } from "path";
16398
16837
 
16399
- // src/search.ts
16838
+ // src/source-resolver.ts
16400
16839
  function parseJsonObject3(value) {
16401
16840
  if (!value)
16402
16841
  return {};
@@ -16423,570 +16862,494 @@ function metadataNumber3(metadata, keys) {
16423
16862
  }
16424
16863
  return null;
16425
16864
  }
16426
- function unique(values) {
16427
- return Array.from(new Set(values));
16428
- }
16429
- function queryTerms(query) {
16430
- const terms = query.normalize("NFKC").toLowerCase().match(/[\p{L}\p{N}_]+/gu) ?? [];
16431
- return unique(terms.filter((term) => term.length > 0)).slice(0, 16);
16432
- }
16433
- function ftsQueryForTerms(terms) {
16434
- if (terms.length === 0)
16435
- return null;
16436
- return terms.map((term) => `${term}*`).join(" OR ");
16437
- }
16438
- function escapeLikeTerm(term) {
16439
- return term.replace(/[\\%_]/g, (char) => `\\${char}`);
16440
- }
16441
- function likeParams(terms, fieldsPerTerm) {
16442
- return terms.flatMap((term) => Array.from({ length: fieldsPerTerm }, () => `%${escapeLikeTerm(term)}%`));
16443
- }
16444
- function scoreFromRank(rank, index) {
16445
- const rankScore = Number.isFinite(rank) ? 1 / (1 + Math.abs(rank)) : 0;
16446
- const orderScore = 1 / (1 + index);
16447
- return roundScore(Math.max(rankScore, orderScore));
16448
- }
16449
- function catalogScore(haystack, terms) {
16450
- if (terms.length === 0)
16451
- return 0;
16452
- const matched = terms.filter((term) => haystack.includes(term)).length;
16453
- if (matched === 0)
16454
- return 0;
16455
- return roundScore(Math.min(0.85, 0.35 + matched / terms.length * 0.5));
16456
- }
16457
- function semanticScore(score) {
16458
- return roundScore(Math.max(0, Math.min(1, (score + 1) / 2)));
16459
- }
16460
- function roundScore(score) {
16461
- return Number(score.toFixed(6));
16462
- }
16463
- function combinedScore(scores, citation) {
16464
- const keyword = scores.keyword ?? 0;
16465
- const semantic = scores.semantic ?? 0;
16466
- const catalog = scores.catalog ?? 0;
16467
- const citationBoost = citation?.chunk_id ? 0.05 : 0;
16468
- return roundScore(Math.min(1, keyword * 0.55 + semantic * 0.4 + catalog * 0.35 + citationBoost));
16469
- }
16470
- function existingProvenance(metadata) {
16471
- const provenance = metadata.provenance;
16472
- return provenance && typeof provenance === "object" && !Array.isArray(provenance) ? provenance : null;
16473
- }
16474
- function provenanceForChunk2(row) {
16475
- const metadata = parseJsonObject3(row.chunk_metadata_json);
16476
- const existing = existingProvenance(metadata);
16477
- if (existing)
16478
- return existing;
16479
- if (!row.source_revision_id && !row.source_uri)
16480
- return null;
16481
- return sourceProvenance({
16482
- source_ref: metadataString3(metadata, ["source_ref"]),
16483
- source_uri: row.source_uri ?? metadataString3(metadata, ["source_uri"]),
16484
- source_kind: row.source_kind ?? metadataString3(metadata, ["source_kind"]),
16485
- source_revision_id: row.source_revision_id,
16486
- revision: row.revision ?? metadataString3(metadata, ["revision"]),
16487
- hash: row.hash ?? metadataString3(metadata, ["hash"]),
16488
- chunk_id: row.chunk_id,
16489
- start_offset: row.start_offset ?? metadataNumber3(metadata, ["start_offset"]),
16490
- end_offset: row.end_offset ?? metadataNumber3(metadata, ["end_offset"]),
16491
- status: metadataString3(metadata, ["status"]),
16492
- resolver: "open-files-read-only"
16493
- });
16494
- }
16495
- function selectFtsChunks(db, ftsQuery, limit) {
16496
- if (!ftsQuery)
16497
- return [];
16498
- return db.query(`SELECT
16499
- chunks_fts.chunk_id,
16500
- c.kind AS chunk_kind,
16501
- c.wiki_page_id,
16502
- c.text,
16503
- c.token_count,
16504
- c.start_offset,
16505
- c.end_offset,
16506
- c.metadata_json AS chunk_metadata_json,
16507
- c.source_revision_id,
16508
- sr.revision,
16509
- sr.hash,
16510
- s.uri AS source_uri,
16511
- s.kind AS source_kind,
16512
- s.title AS source_title,
16513
- wp.path AS wiki_path,
16514
- wp.title AS wiki_title,
16515
- wp.artifact_uri AS wiki_artifact_uri,
16516
- wp.content_hash AS wiki_content_hash,
16517
- wp.status AS wiki_status,
16518
- wp.metadata_json AS wiki_metadata_json,
16519
- bm25(chunks_fts) AS rank
16520
- FROM chunks_fts
16521
- JOIN chunks c ON c.id = chunks_fts.chunk_id
16522
- LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
16523
- LEFT JOIN sources s ON s.id = sr.source_id
16524
- LEFT JOIN wiki_pages wp ON wp.id = c.wiki_page_id
16525
- WHERE chunks_fts MATCH ?
16526
- ORDER BY rank ASC
16527
- LIMIT ?`).all(ftsQuery, limit);
16528
- }
16529
- function catalogWhere(fields, terms) {
16530
- if (terms.length === 0)
16531
- return "1 = 0";
16532
- const clauses = terms.map(() => `(${fields.map((field) => `lower(COALESCE(${field}, '')) LIKE ? ESCAPE '\\'`).join(" OR ")})`);
16533
- return clauses.join(" OR ");
16534
- }
16535
- function selectWikiPages(db, terms, limit) {
16536
- const fields = ["path", "title", "artifact_uri", "metadata_json"];
16537
- return db.query(`SELECT id, path, title, artifact_uri, content_hash, status, metadata_json
16538
- FROM wiki_pages
16539
- WHERE status = 'active' AND (${catalogWhere(fields, terms)})
16540
- ORDER BY updated_at DESC
16541
- LIMIT ?`).all(...likeParams(terms, fields.length), limit);
16542
- }
16543
- function selectKnowledgeIndexes(db, terms, limit) {
16544
- const fields = ["kind", "name", "shard_key", "artifact_uri", "metadata_json"];
16545
- return db.query(`SELECT id, kind, name, artifact_uri, shard_key, metadata_json
16546
- FROM knowledge_indexes
16547
- WHERE ${catalogWhere(fields, terms)}
16548
- ORDER BY updated_at DESC
16549
- LIMIT ?`).all(...likeParams(terms, fields.length), limit);
16550
- }
16551
- function chunkResult(row, keywordScore) {
16552
- const metadata = parseJsonObject3(row.chunk_metadata_json);
16553
- const provenance = provenanceForChunk2(row);
16554
- const sourceRef = metadataString3(metadata, ["source_ref"]);
16555
- const sourceUri = row.source_uri ?? metadataString3(metadata, ["source_uri"]);
16556
- const isWiki = Boolean(row.wiki_page_id);
16557
- const result = {
16558
- kind: isWiki ? "wiki_chunk" : "source_chunk",
16559
- id: row.chunk_id,
16560
- title: isWiki ? row.wiki_title : row.source_title,
16561
- text: row.text,
16562
- score: 0,
16563
- scores: { keyword: keywordScore },
16564
- source: sourceUri || sourceRef ? {
16565
- uri: sourceUri,
16566
- ref: sourceRef,
16567
- kind: row.source_kind ?? metadataString3(metadata, ["source_kind"]),
16568
- revision: row.revision ?? metadataString3(metadata, ["revision"]),
16569
- hash: row.hash ?? metadataString3(metadata, ["hash"])
16570
- } : null,
16571
- citation: {
16572
- chunk_id: row.chunk_id,
16573
- start_offset: row.start_offset,
16574
- end_offset: row.end_offset
16575
- },
16576
- artifact: isWiki ? {
16577
- uri: row.wiki_artifact_uri,
16578
- path: row.wiki_path,
16579
- hash: row.wiki_content_hash,
16580
- shard_key: row.wiki_path
16581
- } : null,
16582
- provenance,
16583
- reasons: ["keyword_match"]
16584
- };
16585
- result.score = combinedScore(result.scores, result.citation);
16586
- return result;
16587
- }
16588
- function wikiPageResult(row, terms) {
16589
- const metadata = parseJsonObject3(row.metadata_json);
16590
- const score = catalogScore(`${row.path} ${row.title} ${row.artifact_uri ?? ""} ${row.metadata_json}`.toLowerCase(), terms);
16591
- const result = {
16592
- kind: "wiki_page",
16593
- id: row.id,
16594
- title: row.title,
16595
- text: null,
16596
- score: 0,
16597
- scores: { catalog: score },
16598
- source: null,
16599
- citation: null,
16600
- artifact: {
16601
- uri: row.artifact_uri,
16602
- path: row.path,
16603
- hash: row.content_hash,
16604
- shard_key: row.path
16605
- },
16606
- provenance: existingProvenance(metadata),
16607
- reasons: ["wiki_catalog_match"]
16608
- };
16609
- result.score = combinedScore(result.scores, result.citation);
16610
- return result;
16611
- }
16612
- function indexResult(row, terms) {
16613
- const metadata = parseJsonObject3(row.metadata_json);
16614
- const score = catalogScore(`${row.kind} ${row.name} ${row.shard_key ?? ""} ${row.artifact_uri ?? ""} ${row.metadata_json}`.toLowerCase(), terms);
16615
- const result = {
16616
- kind: "knowledge_index",
16617
- id: row.id,
16618
- title: row.name,
16619
- text: null,
16620
- score: 0,
16621
- scores: { catalog: score },
16622
- source: null,
16623
- citation: null,
16624
- artifact: {
16625
- uri: row.artifact_uri,
16626
- path: metadataString3(metadata, ["artifact_key"]),
16627
- hash: metadataString3(metadata, ["content_hash"]),
16628
- shard_key: row.shard_key
16629
- },
16630
- provenance: existingProvenance(metadata),
16631
- reasons: ["index_catalog_match"]
16632
- };
16633
- result.score = combinedScore(result.scores, result.citation);
16634
- return result;
16865
+ function assertPurposeAllowed(permissions, purpose) {
16866
+ const mode = permissions.mode;
16867
+ if (typeof mode === "string" && mode !== "read_only") {
16868
+ throw new Error(`Source resolver denied ${purpose}. Permission mode is ${mode}, expected read_only.`);
16869
+ }
16870
+ const denied = permissions.denied_purposes;
16871
+ if (Array.isArray(denied) && denied.includes(purpose)) {
16872
+ throw new Error(`Source resolver denied ${purpose}. Purpose is explicitly denied.`);
16873
+ }
16874
+ const allowed = permissions.allowed_purposes;
16875
+ if (Array.isArray(allowed) && allowed.length > 0 && !allowed.includes(purpose)) {
16876
+ throw new Error(`Source resolver denied ${purpose}. Allowed purposes: ${allowed.join(", ")}`);
16877
+ }
16635
16878
  }
16636
- function mergeResult(results, entry) {
16637
- const key = `${entry.kind}:${entry.id}`;
16638
- const existing = results.get(key);
16639
- if (!existing) {
16640
- results.set(key, entry);
16641
- return;
16879
+ function sourceRevisionRef(sourceUri, revision, fallback) {
16880
+ if (!revision)
16881
+ return fallback;
16882
+ try {
16883
+ const parsed = parseSourceRef(sourceUri);
16884
+ if (parsed.kind === "open-files" && parsed.entity === "file") {
16885
+ return `${sourceUri}/revision/${encodeURIComponent(revision.revision)}`;
16886
+ }
16887
+ } catch {
16888
+ return fallback;
16642
16889
  }
16643
- existing.scores = {
16644
- keyword: Math.max(existing.scores.keyword ?? 0, entry.scores.keyword ?? 0) || undefined,
16645
- semantic: Math.max(existing.scores.semantic ?? 0, entry.scores.semantic ?? 0) || undefined,
16646
- catalog: Math.max(existing.scores.catalog ?? 0, entry.scores.catalog ?? 0) || undefined
16647
- };
16648
- existing.reasons = unique([...existing.reasons, ...entry.reasons]);
16649
- existing.text = existing.text ?? entry.text;
16650
- existing.title = existing.title ?? entry.title;
16651
- existing.source = existing.source ?? entry.source;
16652
- existing.citation = existing.citation ?? entry.citation;
16653
- existing.artifact = existing.artifact ?? entry.artifact;
16654
- existing.provenance = existing.provenance ?? entry.provenance;
16655
- existing.score = combinedScore(existing.scores, existing.citation);
16890
+ return fallback;
16656
16891
  }
16657
- function sortResults(results) {
16658
- const kindOrder = {
16659
- source_chunk: 0,
16660
- wiki_chunk: 1,
16661
- wiki_page: 2,
16662
- knowledge_index: 3
16663
- };
16664
- return results.sort((a, b) => {
16665
- if (b.score !== a.score)
16666
- return b.score - a.score;
16667
- return kindOrder[a.kind] - kindOrder[b.kind] || a.id.localeCompare(b.id);
16668
- });
16892
+ function selectSource(db, sourceUri, requestedRef) {
16893
+ return db.query(`SELECT id, uri, kind, title, metadata_json, acl_json, updated_at
16894
+ FROM sources
16895
+ WHERE uri = ? OR uri = ?
16896
+ ORDER BY CASE WHEN uri = ? THEN 0 ELSE 1 END
16897
+ LIMIT 1`).get(sourceUri, requestedRef, sourceUri) ?? null;
16669
16898
  }
16670
- async function hybridSearch(options) {
16671
- const query = options.query.trim();
16672
- if (!query)
16673
- throw new Error("Search query is required.");
16674
- const limit = Math.max(1, Math.min(options.limit ?? 10, 100));
16675
- const terms = queryTerms(query);
16676
- const ftsQuery = ftsQueryForTerms(terms);
16677
- const semanticEnabled = options.semantic === true || options.fake === true || Boolean(options.modelRef);
16678
- const warnings = [];
16679
- let semanticProvider = null;
16680
- let semanticModel = null;
16681
- let semanticDimensions = null;
16682
- let keywordCount = 0;
16683
- let catalogCount = 0;
16684
- let semanticCount = 0;
16685
- const merged = new Map;
16899
+ function selectRevision(db, sourceId, revisionId) {
16900
+ if (revisionId) {
16901
+ return db.query(`SELECT id, revision, hash, extracted_text_uri, metadata_json, created_at
16902
+ FROM source_revisions
16903
+ WHERE source_id = ? AND revision = ?
16904
+ LIMIT 1`).get(sourceId, revisionId) ?? null;
16905
+ }
16906
+ return db.query(`SELECT id, revision, hash, extracted_text_uri, metadata_json, created_at
16907
+ FROM source_revisions
16908
+ WHERE source_id = ?
16909
+ ORDER BY created_at DESC, revision DESC
16910
+ LIMIT 1`).get(sourceId) ?? null;
16911
+ }
16912
+ function countChunks(db, revisionId) {
16913
+ if (!revisionId)
16914
+ return 0;
16915
+ const row = db.query("SELECT COUNT(*) AS n FROM chunks WHERE source_revision_id = ?").get(revisionId);
16916
+ return row?.n ?? 0;
16917
+ }
16918
+ function selectChunks(db, revisionId, limit) {
16919
+ if (!revisionId || limit <= 0)
16920
+ return [];
16921
+ return db.query(`SELECT id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json
16922
+ FROM chunks
16923
+ WHERE source_revision_id = ?
16924
+ ORDER BY ordinal ASC
16925
+ LIMIT ?`).all(revisionId, limit);
16926
+ }
16927
+ async function resolveOpenFilesSource(options) {
16928
+ const purpose = options.purpose ?? "knowledge_answer";
16929
+ const limit = Math.max(0, Math.min(options.limit ?? 10, 100));
16930
+ const resolvedAt = (options.now ?? new Date).toISOString();
16931
+ const parsed = parseSourceRef(options.sourceRef);
16932
+ const sourceUri = catalogSourceUriForRef(options.sourceRef, parsed);
16933
+ const requestedRevision = revisionIdForSourceRef(options.sourceRef);
16934
+ if (options.safetyPolicy) {
16935
+ if (!options.safetyPolicy.readOnlySourceAccess)
16936
+ throw new Error("Safety policy denied source resolution.");
16937
+ assertWriteAllowed(options.dbPath, options.safetyPolicy);
16938
+ }
16686
16939
  migrateKnowledgeDb(options.dbPath);
16687
16940
  const db = openKnowledgeDb(options.dbPath);
16688
16941
  try {
16689
- const ftsRows = selectFtsChunks(db, ftsQuery, Math.max(limit * 3, 20));
16690
- keywordCount = ftsRows.length;
16691
- ftsRows.forEach((row, index) => mergeResult(merged, chunkResult(row, scoreFromRank(row.rank, index))));
16692
- const wikiRows = selectWikiPages(db, terms, Math.max(limit, 10));
16693
- const indexRows = selectKnowledgeIndexes(db, terms, Math.max(limit, 10));
16694
- catalogCount = wikiRows.length + indexRows.length;
16695
- wikiRows.forEach((row) => mergeResult(merged, wikiPageResult(row, terms)));
16696
- indexRows.forEach((row) => mergeResult(merged, indexResult(row, terms)));
16942
+ return db.transaction(() => {
16943
+ const source = selectSource(db, sourceUri, options.sourceRef);
16944
+ if (!source) {
16945
+ recordAuditEvent(db, {
16946
+ event_type: "source_read",
16947
+ action: "open_files_resolve_missing",
16948
+ target_uri: options.sourceRef,
16949
+ decision: "allow",
16950
+ metadata: { purpose, read_only: true, source_uri: sourceUri },
16951
+ created_at: resolvedAt
16952
+ });
16953
+ return {
16954
+ source_ref: options.sourceRef,
16955
+ source_uri: sourceUri,
16956
+ purpose,
16957
+ read_only: true,
16958
+ resolved: false,
16959
+ resolver: {
16960
+ name: "open-files-read-only",
16961
+ mode: "local_catalog",
16962
+ contract: "open-files-knowledge-source-v1"
16963
+ },
16964
+ source: null,
16965
+ revision: null,
16966
+ content: {
16967
+ mime: null,
16968
+ size: null,
16969
+ hash: null,
16970
+ text_available: false,
16971
+ chunks_total: 0,
16972
+ chunks_returned: 0,
16973
+ char_count_returned: 0,
16974
+ extracted_text_ref: null,
16975
+ bytes_available: false,
16976
+ bytes_exposed: false
16977
+ },
16978
+ chunks: [],
16979
+ citations: []
16980
+ };
16981
+ }
16982
+ const sourceMetadata = parseJsonObject3(source.metadata_json);
16983
+ const permissions = parseJsonObject3(source.acl_json);
16984
+ try {
16985
+ assertPurposeAllowed(permissions, purpose);
16986
+ } catch (error48) {
16987
+ recordAuditEvent(db, {
16988
+ event_type: "source_read",
16989
+ action: "open_files_resolve",
16990
+ target_uri: options.sourceRef,
16991
+ decision: "deny",
16992
+ metadata: {
16993
+ purpose,
16994
+ read_only: true,
16995
+ source_uri: source.uri,
16996
+ error: error48 instanceof Error ? error48.message : String(error48)
16997
+ },
16998
+ created_at: resolvedAt
16999
+ });
17000
+ throw error48;
17001
+ }
17002
+ const revision = selectRevision(db, source.id, requestedRevision);
17003
+ const revisionMetadata = parseJsonObject3(revision?.metadata_json);
17004
+ const totalChunks = countChunks(db, revision?.id ?? null);
17005
+ const rows = selectChunks(db, revision?.id ?? null, limit);
17006
+ const effectiveSourceRef = sourceRevisionRef(source.uri, revision, options.sourceRef);
17007
+ const chunks = rows.map((row) => {
17008
+ const metadata = parseJsonObject3(row.metadata_json);
17009
+ const evidence = {
17010
+ resolver: "open-files-read-only",
17011
+ mode: "local_catalog",
17012
+ purpose,
17013
+ read_only: true,
17014
+ source_ref: metadataString3(metadata, ["source_ref"]) ?? effectiveSourceRef,
17015
+ source_uri: source.uri,
17016
+ source_revision_id: revision?.id ?? null,
17017
+ revision: revision?.revision ?? null,
17018
+ hash: revision?.hash ?? metadataString3(metadata, ["hash"]),
17019
+ chunk_id: row.id,
17020
+ start_offset: row.start_offset,
17021
+ end_offset: row.end_offset,
17022
+ resolved_at: resolvedAt
17023
+ };
17024
+ const provenance = sourceProvenance({
17025
+ source_ref: evidence.source_ref,
17026
+ source_uri: evidence.source_uri,
17027
+ source_kind: source.kind,
17028
+ source_revision_id: evidence.source_revision_id,
17029
+ revision: evidence.revision,
17030
+ hash: evidence.hash,
17031
+ chunk_id: row.id,
17032
+ start_offset: row.start_offset,
17033
+ end_offset: row.end_offset,
17034
+ status: metadataString3(metadata, ["status"]),
17035
+ resolver: evidence.resolver
17036
+ });
17037
+ return {
17038
+ id: row.id,
17039
+ kind: row.kind,
17040
+ ordinal: row.ordinal,
17041
+ text: row.text,
17042
+ token_count: row.token_count,
17043
+ start_offset: row.start_offset,
17044
+ end_offset: row.end_offset,
17045
+ metadata,
17046
+ evidence,
17047
+ provenance
17048
+ };
17049
+ });
17050
+ const citations = chunks.map((chunk) => ({
17051
+ source_ref: chunk.evidence.source_ref,
17052
+ source_uri: source.uri,
17053
+ chunk_id: chunk.id,
17054
+ quote: chunk.text.slice(0, 500),
17055
+ start_offset: chunk.start_offset,
17056
+ end_offset: chunk.end_offset,
17057
+ evidence: chunk.evidence,
17058
+ provenance: chunk.provenance
17059
+ }));
17060
+ recordAuditEvent(db, {
17061
+ event_type: "source_read",
17062
+ action: "open_files_resolve",
17063
+ target_uri: options.sourceRef,
17064
+ decision: "allow",
17065
+ metadata: {
17066
+ purpose,
17067
+ read_only: true,
17068
+ source_uri: source.uri,
17069
+ revision: revision?.revision ?? null,
17070
+ chunks_returned: chunks.length,
17071
+ chunks_total: totalChunks
17072
+ },
17073
+ created_at: resolvedAt
17074
+ });
17075
+ const mime = metadataString3(sourceMetadata, ["mime", "content_type"]) ?? metadataString3(revisionMetadata, ["mime", "content_type"]);
17076
+ const size = metadataNumber3(sourceMetadata, ["size", "size_bytes"]) ?? metadataNumber3(revisionMetadata, ["size", "size_bytes"]);
17077
+ return {
17078
+ source_ref: effectiveSourceRef,
17079
+ source_uri: source.uri,
17080
+ purpose,
17081
+ read_only: true,
17082
+ resolved: true,
17083
+ resolver: {
17084
+ name: "open-files-read-only",
17085
+ mode: "local_catalog",
17086
+ contract: "open-files-knowledge-source-v1"
17087
+ },
17088
+ source: {
17089
+ id: source.id,
17090
+ uri: source.uri,
17091
+ kind: source.kind,
17092
+ title: source.title,
17093
+ metadata: sourceMetadata,
17094
+ permissions,
17095
+ updated_at: source.updated_at
17096
+ },
17097
+ revision: revision ? {
17098
+ id: revision.id,
17099
+ revision: revision.revision,
17100
+ hash: revision.hash,
17101
+ extracted_text_uri: revision.extracted_text_uri,
17102
+ metadata: revisionMetadata,
17103
+ created_at: revision.created_at,
17104
+ reindex_required: revisionMetadata.reindex_required === true
17105
+ } : null,
17106
+ content: {
17107
+ mime,
17108
+ size,
17109
+ hash: revision?.hash ?? metadataString3(sourceMetadata, ["hash", "checksum", "sha256"]),
17110
+ text_available: totalChunks > 0,
17111
+ chunks_total: totalChunks,
17112
+ chunks_returned: chunks.length,
17113
+ char_count_returned: chunks.reduce((sum, chunk) => sum + chunk.text.length, 0),
17114
+ extracted_text_ref: revision?.extracted_text_uri ?? metadataString3(revisionMetadata, ["extracted_text_ref", "extracted_text_uri"]),
17115
+ bytes_available: false,
17116
+ bytes_exposed: false
17117
+ },
17118
+ chunks,
17119
+ citations
17120
+ };
17121
+ })();
16697
17122
  } finally {
16698
17123
  db.close();
16699
17124
  }
16700
- if (semanticEnabled) {
16701
- try {
16702
- const semantic = await searchVectorIndex({
16703
- dbPath: options.dbPath,
16704
- query,
16705
- limit: Math.max(limit * 3, 20),
16706
- config: options.config,
16707
- env: options.env,
16708
- modelRef: options.modelRef,
16709
- dimensions: options.dimensions,
16710
- fake: options.fake,
16711
- batchSize: options.batchSize,
16712
- maxParallelCalls: options.maxParallelCalls
16713
- });
16714
- semanticProvider = semantic.provider;
16715
- semanticModel = semantic.model;
16716
- semanticDimensions = semantic.dimensions;
16717
- semanticCount = semantic.results.length;
16718
- for (const row of semantic.results) {
16719
- const result = {
16720
- kind: "source_chunk",
16721
- id: row.chunk_id,
16722
- title: null,
16723
- text: row.text,
16724
- score: 0,
16725
- scores: { semantic: semanticScore(row.score) },
16726
- source: {
16727
- uri: row.source_uri,
16728
- ref: row.source_ref,
16729
- kind: row.provenance?.source_kind ?? null,
16730
- revision: row.revision,
16731
- hash: row.hash
16732
- },
16733
- citation: {
16734
- chunk_id: row.chunk_id,
16735
- start_offset: row.provenance?.start_offset ?? null,
16736
- end_offset: row.provenance?.end_offset ?? null
16737
- },
16738
- artifact: null,
16739
- provenance: row.provenance,
16740
- reasons: ["semantic_match"]
16741
- };
16742
- result.score = combinedScore(result.scores, result.citation);
16743
- mergeResult(merged, result);
16744
- }
16745
- } catch (error48) {
16746
- warnings.push(`semantic_search_failed: ${error48 instanceof Error ? error48.message : String(error48)}`);
16747
- }
16748
- }
16749
- const results = sortResults(Array.from(merged.values())).slice(0, limit);
16750
- return {
16751
- query,
16752
- limit,
16753
- mode: {
16754
- keyword: true,
16755
- catalog: true,
16756
- semantic: semanticEnabled
16757
- },
16758
- semantic_provider: semanticProvider,
16759
- semantic_model: semanticModel,
16760
- semantic_dimensions: semanticDimensions,
16761
- counts: {
16762
- keyword_results: keywordCount,
16763
- catalog_results: catalogCount,
16764
- semantic_results: semanticCount,
16765
- merged_results: results.length
16766
- },
16767
- warnings,
16768
- results
16769
- };
16770
17125
  }
16771
17126
 
16772
- // src/retrieval.ts
16773
- function stableId4(prefix, value) {
16774
- return `${prefix}_${createHash6("sha256").update(value).digest("hex").slice(0, 20)}`;
16775
- }
16776
- function normalizeQuery(query) {
16777
- return query.normalize("NFKC").trim().replace(/\s+/g, " ").toLowerCase();
16778
- }
16779
- function queryTerms2(query) {
16780
- return Array.from(new Set(normalizeQuery(query).match(/[\p{L}\p{N}_]+/gu) ?? [])).slice(0, 16);
16781
- }
16782
- function textForResult(result) {
16783
- return [result.title, result.text].filter(Boolean).join(" ").toLowerCase();
16784
- }
16785
- function exactScore(result, terms) {
16786
- if (terms.length === 0)
16787
- return 0;
16788
- const text = textForResult(result);
16789
- const matched = terms.filter((term) => text.includes(term)).length;
16790
- return Number((matched / terms.length).toFixed(6));
16791
- }
16792
- function hasReadOnlyProvenance(provenance) {
16793
- if (!provenance)
16794
- return true;
16795
- if ("read_only" in provenance)
16796
- return provenance.read_only === true;
16797
- if ("read_only_sources" in provenance)
16798
- return provenance.read_only_sources === true;
16799
- return true;
16800
- }
16801
- function isStale(provenance) {
16802
- if (!provenance)
16803
- return false;
16804
- if ("stale" in provenance && provenance.stale)
16805
- return true;
16806
- if ("status" in provenance)
16807
- return isStaleStatus(provenance.status);
16808
- return false;
16809
- }
16810
- function freshnessScore(result) {
16811
- if (isStale(result.provenance))
16812
- return 0;
16813
- if (result.source?.hash || result.source?.revision)
16814
- return 1;
16815
- if (result.artifact?.hash)
16816
- return 0.85;
16817
- if (result.provenance && "source_refs" in result.provenance && result.provenance.source_refs.length > 0)
16818
- return 0.75;
16819
- return 0.55;
17127
+ // src/source-ingest.ts
17128
+ function sha256Text(text) {
17129
+ return `sha256:${createHash6("sha256").update(text).digest("hex")}`;
16820
17130
  }
16821
- function citationScore(result) {
16822
- if (result.citation?.chunk_id && (result.source?.uri || result.artifact?.uri))
16823
- return 1;
16824
- if (result.provenance && "citation_required" in result.provenance && result.provenance.citation_required)
16825
- return 0.75;
16826
- if (result.artifact?.uri)
16827
- return 0.65;
16828
- return 0.35;
17131
+ function stripHtml(html) {
17132
+ return html.replace(/<script[\s\S]*?<\/script>/gi, " ").replace(/<style[\s\S]*?<\/style>/gi, " ").replace(/<[^>]+>/g, " ").replace(/&nbsp;/g, " ").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/\s+\n/g, `
17133
+ `).replace(/\n\s+/g, `
17134
+ `).replace(/[ \t]{2,}/g, " ").trim();
16829
17135
  }
16830
- function authorityScore(result) {
16831
- if (result.kind === "wiki_chunk")
16832
- return 0.85;
16833
- if (result.kind === "source_chunk")
16834
- return 0.8;
16835
- if (result.kind === "wiki_page")
16836
- return 0.65;
16837
- return 0.55;
17136
+ async function readS3Text3(uri, config2, safetyPolicy) {
17137
+ const parsed = new URL(uri);
17138
+ const bucket = parsed.hostname;
17139
+ const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ""));
17140
+ if (!bucket || !key)
17141
+ throw new Error(`Invalid S3 source URI: ${uri}`);
17142
+ if (safetyPolicy)
17143
+ assertS3ReadAllowed(uri, safetyPolicy);
17144
+ const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
17145
+ import("@aws-sdk/client-s3"),
17146
+ import("@aws-sdk/credential-providers")
17147
+ ]);
17148
+ const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
17149
+ const client = new S3Client({
17150
+ region: s3Config?.region,
17151
+ credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
17152
+ maxAttempts: s3Config?.max_attempts
17153
+ });
17154
+ const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
17155
+ if (!response.Body)
17156
+ return "";
17157
+ return await response.Body.transformToString();
16838
17158
  }
16839
- function rerank(result, terms) {
16840
- const scores = {
16841
- base_score: result.score,
16842
- exact_score: exactScore(result, terms),
16843
- citation_score: citationScore(result),
16844
- freshness_score: freshnessScore(result),
16845
- authority_score: authorityScore(result)
16846
- };
16847
- const final = Math.min(1, scores.base_score * 0.65 + scores.exact_score * 0.1 + scores.citation_score * 0.1 + scores.freshness_score * 0.1 + scores.authority_score * 0.05);
16848
- const reasons = new Set(result.reasons);
16849
- if (scores.exact_score > 0.5)
16850
- reasons.add("exact_term");
16851
- if (scores.citation_score >= 0.75)
16852
- reasons.add("cited_source");
16853
- if (scores.freshness_score >= 0.85)
16854
- reasons.add("fresh_source");
16855
- return {
16856
- ...result,
16857
- score: Number(final.toFixed(6)),
16858
- reasons: Array.from(reasons),
16859
- rerank: {
16860
- ...scores,
16861
- final_score: Number(final.toFixed(6))
17159
+ async function readWebText(uri, safetyPolicy) {
17160
+ if (safetyPolicy)
17161
+ assertWebSearchAllowed(safetyPolicy);
17162
+ const response = await fetch(uri, {
17163
+ headers: {
17164
+ accept: "text/markdown,text/plain,text/html,application/json;q=0.8,*/*;q=0.5",
17165
+ "user-agent": "@hasna/knowledge source-ingest"
16862
17166
  }
16863
- };
17167
+ });
17168
+ if (!response.ok)
17169
+ throw new Error(`Web source read failed ${response.status}: ${uri}`);
17170
+ const mime = response.headers.get("content-type");
17171
+ const body = await response.text();
17172
+ return { text: mime?.includes("html") ? stripHtml(body) : body, mime };
16864
17173
  }
16865
- function quoteFor(result, maxChars) {
16866
- const source = result.text ?? result.title;
16867
- if (!source)
16868
- return null;
16869
- const normalized = source.replace(/\s+/g, " ").trim();
16870
- return normalized.length <= maxChars ? normalized : `${normalized.slice(0, Math.max(0, maxChars - 1)).trim()}...`;
17174
+ function titleForRef(parsed) {
17175
+ if (parsed.kind === "file")
17176
+ return basename3(parsed.path);
17177
+ if (parsed.kind === "s3")
17178
+ return basename3(parsed.key);
17179
+ if (parsed.kind === "web")
17180
+ return basename3(new URL(parsed.url).pathname) || parsed.url;
17181
+ return parsed.path ? basename3(parsed.path) : parsed.id;
17182
+ }
17183
+ async function readDirectSourceText(parsed, config2, safetyPolicy) {
17184
+ if (parsed.kind === "file") {
17185
+ if (!existsSync6(parsed.path))
17186
+ throw new Error(`Source file not found: ${parsed.path}`);
17187
+ const text = readFileSync6(parsed.path, "utf8");
17188
+ return {
17189
+ text,
17190
+ contentSource: "file",
17191
+ title: titleForRef(parsed),
17192
+ mime: "text/plain",
17193
+ size: text.length,
17194
+ hash: sha256Text(text),
17195
+ revision: null,
17196
+ extractedTextRef: null,
17197
+ metadata: { path: parsed.path },
17198
+ permissions: { mode: "read_only" }
17199
+ };
17200
+ }
17201
+ if (parsed.kind === "s3") {
17202
+ const text = await readS3Text3(parsed.uri, config2, safetyPolicy);
17203
+ return {
17204
+ text,
17205
+ contentSource: "s3",
17206
+ title: titleForRef(parsed),
17207
+ mime: "text/plain",
17208
+ size: text.length,
17209
+ hash: sha256Text(text),
17210
+ revision: null,
17211
+ extractedTextRef: null,
17212
+ metadata: { bucket: parsed.bucket, key: parsed.key },
17213
+ permissions: { mode: "read_only" }
17214
+ };
17215
+ }
17216
+ if (parsed.kind === "web") {
17217
+ const web = await readWebText(parsed.url, safetyPolicy);
17218
+ return {
17219
+ text: web.text,
17220
+ contentSource: "web",
17221
+ title: titleForRef(parsed),
17222
+ mime: web.mime,
17223
+ size: web.text.length,
17224
+ hash: sha256Text(web.text),
17225
+ revision: null,
17226
+ extractedTextRef: null,
17227
+ metadata: { url: parsed.url },
17228
+ permissions: { mode: "read_only" }
17229
+ };
17230
+ }
17231
+ throw new Error(`Direct source reading is not available for ${parsed.uri}`);
16871
17232
  }
16872
- function citationFor(result) {
16873
- const id = stableId4("cite", `${result.kind}\x00${result.id}\x00${result.source?.uri ?? ""}\x00${result.artifact?.uri ?? ""}`);
16874
- return {
16875
- id,
16876
- result_id: result.id,
16877
- kind: result.kind,
16878
- source_uri: result.source?.uri ?? null,
16879
- source_ref: result.source?.ref ?? null,
16880
- artifact_uri: result.artifact?.uri ?? null,
16881
- artifact_path: result.artifact?.path ?? null,
16882
- revision: result.source?.revision ?? null,
16883
- hash: result.source?.hash ?? result.artifact?.hash ?? null,
16884
- chunk_id: result.citation?.chunk_id ?? null,
16885
- start_offset: result.citation?.start_offset ?? null,
16886
- end_offset: result.citation?.end_offset ?? null,
16887
- quote: quoteFor(result, 500),
16888
- provenance: result.provenance
16889
- };
17233
+ async function readTextRef(uri, config2, safetyPolicy) {
17234
+ if (uri.startsWith("open-files://")) {
17235
+ throw new Error("Open-files extracted text refs require an open-files resolver API. Ingest an open-files manifest with extracted_text or an extracted_text_ref using file://, s3://, or https://.");
17236
+ }
17237
+ const parsed = parseSourceRef(uri);
17238
+ const direct = await readDirectSourceText(parsed, config2, safetyPolicy);
17239
+ return { text: direct.text, contentSource: "extracted_text_ref" };
16890
17240
  }
16891
- function excerptFor(result, citation, contextChars) {
16892
- const text = quoteFor(result, contextChars);
16893
- if (!text)
16894
- return null;
17241
+ async function readOpenFilesSourceText(options) {
17242
+ const resolved = await resolveOpenFilesSource({
17243
+ dbPath: options.dbPath,
17244
+ sourceRef: options.sourceRef,
17245
+ purpose: options.purpose ?? "knowledge_index",
17246
+ limit: 100,
17247
+ safetyPolicy: options.safetyPolicy,
17248
+ now: options.now
17249
+ });
17250
+ if (!resolved.resolved) {
17251
+ throw new Error("Open-files source is not in the local knowledge catalog. Ingest an open-files manifest first or use the open-files resolver API.");
17252
+ }
17253
+ if (resolved.revision?.extracted_text_uri && !resolved.content.text_available) {
17254
+ const textRef = await readTextRef(resolved.revision.extracted_text_uri, options.config, options.safetyPolicy);
17255
+ return {
17256
+ text: textRef.text,
17257
+ contentSource: textRef.contentSource,
17258
+ title: resolved.source?.title ?? null,
17259
+ mime: resolved.content.mime,
17260
+ size: textRef.text.length,
17261
+ hash: resolved.revision.hash ?? sha256Text(textRef.text),
17262
+ revision: resolved.revision.revision,
17263
+ extractedTextRef: resolved.revision.extracted_text_uri,
17264
+ metadata: resolved.source?.metadata ?? {},
17265
+ permissions: resolved.source?.permissions ?? { mode: "read_only" }
17266
+ };
17267
+ }
17268
+ if (resolved.chunks.length === 0) {
17269
+ throw new Error("Open-files source has no extracted text chunks yet. Ingest an open-files manifest with extracted_text or extracted_text_ref first.");
17270
+ }
17271
+ const text = resolved.chunks.map((chunk) => chunk.text).join(`
17272
+
17273
+ `);
16895
17274
  return {
16896
- id: stableId4("excerpt", `${result.kind}\x00${result.id}`),
16897
- result_id: result.id,
16898
- citation_id: citation.id,
16899
- kind: result.kind,
16900
17275
  text,
16901
- score: result.score
17276
+ contentSource: "catalog_chunks",
17277
+ title: resolved.source?.title ?? null,
17278
+ mime: resolved.content.mime,
17279
+ size: text.length,
17280
+ hash: resolved.revision?.hash ?? sha256Text(text),
17281
+ revision: resolved.revision?.revision ?? null,
17282
+ extractedTextRef: resolved.revision?.extracted_text_uri ?? null,
17283
+ metadata: resolved.source?.metadata ?? {},
17284
+ permissions: resolved.source?.permissions ?? { mode: "read_only" }
16902
17285
  };
16903
17286
  }
16904
- function placeholders(values) {
16905
- return values.map(() => "?").join(", ");
16906
- }
16907
- function loadGraphEvidence(dbPath, results) {
16908
- const chunkIds = results.map((result) => result.citation?.chunk_id).filter((id) => Boolean(id));
16909
- const wikiPageIds = results.filter((result) => result.kind === "wiki_page").map((result) => result.id);
16910
- const citations = [];
16911
- const backlinks = [];
16912
- if (chunkIds.length === 0 && wikiPageIds.length === 0)
16913
- return { citations, backlinks };
16914
- const db = openKnowledgeDb(dbPath);
16915
- try {
16916
- if (chunkIds.length > 0) {
16917
- citations.push(...db.query(`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
16918
- FROM citations
16919
- WHERE chunk_id IN (${placeholders(chunkIds)})
16920
- ORDER BY created_at DESC
16921
- LIMIT 50`).all(...chunkIds));
16922
- }
16923
- if (wikiPageIds.length > 0) {
16924
- citations.push(...db.query(`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
16925
- FROM citations
16926
- WHERE wiki_page_id IN (${placeholders(wikiPageIds)})
16927
- ORDER BY created_at DESC
16928
- LIMIT 50`).all(...wikiPageIds));
16929
- backlinks.push(...db.query(`SELECT from_page_id, to_page_id, label
16930
- FROM wiki_backlinks
16931
- WHERE from_page_id IN (${placeholders(wikiPageIds)}) OR to_page_id IN (${placeholders(wikiPageIds)})
16932
- LIMIT 50`).all(...wikiPageIds, ...wikiPageIds));
17287
+ function manifestItemForSource(sourceRef, parsed, resolved, purpose) {
17288
+ const hash2 = resolved.hash ?? sha256Text(resolved.text);
17289
+ const metadata = {
17290
+ ...resolved.metadata,
17291
+ source_ref: sourceRef,
17292
+ content_source: resolved.contentSource,
17293
+ read_only: true
17294
+ };
17295
+ const item = {
17296
+ source_ref: sourceRef,
17297
+ name: resolved.title ?? titleForRef(parsed),
17298
+ mime: resolved.mime ?? "text/plain",
17299
+ size: resolved.size ?? resolved.text.length,
17300
+ hash: hash2,
17301
+ revision: resolved.revision ?? hash2,
17302
+ status: "active",
17303
+ updated_at: new Date().toISOString(),
17304
+ permissions: {
17305
+ mode: "read_only",
17306
+ allowed_purposes: [purpose],
17307
+ ...resolved.permissions
17308
+ },
17309
+ metadata,
17310
+ extracted_text_ref: resolved.extractedTextRef,
17311
+ extracted_text: resolved.text
17312
+ };
17313
+ if (parsed.kind === "open-files") {
17314
+ if (parsed.entity === "file")
17315
+ item.file_id = parsed.id;
17316
+ if (parsed.entity === "source") {
17317
+ item.source_id = parsed.id;
17318
+ item.path = parsed.path;
16933
17319
  }
16934
- } finally {
16935
- db.close();
16936
17320
  }
16937
- return { citations, backlinks };
17321
+ if (parsed.kind === "file")
17322
+ item.path = parsed.path;
17323
+ if (parsed.kind === "s3")
17324
+ item.path = parsed.key;
17325
+ if (parsed.kind === "web")
17326
+ item.url = parsed.url;
17327
+ return item;
16938
17328
  }
16939
- async function retrieveKnowledgeContext(options) {
16940
- const contextChars = Math.max(200, Math.min(options.contextChars ?? 1200, 4000));
16941
- const search = await hybridSearch(options);
16942
- const terms = queryTerms2(search.query);
16943
- const warnings = [...search.warnings];
16944
- const permissionNotes = new Set;
16945
- const freshnessNotes = new Set;
16946
- const filtered = search.results.filter((result) => {
16947
- if (!hasReadOnlyProvenance(result.provenance)) {
16948
- warnings.push(`permission_filtered: ${result.kind}:${result.id}`);
16949
- permissionNotes.add("Dropped a result because provenance was not read-only.");
16950
- return false;
16951
- }
16952
- if (isStale(result.provenance)) {
16953
- warnings.push(`stale_filtered: ${result.kind}:${result.id}`);
16954
- freshnessNotes.add("Dropped a stale result whose source status requires reindexing.");
16955
- return false;
16956
- }
16957
- return true;
17329
+ async function ingestSourceRef(options) {
17330
+ const purpose = options.purpose ?? "knowledge_index";
17331
+ const parsed = parseSourceRef(options.sourceRef);
17332
+ const resolved = parsed.kind === "open-files" ? await readOpenFilesSourceText(options) : await readDirectSourceText(parsed, options.config, options.safetyPolicy);
17333
+ const item = manifestItemForSource(options.sourceRef, parsed, resolved, purpose);
17334
+ const result = await ingestOpenFilesManifestItems({
17335
+ dbPath: options.dbPath,
17336
+ items: [item],
17337
+ sourceLabel: options.sourceRef,
17338
+ readAction: "source_ref_ingest_read",
17339
+ safetyPolicy: options.safetyPolicy,
17340
+ now: options.now
16958
17341
  });
16959
- const results = filtered.map((result) => rerank(result, terms)).sort((a, b) => b.score - a.score || a.id.localeCompare(b.id)).slice(0, search.limit);
16960
- const citations = results.map(citationFor);
16961
- const excerpts = results.map((result, index) => excerptFor(result, citations[index], contextChars)).filter((entry) => Boolean(entry));
16962
- for (const result of results) {
16963
- if (result.provenance && "read_only" in result.provenance && result.provenance.read_only) {
16964
- permissionNotes.add("All source-backed excerpts are read-only and citation-required.");
16965
- }
16966
- if (result.rerank.freshness_score >= 0.85) {
16967
- freshnessNotes.add("Fresh source revision/hash or artifact hash is present for top context.");
16968
- }
16969
- }
16970
17342
  return {
16971
- query: search.query,
16972
- normalized_query: normalizeQuery(search.query),
16973
- created_at: new Date().toISOString(),
16974
- mode: search.mode,
16975
- warnings,
16976
- search_counts: search.counts,
16977
- results,
16978
- citations,
16979
- excerpts,
16980
- graph: loadGraphEvidence(options.dbPath, results),
16981
- notes: {
16982
- permissions: Array.from(permissionNotes),
16983
- freshness: Array.from(freshnessNotes)
16984
- }
17343
+ ...result,
17344
+ source_ref: options.sourceRef,
17345
+ content_source: resolved.contentSource,
17346
+ read_only: true,
17347
+ hash: String(item.hash)
16985
17348
  };
16986
17349
  }
16987
17350
 
16988
17351
  // src/storage-contract.ts
16989
- import { createHash as createHash7, randomUUID as randomUUID4 } from "crypto";
17352
+ import { createHash as createHash7, randomUUID as randomUUID6 } from "crypto";
16990
17353
  var GENERATED_ARTIFACTS = [
16991
17354
  {
16992
17355
  kind: "schema",
@@ -17147,7 +17510,7 @@ function recordStorageObjects(db, objects, now = new Date) {
17147
17510
  `);
17148
17511
  const insert = db.transaction((entries) => {
17149
17512
  for (const entry of entries) {
17150
- statement.run(randomUUID4(), entry.uri, entry.kind, entry.content_type ?? null, entry.hash ?? null, entry.size_bytes ?? null, JSON.stringify({
17513
+ statement.run(randomUUID6(), entry.uri, entry.kind, entry.content_type ?? null, entry.hash ?? null, entry.size_bytes ?? null, JSON.stringify({
17151
17514
  key: entry.key,
17152
17515
  ...entry.metadata ?? {}
17153
17516
  }), timestamp, timestamp);
@@ -17522,6 +17885,14 @@ class KnowledgeService {
17522
17885
  config: this.config()
17523
17886
  });
17524
17887
  }
17888
+ async runPrompt(options) {
17889
+ const workspace = this.ensureWorkspace();
17890
+ return runKnowledgePrompt({
17891
+ ...options,
17892
+ dbPath: workspace.knowledgeDbPath,
17893
+ config: this.config()
17894
+ });
17895
+ }
17525
17896
  }
17526
17897
  function createKnowledgeService(options = {}) {
17527
17898
  return new KnowledgeService(options);
@@ -17703,6 +18074,24 @@ function buildServer() {
17703
18074
  return errorText(error48 instanceof Error ? error48.message : String(error48));
17704
18075
  }
17705
18076
  });
18077
+ registerTool(server, "knowledge_ask", "Knowledge prompt answer", "Answer a prompt using read-only knowledge context and optional AI SDK generation", {
18078
+ scope: scopeField,
18079
+ prompt: exports_external.string().describe("Prompt to answer with the knowledge base"),
18080
+ limit: exports_external.number().optional().describe("Maximum context results"),
18081
+ semantic: exports_external.boolean().optional().describe("Include vector semantic results"),
18082
+ generate: exports_external.boolean().optional().describe("Call AI SDK text generation; omitted returns a local citation draft"),
18083
+ approve_write: exports_external.boolean().optional().describe("Record approval intent for future durable wiki writes"),
18084
+ model: exports_external.string().optional().describe("Model alias/ref, default configured provider default"),
18085
+ dimensions: exports_external.number().optional().describe("Embedding dimensions for deterministic fake mode"),
18086
+ fake: exports_external.boolean().optional().describe("Use deterministic fake embeddings/generation for local tests")
18087
+ }, async ({ scope, prompt, limit, semantic, generate, approve_write, model, dimensions, fake }) => {
18088
+ const service = createKnowledgeService({ scope });
18089
+ try {
18090
+ return jsonText({ ok: true, ...await service.runPrompt({ prompt, limit, semantic, generate, approveWrite: approve_write, modelRef: model, dimensions, fake }) });
18091
+ } catch (error48) {
18092
+ return errorText(error48 instanceof Error ? error48.message : String(error48));
18093
+ }
18094
+ });
17706
18095
  registerTool(server, "ok_add", "Add a knowledge item", "Add a new item to the knowledge store", {
17707
18096
  title: exports_external.string().describe("Item title"),
17708
18097
  content: exports_external.string().describe("Item content/body"),