@hasna/knowledge 0.2.16 → 0.2.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -1
- package/bin/open-knowledge-mcp.js +2553 -1887
- package/bin/open-knowledge.js +104 -83
- package/docs/architecture/ai-native-knowledge-base.md +13 -0
- package/docs/architecture/hybrid-semantic-search.md +8 -0
- package/package.json +2 -1
- package/src/agent.ts +367 -0
- package/src/cli.ts +70 -5
- package/src/mcp.js +37 -0
- package/src/providers.ts +1 -1
- package/src/service.ts +21 -0
- package/src/web-search.ts +330 -0
|
@@ -13660,10 +13660,11 @@ import { existsSync as existsSync7, readFileSync as readFileSync7, writeFileSync
|
|
|
13660
13660
|
// package.json
|
|
13661
13661
|
var package_default = {
|
|
13662
13662
|
name: "@hasna/knowledge",
|
|
13663
|
-
version: "0.2.
|
|
13663
|
+
version: "0.2.18",
|
|
13664
13664
|
description: "Agent-friendly local knowledge CLI with JSON output, pagination, and safe destructive actions",
|
|
13665
13665
|
type: "module",
|
|
13666
13666
|
bin: {
|
|
13667
|
+
knowledge: "bin/open-knowledge.js",
|
|
13667
13668
|
"open-knowledge": "bin/open-knowledge.js",
|
|
13668
13669
|
"open-knowledge-mcp": "bin/open-knowledge-mcp.js"
|
|
13669
13670
|
},
|
|
@@ -14134,8 +14135,8 @@ function createArtifactStore(config2, workspace) {
|
|
|
14134
14135
|
return new LocalArtifactStore(workspace.artifactsDir);
|
|
14135
14136
|
}
|
|
14136
14137
|
|
|
14137
|
-
// src/
|
|
14138
|
-
import {
|
|
14138
|
+
// src/agent.ts
|
|
14139
|
+
import { randomUUID as randomUUID3 } from "crypto";
|
|
14139
14140
|
|
|
14140
14141
|
// src/knowledge-db.ts
|
|
14141
14142
|
import { Database } from "bun:sqlite";
|
|
@@ -14441,6 +14442,7 @@ function getKnowledgeDbStats(path) {
|
|
|
14441
14442
|
}
|
|
14442
14443
|
|
|
14443
14444
|
// src/providers.ts
|
|
14445
|
+
import { randomUUID as randomUUID2 } from "crypto";
|
|
14444
14446
|
var DEFAULT_PROVIDER_SETTINGS = {
|
|
14445
14447
|
openai: {
|
|
14446
14448
|
api_key_env: "OPENAI_API_KEY",
|
|
@@ -14496,7 +14498,7 @@ var BUILTIN_ALIASES = {
|
|
|
14496
14498
|
"deepseek-reasoning": "deepseek:deepseek-reasoner"
|
|
14497
14499
|
};
|
|
14498
14500
|
function providerConfig(config2) {
|
|
14499
|
-
return config2
|
|
14501
|
+
return config2?.providers ?? {};
|
|
14500
14502
|
}
|
|
14501
14503
|
function providerSettings(config2, provider) {
|
|
14502
14504
|
const configured = providerConfig(config2)[provider] ?? {};
|
|
@@ -14570,6 +14572,80 @@ function assertProviderCredentials(provider, config2, env = process.env) {
|
|
|
14570
14572
|
throw new Error(`Missing ${status.api_key_env} for ${provider}. Set the env var to use this provider.`);
|
|
14571
14573
|
return status;
|
|
14572
14574
|
}
|
|
14575
|
+
async function defaultFactory(provider) {
|
|
14576
|
+
if (provider === "openai") {
|
|
14577
|
+
const { createOpenAI } = await import("@ai-sdk/openai");
|
|
14578
|
+
return createOpenAI;
|
|
14579
|
+
}
|
|
14580
|
+
if (provider === "anthropic") {
|
|
14581
|
+
const { createAnthropic } = await import("@ai-sdk/anthropic");
|
|
14582
|
+
return createAnthropic;
|
|
14583
|
+
}
|
|
14584
|
+
const { createDeepSeek } = await import("@ai-sdk/deepseek");
|
|
14585
|
+
return createDeepSeek;
|
|
14586
|
+
}
|
|
14587
|
+
async function createAiSdkProviderRegistry(options = {}) {
|
|
14588
|
+
const { createProviderRegistry } = await import("ai");
|
|
14589
|
+
const env = options.env ?? process.env;
|
|
14590
|
+
const providers = {};
|
|
14591
|
+
for (const provider of Object.keys(DEFAULT_PROVIDER_SETTINGS)) {
|
|
14592
|
+
const settings = providerSettings(options.config, provider);
|
|
14593
|
+
const apiKey = env[settings.api_key_env];
|
|
14594
|
+
if (!apiKey)
|
|
14595
|
+
continue;
|
|
14596
|
+
const factory = options.factories?.[provider] ?? await defaultFactory(provider);
|
|
14597
|
+
providers[provider] = factory({ apiKey, baseURL: settings.base_url });
|
|
14598
|
+
}
|
|
14599
|
+
return createProviderRegistry(providers);
|
|
14600
|
+
}
|
|
14601
|
+
async function languageModelFor(aliasOrRef, options = {}) {
|
|
14602
|
+
const modelRef = resolveModelRef(aliasOrRef, options.config);
|
|
14603
|
+
const parsed = parseModelRef(modelRef);
|
|
14604
|
+
assertProviderCredentials(parsed.provider, options.config, options.env);
|
|
14605
|
+
const registry2 = await createAiSdkProviderRegistry(options);
|
|
14606
|
+
return registry2.languageModel(modelRef);
|
|
14607
|
+
}
|
|
14608
|
+
function usageNumber(usage, keys) {
|
|
14609
|
+
for (const key of keys) {
|
|
14610
|
+
const value = usage[key];
|
|
14611
|
+
if (typeof value === "number" && Number.isFinite(value))
|
|
14612
|
+
return value;
|
|
14613
|
+
}
|
|
14614
|
+
return 0;
|
|
14615
|
+
}
|
|
14616
|
+
function normalizeAiSdkUsage(input) {
|
|
14617
|
+
const usage = input.usage ?? {};
|
|
14618
|
+
return {
|
|
14619
|
+
provider: input.provider,
|
|
14620
|
+
model: input.model,
|
|
14621
|
+
input_tokens: usageNumber(usage, ["inputTokens", "promptTokens", "input_tokens", "prompt_tokens"]),
|
|
14622
|
+
output_tokens: usageNumber(usage, ["outputTokens", "completionTokens", "output_tokens", "completion_tokens"]),
|
|
14623
|
+
cost_usd: input.costUsd ?? 0,
|
|
14624
|
+
metadata: {
|
|
14625
|
+
usage,
|
|
14626
|
+
provider_metadata: input.providerMetadata ?? {}
|
|
14627
|
+
}
|
|
14628
|
+
};
|
|
14629
|
+
}
|
|
14630
|
+
function recordProviderUsage(db, input) {
|
|
14631
|
+
const id = `usage_${randomUUID2()}`;
|
|
14632
|
+
db.run(`INSERT INTO provider_usage (id, run_id, provider, model, input_tokens, output_tokens, cost_usd, metadata_json, created_at)
|
|
14633
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
|
|
14634
|
+
id,
|
|
14635
|
+
input.run_id ?? null,
|
|
14636
|
+
input.provider,
|
|
14637
|
+
input.model,
|
|
14638
|
+
input.input_tokens,
|
|
14639
|
+
input.output_tokens,
|
|
14640
|
+
input.cost_usd,
|
|
14641
|
+
JSON.stringify(input.metadata),
|
|
14642
|
+
input.created_at ?? new Date().toISOString()
|
|
14643
|
+
]);
|
|
14644
|
+
return id;
|
|
14645
|
+
}
|
|
14646
|
+
|
|
14647
|
+
// src/retrieval.ts
|
|
14648
|
+
import { createHash as createHash2 } from "crypto";
|
|
14573
14649
|
|
|
14574
14650
|
// src/provenance.ts
|
|
14575
14651
|
function isStaleStatus(status) {
|
|
@@ -14614,6 +14690,7 @@ function withProvenance(metadata, provenance) {
|
|
|
14614
14690
|
}
|
|
14615
14691
|
|
|
14616
14692
|
// src/embeddings.ts
|
|
14693
|
+
import { createHash } from "crypto";
|
|
14617
14694
|
var DEFAULT_EMBEDDING_MODEL_REF = "openai:text-embedding-3-small";
|
|
14618
14695
|
var DEFAULT_EMBEDDING_DIMENSIONS = 1536;
|
|
14619
14696
|
function embeddingConfig(config2) {
|
|
@@ -14948,1220 +15025,1386 @@ async function searchVectorIndex(options) {
|
|
|
14948
15025
|
}
|
|
14949
15026
|
}
|
|
14950
15027
|
|
|
14951
|
-
// src/
|
|
14952
|
-
|
|
14953
|
-
|
|
14954
|
-
|
|
14955
|
-
|
|
14956
|
-
|
|
14957
|
-
|
|
14958
|
-
|
|
14959
|
-
|
|
14960
|
-
const value = process.env[name];
|
|
14961
|
-
return value === "1" || value === "true" || value === "yes";
|
|
14962
|
-
}
|
|
14963
|
-
function resolveSafetyPolicy(config2, workspace) {
|
|
14964
|
-
const extended = config2;
|
|
14965
|
-
const configuredBuckets = new Set(extended.safety?.network?.allowed_s3_buckets ?? []);
|
|
14966
|
-
if (config2.storage.type === "s3" && config2.storage.s3?.bucket)
|
|
14967
|
-
configuredBuckets.add(config2.storage.s3.bucket);
|
|
14968
|
-
if (process.env.HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS) {
|
|
14969
|
-
for (const bucket of process.env.HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS.split(",").map((entry) => entry.trim()).filter(Boolean)) {
|
|
14970
|
-
configuredBuckets.add(bucket);
|
|
14971
|
-
}
|
|
14972
|
-
}
|
|
14973
|
-
return {
|
|
14974
|
-
mode: config2.mode,
|
|
14975
|
-
allowWriteRoots: [
|
|
14976
|
-
workspace.home,
|
|
14977
|
-
workspace.artifactsDir,
|
|
14978
|
-
workspace.cacheDir,
|
|
14979
|
-
workspace.exportsDir,
|
|
14980
|
-
workspace.indexesDir,
|
|
14981
|
-
workspace.logsDir,
|
|
14982
|
-
workspace.runsDir,
|
|
14983
|
-
workspace.schemasDir,
|
|
14984
|
-
workspace.wikiDir
|
|
14985
|
-
].map((entry) => resolve2(entry)),
|
|
14986
|
-
readOnlySourceAccess: true,
|
|
14987
|
-
network: {
|
|
14988
|
-
webSearchEnabled: extended.safety?.network?.web_search_enabled ?? envEnabled("HASNA_KNOWLEDGE_WEB_SEARCH"),
|
|
14989
|
-
s3ReadsEnabled: extended.safety?.network?.s3_reads_enabled ?? envEnabled("HASNA_KNOWLEDGE_ALLOW_S3_READS"),
|
|
14990
|
-
allowedS3Buckets: [...configuredBuckets].sort()
|
|
14991
|
-
},
|
|
14992
|
-
redaction: {
|
|
14993
|
-
enabled: extended.safety?.redaction?.enabled ?? true
|
|
14994
|
-
},
|
|
14995
|
-
approvals: {
|
|
14996
|
-
generatedWritesRequireApproval: extended.safety?.approvals?.generated_writes_require_approval ?? true
|
|
14997
|
-
}
|
|
14998
|
-
};
|
|
14999
|
-
}
|
|
15000
|
-
function isInside(root, target) {
|
|
15001
|
-
const rel = relative2(root, target);
|
|
15002
|
-
return rel === "" || !rel.startsWith("..") && rel !== ".." && !rel.startsWith(`..${sep2}`);
|
|
15003
|
-
}
|
|
15004
|
-
function assertWriteAllowed(targetPath, policy) {
|
|
15005
|
-
const resolved = resolve2(targetPath);
|
|
15006
|
-
if (!policy.allowWriteRoots.some((root) => isInside(root, resolved))) {
|
|
15007
|
-
throw new Error(`Safety policy denied write outside .hasna/apps/knowledge: ${targetPath}`);
|
|
15008
|
-
}
|
|
15009
|
-
}
|
|
15010
|
-
function assertS3ReadAllowed(uri, policy) {
|
|
15011
|
-
const parsed = new URL(uri);
|
|
15012
|
-
const bucket = parsed.hostname;
|
|
15013
|
-
if (!policy.network.s3ReadsEnabled) {
|
|
15014
|
-
throw new Error("Safety policy denied S3 read. Set safety.network.s3_reads_enabled=true or HASNA_KNOWLEDGE_ALLOW_S3_READS=1.");
|
|
15015
|
-
}
|
|
15016
|
-
if (!policy.network.allowedS3Buckets.includes(bucket)) {
|
|
15017
|
-
throw new Error(`Safety policy denied S3 bucket "${bucket}". Add it to safety.network.allowed_s3_buckets or HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS.`);
|
|
15028
|
+
// src/search.ts
|
|
15029
|
+
function parseJsonObject2(value) {
|
|
15030
|
+
if (!value)
|
|
15031
|
+
return {};
|
|
15032
|
+
try {
|
|
15033
|
+
const parsed = JSON.parse(value);
|
|
15034
|
+
return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : {};
|
|
15035
|
+
} catch {
|
|
15036
|
+
return {};
|
|
15018
15037
|
}
|
|
15019
15038
|
}
|
|
15020
|
-
function
|
|
15021
|
-
|
|
15022
|
-
|
|
15039
|
+
function metadataString2(metadata, keys) {
|
|
15040
|
+
for (const key of keys) {
|
|
15041
|
+
const value = metadata[key];
|
|
15042
|
+
if (typeof value === "string" && value.length > 0)
|
|
15043
|
+
return value;
|
|
15023
15044
|
}
|
|
15045
|
+
return null;
|
|
15024
15046
|
}
|
|
15025
|
-
|
|
15026
|
-
|
|
15027
|
-
|
|
15028
|
-
|
|
15029
|
-
|
|
15030
|
-
{ type: "aws_access_key_id", severity: "high", regex: /\bA(?:KIA|SIA)[A-Z0-9]{16}\b/g, replacement: "[REDACTED:aws_access_key_id]" }
|
|
15031
|
-
];
|
|
15032
|
-
function redactSecrets(text, policy) {
|
|
15033
|
-
if (policy && !policy.redaction.enabled)
|
|
15034
|
-
return { text, findings: [] };
|
|
15035
|
-
let output = text;
|
|
15036
|
-
const findings = [];
|
|
15037
|
-
for (const pattern of REDACTION_PATTERNS) {
|
|
15038
|
-
output = output.replace(pattern.regex, (match, ...args) => {
|
|
15039
|
-
const offset = typeof args.at(-2) === "number" ? args.at(-2) : output.indexOf(match);
|
|
15040
|
-
findings.push({
|
|
15041
|
-
type: pattern.type,
|
|
15042
|
-
severity: pattern.severity,
|
|
15043
|
-
start: Math.max(0, offset),
|
|
15044
|
-
end: Math.max(0, offset + match.length)
|
|
15045
|
-
});
|
|
15046
|
-
return pattern.replacement;
|
|
15047
|
-
});
|
|
15047
|
+
function metadataNumber2(metadata, keys) {
|
|
15048
|
+
for (const key of keys) {
|
|
15049
|
+
const value = metadata[key];
|
|
15050
|
+
if (typeof value === "number" && Number.isFinite(value))
|
|
15051
|
+
return value;
|
|
15048
15052
|
}
|
|
15049
|
-
return
|
|
15053
|
+
return null;
|
|
15050
15054
|
}
|
|
15051
|
-
function
|
|
15052
|
-
return
|
|
15055
|
+
function unique(values) {
|
|
15056
|
+
return Array.from(new Set(values));
|
|
15053
15057
|
}
|
|
15054
|
-
function
|
|
15055
|
-
const
|
|
15056
|
-
|
|
15057
|
-
db.run(`INSERT INTO audit_events (id, event_type, action, target_uri, decision, metadata_json, created_at)
|
|
15058
|
-
VALUES (?, ?, ?, ?, ?, ?, ?)`, [
|
|
15059
|
-
id,
|
|
15060
|
-
input.event_type,
|
|
15061
|
-
input.action,
|
|
15062
|
-
input.target_uri ?? null,
|
|
15063
|
-
input.decision,
|
|
15064
|
-
JSON.stringify(input.metadata ?? {}),
|
|
15065
|
-
createdAt
|
|
15066
|
-
]);
|
|
15067
|
-
return id;
|
|
15058
|
+
function queryTerms(query) {
|
|
15059
|
+
const terms = query.normalize("NFKC").toLowerCase().match(/[\p{L}\p{N}_]+/gu) ?? [];
|
|
15060
|
+
return unique(terms.filter((term) => term.length > 0)).slice(0, 16);
|
|
15068
15061
|
}
|
|
15069
|
-
function
|
|
15070
|
-
|
|
15071
|
-
|
|
15072
|
-
|
|
15073
|
-
VALUES (?, ?, ?, ?, ?, ?, ?)`, [
|
|
15074
|
-
`redact_${randomUUID2()}`,
|
|
15075
|
-
input.source_uri ?? null,
|
|
15076
|
-
input.run_id ?? null,
|
|
15077
|
-
finding.severity,
|
|
15078
|
-
finding.type,
|
|
15079
|
-
JSON.stringify({ ...input.metadata ?? {}, start: finding.start, end: finding.end }),
|
|
15080
|
-
createdAt
|
|
15081
|
-
]);
|
|
15082
|
-
}
|
|
15083
|
-
return input.findings.length;
|
|
15062
|
+
function ftsQueryForTerms(terms) {
|
|
15063
|
+
if (terms.length === 0)
|
|
15064
|
+
return null;
|
|
15065
|
+
return terms.map((term) => `${term}*`).join(" OR ");
|
|
15084
15066
|
}
|
|
15085
|
-
|
|
15086
|
-
|
|
15087
|
-
function stableId2(prefix, value) {
|
|
15088
|
-
return `${prefix}_${createHash3("sha256").update(value).digest("hex").slice(0, 20)}`;
|
|
15067
|
+
function escapeLikeTerm(term) {
|
|
15068
|
+
return term.replace(/[\\%_]/g, (char) => `\\${char}`);
|
|
15089
15069
|
}
|
|
15090
|
-
function
|
|
15091
|
-
return
|
|
15070
|
+
function likeParams(terms, fieldsPerTerm) {
|
|
15071
|
+
return terms.flatMap((term) => Array.from({ length: fieldsPerTerm }, () => `%${escapeLikeTerm(term)}%`));
|
|
15092
15072
|
}
|
|
15093
|
-
function
|
|
15094
|
-
|
|
15073
|
+
function scoreFromRank(rank, index) {
|
|
15074
|
+
const rankScore = Number.isFinite(rank) ? 1 / (1 + Math.abs(rank)) : 0;
|
|
15075
|
+
const orderScore = 1 / (1 + index);
|
|
15076
|
+
return roundScore(Math.max(rankScore, orderScore));
|
|
15095
15077
|
}
|
|
15096
|
-
function
|
|
15097
|
-
|
|
15098
|
-
|
|
15099
|
-
|
|
15100
|
-
|
|
15101
|
-
|
|
15102
|
-
|
|
15103
|
-
const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
|
|
15104
|
-
return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
|
|
15105
|
-
}
|
|
15106
|
-
const sourceId = asString(event.source_id);
|
|
15107
|
-
const path = asString(event.path);
|
|
15108
|
-
if (sourceId && path) {
|
|
15109
|
-
return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
|
|
15110
|
-
}
|
|
15111
|
-
throw new Error("Outbox event is missing source_ref, file_id, or source_id/path.");
|
|
15078
|
+
function catalogScore(haystack, terms) {
|
|
15079
|
+
if (terms.length === 0)
|
|
15080
|
+
return 0;
|
|
15081
|
+
const matched = terms.filter((term) => haystack.includes(term)).length;
|
|
15082
|
+
if (matched === 0)
|
|
15083
|
+
return 0;
|
|
15084
|
+
return roundScore(Math.min(0.85, 0.35 + matched / terms.length * 0.5));
|
|
15112
15085
|
}
|
|
15113
|
-
function
|
|
15114
|
-
|
|
15115
|
-
return sourceRef.replace(/\/revision\/[^/]+$/, "");
|
|
15116
|
-
}
|
|
15117
|
-
return sourceRef;
|
|
15086
|
+
function semanticScore(score) {
|
|
15087
|
+
return roundScore(Math.max(0, Math.min(1, (score + 1) / 2)));
|
|
15118
15088
|
}
|
|
15119
|
-
function
|
|
15120
|
-
return
|
|
15089
|
+
function roundScore(score) {
|
|
15090
|
+
return Number(score.toFixed(6));
|
|
15121
15091
|
}
|
|
15122
|
-
function
|
|
15123
|
-
|
|
15092
|
+
function combinedScore(scores, citation) {
|
|
15093
|
+
const keyword = scores.keyword ?? 0;
|
|
15094
|
+
const semantic = scores.semantic ?? 0;
|
|
15095
|
+
const catalog = scores.catalog ?? 0;
|
|
15096
|
+
const citationBoost = citation?.chunk_id ? 0.05 : 0;
|
|
15097
|
+
return roundScore(Math.min(1, keyword * 0.55 + semantic * 0.4 + catalog * 0.35 + citationBoost));
|
|
15124
15098
|
}
|
|
15125
|
-
function
|
|
15126
|
-
|
|
15099
|
+
function existingProvenance(metadata) {
|
|
15100
|
+
const provenance = metadata.provenance;
|
|
15101
|
+
return provenance && typeof provenance === "object" && !Array.isArray(provenance) ? provenance : null;
|
|
15127
15102
|
}
|
|
15128
|
-
function
|
|
15129
|
-
const
|
|
15130
|
-
|
|
15103
|
+
function provenanceForChunk2(row) {
|
|
15104
|
+
const metadata = parseJsonObject2(row.chunk_metadata_json);
|
|
15105
|
+
const existing = existingProvenance(metadata);
|
|
15106
|
+
if (existing)
|
|
15107
|
+
return existing;
|
|
15108
|
+
if (!row.source_revision_id && !row.source_uri)
|
|
15109
|
+
return null;
|
|
15110
|
+
return sourceProvenance({
|
|
15111
|
+
source_ref: metadataString2(metadata, ["source_ref"]),
|
|
15112
|
+
source_uri: row.source_uri ?? metadataString2(metadata, ["source_uri"]),
|
|
15113
|
+
source_kind: row.source_kind ?? metadataString2(metadata, ["source_kind"]),
|
|
15114
|
+
source_revision_id: row.source_revision_id,
|
|
15115
|
+
revision: row.revision ?? metadataString2(metadata, ["revision"]),
|
|
15116
|
+
hash: row.hash ?? metadataString2(metadata, ["hash"]),
|
|
15117
|
+
chunk_id: row.chunk_id,
|
|
15118
|
+
start_offset: row.start_offset ?? metadataNumber2(metadata, ["start_offset"]),
|
|
15119
|
+
end_offset: row.end_offset ?? metadataNumber2(metadata, ["end_offset"]),
|
|
15120
|
+
status: metadataString2(metadata, ["status"]),
|
|
15121
|
+
resolver: "open-files-read-only"
|
|
15122
|
+
});
|
|
15131
15123
|
}
|
|
15132
|
-
function
|
|
15133
|
-
|
|
15134
|
-
const parsed = parseSourceRef(sourceRef);
|
|
15135
|
-
const hash2 = hashFromEvent(event);
|
|
15136
|
-
return {
|
|
15137
|
-
raw: event,
|
|
15138
|
-
eventType: eventType(event),
|
|
15139
|
-
sourceRef,
|
|
15140
|
-
sourceUri: baseSourceUri(sourceRef, parsed),
|
|
15141
|
-
kind: parsed.kind,
|
|
15142
|
-
title: titleFromEvent(event),
|
|
15143
|
-
revision: revisionFromEvent(event, parsed, hash2),
|
|
15144
|
-
hash: hash2,
|
|
15145
|
-
status: asString(event.status)?.toLowerCase() ?? null,
|
|
15146
|
-
updatedAt: asString(event.updated_at) ?? now,
|
|
15147
|
-
acl: event.permissions ?? event.acl ?? undefined
|
|
15148
|
-
};
|
|
15149
|
-
}
|
|
15150
|
-
function parseOutboxText(text) {
|
|
15151
|
-
const trimmed = text.trim();
|
|
15152
|
-
if (!trimmed)
|
|
15124
|
+
function selectFtsChunks(db, ftsQuery, limit) {
|
|
15125
|
+
if (!ftsQuery)
|
|
15153
15126
|
return [];
|
|
15154
|
-
|
|
15155
|
-
|
|
15156
|
-
|
|
15157
|
-
|
|
15158
|
-
|
|
15159
|
-
|
|
15160
|
-
|
|
15161
|
-
|
|
15162
|
-
|
|
15163
|
-
|
|
15164
|
-
|
|
15165
|
-
|
|
15166
|
-
|
|
15167
|
-
|
|
15168
|
-
|
|
15169
|
-
|
|
15170
|
-
|
|
15171
|
-
|
|
15172
|
-
|
|
15173
|
-
|
|
15174
|
-
|
|
15175
|
-
|
|
15176
|
-
|
|
15177
|
-
|
|
15178
|
-
|
|
15179
|
-
|
|
15180
|
-
|
|
15181
|
-
|
|
15182
|
-
|
|
15183
|
-
|
|
15184
|
-
throw error48;
|
|
15185
|
-
return lines.map((line) => {
|
|
15186
|
-
const event = asObject(JSON.parse(line));
|
|
15187
|
-
if (!event)
|
|
15188
|
-
throw new Error("Outbox JSONL entries must be objects.");
|
|
15189
|
-
return event;
|
|
15190
|
-
});
|
|
15191
|
-
}
|
|
15192
|
-
}
|
|
15193
|
-
return trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => {
|
|
15194
|
-
const event = asObject(JSON.parse(line));
|
|
15195
|
-
if (!event)
|
|
15196
|
-
throw new Error("Outbox JSONL entries must be objects.");
|
|
15197
|
-
return event;
|
|
15198
|
-
});
|
|
15127
|
+
return db.query(`SELECT
|
|
15128
|
+
chunks_fts.chunk_id,
|
|
15129
|
+
c.kind AS chunk_kind,
|
|
15130
|
+
c.wiki_page_id,
|
|
15131
|
+
c.text,
|
|
15132
|
+
c.token_count,
|
|
15133
|
+
c.start_offset,
|
|
15134
|
+
c.end_offset,
|
|
15135
|
+
c.metadata_json AS chunk_metadata_json,
|
|
15136
|
+
c.source_revision_id,
|
|
15137
|
+
sr.revision,
|
|
15138
|
+
sr.hash,
|
|
15139
|
+
s.uri AS source_uri,
|
|
15140
|
+
s.kind AS source_kind,
|
|
15141
|
+
s.title AS source_title,
|
|
15142
|
+
wp.path AS wiki_path,
|
|
15143
|
+
wp.title AS wiki_title,
|
|
15144
|
+
wp.artifact_uri AS wiki_artifact_uri,
|
|
15145
|
+
wp.content_hash AS wiki_content_hash,
|
|
15146
|
+
wp.status AS wiki_status,
|
|
15147
|
+
wp.metadata_json AS wiki_metadata_json,
|
|
15148
|
+
bm25(chunks_fts) AS rank
|
|
15149
|
+
FROM chunks_fts
|
|
15150
|
+
JOIN chunks c ON c.id = chunks_fts.chunk_id
|
|
15151
|
+
LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
|
|
15152
|
+
LEFT JOIN sources s ON s.id = sr.source_id
|
|
15153
|
+
LEFT JOIN wiki_pages wp ON wp.id = c.wiki_page_id
|
|
15154
|
+
WHERE chunks_fts MATCH ?
|
|
15155
|
+
ORDER BY rank ASC
|
|
15156
|
+
LIMIT ?`).all(ftsQuery, limit);
|
|
15199
15157
|
}
|
|
15200
|
-
|
|
15201
|
-
|
|
15202
|
-
|
|
15203
|
-
const
|
|
15204
|
-
|
|
15205
|
-
throw new Error(`Invalid S3 outbox URI: ${uri}`);
|
|
15206
|
-
if (safetyPolicy)
|
|
15207
|
-
assertS3ReadAllowed(uri, safetyPolicy);
|
|
15208
|
-
const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
|
|
15209
|
-
import("@aws-sdk/client-s3"),
|
|
15210
|
-
import("@aws-sdk/credential-providers")
|
|
15211
|
-
]);
|
|
15212
|
-
const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
|
|
15213
|
-
const client = new S3Client({
|
|
15214
|
-
region: s3Config?.region,
|
|
15215
|
-
credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
|
|
15216
|
-
maxAttempts: s3Config?.max_attempts
|
|
15217
|
-
});
|
|
15218
|
-
const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
|
|
15219
|
-
if (!response.Body)
|
|
15220
|
-
return "";
|
|
15221
|
-
return await response.Body.transformToString();
|
|
15158
|
+
function catalogWhere(fields, terms) {
|
|
15159
|
+
if (terms.length === 0)
|
|
15160
|
+
return "1 = 0";
|
|
15161
|
+
const clauses = terms.map(() => `(${fields.map((field) => `lower(COALESCE(${field}, '')) LIKE ? ESCAPE '\\'`).join(" OR ")})`);
|
|
15162
|
+
return clauses.join(" OR ");
|
|
15222
15163
|
}
|
|
15223
|
-
|
|
15224
|
-
|
|
15225
|
-
|
|
15226
|
-
|
|
15227
|
-
|
|
15228
|
-
|
|
15164
|
+
function selectWikiPages(db, terms, limit) {
|
|
15165
|
+
const fields = ["path", "title", "artifact_uri", "metadata_json"];
|
|
15166
|
+
return db.query(`SELECT id, path, title, artifact_uri, content_hash, status, metadata_json
|
|
15167
|
+
FROM wiki_pages
|
|
15168
|
+
WHERE status = 'active' AND (${catalogWhere(fields, terms)})
|
|
15169
|
+
ORDER BY updated_at DESC
|
|
15170
|
+
LIMIT ?`).all(...likeParams(terms, fields.length), limit);
|
|
15229
15171
|
}
|
|
15230
|
-
function
|
|
15231
|
-
|
|
15232
|
-
|
|
15233
|
-
|
|
15234
|
-
|
|
15235
|
-
|
|
15236
|
-
|
|
15237
|
-
}
|
|
15238
|
-
}
|
|
15239
|
-
return JSON.stringify({ ...base, ...patch });
|
|
15172
|
+
function selectKnowledgeIndexes(db, terms, limit) {
|
|
15173
|
+
const fields = ["kind", "name", "shard_key", "artifact_uri", "metadata_json"];
|
|
15174
|
+
return db.query(`SELECT id, kind, name, artifact_uri, shard_key, metadata_json
|
|
15175
|
+
FROM knowledge_indexes
|
|
15176
|
+
WHERE ${catalogWhere(fields, terms)}
|
|
15177
|
+
ORDER BY updated_at DESC
|
|
15178
|
+
LIMIT ?`).all(...likeParams(terms, fields.length), limit);
|
|
15240
15179
|
}
|
|
15241
|
-
function
|
|
15242
|
-
const
|
|
15243
|
-
|
|
15244
|
-
|
|
15245
|
-
|
|
15246
|
-
|
|
15247
|
-
|
|
15248
|
-
|
|
15249
|
-
id,
|
|
15250
|
-
|
|
15251
|
-
|
|
15252
|
-
|
|
15253
|
-
|
|
15254
|
-
|
|
15255
|
-
|
|
15256
|
-
|
|
15257
|
-
|
|
15258
|
-
|
|
15259
|
-
|
|
15260
|
-
|
|
15261
|
-
|
|
15262
|
-
|
|
15263
|
-
|
|
15264
|
-
|
|
15265
|
-
|
|
15180
|
+
function chunkResult(row, keywordScore) {
|
|
15181
|
+
const metadata = parseJsonObject2(row.chunk_metadata_json);
|
|
15182
|
+
const provenance = provenanceForChunk2(row);
|
|
15183
|
+
const sourceRef = metadataString2(metadata, ["source_ref"]);
|
|
15184
|
+
const sourceUri = row.source_uri ?? metadataString2(metadata, ["source_uri"]);
|
|
15185
|
+
const isWiki = Boolean(row.wiki_page_id);
|
|
15186
|
+
const result = {
|
|
15187
|
+
kind: isWiki ? "wiki_chunk" : "source_chunk",
|
|
15188
|
+
id: row.chunk_id,
|
|
15189
|
+
title: isWiki ? row.wiki_title : row.source_title,
|
|
15190
|
+
text: row.text,
|
|
15191
|
+
score: 0,
|
|
15192
|
+
scores: { keyword: keywordScore },
|
|
15193
|
+
source: sourceUri || sourceRef ? {
|
|
15194
|
+
uri: sourceUri,
|
|
15195
|
+
ref: sourceRef,
|
|
15196
|
+
kind: row.source_kind ?? metadataString2(metadata, ["source_kind"]),
|
|
15197
|
+
revision: row.revision ?? metadataString2(metadata, ["revision"]),
|
|
15198
|
+
hash: row.hash ?? metadataString2(metadata, ["hash"])
|
|
15199
|
+
} : null,
|
|
15200
|
+
citation: {
|
|
15201
|
+
chunk_id: row.chunk_id,
|
|
15202
|
+
start_offset: row.start_offset,
|
|
15203
|
+
end_offset: row.end_offset
|
|
15204
|
+
},
|
|
15205
|
+
artifact: isWiki ? {
|
|
15206
|
+
uri: row.wiki_artifact_uri,
|
|
15207
|
+
path: row.wiki_path,
|
|
15208
|
+
hash: row.wiki_content_hash,
|
|
15209
|
+
shard_key: row.wiki_path
|
|
15210
|
+
} : null,
|
|
15211
|
+
provenance,
|
|
15212
|
+
reasons: ["keyword_match"]
|
|
15266
15213
|
};
|
|
15267
|
-
|
|
15268
|
-
|
|
15269
|
-
if (asString(event.raw.path))
|
|
15270
|
-
patch.path = event.raw.path;
|
|
15271
|
-
db.run("UPDATE sources SET metadata_json = ?, acl_json = CASE WHEN ? IS NULL THEN acl_json ELSE ? END, updated_at = ? WHERE id = ?", [
|
|
15272
|
-
mergeJson(row.metadata_json, patch),
|
|
15273
|
-
event.acl === undefined ? null : JSON.stringify(event.acl),
|
|
15274
|
-
event.acl === undefined ? null : JSON.stringify(event.acl),
|
|
15275
|
-
event.updatedAt,
|
|
15276
|
-
row.id
|
|
15277
|
-
]);
|
|
15278
|
-
return row.id;
|
|
15214
|
+
result.score = combinedScore(result.scores, result.citation);
|
|
15215
|
+
return result;
|
|
15279
15216
|
}
|
|
15280
|
-
function
|
|
15281
|
-
|
|
15282
|
-
|
|
15283
|
-
const
|
|
15284
|
-
|
|
15285
|
-
|
|
15286
|
-
|
|
15287
|
-
|
|
15288
|
-
|
|
15289
|
-
|
|
15217
|
+
function wikiPageResult(row, terms) {
|
|
15218
|
+
const metadata = parseJsonObject2(row.metadata_json);
|
|
15219
|
+
const score = catalogScore(`${row.path} ${row.title} ${row.artifact_uri ?? ""} ${row.metadata_json}`.toLowerCase(), terms);
|
|
15220
|
+
const result = {
|
|
15221
|
+
kind: "wiki_page",
|
|
15222
|
+
id: row.id,
|
|
15223
|
+
title: row.title,
|
|
15224
|
+
text: null,
|
|
15225
|
+
score: 0,
|
|
15226
|
+
scores: { catalog: score },
|
|
15227
|
+
source: null,
|
|
15228
|
+
citation: null,
|
|
15229
|
+
artifact: {
|
|
15230
|
+
uri: row.artifact_uri,
|
|
15231
|
+
path: row.path,
|
|
15232
|
+
hash: row.content_hash,
|
|
15233
|
+
shard_key: row.path
|
|
15234
|
+
},
|
|
15235
|
+
provenance: existingProvenance(metadata),
|
|
15236
|
+
reasons: ["wiki_catalog_match"]
|
|
15290
15237
|
};
|
|
15291
|
-
|
|
15292
|
-
|
|
15293
|
-
ON CONFLICT(source_id, revision) DO UPDATE SET
|
|
15294
|
-
hash = COALESCE(excluded.hash, source_revisions.hash),
|
|
15295
|
-
metadata_json = excluded.metadata_json`, [id, sourceId, event.revision, event.hash, asString(event.raw.extracted_text_ref) ?? null, JSON.stringify(metadata), now]);
|
|
15296
|
-
const row = db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").get(sourceId, event.revision);
|
|
15297
|
-
return row?.id ?? null;
|
|
15238
|
+
result.score = combinedScore(result.scores, result.citation);
|
|
15239
|
+
return result;
|
|
15298
15240
|
}
|
|
15299
|
-
function
|
|
15300
|
-
|
|
15301
|
-
|
|
15302
|
-
|
|
15303
|
-
|
|
15304
|
-
|
|
15305
|
-
|
|
15306
|
-
|
|
15241
|
+
function indexResult(row, terms) {
|
|
15242
|
+
const metadata = parseJsonObject2(row.metadata_json);
|
|
15243
|
+
const score = catalogScore(`${row.kind} ${row.name} ${row.shard_key ?? ""} ${row.artifact_uri ?? ""} ${row.metadata_json}`.toLowerCase(), terms);
|
|
15244
|
+
const result = {
|
|
15245
|
+
kind: "knowledge_index",
|
|
15246
|
+
id: row.id,
|
|
15247
|
+
title: row.name,
|
|
15248
|
+
text: null,
|
|
15249
|
+
score: 0,
|
|
15250
|
+
scores: { catalog: score },
|
|
15251
|
+
source: null,
|
|
15252
|
+
citation: null,
|
|
15253
|
+
artifact: {
|
|
15254
|
+
uri: row.artifact_uri,
|
|
15255
|
+
path: metadataString2(metadata, ["artifact_key"]),
|
|
15256
|
+
hash: metadataString2(metadata, ["content_hash"]),
|
|
15257
|
+
shard_key: row.shard_key
|
|
15258
|
+
},
|
|
15259
|
+
provenance: existingProvenance(metadata),
|
|
15260
|
+
reasons: ["index_catalog_match"]
|
|
15261
|
+
};
|
|
15262
|
+
result.score = combinedScore(result.scores, result.citation);
|
|
15263
|
+
return result;
|
|
15307
15264
|
}
|
|
15308
|
-
function
|
|
15309
|
-
const
|
|
15310
|
-
|
|
15311
|
-
|
|
15312
|
-
|
|
15313
|
-
|
|
15314
|
-
embeddingsDeleted += row?.n ?? 0;
|
|
15315
|
-
const vectorRow = db.query("SELECT COUNT(*) AS n FROM vector_index_entries WHERE chunk_id = ?").get(chunk.id);
|
|
15316
|
-
vectorEntriesDeleted += vectorRow?.n ?? 0;
|
|
15317
|
-
db.run("DELETE FROM vector_index_entries WHERE chunk_id = ?", [chunk.id]);
|
|
15318
|
-
db.run("DELETE FROM chunk_embeddings WHERE chunk_id = ?", [chunk.id]);
|
|
15319
|
-
db.run("DELETE FROM chunks_fts WHERE chunk_id = ?", [chunk.id]);
|
|
15265
|
+
function mergeResult(results, entry) {
|
|
15266
|
+
const key = `${entry.kind}:${entry.id}`;
|
|
15267
|
+
const existing = results.get(key);
|
|
15268
|
+
if (!existing) {
|
|
15269
|
+
results.set(key, entry);
|
|
15270
|
+
return;
|
|
15320
15271
|
}
|
|
15321
|
-
|
|
15322
|
-
|
|
15323
|
-
|
|
15324
|
-
|
|
15325
|
-
}
|
|
15326
|
-
|
|
15327
|
-
|
|
15328
|
-
|
|
15329
|
-
|
|
15330
|
-
|
|
15272
|
+
existing.scores = {
|
|
15273
|
+
keyword: Math.max(existing.scores.keyword ?? 0, entry.scores.keyword ?? 0) || undefined,
|
|
15274
|
+
semantic: Math.max(existing.scores.semantic ?? 0, entry.scores.semantic ?? 0) || undefined,
|
|
15275
|
+
catalog: Math.max(existing.scores.catalog ?? 0, entry.scores.catalog ?? 0) || undefined
|
|
15276
|
+
};
|
|
15277
|
+
existing.reasons = unique([...existing.reasons, ...entry.reasons]);
|
|
15278
|
+
existing.text = existing.text ?? entry.text;
|
|
15279
|
+
existing.title = existing.title ?? entry.title;
|
|
15280
|
+
existing.source = existing.source ?? entry.source;
|
|
15281
|
+
existing.citation = existing.citation ?? entry.citation;
|
|
15282
|
+
existing.artifact = existing.artifact ?? entry.artifact;
|
|
15283
|
+
existing.provenance = existing.provenance ?? entry.provenance;
|
|
15284
|
+
existing.score = combinedScore(existing.scores, existing.citation);
|
|
15331
15285
|
}
|
|
15332
|
-
function
|
|
15333
|
-
|
|
15286
|
+
function sortResults(results) {
|
|
15287
|
+
const kindOrder = {
|
|
15288
|
+
source_chunk: 0,
|
|
15289
|
+
wiki_chunk: 1,
|
|
15290
|
+
wiki_page: 2,
|
|
15291
|
+
knowledge_index: 3
|
|
15292
|
+
};
|
|
15293
|
+
return results.sort((a, b) => {
|
|
15294
|
+
if (b.score !== a.score)
|
|
15295
|
+
return b.score - a.score;
|
|
15296
|
+
return kindOrder[a.kind] - kindOrder[b.kind] || a.id.localeCompare(b.id);
|
|
15297
|
+
});
|
|
15334
15298
|
}
|
|
15335
|
-
async function
|
|
15336
|
-
const
|
|
15337
|
-
if (
|
|
15338
|
-
|
|
15299
|
+
async function hybridSearch(options) {
|
|
15300
|
+
const query = options.query.trim();
|
|
15301
|
+
if (!query)
|
|
15302
|
+
throw new Error("Search query is required.");
|
|
15303
|
+
const limit = Math.max(1, Math.min(options.limit ?? 10, 100));
|
|
15304
|
+
const terms = queryTerms(query);
|
|
15305
|
+
const ftsQuery = ftsQueryForTerms(terms);
|
|
15306
|
+
const semanticEnabled = options.semantic === true || options.fake === true || Boolean(options.modelRef);
|
|
15307
|
+
const warnings = [];
|
|
15308
|
+
let semanticProvider = null;
|
|
15309
|
+
let semanticModel = null;
|
|
15310
|
+
let semanticDimensions = null;
|
|
15311
|
+
let keywordCount = 0;
|
|
15312
|
+
let catalogCount = 0;
|
|
15313
|
+
let semanticCount = 0;
|
|
15314
|
+
const merged = new Map;
|
|
15339
15315
|
migrateKnowledgeDb(options.dbPath);
|
|
15340
|
-
const text = await readOutboxInput(options.input, options.config, options.safetyPolicy);
|
|
15341
|
-
const events = parseOutboxText(text);
|
|
15342
15316
|
const db = openKnowledgeDb(options.dbPath);
|
|
15343
|
-
const runId = `run_${randomUUID3()}`;
|
|
15344
15317
|
try {
|
|
15345
|
-
|
|
15346
|
-
|
|
15347
|
-
|
|
15348
|
-
|
|
15349
|
-
|
|
15350
|
-
|
|
15351
|
-
|
|
15352
|
-
|
|
15353
|
-
"open-files-outbox",
|
|
15354
|
-
JSON.stringify({ path: options.input, events: events.length }),
|
|
15355
|
-
now,
|
|
15356
|
-
now
|
|
15357
|
-
]);
|
|
15358
|
-
const sourcesTouched = new Set;
|
|
15359
|
-
const revisionsTouched = new Set;
|
|
15360
|
-
let chunksDeleted = 0;
|
|
15361
|
-
let embeddingsDeleted = 0;
|
|
15362
|
-
let vectorEntriesDeleted = 0;
|
|
15363
|
-
let staleRevisions = 0;
|
|
15364
|
-
let deletedSources = 0;
|
|
15365
|
-
let movedSources = 0;
|
|
15366
|
-
let permissionUpdates = 0;
|
|
15367
|
-
recordAuditEvent(db, {
|
|
15368
|
-
event_type: "source_read",
|
|
15369
|
-
action: options.input.startsWith("s3://") ? "s3_outbox_read" : "local_outbox_read",
|
|
15370
|
-
target_uri: options.input,
|
|
15371
|
-
decision: "allow",
|
|
15372
|
-
metadata: { events: events.length, read_only: true },
|
|
15373
|
-
created_at: now
|
|
15374
|
-
});
|
|
15375
|
-
events.forEach((raw, index) => {
|
|
15376
|
-
const event = normalizeEvent(raw, now);
|
|
15377
|
-
const sourceId = ensureSource(db, event, now);
|
|
15378
|
-
sourcesTouched.add(sourceId);
|
|
15379
|
-
const createdRevisionId = ensureRevision(db, sourceId, event, now);
|
|
15380
|
-
if (createdRevisionId)
|
|
15381
|
-
revisionsTouched.add(createdRevisionId);
|
|
15382
|
-
const affectedRevisionIds = revisionIdsForEvent(db, sourceId, event);
|
|
15383
|
-
for (const revisionId of affectedRevisionIds) {
|
|
15384
|
-
revisionsTouched.add(revisionId);
|
|
15385
|
-
const invalidation = invalidateRevision(db, revisionId);
|
|
15386
|
-
chunksDeleted += invalidation.chunksDeleted;
|
|
15387
|
-
embeddingsDeleted += invalidation.embeddingsDeleted;
|
|
15388
|
-
vectorEntriesDeleted += invalidation.vectorEntriesDeleted;
|
|
15389
|
-
staleRevisions += 1;
|
|
15390
|
-
}
|
|
15391
|
-
if (isDeleteEvent(event.eventType, event.status))
|
|
15392
|
-
deletedSources += 1;
|
|
15393
|
-
if (isMoveEvent(event.eventType))
|
|
15394
|
-
movedSources += 1;
|
|
15395
|
-
if (isPermissionEvent(event.eventType) || event.acl !== undefined)
|
|
15396
|
-
permissionUpdates += 1;
|
|
15397
|
-
db.run(`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
|
|
15398
|
-
VALUES (?, ?, ?, ?, ?, ?)`, [
|
|
15399
|
-
stableId2("evt", `${runId}\x00${index}\x00${event.sourceRef}\x00${event.eventType}`),
|
|
15400
|
-
runId,
|
|
15401
|
-
"info",
|
|
15402
|
-
event.eventType,
|
|
15403
|
-
JSON.stringify({
|
|
15404
|
-
source_ref: event.sourceRef,
|
|
15405
|
-
source_uri: event.sourceUri,
|
|
15406
|
-
revision: event.revision,
|
|
15407
|
-
hash: event.hash,
|
|
15408
|
-
status: event.status,
|
|
15409
|
-
affected_revisions: affectedRevisionIds.length
|
|
15410
|
-
}),
|
|
15411
|
-
event.updatedAt
|
|
15412
|
-
]);
|
|
15413
|
-
});
|
|
15414
|
-
db.run(`INSERT INTO provider_usage (id, run_id, provider, model, input_tokens, output_tokens, cost_usd, metadata_json, created_at)
|
|
15415
|
-
VALUES (?, ?, ?, ?, 0, 0, 0, ?, ?)`, [
|
|
15416
|
-
stableId2("usage", runId),
|
|
15417
|
-
runId,
|
|
15418
|
-
"local",
|
|
15419
|
-
"open-files-outbox",
|
|
15420
|
-
JSON.stringify({ note: "No model provider used for outbox invalidation." }),
|
|
15421
|
-
now
|
|
15422
|
-
]);
|
|
15423
|
-
recordAuditEvent(db, {
|
|
15424
|
-
event_type: "write",
|
|
15425
|
-
action: "knowledge_outbox_invalidation",
|
|
15426
|
-
target_uri: options.dbPath,
|
|
15427
|
-
decision: "allow",
|
|
15428
|
-
metadata: {
|
|
15429
|
-
run_id: runId,
|
|
15430
|
-
events: events.length,
|
|
15431
|
-
sources: sourcesTouched.size,
|
|
15432
|
-
revisions: revisionsTouched.size,
|
|
15433
|
-
chunks_deleted: chunksDeleted,
|
|
15434
|
-
embeddings_deleted: embeddingsDeleted,
|
|
15435
|
-
vector_entries_deleted: vectorEntriesDeleted
|
|
15436
|
-
},
|
|
15437
|
-
created_at: now
|
|
15438
|
-
});
|
|
15439
|
-
return {
|
|
15440
|
-
path: options.input,
|
|
15441
|
-
db_path: options.dbPath,
|
|
15442
|
-
run_id: runId,
|
|
15443
|
-
events_seen: events.length,
|
|
15444
|
-
sources_touched: sourcesTouched.size,
|
|
15445
|
-
revisions_touched: revisionsTouched.size,
|
|
15446
|
-
chunks_deleted: chunksDeleted,
|
|
15447
|
-
embeddings_deleted: embeddingsDeleted,
|
|
15448
|
-
vector_entries_deleted: vectorEntriesDeleted,
|
|
15449
|
-
stale_revisions: staleRevisions,
|
|
15450
|
-
deleted_sources: deletedSources,
|
|
15451
|
-
moved_sources: movedSources,
|
|
15452
|
-
permission_updates: permissionUpdates
|
|
15453
|
-
};
|
|
15454
|
-
})();
|
|
15318
|
+
const ftsRows = selectFtsChunks(db, ftsQuery, Math.max(limit * 3, 20));
|
|
15319
|
+
keywordCount = ftsRows.length;
|
|
15320
|
+
ftsRows.forEach((row, index) => mergeResult(merged, chunkResult(row, scoreFromRank(row.rank, index))));
|
|
15321
|
+
const wikiRows = selectWikiPages(db, terms, Math.max(limit, 10));
|
|
15322
|
+
const indexRows = selectKnowledgeIndexes(db, terms, Math.max(limit, 10));
|
|
15323
|
+
catalogCount = wikiRows.length + indexRows.length;
|
|
15324
|
+
wikiRows.forEach((row) => mergeResult(merged, wikiPageResult(row, terms)));
|
|
15325
|
+
indexRows.forEach((row) => mergeResult(merged, indexResult(row, terms)));
|
|
15455
15326
|
} finally {
|
|
15456
15327
|
db.close();
|
|
15457
15328
|
}
|
|
15458
|
-
|
|
15459
|
-
|
|
15460
|
-
// src/manifest-ingest.ts
|
|
15461
|
-
import { createHash as createHash4 } from "crypto";
|
|
15462
|
-
import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
|
|
15463
|
-
import { basename as basename2 } from "path";
|
|
15464
|
-
function stableId3(prefix, value) {
|
|
15465
|
-
return `${prefix}_${createHash4("sha256").update(value).digest("hex").slice(0, 20)}`;
|
|
15466
|
-
}
|
|
15467
|
-
function asObject2(value) {
|
|
15468
|
-
return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
|
|
15469
|
-
}
|
|
15470
|
-
function asString2(value) {
|
|
15471
|
-
return typeof value === "string" && value.length > 0 ? value : undefined;
|
|
15472
|
-
}
|
|
15473
|
-
function asNumber(value) {
|
|
15474
|
-
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
|
15475
|
-
}
|
|
15476
|
-
function buildSourceRefFromItem(item) {
|
|
15477
|
-
const explicit = asString2(item.source_ref) ?? asString2(item.source_uri) ?? asString2(item.uri);
|
|
15478
|
-
if (explicit)
|
|
15479
|
-
return explicit;
|
|
15480
|
-
const fileId = asString2(item.file_id);
|
|
15481
|
-
if (fileId) {
|
|
15482
|
-
const revision = asString2(item.revision_id) ?? asString2(item.revision);
|
|
15483
|
-
const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
|
|
15484
|
-
return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
|
|
15485
|
-
}
|
|
15486
|
-
const sourceId = asString2(item.source_id);
|
|
15487
|
-
const path = asString2(item.path);
|
|
15488
|
-
if (sourceId && path) {
|
|
15489
|
-
return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
|
|
15490
|
-
}
|
|
15491
|
-
throw new Error("Manifest item is missing source_ref, file_id, or source_id/path.");
|
|
15492
|
-
}
|
|
15493
|
-
function baseSourceUri2(sourceRef, parsed) {
|
|
15494
|
-
if (parsed.kind === "open-files" && parsed.entity === "file" && parsed.revision_id) {
|
|
15495
|
-
return sourceRef.replace(/\/revision\/[^/]+$/, "");
|
|
15496
|
-
}
|
|
15497
|
-
return sourceRef;
|
|
15498
|
-
}
|
|
15499
|
-
function textFromItem(item) {
|
|
15500
|
-
const direct = asString2(item.extracted_text) ?? asString2(item.text) ?? asString2(item.content_text) ?? asString2(item.markdown);
|
|
15501
|
-
if (direct !== undefined)
|
|
15502
|
-
return direct;
|
|
15503
|
-
const content = item.content;
|
|
15504
|
-
return typeof content === "string" ? content : null;
|
|
15505
|
-
}
|
|
15506
|
-
function extractedTextUriFromItem(item) {
|
|
15507
|
-
const direct = asString2(item.extracted_text_ref) ?? asString2(item.extracted_text_uri) ?? asString2(item.text_ref);
|
|
15508
|
-
if (direct)
|
|
15509
|
-
return direct;
|
|
15510
|
-
const content = asObject2(item.content);
|
|
15511
|
-
return asString2(content?.extracted_text_ref) ?? asString2(content?.extracted_text_uri) ?? null;
|
|
15512
|
-
}
|
|
15513
|
-
function titleFromItem(item) {
|
|
15514
|
-
const path = asString2(item.path);
|
|
15515
|
-
return asString2(item.title) ?? asString2(item.name) ?? (path ? basename2(path) : null);
|
|
15516
|
-
}
|
|
15517
|
-
function hashFromItem(item) {
|
|
15518
|
-
return asString2(item.hash) ?? asString2(item.checksum) ?? asString2(item.sha256) ?? null;
|
|
15519
|
-
}
|
|
15520
|
-
function revisionFromItem(item, parsed, hash2) {
|
|
15521
|
-
const revision = asString2(item.revision_id) ?? asString2(item.revision) ?? asString2(item.version_id) ?? (parsed.kind === "open-files" ? parsed.revision_id : undefined) ?? hash2 ?? asString2(item.updated_at);
|
|
15522
|
-
return revision ?? "current";
|
|
15523
|
-
}
|
|
15524
|
-
function metadataFromItem(item, normalized) {
|
|
15525
|
-
const metadata = {};
|
|
15526
|
-
for (const [key, value] of Object.entries(item)) {
|
|
15527
|
-
if (["text", "content", "content_text", "extracted_text", "markdown"].includes(key))
|
|
15528
|
-
continue;
|
|
15529
|
-
metadata[key] = value;
|
|
15530
|
-
}
|
|
15531
|
-
metadata.source_ref = normalized.sourceRef;
|
|
15532
|
-
metadata.source_uri = normalized.sourceUri;
|
|
15533
|
-
metadata.status = normalized.status;
|
|
15534
|
-
return metadata;
|
|
15535
|
-
}
|
|
15536
|
-
function normalizeManifestItem(item, now) {
|
|
15537
|
-
const sourceRef = buildSourceRefFromItem(item);
|
|
15538
|
-
const parsed = parseSourceRef(sourceRef);
|
|
15539
|
-
const sourceUri = baseSourceUri2(sourceRef, parsed);
|
|
15540
|
-
const hash2 = hashFromItem(item);
|
|
15541
|
-
const status = asString2(item.status) ?? "active";
|
|
15542
|
-
return {
|
|
15543
|
-
raw: item,
|
|
15544
|
-
sourceRef,
|
|
15545
|
-
sourceUri,
|
|
15546
|
-
kind: parsed.kind,
|
|
15547
|
-
title: titleFromItem(item),
|
|
15548
|
-
revision: revisionFromItem(item, parsed, hash2),
|
|
15549
|
-
hash: hash2,
|
|
15550
|
-
extractedTextUri: extractedTextUriFromItem(item),
|
|
15551
|
-
text: textFromItem(item),
|
|
15552
|
-
metadata: metadataFromItem(item, { sourceRef, sourceUri, status }),
|
|
15553
|
-
acl: item.permissions ?? item.acl ?? {},
|
|
15554
|
-
status,
|
|
15555
|
-
updatedAt: asString2(item.updated_at) ?? now
|
|
15556
|
-
};
|
|
15557
|
-
}
|
|
15558
|
-
function parseManifestText(text) {
|
|
15559
|
-
const trimmed = text.trim();
|
|
15560
|
-
if (!trimmed)
|
|
15561
|
-
return [];
|
|
15562
|
-
if (trimmed.startsWith("[")) {
|
|
15563
|
-
const parsed = JSON.parse(trimmed);
|
|
15564
|
-
if (!Array.isArray(parsed))
|
|
15565
|
-
throw new Error("Manifest array parse failed.");
|
|
15566
|
-
return parsed.map((entry) => {
|
|
15567
|
-
const item = asObject2(entry);
|
|
15568
|
-
if (!item)
|
|
15569
|
-
throw new Error("Manifest array entries must be objects.");
|
|
15570
|
-
return item;
|
|
15571
|
-
});
|
|
15572
|
-
}
|
|
15573
|
-
if (trimmed.startsWith("{")) {
|
|
15329
|
+
if (semanticEnabled) {
|
|
15574
15330
|
try {
|
|
15575
|
-
const
|
|
15576
|
-
|
|
15577
|
-
|
|
15578
|
-
|
|
15579
|
-
|
|
15580
|
-
|
|
15581
|
-
|
|
15582
|
-
|
|
15583
|
-
|
|
15584
|
-
|
|
15585
|
-
|
|
15331
|
+
const semantic = await searchVectorIndex({
|
|
15332
|
+
dbPath: options.dbPath,
|
|
15333
|
+
query,
|
|
15334
|
+
limit: Math.max(limit * 3, 20),
|
|
15335
|
+
config: options.config,
|
|
15336
|
+
env: options.env,
|
|
15337
|
+
modelRef: options.modelRef,
|
|
15338
|
+
dimensions: options.dimensions,
|
|
15339
|
+
fake: options.fake,
|
|
15340
|
+
batchSize: options.batchSize,
|
|
15341
|
+
maxParallelCalls: options.maxParallelCalls
|
|
15342
|
+
});
|
|
15343
|
+
semanticProvider = semantic.provider;
|
|
15344
|
+
semanticModel = semantic.model;
|
|
15345
|
+
semanticDimensions = semantic.dimensions;
|
|
15346
|
+
semanticCount = semantic.results.length;
|
|
15347
|
+
for (const row of semantic.results) {
|
|
15348
|
+
const result = {
|
|
15349
|
+
kind: "source_chunk",
|
|
15350
|
+
id: row.chunk_id,
|
|
15351
|
+
title: null,
|
|
15352
|
+
text: row.text,
|
|
15353
|
+
score: 0,
|
|
15354
|
+
scores: { semantic: semanticScore(row.score) },
|
|
15355
|
+
source: {
|
|
15356
|
+
uri: row.source_uri,
|
|
15357
|
+
ref: row.source_ref,
|
|
15358
|
+
kind: row.provenance?.source_kind ?? null,
|
|
15359
|
+
revision: row.revision,
|
|
15360
|
+
hash: row.hash
|
|
15361
|
+
},
|
|
15362
|
+
citation: {
|
|
15363
|
+
chunk_id: row.chunk_id,
|
|
15364
|
+
start_offset: row.provenance?.start_offset ?? null,
|
|
15365
|
+
end_offset: row.provenance?.end_offset ?? null
|
|
15366
|
+
},
|
|
15367
|
+
artifact: null,
|
|
15368
|
+
provenance: row.provenance,
|
|
15369
|
+
reasons: ["semantic_match"]
|
|
15370
|
+
};
|
|
15371
|
+
result.score = combinedScore(result.scores, result.citation);
|
|
15372
|
+
mergeResult(merged, result);
|
|
15586
15373
|
}
|
|
15587
|
-
if ("source_ref" in object2 || "source_uri" in object2 || "file_id" in object2)
|
|
15588
|
-
return [object2];
|
|
15589
15374
|
} catch (error48) {
|
|
15590
|
-
|
|
15591
|
-
if (lines.length <= 1)
|
|
15592
|
-
throw error48;
|
|
15593
|
-
return lines.map((line) => {
|
|
15594
|
-
const item = asObject2(JSON.parse(line));
|
|
15595
|
-
if (!item)
|
|
15596
|
-
throw new Error("Manifest JSONL entries must be objects.");
|
|
15597
|
-
return item;
|
|
15598
|
-
});
|
|
15375
|
+
warnings.push(`semantic_search_failed: ${error48 instanceof Error ? error48.message : String(error48)}`);
|
|
15599
15376
|
}
|
|
15600
15377
|
}
|
|
15601
|
-
|
|
15602
|
-
|
|
15603
|
-
|
|
15604
|
-
|
|
15605
|
-
|
|
15606
|
-
|
|
15378
|
+
const results = sortResults(Array.from(merged.values())).slice(0, limit);
|
|
15379
|
+
return {
|
|
15380
|
+
query,
|
|
15381
|
+
limit,
|
|
15382
|
+
mode: {
|
|
15383
|
+
keyword: true,
|
|
15384
|
+
catalog: true,
|
|
15385
|
+
semantic: semanticEnabled
|
|
15386
|
+
},
|
|
15387
|
+
semantic_provider: semanticProvider,
|
|
15388
|
+
semantic_model: semanticModel,
|
|
15389
|
+
semantic_dimensions: semanticDimensions,
|
|
15390
|
+
counts: {
|
|
15391
|
+
keyword_results: keywordCount,
|
|
15392
|
+
catalog_results: catalogCount,
|
|
15393
|
+
semantic_results: semanticCount,
|
|
15394
|
+
merged_results: results.length
|
|
15395
|
+
},
|
|
15396
|
+
warnings,
|
|
15397
|
+
results
|
|
15398
|
+
};
|
|
15607
15399
|
}
|
|
15608
|
-
|
|
15609
|
-
|
|
15610
|
-
|
|
15611
|
-
|
|
15612
|
-
if (!bucket || !key)
|
|
15613
|
-
throw new Error(`Invalid S3 manifest URI: ${uri}`);
|
|
15614
|
-
if (safetyPolicy)
|
|
15615
|
-
assertS3ReadAllowed(uri, safetyPolicy);
|
|
15616
|
-
const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
|
|
15617
|
-
import("@aws-sdk/client-s3"),
|
|
15618
|
-
import("@aws-sdk/credential-providers")
|
|
15619
|
-
]);
|
|
15620
|
-
const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
|
|
15621
|
-
const client = new S3Client({
|
|
15622
|
-
region: s3Config?.region,
|
|
15623
|
-
credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
|
|
15624
|
-
maxAttempts: s3Config?.max_attempts
|
|
15625
|
-
});
|
|
15626
|
-
const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
|
|
15627
|
-
if (!response.Body)
|
|
15628
|
-
return "";
|
|
15629
|
-
return await response.Body.transformToString();
|
|
15400
|
+
|
|
15401
|
+
// src/retrieval.ts
|
|
15402
|
+
function stableId2(prefix, value) {
|
|
15403
|
+
return `${prefix}_${createHash2("sha256").update(value).digest("hex").slice(0, 20)}`;
|
|
15630
15404
|
}
|
|
15631
|
-
|
|
15632
|
-
|
|
15633
|
-
return readS3Text2(input, config2, safetyPolicy);
|
|
15634
|
-
if (!existsSync5(input))
|
|
15635
|
-
throw new Error(`Manifest not found: ${input}`);
|
|
15636
|
-
return readFileSync5(input, "utf8");
|
|
15405
|
+
function normalizeQuery(query) {
|
|
15406
|
+
return query.normalize("NFKC").trim().replace(/\s+/g, " ").toLowerCase();
|
|
15637
15407
|
}
|
|
15638
|
-
function
|
|
15639
|
-
|
|
15640
|
-
`);
|
|
15641
|
-
if (!normalized.trim())
|
|
15642
|
-
return [];
|
|
15643
|
-
const chunks = [];
|
|
15644
|
-
let start = 0;
|
|
15645
|
-
while (start < normalized.length) {
|
|
15646
|
-
const hardEnd = Math.min(normalized.length, start + maxChars);
|
|
15647
|
-
let end = hardEnd;
|
|
15648
|
-
if (hardEnd < normalized.length) {
|
|
15649
|
-
const paragraphBreak = normalized.lastIndexOf(`
|
|
15650
|
-
|
|
15651
|
-
`, hardEnd);
|
|
15652
|
-
const sentenceBreak = normalized.lastIndexOf(". ", hardEnd);
|
|
15653
|
-
const candidate = Math.max(paragraphBreak, sentenceBreak);
|
|
15654
|
-
if (candidate > start + Math.floor(maxChars * 0.5))
|
|
15655
|
-
end = candidate + (candidate === paragraphBreak ? 2 : 1);
|
|
15656
|
-
}
|
|
15657
|
-
const chunk = normalized.slice(start, end).trim();
|
|
15658
|
-
if (chunk) {
|
|
15659
|
-
chunks.push({
|
|
15660
|
-
ordinal: chunks.length,
|
|
15661
|
-
text: chunk,
|
|
15662
|
-
startOffset: start,
|
|
15663
|
-
endOffset: end
|
|
15664
|
-
});
|
|
15665
|
-
}
|
|
15666
|
-
if (end >= normalized.length)
|
|
15667
|
-
break;
|
|
15668
|
-
start = Math.max(0, end - overlapChars);
|
|
15669
|
-
}
|
|
15670
|
-
return chunks;
|
|
15408
|
+
function queryTerms2(query) {
|
|
15409
|
+
return Array.from(new Set(normalizeQuery(query).match(/[\p{L}\p{N}_]+/gu) ?? [])).slice(0, 16);
|
|
15671
15410
|
}
|
|
15672
|
-
function
|
|
15673
|
-
|
|
15674
|
-
return Math.max(1, Math.ceil(words * 1.25));
|
|
15411
|
+
function textForResult(result) {
|
|
15412
|
+
return [result.title, result.text].filter(Boolean).join(" ").toLowerCase();
|
|
15675
15413
|
}
|
|
15676
|
-
function
|
|
15677
|
-
|
|
15678
|
-
|
|
15679
|
-
|
|
15680
|
-
|
|
15681
|
-
|
|
15682
|
-
return rows.length;
|
|
15414
|
+
function exactScore(result, terms) {
|
|
15415
|
+
if (terms.length === 0)
|
|
15416
|
+
return 0;
|
|
15417
|
+
const text = textForResult(result);
|
|
15418
|
+
const matched = terms.filter((term) => text.includes(term)).length;
|
|
15419
|
+
return Number((matched / terms.length).toFixed(6));
|
|
15683
15420
|
}
|
|
15684
|
-
function
|
|
15685
|
-
|
|
15421
|
+
function hasReadOnlyProvenance(provenance) {
|
|
15422
|
+
if (!provenance)
|
|
15423
|
+
return true;
|
|
15424
|
+
if ("read_only" in provenance)
|
|
15425
|
+
return provenance.read_only === true;
|
|
15426
|
+
if ("read_only_sources" in provenance)
|
|
15427
|
+
return provenance.read_only_sources === true;
|
|
15428
|
+
return true;
|
|
15429
|
+
}
|
|
15430
|
+
function isStale(provenance) {
|
|
15431
|
+
if (!provenance)
|
|
15432
|
+
return false;
|
|
15433
|
+
if ("stale" in provenance && provenance.stale)
|
|
15434
|
+
return true;
|
|
15435
|
+
if ("status" in provenance)
|
|
15436
|
+
return isStaleStatus(provenance.status);
|
|
15437
|
+
return false;
|
|
15438
|
+
}
|
|
15439
|
+
function freshnessScore(result) {
|
|
15440
|
+
if (isStale(result.provenance))
|
|
15441
|
+
return 0;
|
|
15442
|
+
if (result.source?.hash || result.source?.revision)
|
|
15443
|
+
return 1;
|
|
15444
|
+
if (result.artifact?.hash)
|
|
15445
|
+
return 0.85;
|
|
15446
|
+
if (result.provenance && "source_refs" in result.provenance && result.provenance.source_refs.length > 0)
|
|
15447
|
+
return 0.75;
|
|
15448
|
+
return 0.55;
|
|
15449
|
+
}
|
|
15450
|
+
function citationScore(result) {
|
|
15451
|
+
if (result.citation?.chunk_id && (result.source?.uri || result.artifact?.uri))
|
|
15452
|
+
return 1;
|
|
15453
|
+
if (result.provenance && "citation_required" in result.provenance && result.provenance.citation_required)
|
|
15454
|
+
return 0.75;
|
|
15455
|
+
if (result.artifact?.uri)
|
|
15456
|
+
return 0.65;
|
|
15457
|
+
return 0.35;
|
|
15458
|
+
}
|
|
15459
|
+
function authorityScore(result) {
|
|
15460
|
+
if (result.kind === "wiki_chunk")
|
|
15461
|
+
return 0.85;
|
|
15462
|
+
if (result.kind === "source_chunk")
|
|
15463
|
+
return 0.8;
|
|
15464
|
+
if (result.kind === "wiki_page")
|
|
15465
|
+
return 0.65;
|
|
15466
|
+
return 0.55;
|
|
15467
|
+
}
|
|
15468
|
+
function rerank(result, terms) {
|
|
15469
|
+
const scores = {
|
|
15470
|
+
base_score: result.score,
|
|
15471
|
+
exact_score: exactScore(result, terms),
|
|
15472
|
+
citation_score: citationScore(result),
|
|
15473
|
+
freshness_score: freshnessScore(result),
|
|
15474
|
+
authority_score: authorityScore(result)
|
|
15475
|
+
};
|
|
15476
|
+
const final = Math.min(1, scores.base_score * 0.65 + scores.exact_score * 0.1 + scores.citation_score * 0.1 + scores.freshness_score * 0.1 + scores.authority_score * 0.05);
|
|
15477
|
+
const reasons = new Set(result.reasons);
|
|
15478
|
+
if (scores.exact_score > 0.5)
|
|
15479
|
+
reasons.add("exact_term");
|
|
15480
|
+
if (scores.citation_score >= 0.75)
|
|
15481
|
+
reasons.add("cited_source");
|
|
15482
|
+
if (scores.freshness_score >= 0.85)
|
|
15483
|
+
reasons.add("fresh_source");
|
|
15484
|
+
return {
|
|
15485
|
+
...result,
|
|
15486
|
+
score: Number(final.toFixed(6)),
|
|
15487
|
+
reasons: Array.from(reasons),
|
|
15488
|
+
rerank: {
|
|
15489
|
+
...scores,
|
|
15490
|
+
final_score: Number(final.toFixed(6))
|
|
15491
|
+
}
|
|
15492
|
+
};
|
|
15493
|
+
}
|
|
15494
|
+
function quoteFor(result, maxChars) {
|
|
15495
|
+
const source = result.text ?? result.title;
|
|
15496
|
+
if (!source)
|
|
15497
|
+
return null;
|
|
15498
|
+
const normalized = source.replace(/\s+/g, " ").trim();
|
|
15499
|
+
return normalized.length <= maxChars ? normalized : `${normalized.slice(0, Math.max(0, maxChars - 1)).trim()}...`;
|
|
15500
|
+
}
|
|
15501
|
+
function citationFor(result) {
|
|
15502
|
+
const id = stableId2("cite", `${result.kind}\x00${result.id}\x00${result.source?.uri ?? ""}\x00${result.artifact?.uri ?? ""}`);
|
|
15503
|
+
return {
|
|
15504
|
+
id,
|
|
15505
|
+
result_id: result.id,
|
|
15506
|
+
kind: result.kind,
|
|
15507
|
+
source_uri: result.source?.uri ?? null,
|
|
15508
|
+
source_ref: result.source?.ref ?? null,
|
|
15509
|
+
artifact_uri: result.artifact?.uri ?? null,
|
|
15510
|
+
artifact_path: result.artifact?.path ?? null,
|
|
15511
|
+
revision: result.source?.revision ?? null,
|
|
15512
|
+
hash: result.source?.hash ?? result.artifact?.hash ?? null,
|
|
15513
|
+
chunk_id: result.citation?.chunk_id ?? null,
|
|
15514
|
+
start_offset: result.citation?.start_offset ?? null,
|
|
15515
|
+
end_offset: result.citation?.end_offset ?? null,
|
|
15516
|
+
quote: quoteFor(result, 500),
|
|
15517
|
+
provenance: result.provenance
|
|
15518
|
+
};
|
|
15519
|
+
}
|
|
15520
|
+
function excerptFor(result, citation, contextChars) {
|
|
15521
|
+
const text = quoteFor(result, contextChars);
|
|
15522
|
+
if (!text)
|
|
15523
|
+
return null;
|
|
15524
|
+
return {
|
|
15525
|
+
id: stableId2("excerpt", `${result.kind}\x00${result.id}`),
|
|
15526
|
+
result_id: result.id,
|
|
15527
|
+
citation_id: citation.id,
|
|
15528
|
+
kind: result.kind,
|
|
15529
|
+
text,
|
|
15530
|
+
score: result.score
|
|
15531
|
+
};
|
|
15532
|
+
}
|
|
15533
|
+
function placeholders(values) {
|
|
15534
|
+
return values.map(() => "?").join(", ");
|
|
15535
|
+
}
|
|
15536
|
+
function loadGraphEvidence(dbPath, results) {
|
|
15537
|
+
const chunkIds = results.map((result) => result.citation?.chunk_id).filter((id) => Boolean(id));
|
|
15538
|
+
const wikiPageIds = results.filter((result) => result.kind === "wiki_page").map((result) => result.id);
|
|
15539
|
+
const citations = [];
|
|
15540
|
+
const backlinks = [];
|
|
15541
|
+
if (chunkIds.length === 0 && wikiPageIds.length === 0)
|
|
15542
|
+
return { citations, backlinks };
|
|
15543
|
+
const db = openKnowledgeDb(dbPath);
|
|
15544
|
+
try {
|
|
15545
|
+
if (chunkIds.length > 0) {
|
|
15546
|
+
citations.push(...db.query(`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
|
|
15547
|
+
FROM citations
|
|
15548
|
+
WHERE chunk_id IN (${placeholders(chunkIds)})
|
|
15549
|
+
ORDER BY created_at DESC
|
|
15550
|
+
LIMIT 50`).all(...chunkIds));
|
|
15551
|
+
}
|
|
15552
|
+
if (wikiPageIds.length > 0) {
|
|
15553
|
+
citations.push(...db.query(`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
|
|
15554
|
+
FROM citations
|
|
15555
|
+
WHERE wiki_page_id IN (${placeholders(wikiPageIds)})
|
|
15556
|
+
ORDER BY created_at DESC
|
|
15557
|
+
LIMIT 50`).all(...wikiPageIds));
|
|
15558
|
+
backlinks.push(...db.query(`SELECT from_page_id, to_page_id, label
|
|
15559
|
+
FROM wiki_backlinks
|
|
15560
|
+
WHERE from_page_id IN (${placeholders(wikiPageIds)}) OR to_page_id IN (${placeholders(wikiPageIds)})
|
|
15561
|
+
LIMIT 50`).all(...wikiPageIds, ...wikiPageIds));
|
|
15562
|
+
}
|
|
15563
|
+
} finally {
|
|
15564
|
+
db.close();
|
|
15565
|
+
}
|
|
15566
|
+
return { citations, backlinks };
|
|
15567
|
+
}
|
|
15568
|
+
async function retrieveKnowledgeContext(options) {
|
|
15569
|
+
const contextChars = Math.max(200, Math.min(options.contextChars ?? 1200, 4000));
|
|
15570
|
+
const search = await hybridSearch(options);
|
|
15571
|
+
const terms = queryTerms2(search.query);
|
|
15572
|
+
const warnings = [...search.warnings];
|
|
15573
|
+
const permissionNotes = new Set;
|
|
15574
|
+
const freshnessNotes = new Set;
|
|
15575
|
+
const filtered = search.results.filter((result) => {
|
|
15576
|
+
if (!hasReadOnlyProvenance(result.provenance)) {
|
|
15577
|
+
warnings.push(`permission_filtered: ${result.kind}:${result.id}`);
|
|
15578
|
+
permissionNotes.add("Dropped a result because provenance was not read-only.");
|
|
15579
|
+
return false;
|
|
15580
|
+
}
|
|
15581
|
+
if (isStale(result.provenance)) {
|
|
15582
|
+
warnings.push(`stale_filtered: ${result.kind}:${result.id}`);
|
|
15583
|
+
freshnessNotes.add("Dropped a stale result whose source status requires reindexing.");
|
|
15584
|
+
return false;
|
|
15585
|
+
}
|
|
15586
|
+
return true;
|
|
15587
|
+
});
|
|
15588
|
+
const results = filtered.map((result) => rerank(result, terms)).sort((a, b) => b.score - a.score || a.id.localeCompare(b.id)).slice(0, search.limit);
|
|
15589
|
+
const citations = results.map(citationFor);
|
|
15590
|
+
const excerpts = results.map((result, index) => excerptFor(result, citations[index], contextChars)).filter((entry) => Boolean(entry));
|
|
15591
|
+
for (const result of results) {
|
|
15592
|
+
if (result.provenance && "read_only" in result.provenance && result.provenance.read_only) {
|
|
15593
|
+
permissionNotes.add("All source-backed excerpts are read-only and citation-required.");
|
|
15594
|
+
}
|
|
15595
|
+
if (result.rerank.freshness_score >= 0.85) {
|
|
15596
|
+
freshnessNotes.add("Fresh source revision/hash or artifact hash is present for top context.");
|
|
15597
|
+
}
|
|
15598
|
+
}
|
|
15599
|
+
return {
|
|
15600
|
+
query: search.query,
|
|
15601
|
+
normalized_query: normalizeQuery(search.query),
|
|
15602
|
+
created_at: new Date().toISOString(),
|
|
15603
|
+
mode: search.mode,
|
|
15604
|
+
warnings,
|
|
15605
|
+
search_counts: search.counts,
|
|
15606
|
+
results,
|
|
15607
|
+
citations,
|
|
15608
|
+
excerpts,
|
|
15609
|
+
graph: loadGraphEvidence(options.dbPath, results),
|
|
15610
|
+
notes: {
|
|
15611
|
+
permissions: Array.from(permissionNotes),
|
|
15612
|
+
freshness: Array.from(freshnessNotes)
|
|
15613
|
+
}
|
|
15614
|
+
};
|
|
15615
|
+
}
|
|
15616
|
+
|
|
15617
|
+
// src/agent.ts
|
|
15618
|
+
function estimateTokens(text) {
|
|
15619
|
+
const words = text.trim().split(/\s+/).filter(Boolean).length;
|
|
15620
|
+
return Math.max(1, Math.ceil(words * 1.25));
|
|
15621
|
+
}
|
|
15622
|
+
function citationLabel(index) {
|
|
15623
|
+
return `C${index + 1}`;
|
|
15624
|
+
}
|
|
15625
|
+
function localAnswer(prompt, context) {
|
|
15626
|
+
if (context.excerpts.length === 0) {
|
|
15627
|
+
return `No indexed knowledge matched the prompt: ${prompt}`;
|
|
15628
|
+
}
|
|
15629
|
+
const lines = [
|
|
15630
|
+
`Found ${context.excerpts.length} relevant knowledge excerpt(s) for: ${prompt}`,
|
|
15631
|
+
"",
|
|
15632
|
+
...context.excerpts.slice(0, 5).map((excerpt, index) => {
|
|
15633
|
+
const citation = context.citations.find((entry) => entry.id === excerpt.citation_id);
|
|
15634
|
+
const ref = citation?.source_ref ?? citation?.source_uri ?? citation?.artifact_path ?? citation?.artifact_uri ?? "unknown source";
|
|
15635
|
+
return `[${citationLabel(index)}] ${excerpt.text} (${ref})`;
|
|
15636
|
+
})
|
|
15637
|
+
];
|
|
15638
|
+
return lines.join(`
|
|
15639
|
+
`);
|
|
15640
|
+
}
|
|
15641
|
+
function promptForModel(prompt, context) {
|
|
15642
|
+
const citations = context.citations.map((citation, index) => ({
|
|
15643
|
+
id: citationLabel(index),
|
|
15644
|
+
source_ref: citation.source_ref,
|
|
15645
|
+
source_uri: citation.source_uri,
|
|
15646
|
+
artifact_path: citation.artifact_path,
|
|
15647
|
+
revision: citation.revision,
|
|
15648
|
+
hash: citation.hash,
|
|
15649
|
+
quote: citation.quote
|
|
15650
|
+
}));
|
|
15651
|
+
const excerpts = context.excerpts.map((excerpt, index) => ({
|
|
15652
|
+
id: citationLabel(index),
|
|
15653
|
+
kind: excerpt.kind,
|
|
15654
|
+
text: excerpt.text,
|
|
15655
|
+
score: excerpt.score
|
|
15656
|
+
}));
|
|
15657
|
+
return [
|
|
15658
|
+
`Prompt: ${prompt}`,
|
|
15659
|
+
"",
|
|
15660
|
+
"Use only the provided context. Cite claims with citation ids like [C1]. If context is insufficient, say what is missing.",
|
|
15661
|
+
"",
|
|
15662
|
+
`Context excerpts:
|
|
15663
|
+
${JSON.stringify(excerpts, null, 2)}`,
|
|
15664
|
+
"",
|
|
15665
|
+
`Citations:
|
|
15666
|
+
${JSON.stringify(citations, null, 2)}`
|
|
15667
|
+
].join(`
|
|
15668
|
+
`);
|
|
15669
|
+
}
|
|
15670
|
+
function proposedUpdates(prompt, context) {
|
|
15671
|
+
if (context.citations.length === 0)
|
|
15672
|
+
return [];
|
|
15673
|
+
return [{
|
|
15674
|
+
kind: "answer_note",
|
|
15675
|
+
title: prompt.length > 80 ? `${prompt.slice(0, 77)}...` : prompt,
|
|
15676
|
+
citations: context.citations.map((citation) => citation.id),
|
|
15677
|
+
requires_approval: true
|
|
15678
|
+
}];
|
|
15679
|
+
}
|
|
15680
|
+
function insertRun(dbPath, input) {
|
|
15681
|
+
const db = openKnowledgeDb(dbPath);
|
|
15682
|
+
try {
|
|
15683
|
+
db.run(`INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
|
|
15684
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
|
|
15685
|
+
input.runId,
|
|
15686
|
+
"knowledge-prompt",
|
|
15687
|
+
input.prompt,
|
|
15688
|
+
input.status,
|
|
15689
|
+
input.provider,
|
|
15690
|
+
input.model,
|
|
15691
|
+
JSON.stringify(input.metadata),
|
|
15692
|
+
input.now,
|
|
15693
|
+
input.now
|
|
15694
|
+
]);
|
|
15695
|
+
} finally {
|
|
15696
|
+
db.close();
|
|
15697
|
+
}
|
|
15698
|
+
}
|
|
15699
|
+
function addRunEvent(dbPath, input) {
|
|
15700
|
+
const db = openKnowledgeDb(dbPath);
|
|
15701
|
+
try {
|
|
15702
|
+
db.run(`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
|
|
15703
|
+
VALUES (?, ?, ?, ?, ?, ?)`, [
|
|
15704
|
+
`evt_${randomUUID3()}`,
|
|
15705
|
+
input.runId,
|
|
15706
|
+
input.level,
|
|
15707
|
+
input.event,
|
|
15708
|
+
JSON.stringify(input.metadata),
|
|
15709
|
+
input.now
|
|
15710
|
+
]);
|
|
15711
|
+
} finally {
|
|
15712
|
+
db.close();
|
|
15713
|
+
}
|
|
15714
|
+
}
|
|
15715
|
+
function updateRun(dbPath, input) {
|
|
15716
|
+
const db = openKnowledgeDb(dbPath);
|
|
15717
|
+
try {
|
|
15718
|
+
db.run(`UPDATE runs
|
|
15719
|
+
SET status = ?, provider = ?, model = ?, metadata_json = ?, updated_at = ?
|
|
15720
|
+
WHERE id = ?`, [
|
|
15721
|
+
input.status,
|
|
15722
|
+
input.provider,
|
|
15723
|
+
input.model,
|
|
15724
|
+
JSON.stringify(input.metadata),
|
|
15725
|
+
input.now,
|
|
15726
|
+
input.runId
|
|
15727
|
+
]);
|
|
15728
|
+
} finally {
|
|
15729
|
+
db.close();
|
|
15730
|
+
}
|
|
15731
|
+
}
|
|
15732
|
+
function recordUsage(dbPath, runId, usage, provider, model, now, metadata = {}) {
|
|
15733
|
+
const db = openKnowledgeDb(dbPath);
|
|
15734
|
+
try {
|
|
15735
|
+
recordProviderUsage(db, {
|
|
15736
|
+
run_id: runId,
|
|
15737
|
+
provider,
|
|
15738
|
+
model,
|
|
15739
|
+
input_tokens: usage.input_tokens,
|
|
15740
|
+
output_tokens: usage.output_tokens,
|
|
15741
|
+
cost_usd: usage.cost_usd,
|
|
15742
|
+
metadata,
|
|
15743
|
+
created_at: now
|
|
15744
|
+
});
|
|
15745
|
+
} finally {
|
|
15746
|
+
db.close();
|
|
15747
|
+
}
|
|
15748
|
+
}
|
|
15749
|
+
async function runKnowledgePrompt(options) {
|
|
15750
|
+
const prompt = options.prompt.trim();
|
|
15751
|
+
if (!prompt)
|
|
15752
|
+
throw new Error("Knowledge prompt is required.");
|
|
15753
|
+
const now = (options.now ?? new Date).toISOString();
|
|
15754
|
+
const runId = `run_${randomUUID3()}`;
|
|
15755
|
+
const modelRef = resolveModelRef(options.modelRef ?? "default", options.config);
|
|
15756
|
+
const parsed = parseModelRef(modelRef);
|
|
15757
|
+
migrateKnowledgeDb(options.dbPath);
|
|
15758
|
+
insertRun(options.dbPath, {
|
|
15759
|
+
runId,
|
|
15760
|
+
prompt,
|
|
15761
|
+
status: options.generate ? "running" : "dry_run",
|
|
15762
|
+
provider: options.generate ? parsed.provider : "local",
|
|
15763
|
+
model: options.generate ? parsed.model : "context-draft",
|
|
15764
|
+
metadata: {
|
|
15765
|
+
semantic: options.semantic === true || options.fake === true || Boolean(options.modelRef),
|
|
15766
|
+
approve_write: options.approveWrite === true,
|
|
15767
|
+
generated: options.generate === true
|
|
15768
|
+
},
|
|
15769
|
+
now
|
|
15770
|
+
});
|
|
15771
|
+
const { prompt: _prompt, generate: _generate, approveWrite: _approveWrite, now: _now, ...retrievalOptions } = options;
|
|
15772
|
+
const context = await retrieveKnowledgeContext({
|
|
15773
|
+
...retrievalOptions,
|
|
15774
|
+
query: prompt
|
|
15775
|
+
});
|
|
15776
|
+
addRunEvent(options.dbPath, {
|
|
15777
|
+
runId,
|
|
15778
|
+
level: "info",
|
|
15779
|
+
event: "context_retrieved",
|
|
15780
|
+
metadata: {
|
|
15781
|
+
results: context.results.length,
|
|
15782
|
+
citations: context.citations.length,
|
|
15783
|
+
warnings: context.warnings
|
|
15784
|
+
},
|
|
15785
|
+
now
|
|
15786
|
+
});
|
|
15787
|
+
let answer = localAnswer(prompt, context);
|
|
15788
|
+
let generated = false;
|
|
15789
|
+
let provider = "local";
|
|
15790
|
+
let model = "context-draft";
|
|
15791
|
+
let usage = {
|
|
15792
|
+
input_tokens: estimateTokens(prompt) + context.excerpts.reduce((sum, excerpt) => sum + estimateTokens(excerpt.text), 0),
|
|
15793
|
+
output_tokens: estimateTokens(answer),
|
|
15794
|
+
cost_usd: 0
|
|
15795
|
+
};
|
|
15796
|
+
const warnings = [...context.warnings];
|
|
15797
|
+
if (options.generate) {
|
|
15798
|
+
try {
|
|
15799
|
+
if (options.fake) {
|
|
15800
|
+
generated = true;
|
|
15801
|
+
provider = parsed.provider;
|
|
15802
|
+
model = parsed.model;
|
|
15803
|
+
answer = `Fake generated answer for: ${prompt}
|
|
15804
|
+
|
|
15805
|
+
${answer}`;
|
|
15806
|
+
} else {
|
|
15807
|
+
const { generateText } = await import("ai");
|
|
15808
|
+
const languageModel = await languageModelFor(modelRef, {
|
|
15809
|
+
config: options.config,
|
|
15810
|
+
env: options.env
|
|
15811
|
+
});
|
|
15812
|
+
const result = await generateText({
|
|
15813
|
+
model: languageModel,
|
|
15814
|
+
system: "You answer company knowledge-base prompts using only provided context and citation ids.",
|
|
15815
|
+
prompt: promptForModel(prompt, context)
|
|
15816
|
+
});
|
|
15817
|
+
generated = true;
|
|
15818
|
+
provider = parsed.provider;
|
|
15819
|
+
model = parsed.model;
|
|
15820
|
+
answer = result.text;
|
|
15821
|
+
const normalized = normalizeAiSdkUsage({
|
|
15822
|
+
provider,
|
|
15823
|
+
model,
|
|
15824
|
+
usage: result.usage,
|
|
15825
|
+
providerMetadata: result.providerMetadata
|
|
15826
|
+
});
|
|
15827
|
+
usage = {
|
|
15828
|
+
input_tokens: normalized.input_tokens,
|
|
15829
|
+
output_tokens: normalized.output_tokens,
|
|
15830
|
+
cost_usd: normalized.cost_usd
|
|
15831
|
+
};
|
|
15832
|
+
}
|
|
15833
|
+
} catch (error48) {
|
|
15834
|
+
addRunEvent(options.dbPath, {
|
|
15835
|
+
runId,
|
|
15836
|
+
level: "error",
|
|
15837
|
+
event: "answer_generation_failed",
|
|
15838
|
+
metadata: { message: error48 instanceof Error ? error48.message : String(error48) },
|
|
15839
|
+
now
|
|
15840
|
+
});
|
|
15841
|
+
updateRun(options.dbPath, {
|
|
15842
|
+
runId,
|
|
15843
|
+
status: "failed",
|
|
15844
|
+
provider: parsed.provider,
|
|
15845
|
+
model: parsed.model,
|
|
15846
|
+
metadata: {
|
|
15847
|
+
generated: false,
|
|
15848
|
+
error: error48 instanceof Error ? error48.message : String(error48)
|
|
15849
|
+
},
|
|
15850
|
+
now
|
|
15851
|
+
});
|
|
15852
|
+
throw error48;
|
|
15853
|
+
}
|
|
15854
|
+
}
|
|
15855
|
+
const updates = proposedUpdates(prompt, context);
|
|
15856
|
+
const writePolicy = {
|
|
15857
|
+
approved: options.approveWrite === true,
|
|
15858
|
+
durable_writes_performed: false,
|
|
15859
|
+
reason: options.approveWrite ? "Approval flag recorded; durable wiki writing is deferred to the wiki compile task." : "Dry-run mode: proposed wiki updates require approval before durable writes."
|
|
15860
|
+
};
|
|
15861
|
+
addRunEvent(options.dbPath, {
|
|
15862
|
+
runId,
|
|
15863
|
+
level: "info",
|
|
15864
|
+
event: generated ? "answer_generated" : "answer_drafted",
|
|
15865
|
+
metadata: {
|
|
15866
|
+
provider,
|
|
15867
|
+
model,
|
|
15868
|
+
proposed_updates: updates.length,
|
|
15869
|
+
durable_writes_performed: false
|
|
15870
|
+
},
|
|
15871
|
+
now
|
|
15872
|
+
});
|
|
15873
|
+
recordUsage(options.dbPath, runId, usage, provider, model, now, {
|
|
15874
|
+
generated,
|
|
15875
|
+
citations: context.citations.length
|
|
15876
|
+
});
|
|
15877
|
+
updateRun(options.dbPath, {
|
|
15878
|
+
runId,
|
|
15879
|
+
status: generated ? "completed" : "dry_run",
|
|
15880
|
+
provider,
|
|
15881
|
+
model,
|
|
15882
|
+
metadata: {
|
|
15883
|
+
generated,
|
|
15884
|
+
citations: context.citations.length,
|
|
15885
|
+
proposed_updates: updates.length,
|
|
15886
|
+
approve_write: options.approveWrite === true
|
|
15887
|
+
},
|
|
15888
|
+
now
|
|
15889
|
+
});
|
|
15890
|
+
return {
|
|
15891
|
+
run_id: runId,
|
|
15892
|
+
prompt,
|
|
15893
|
+
generated,
|
|
15894
|
+
provider,
|
|
15895
|
+
model,
|
|
15896
|
+
answer,
|
|
15897
|
+
context,
|
|
15898
|
+
citations: context.citations,
|
|
15899
|
+
proposed_wiki_updates: updates,
|
|
15900
|
+
write_policy: writePolicy,
|
|
15901
|
+
usage,
|
|
15902
|
+
warnings
|
|
15903
|
+
};
|
|
15904
|
+
}
|
|
15905
|
+
|
|
15906
|
+
// src/outbox-consume.ts
|
|
15907
|
+
import { createHash as createHash4, randomUUID as randomUUID5 } from "crypto";
|
|
15908
|
+
import { existsSync as existsSync4, readFileSync as readFileSync4 } from "fs";
|
|
15909
|
+
import { basename } from "path";
|
|
15910
|
+
|
|
15911
|
+
// src/safety.ts
|
|
15912
|
+
import { createHash as createHash3, randomUUID as randomUUID4 } from "crypto";
|
|
15913
|
+
import { relative as relative2, resolve as resolve2, sep as sep2 } from "path";
|
|
15914
|
+
function envEnabled(name) {
|
|
15915
|
+
const value = process.env[name];
|
|
15916
|
+
return value === "1" || value === "true" || value === "yes";
|
|
15917
|
+
}
|
|
15918
|
+
function resolveSafetyPolicy(config2, workspace) {
|
|
15919
|
+
const extended = config2;
|
|
15920
|
+
const configuredBuckets = new Set(extended.safety?.network?.allowed_s3_buckets ?? []);
|
|
15921
|
+
if (config2.storage.type === "s3" && config2.storage.s3?.bucket)
|
|
15922
|
+
configuredBuckets.add(config2.storage.s3.bucket);
|
|
15923
|
+
if (process.env.HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS) {
|
|
15924
|
+
for (const bucket of process.env.HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS.split(",").map((entry) => entry.trim()).filter(Boolean)) {
|
|
15925
|
+
configuredBuckets.add(bucket);
|
|
15926
|
+
}
|
|
15927
|
+
}
|
|
15928
|
+
return {
|
|
15929
|
+
mode: config2.mode,
|
|
15930
|
+
allowWriteRoots: [
|
|
15931
|
+
workspace.home,
|
|
15932
|
+
workspace.artifactsDir,
|
|
15933
|
+
workspace.cacheDir,
|
|
15934
|
+
workspace.exportsDir,
|
|
15935
|
+
workspace.indexesDir,
|
|
15936
|
+
workspace.logsDir,
|
|
15937
|
+
workspace.runsDir,
|
|
15938
|
+
workspace.schemasDir,
|
|
15939
|
+
workspace.wikiDir
|
|
15940
|
+
].map((entry) => resolve2(entry)),
|
|
15941
|
+
readOnlySourceAccess: true,
|
|
15942
|
+
network: {
|
|
15943
|
+
webSearchEnabled: extended.safety?.network?.web_search_enabled ?? envEnabled("HASNA_KNOWLEDGE_WEB_SEARCH"),
|
|
15944
|
+
s3ReadsEnabled: extended.safety?.network?.s3_reads_enabled ?? envEnabled("HASNA_KNOWLEDGE_ALLOW_S3_READS"),
|
|
15945
|
+
allowedS3Buckets: [...configuredBuckets].sort()
|
|
15946
|
+
},
|
|
15947
|
+
redaction: {
|
|
15948
|
+
enabled: extended.safety?.redaction?.enabled ?? true
|
|
15949
|
+
},
|
|
15950
|
+
approvals: {
|
|
15951
|
+
generatedWritesRequireApproval: extended.safety?.approvals?.generated_writes_require_approval ?? true
|
|
15952
|
+
}
|
|
15953
|
+
};
|
|
15954
|
+
}
|
|
15955
|
+
function isInside(root, target) {
|
|
15956
|
+
const rel = relative2(root, target);
|
|
15957
|
+
return rel === "" || !rel.startsWith("..") && rel !== ".." && !rel.startsWith(`..${sep2}`);
|
|
15958
|
+
}
|
|
15959
|
+
function assertWriteAllowed(targetPath, policy) {
|
|
15960
|
+
const resolved = resolve2(targetPath);
|
|
15961
|
+
if (!policy.allowWriteRoots.some((root) => isInside(root, resolved))) {
|
|
15962
|
+
throw new Error(`Safety policy denied write outside .hasna/apps/knowledge: ${targetPath}`);
|
|
15963
|
+
}
|
|
15964
|
+
}
|
|
15965
|
+
function assertS3ReadAllowed(uri, policy) {
|
|
15966
|
+
const parsed = new URL(uri);
|
|
15967
|
+
const bucket = parsed.hostname;
|
|
15968
|
+
if (!policy.network.s3ReadsEnabled) {
|
|
15969
|
+
throw new Error("Safety policy denied S3 read. Set safety.network.s3_reads_enabled=true or HASNA_KNOWLEDGE_ALLOW_S3_READS=1.");
|
|
15970
|
+
}
|
|
15971
|
+
if (!policy.network.allowedS3Buckets.includes(bucket)) {
|
|
15972
|
+
throw new Error(`Safety policy denied S3 bucket "${bucket}". Add it to safety.network.allowed_s3_buckets or HASNA_KNOWLEDGE_ALLOWED_S3_BUCKETS.`);
|
|
15973
|
+
}
|
|
15974
|
+
}
|
|
15975
|
+
function assertWebSearchAllowed(policy) {
|
|
15976
|
+
if (!policy.network.webSearchEnabled) {
|
|
15977
|
+
throw new Error("Safety policy denied web search. Set safety.network.web_search_enabled=true or HASNA_KNOWLEDGE_WEB_SEARCH=1.");
|
|
15978
|
+
}
|
|
15979
|
+
}
|
|
15980
|
+
var REDACTION_PATTERNS = [
|
|
15981
|
+
{ type: "private_key_block", severity: "high", regex: /-----BEGIN [A-Z ]*PRIVATE KEY-----[\s\S]*?-----END [A-Z ]*PRIVATE KEY-----/g, replacement: "[REDACTED:private_key_block]" },
|
|
15982
|
+
{ type: "secret_assignment", severity: "high", regex: /\b(?:api[_-]?key|secret|token|password)\s*[:=]\s*['"]?[^'"\s]{8,}/gi, replacement: "[REDACTED:secret_assignment]" },
|
|
15983
|
+
{ type: "openai_api_key", severity: "high", regex: /\bsk-[A-Za-z0-9_-]{20,}\b/g, replacement: "[REDACTED:openai_api_key]" },
|
|
15984
|
+
{ type: "anthropic_api_key", severity: "high", regex: /\bsk-ant-[A-Za-z0-9_-]{20,}\b/g, replacement: "[REDACTED:anthropic_api_key]" },
|
|
15985
|
+
{ type: "aws_access_key_id", severity: "high", regex: /\bA(?:KIA|SIA)[A-Z0-9]{16}\b/g, replacement: "[REDACTED:aws_access_key_id]" }
|
|
15986
|
+
];
|
|
15987
|
+
function redactSecrets(text, policy) {
|
|
15988
|
+
if (policy && !policy.redaction.enabled)
|
|
15989
|
+
return { text, findings: [] };
|
|
15990
|
+
let output = text;
|
|
15991
|
+
const findings = [];
|
|
15992
|
+
for (const pattern of REDACTION_PATTERNS) {
|
|
15993
|
+
output = output.replace(pattern.regex, (match, ...args) => {
|
|
15994
|
+
const offset = typeof args.at(-2) === "number" ? args.at(-2) : output.indexOf(match);
|
|
15995
|
+
findings.push({
|
|
15996
|
+
type: pattern.type,
|
|
15997
|
+
severity: pattern.severity,
|
|
15998
|
+
start: Math.max(0, offset),
|
|
15999
|
+
end: Math.max(0, offset + match.length)
|
|
16000
|
+
});
|
|
16001
|
+
return pattern.replacement;
|
|
16002
|
+
});
|
|
16003
|
+
}
|
|
16004
|
+
return { text: output, findings };
|
|
16005
|
+
}
|
|
16006
|
+
function auditId(input) {
|
|
16007
|
+
return `audit_${createHash3("sha256").update(`${input.event_type}\x00${input.action}\x00${input.target_uri ?? ""}\x00${input.created_at ?? ""}\x00${JSON.stringify(input.metadata ?? {})}\x00${randomUUID4()}`).digest("hex").slice(0, 24)}`;
|
|
16008
|
+
}
|
|
16009
|
+
function recordAuditEvent(db, input) {
|
|
16010
|
+
const createdAt = input.created_at ?? new Date().toISOString();
|
|
16011
|
+
const id = auditId({ ...input, created_at: createdAt });
|
|
16012
|
+
db.run(`INSERT INTO audit_events (id, event_type, action, target_uri, decision, metadata_json, created_at)
|
|
16013
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)`, [
|
|
16014
|
+
id,
|
|
16015
|
+
input.event_type,
|
|
16016
|
+
input.action,
|
|
16017
|
+
input.target_uri ?? null,
|
|
16018
|
+
input.decision,
|
|
16019
|
+
JSON.stringify(input.metadata ?? {}),
|
|
16020
|
+
createdAt
|
|
16021
|
+
]);
|
|
16022
|
+
return id;
|
|
16023
|
+
}
|
|
16024
|
+
function recordRedactionFindings(db, input) {
|
|
16025
|
+
const createdAt = input.created_at ?? new Date().toISOString();
|
|
16026
|
+
for (const finding of input.findings) {
|
|
16027
|
+
db.run(`INSERT INTO redaction_findings (id, source_uri, run_id, severity, finding_type, metadata_json, created_at)
|
|
16028
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)`, [
|
|
16029
|
+
`redact_${randomUUID4()}`,
|
|
16030
|
+
input.source_uri ?? null,
|
|
16031
|
+
input.run_id ?? null,
|
|
16032
|
+
finding.severity,
|
|
16033
|
+
finding.type,
|
|
16034
|
+
JSON.stringify({ ...input.metadata ?? {}, start: finding.start, end: finding.end }),
|
|
16035
|
+
createdAt
|
|
16036
|
+
]);
|
|
16037
|
+
}
|
|
16038
|
+
return input.findings.length;
|
|
16039
|
+
}
|
|
16040
|
+
|
|
16041
|
+
// src/outbox-consume.ts
|
|
16042
|
+
function stableId3(prefix, value) {
|
|
16043
|
+
return `${prefix}_${createHash4("sha256").update(value).digest("hex").slice(0, 20)}`;
|
|
16044
|
+
}
|
|
16045
|
+
function asObject(value) {
|
|
16046
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
|
|
16047
|
+
}
|
|
16048
|
+
function asString(value) {
|
|
16049
|
+
return typeof value === "string" && value.length > 0 ? value : undefined;
|
|
16050
|
+
}
|
|
16051
|
+
function buildSourceRef(event) {
|
|
16052
|
+
const explicit = asString(event.source_ref) ?? asString(event.source_uri) ?? asString(event.uri);
|
|
16053
|
+
if (explicit)
|
|
16054
|
+
return explicit;
|
|
16055
|
+
const fileId = asString(event.file_id);
|
|
16056
|
+
if (fileId) {
|
|
16057
|
+
const revision = asString(event.revision_id) ?? asString(event.revision);
|
|
16058
|
+
const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
|
|
16059
|
+
return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
|
|
16060
|
+
}
|
|
16061
|
+
const sourceId = asString(event.source_id);
|
|
16062
|
+
const path = asString(event.path);
|
|
16063
|
+
if (sourceId && path) {
|
|
16064
|
+
return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
|
|
16065
|
+
}
|
|
16066
|
+
throw new Error("Outbox event is missing source_ref, file_id, or source_id/path.");
|
|
16067
|
+
}
|
|
16068
|
+
function baseSourceUri(sourceRef, parsed) {
|
|
16069
|
+
if (parsed.kind === "open-files" && parsed.entity === "file" && parsed.revision_id) {
|
|
16070
|
+
return sourceRef.replace(/\/revision\/[^/]+$/, "");
|
|
16071
|
+
}
|
|
16072
|
+
return sourceRef;
|
|
16073
|
+
}
|
|
16074
|
+
function hashFromEvent(event) {
|
|
16075
|
+
return asString(event.hash) ?? asString(event.checksum) ?? asString(event.sha256) ?? null;
|
|
16076
|
+
}
|
|
16077
|
+
function revisionFromEvent(event, parsed, hash2) {
|
|
16078
|
+
return asString(event.revision_id) ?? asString(event.revision) ?? asString(event.version_id) ?? (parsed.kind === "open-files" ? parsed.revision_id : undefined) ?? hash2 ?? null;
|
|
16079
|
+
}
|
|
16080
|
+
function eventType(event) {
|
|
16081
|
+
return (asString(event.event) ?? asString(event.type) ?? asString(event.action) ?? asString(event.change_type) ?? "changed").toLowerCase();
|
|
16082
|
+
}
|
|
16083
|
+
function titleFromEvent(event) {
|
|
16084
|
+
const path = asString(event.path);
|
|
16085
|
+
return asString(event.title) ?? asString(event.name) ?? (path ? basename(path) : null);
|
|
16086
|
+
}
|
|
16087
|
+
function normalizeEvent(event, now) {
|
|
16088
|
+
const sourceRef = buildSourceRef(event);
|
|
16089
|
+
const parsed = parseSourceRef(sourceRef);
|
|
16090
|
+
const hash2 = hashFromEvent(event);
|
|
16091
|
+
return {
|
|
16092
|
+
raw: event,
|
|
16093
|
+
eventType: eventType(event),
|
|
16094
|
+
sourceRef,
|
|
16095
|
+
sourceUri: baseSourceUri(sourceRef, parsed),
|
|
16096
|
+
kind: parsed.kind,
|
|
16097
|
+
title: titleFromEvent(event),
|
|
16098
|
+
revision: revisionFromEvent(event, parsed, hash2),
|
|
16099
|
+
hash: hash2,
|
|
16100
|
+
status: asString(event.status)?.toLowerCase() ?? null,
|
|
16101
|
+
updatedAt: asString(event.updated_at) ?? now,
|
|
16102
|
+
acl: event.permissions ?? event.acl ?? undefined
|
|
16103
|
+
};
|
|
16104
|
+
}
|
|
16105
|
+
function parseOutboxText(text) {
|
|
16106
|
+
const trimmed = text.trim();
|
|
16107
|
+
if (!trimmed)
|
|
16108
|
+
return [];
|
|
16109
|
+
if (trimmed.startsWith("[")) {
|
|
16110
|
+
const parsed = JSON.parse(trimmed);
|
|
16111
|
+
if (!Array.isArray(parsed))
|
|
16112
|
+
throw new Error("Outbox array parse failed.");
|
|
16113
|
+
return parsed.map((entry) => {
|
|
16114
|
+
const event = asObject(entry);
|
|
16115
|
+
if (!event)
|
|
16116
|
+
throw new Error("Outbox array entries must be objects.");
|
|
16117
|
+
return event;
|
|
16118
|
+
});
|
|
16119
|
+
}
|
|
16120
|
+
if (trimmed.startsWith("{")) {
|
|
16121
|
+
try {
|
|
16122
|
+
const parsed = JSON.parse(trimmed);
|
|
16123
|
+
const object2 = asObject(parsed);
|
|
16124
|
+
if (!object2)
|
|
16125
|
+
throw new Error("Outbox object parse failed.");
|
|
16126
|
+
if (Array.isArray(object2.events)) {
|
|
16127
|
+
return object2.events.map((entry) => {
|
|
16128
|
+
const event = asObject(entry);
|
|
16129
|
+
if (!event)
|
|
16130
|
+
throw new Error("Outbox events entries must be objects.");
|
|
16131
|
+
return event;
|
|
16132
|
+
});
|
|
16133
|
+
}
|
|
16134
|
+
if ("source_ref" in object2 || "source_uri" in object2 || "file_id" in object2)
|
|
16135
|
+
return [object2];
|
|
16136
|
+
} catch (error48) {
|
|
16137
|
+
const lines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0);
|
|
16138
|
+
if (lines.length <= 1)
|
|
16139
|
+
throw error48;
|
|
16140
|
+
return lines.map((line) => {
|
|
16141
|
+
const event = asObject(JSON.parse(line));
|
|
16142
|
+
if (!event)
|
|
16143
|
+
throw new Error("Outbox JSONL entries must be objects.");
|
|
16144
|
+
return event;
|
|
16145
|
+
});
|
|
16146
|
+
}
|
|
16147
|
+
}
|
|
16148
|
+
return trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => {
|
|
16149
|
+
const event = asObject(JSON.parse(line));
|
|
16150
|
+
if (!event)
|
|
16151
|
+
throw new Error("Outbox JSONL entries must be objects.");
|
|
16152
|
+
return event;
|
|
16153
|
+
});
|
|
16154
|
+
}
|
|
16155
|
+
async function readS3Text(uri, config2, safetyPolicy) {
|
|
16156
|
+
const parsed = new URL(uri);
|
|
16157
|
+
const bucket = parsed.hostname;
|
|
16158
|
+
const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ""));
|
|
16159
|
+
if (!bucket || !key)
|
|
16160
|
+
throw new Error(`Invalid S3 outbox URI: ${uri}`);
|
|
16161
|
+
if (safetyPolicy)
|
|
16162
|
+
assertS3ReadAllowed(uri, safetyPolicy);
|
|
16163
|
+
const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
|
|
16164
|
+
import("@aws-sdk/client-s3"),
|
|
16165
|
+
import("@aws-sdk/credential-providers")
|
|
16166
|
+
]);
|
|
16167
|
+
const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
|
|
16168
|
+
const client = new S3Client({
|
|
16169
|
+
region: s3Config?.region,
|
|
16170
|
+
credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
|
|
16171
|
+
maxAttempts: s3Config?.max_attempts
|
|
16172
|
+
});
|
|
16173
|
+
const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
|
|
16174
|
+
if (!response.Body)
|
|
16175
|
+
return "";
|
|
16176
|
+
return await response.Body.transformToString();
|
|
16177
|
+
}
|
|
16178
|
+
async function readOutboxInput(input, config2, safetyPolicy) {
|
|
16179
|
+
if (input.startsWith("s3://"))
|
|
16180
|
+
return readS3Text(input, config2, safetyPolicy);
|
|
16181
|
+
if (!existsSync4(input))
|
|
16182
|
+
throw new Error(`Outbox not found: ${input}`);
|
|
16183
|
+
return readFileSync4(input, "utf8");
|
|
16184
|
+
}
|
|
16185
|
+
function mergeJson(existing, patch) {
|
|
16186
|
+
let base = {};
|
|
16187
|
+
if (existing) {
|
|
16188
|
+
try {
|
|
16189
|
+
base = asObject(JSON.parse(existing)) ?? {};
|
|
16190
|
+
} catch {
|
|
16191
|
+
base = {};
|
|
16192
|
+
}
|
|
16193
|
+
}
|
|
16194
|
+
return JSON.stringify({ ...base, ...patch });
|
|
16195
|
+
}
|
|
16196
|
+
function ensureSource(db, event, now) {
|
|
16197
|
+
const id = stableId3("src", event.sourceUri);
|
|
15686
16198
|
db.run(`INSERT INTO sources (id, uri, kind, title, metadata_json, acl_json, created_at, updated_at)
|
|
15687
16199
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
15688
16200
|
ON CONFLICT(uri) DO UPDATE SET
|
|
15689
16201
|
kind = excluded.kind,
|
|
15690
|
-
title = excluded.title,
|
|
15691
|
-
metadata_json = excluded.metadata_json,
|
|
15692
|
-
acl_json = excluded.acl_json,
|
|
16202
|
+
title = COALESCE(excluded.title, sources.title),
|
|
15693
16203
|
updated_at = excluded.updated_at`, [
|
|
15694
|
-
|
|
15695
|
-
|
|
15696
|
-
|
|
15697
|
-
|
|
15698
|
-
JSON.stringify(
|
|
15699
|
-
JSON.stringify(
|
|
16204
|
+
id,
|
|
16205
|
+
event.sourceUri,
|
|
16206
|
+
event.kind,
|
|
16207
|
+
event.title,
|
|
16208
|
+
JSON.stringify({ source_ref: event.sourceRef, source_uri: event.sourceUri, status: event.status, last_outbox_event: event.eventType }),
|
|
16209
|
+
JSON.stringify(event.acl ?? {}),
|
|
15700
16210
|
now,
|
|
15701
|
-
|
|
16211
|
+
event.updatedAt
|
|
15702
16212
|
]);
|
|
15703
|
-
const row = db.query("SELECT id FROM sources WHERE uri = ?").get(
|
|
16213
|
+
const row = db.query("SELECT id, metadata_json, acl_json FROM sources WHERE uri = ?").get(event.sourceUri);
|
|
15704
16214
|
if (!row)
|
|
15705
|
-
throw new Error(`Failed to upsert source: ${
|
|
15706
|
-
|
|
15707
|
-
|
|
15708
|
-
|
|
15709
|
-
|
|
15710
|
-
|
|
15711
|
-
|
|
15712
|
-
|
|
15713
|
-
|
|
15714
|
-
|
|
15715
|
-
|
|
15716
|
-
|
|
15717
|
-
|
|
15718
|
-
|
|
15719
|
-
|
|
15720
|
-
|
|
15721
|
-
|
|
15722
|
-
now
|
|
16215
|
+
throw new Error(`Failed to upsert source for outbox event: ${event.sourceUri}`);
|
|
16216
|
+
const patch = {
|
|
16217
|
+
source_ref: event.sourceRef,
|
|
16218
|
+
source_uri: event.sourceUri,
|
|
16219
|
+
last_outbox_event: event.eventType,
|
|
16220
|
+
last_outbox_at: event.updatedAt
|
|
16221
|
+
};
|
|
16222
|
+
if (event.status)
|
|
16223
|
+
patch.status = event.status;
|
|
16224
|
+
if (asString(event.raw.path))
|
|
16225
|
+
patch.path = event.raw.path;
|
|
16226
|
+
db.run("UPDATE sources SET metadata_json = ?, acl_json = CASE WHEN ? IS NULL THEN acl_json ELSE ? END, updated_at = ? WHERE id = ?", [
|
|
16227
|
+
mergeJson(row.metadata_json, patch),
|
|
16228
|
+
event.acl === undefined ? null : JSON.stringify(event.acl),
|
|
16229
|
+
event.acl === undefined ? null : JSON.stringify(event.acl),
|
|
16230
|
+
event.updatedAt,
|
|
16231
|
+
row.id
|
|
15723
16232
|
]);
|
|
15724
|
-
const row = db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").get(sourceId, item.revision);
|
|
15725
|
-
if (!row)
|
|
15726
|
-
throw new Error(`Failed to upsert source revision: ${item.sourceRef}`);
|
|
15727
16233
|
return row.id;
|
|
15728
16234
|
}
|
|
15729
|
-
function
|
|
15730
|
-
if (!
|
|
15731
|
-
return
|
|
15732
|
-
const
|
|
15733
|
-
|
|
15734
|
-
|
|
15735
|
-
|
|
15736
|
-
|
|
15737
|
-
|
|
15738
|
-
|
|
15739
|
-
|
|
15740
|
-
|
|
15741
|
-
|
|
15742
|
-
|
|
15743
|
-
|
|
15744
|
-
|
|
15745
|
-
|
|
15746
|
-
|
|
15747
|
-
});
|
|
15748
|
-
}
|
|
15749
|
-
const chunks = chunkText(redacted.text, maxChars, overlapChars);
|
|
15750
|
-
for (const chunk of chunks) {
|
|
15751
|
-
const chunkId = stableId3("chk", `${sourceRevisionId}\x00${chunk.ordinal}\x00${chunk.text}`);
|
|
15752
|
-
const provenance = sourceProvenance({
|
|
15753
|
-
source_ref: item.sourceRef,
|
|
15754
|
-
source_uri: item.sourceUri,
|
|
15755
|
-
source_kind: item.kind,
|
|
15756
|
-
source_revision_id: sourceRevisionId,
|
|
15757
|
-
revision: item.revision,
|
|
15758
|
-
hash: item.hash,
|
|
15759
|
-
chunk_id: chunkId,
|
|
15760
|
-
start_offset: chunk.startOffset,
|
|
15761
|
-
end_offset: chunk.endOffset,
|
|
15762
|
-
status: item.status,
|
|
15763
|
-
resolver: "open-files-read-only"
|
|
15764
|
-
});
|
|
15765
|
-
const metadata = withProvenance({
|
|
15766
|
-
source_ref: item.sourceRef,
|
|
15767
|
-
source_uri: item.sourceUri,
|
|
15768
|
-
source_kind: item.kind,
|
|
15769
|
-
source_revision_id: sourceRevisionId,
|
|
15770
|
-
revision: item.revision,
|
|
15771
|
-
hash: item.hash,
|
|
15772
|
-
status: item.status,
|
|
15773
|
-
path: asString2(item.raw.path) ?? null,
|
|
15774
|
-
mime: asString2(item.raw.mime) ?? asString2(item.raw.content_type) ?? null,
|
|
15775
|
-
size: asNumber(item.raw.size) ?? null
|
|
15776
|
-
}, provenance);
|
|
15777
|
-
db.run(`INSERT INTO chunks (id, source_revision_id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json, created_at)
|
|
15778
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
|
|
15779
|
-
chunkId,
|
|
15780
|
-
sourceRevisionId,
|
|
15781
|
-
"source",
|
|
15782
|
-
chunk.ordinal,
|
|
15783
|
-
chunk.text,
|
|
15784
|
-
estimateTokenCount(chunk.text),
|
|
15785
|
-
chunk.startOffset,
|
|
15786
|
-
chunk.endOffset,
|
|
15787
|
-
JSON.stringify(metadata),
|
|
15788
|
-
now
|
|
15789
|
-
]);
|
|
15790
|
-
db.run("INSERT INTO chunks_fts (chunk_id, text, title, source_uri) VALUES (?, ?, ?, ?)", [chunkId, chunk.text, item.title ?? "", item.sourceUri]);
|
|
15791
|
-
}
|
|
15792
|
-
return { chunksInserted: chunks.length, redactions: redacted.findings.length };
|
|
15793
|
-
}
|
|
15794
|
-
async function ingestOpenFilesManifest(options) {
|
|
15795
|
-
const now = options.now ?? new Date;
|
|
15796
|
-
if (options.safetyPolicy)
|
|
15797
|
-
assertWriteAllowed(options.dbPath, options.safetyPolicy);
|
|
15798
|
-
migrateKnowledgeDb(options.dbPath);
|
|
15799
|
-
const text = await readManifestInput(options.input, options.config, options.safetyPolicy);
|
|
15800
|
-
const items = parseManifestText(text);
|
|
15801
|
-
return ingestOpenFilesManifestItems({
|
|
15802
|
-
dbPath: options.dbPath,
|
|
15803
|
-
items,
|
|
15804
|
-
sourceLabel: options.input,
|
|
15805
|
-
safetyPolicy: options.safetyPolicy,
|
|
15806
|
-
now,
|
|
15807
|
-
maxChunkChars: options.maxChunkChars,
|
|
15808
|
-
chunkOverlapChars: options.chunkOverlapChars
|
|
15809
|
-
});
|
|
15810
|
-
}
|
|
15811
|
-
async function ingestOpenFilesManifestItems(options) {
|
|
15812
|
-
const now = (options.now ?? new Date).toISOString();
|
|
15813
|
-
const maxChunkChars = options.maxChunkChars ?? 4000;
|
|
15814
|
-
const chunkOverlapChars = options.chunkOverlapChars ?? 200;
|
|
15815
|
-
if (maxChunkChars < 500)
|
|
15816
|
-
throw new Error("maxChunkChars must be at least 500.");
|
|
15817
|
-
if (chunkOverlapChars < 0 || chunkOverlapChars >= maxChunkChars)
|
|
15818
|
-
throw new Error("chunkOverlapChars must be less than maxChunkChars.");
|
|
15819
|
-
if (options.safetyPolicy)
|
|
15820
|
-
assertWriteAllowed(options.dbPath, options.safetyPolicy);
|
|
15821
|
-
migrateKnowledgeDb(options.dbPath);
|
|
15822
|
-
const db = openKnowledgeDb(options.dbPath);
|
|
15823
|
-
try {
|
|
15824
|
-
const result = db.transaction(() => {
|
|
15825
|
-
const seenSources = new Set;
|
|
15826
|
-
const seenRevisions = new Set;
|
|
15827
|
-
let chunksInserted = 0;
|
|
15828
|
-
let chunksDeleted = 0;
|
|
15829
|
-
let redactions = 0;
|
|
15830
|
-
let skipped = 0;
|
|
15831
|
-
recordAuditEvent(db, {
|
|
15832
|
-
event_type: "source_read",
|
|
15833
|
-
action: options.readAction ?? (options.sourceLabel.startsWith("s3://") ? "s3_manifest_read" : "local_manifest_read"),
|
|
15834
|
-
target_uri: options.sourceLabel,
|
|
15835
|
-
decision: "allow",
|
|
15836
|
-
metadata: { items: options.items.length, read_only: true },
|
|
15837
|
-
created_at: now
|
|
15838
|
-
});
|
|
15839
|
-
for (const raw of options.items) {
|
|
15840
|
-
const item = normalizeManifestItem(raw, now);
|
|
15841
|
-
const sourceId = upsertSource(db, item, now);
|
|
15842
|
-
const revisionId = upsertRevision(db, sourceId, item, now);
|
|
15843
|
-
seenSources.add(sourceId);
|
|
15844
|
-
seenRevisions.add(revisionId);
|
|
15845
|
-
if (item.text || item.status.toLowerCase() === "deleted") {
|
|
15846
|
-
chunksDeleted += deleteChunksForRevision(db, revisionId);
|
|
15847
|
-
}
|
|
15848
|
-
const inserted = insertChunks(db, revisionId, item, now, maxChunkChars, chunkOverlapChars, options.safetyPolicy);
|
|
15849
|
-
chunksInserted += inserted.chunksInserted;
|
|
15850
|
-
redactions += inserted.redactions;
|
|
15851
|
-
}
|
|
15852
|
-
recordAuditEvent(db, {
|
|
15853
|
-
event_type: "write",
|
|
15854
|
-
action: "knowledge_manifest_ingest",
|
|
15855
|
-
target_uri: options.dbPath,
|
|
15856
|
-
decision: "allow",
|
|
15857
|
-
metadata: { items: options.items.length, sources: seenSources.size, revisions: seenRevisions.size, chunks_inserted: chunksInserted, redactions },
|
|
15858
|
-
created_at: now
|
|
15859
|
-
});
|
|
15860
|
-
return {
|
|
15861
|
-
path: options.sourceLabel,
|
|
15862
|
-
db_path: options.dbPath,
|
|
15863
|
-
items_seen: options.items.length,
|
|
15864
|
-
sources_upserted: seenSources.size,
|
|
15865
|
-
revisions_upserted: seenRevisions.size,
|
|
15866
|
-
chunks_inserted: chunksInserted,
|
|
15867
|
-
chunks_deleted: chunksDeleted,
|
|
15868
|
-
redactions,
|
|
15869
|
-
skipped
|
|
15870
|
-
};
|
|
15871
|
-
})();
|
|
15872
|
-
return result;
|
|
15873
|
-
} finally {
|
|
15874
|
-
db.close();
|
|
15875
|
-
}
|
|
15876
|
-
}
|
|
15877
|
-
|
|
15878
|
-
// src/source-ingest.ts
|
|
15879
|
-
import { createHash as createHash5 } from "crypto";
|
|
15880
|
-
import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
|
|
15881
|
-
import { basename as basename3 } from "path";
|
|
15882
|
-
|
|
15883
|
-
// src/source-resolver.ts
|
|
15884
|
-
function parseJsonObject2(value) {
|
|
15885
|
-
if (!value)
|
|
15886
|
-
return {};
|
|
15887
|
-
try {
|
|
15888
|
-
const parsed = JSON.parse(value);
|
|
15889
|
-
return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : {};
|
|
15890
|
-
} catch {
|
|
15891
|
-
return {};
|
|
15892
|
-
}
|
|
15893
|
-
}
|
|
15894
|
-
function metadataString2(metadata, keys) {
|
|
15895
|
-
for (const key of keys) {
|
|
15896
|
-
const value = metadata[key];
|
|
15897
|
-
if (typeof value === "string" && value.length > 0)
|
|
15898
|
-
return value;
|
|
15899
|
-
}
|
|
15900
|
-
return null;
|
|
15901
|
-
}
|
|
15902
|
-
function metadataNumber2(metadata, keys) {
|
|
15903
|
-
for (const key of keys) {
|
|
15904
|
-
const value = metadata[key];
|
|
15905
|
-
if (typeof value === "number" && Number.isFinite(value))
|
|
15906
|
-
return value;
|
|
15907
|
-
}
|
|
15908
|
-
return null;
|
|
16235
|
+
function ensureRevision(db, sourceId, event, now) {
|
|
16236
|
+
if (!event.revision)
|
|
16237
|
+
return null;
|
|
16238
|
+
const id = stableId3("rev", `${sourceId}\x00${event.revision}`);
|
|
16239
|
+
const metadata = {
|
|
16240
|
+
source_ref: event.sourceRef,
|
|
16241
|
+
source_uri: event.sourceUri,
|
|
16242
|
+
status: event.status,
|
|
16243
|
+
last_outbox_event: event.eventType,
|
|
16244
|
+
reindex_required: true
|
|
16245
|
+
};
|
|
16246
|
+
db.run(`INSERT INTO source_revisions (id, source_id, revision, hash, extracted_text_uri, metadata_json, created_at)
|
|
16247
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
16248
|
+
ON CONFLICT(source_id, revision) DO UPDATE SET
|
|
16249
|
+
hash = COALESCE(excluded.hash, source_revisions.hash),
|
|
16250
|
+
metadata_json = excluded.metadata_json`, [id, sourceId, event.revision, event.hash, asString(event.raw.extracted_text_ref) ?? null, JSON.stringify(metadata), now]);
|
|
16251
|
+
const row = db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").get(sourceId, event.revision);
|
|
16252
|
+
return row?.id ?? null;
|
|
15909
16253
|
}
|
|
15910
|
-
function
|
|
15911
|
-
|
|
15912
|
-
|
|
15913
|
-
throw new Error(`Source resolver denied ${purpose}. Permission mode is ${mode}, expected read_only.`);
|
|
15914
|
-
}
|
|
15915
|
-
const denied = permissions.denied_purposes;
|
|
15916
|
-
if (Array.isArray(denied) && denied.includes(purpose)) {
|
|
15917
|
-
throw new Error(`Source resolver denied ${purpose}. Purpose is explicitly denied.`);
|
|
16254
|
+
function revisionIdsForEvent(db, sourceId, event) {
|
|
16255
|
+
if (event.revision) {
|
|
16256
|
+
return db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").all(sourceId, event.revision).map((row) => row.id);
|
|
15918
16257
|
}
|
|
15919
|
-
|
|
15920
|
-
|
|
15921
|
-
throw new Error(`Source resolver denied ${purpose}. Allowed purposes: ${allowed.join(", ")}`);
|
|
16258
|
+
if (event.hash) {
|
|
16259
|
+
return db.query("SELECT id FROM source_revisions WHERE source_id = ? AND hash = ?").all(sourceId, event.hash).map((row) => row.id);
|
|
15922
16260
|
}
|
|
16261
|
+
return db.query("SELECT id FROM source_revisions WHERE source_id = ?").all(sourceId).map((row) => row.id);
|
|
15923
16262
|
}
|
|
15924
|
-
function
|
|
15925
|
-
|
|
15926
|
-
|
|
15927
|
-
|
|
15928
|
-
|
|
15929
|
-
|
|
15930
|
-
|
|
15931
|
-
|
|
15932
|
-
|
|
15933
|
-
|
|
16263
|
+
function invalidateRevision(db, revisionId) {
|
|
16264
|
+
const chunks = db.query("SELECT id FROM chunks WHERE source_revision_id = ?").all(revisionId);
|
|
16265
|
+
let embeddingsDeleted = 0;
|
|
16266
|
+
let vectorEntriesDeleted = 0;
|
|
16267
|
+
for (const chunk of chunks) {
|
|
16268
|
+
const row = db.query("SELECT COUNT(*) AS n FROM chunk_embeddings WHERE chunk_id = ?").get(chunk.id);
|
|
16269
|
+
embeddingsDeleted += row?.n ?? 0;
|
|
16270
|
+
const vectorRow = db.query("SELECT COUNT(*) AS n FROM vector_index_entries WHERE chunk_id = ?").get(chunk.id);
|
|
16271
|
+
vectorEntriesDeleted += vectorRow?.n ?? 0;
|
|
16272
|
+
db.run("DELETE FROM vector_index_entries WHERE chunk_id = ?", [chunk.id]);
|
|
16273
|
+
db.run("DELETE FROM chunk_embeddings WHERE chunk_id = ?", [chunk.id]);
|
|
16274
|
+
db.run("DELETE FROM chunks_fts WHERE chunk_id = ?", [chunk.id]);
|
|
15934
16275
|
}
|
|
15935
|
-
|
|
15936
|
-
|
|
15937
|
-
|
|
15938
|
-
return
|
|
15939
|
-
FROM sources
|
|
15940
|
-
WHERE uri = ? OR uri = ?
|
|
15941
|
-
ORDER BY CASE WHEN uri = ? THEN 0 ELSE 1 END
|
|
15942
|
-
LIMIT 1`).get(sourceUri, requestedRef, sourceUri) ?? null;
|
|
16276
|
+
db.run("DELETE FROM chunks WHERE source_revision_id = ?", [revisionId]);
|
|
16277
|
+
const revision = db.query("SELECT metadata_json FROM source_revisions WHERE id = ?").get(revisionId);
|
|
16278
|
+
db.run("UPDATE source_revisions SET metadata_json = ? WHERE id = ?", [mergeJson(revision?.metadata_json, { reindex_required: true, invalidated_at: new Date().toISOString() }), revisionId]);
|
|
16279
|
+
return { chunksDeleted: chunks.length, embeddingsDeleted, vectorEntriesDeleted };
|
|
15943
16280
|
}
|
|
15944
|
-
function
|
|
15945
|
-
|
|
15946
|
-
return db.query(`SELECT id, revision, hash, extracted_text_uri, metadata_json, created_at
|
|
15947
|
-
FROM source_revisions
|
|
15948
|
-
WHERE source_id = ? AND revision = ?
|
|
15949
|
-
LIMIT 1`).get(sourceId, revisionId) ?? null;
|
|
15950
|
-
}
|
|
15951
|
-
return db.query(`SELECT id, revision, hash, extracted_text_uri, metadata_json, created_at
|
|
15952
|
-
FROM source_revisions
|
|
15953
|
-
WHERE source_id = ?
|
|
15954
|
-
ORDER BY created_at DESC, revision DESC
|
|
15955
|
-
LIMIT 1`).get(sourceId) ?? null;
|
|
16281
|
+
function isDeleteEvent(eventType2, status) {
|
|
16282
|
+
return status === "deleted" || ["delete", "deleted", "remove", "removed"].includes(eventType2);
|
|
15956
16283
|
}
|
|
15957
|
-
function
|
|
15958
|
-
|
|
15959
|
-
return 0;
|
|
15960
|
-
const row = db.query("SELECT COUNT(*) AS n FROM chunks WHERE source_revision_id = ?").get(revisionId);
|
|
15961
|
-
return row?.n ?? 0;
|
|
16284
|
+
function isMoveEvent(eventType2) {
|
|
16285
|
+
return ["move", "moved", "rename", "renamed", "path_changed"].includes(eventType2);
|
|
15962
16286
|
}
|
|
15963
|
-
function
|
|
15964
|
-
|
|
15965
|
-
return [];
|
|
15966
|
-
return db.query(`SELECT id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json
|
|
15967
|
-
FROM chunks
|
|
15968
|
-
WHERE source_revision_id = ?
|
|
15969
|
-
ORDER BY ordinal ASC
|
|
15970
|
-
LIMIT ?`).all(revisionId, limit);
|
|
16287
|
+
function isPermissionEvent(eventType2) {
|
|
16288
|
+
return ["permission", "permissions", "permission_changed", "acl_changed"].includes(eventType2);
|
|
15971
16289
|
}
|
|
15972
|
-
async function
|
|
15973
|
-
const
|
|
15974
|
-
|
|
15975
|
-
const resolvedAt = (options.now ?? new Date).toISOString();
|
|
15976
|
-
const parsed = parseSourceRef(options.sourceRef);
|
|
15977
|
-
const sourceUri = catalogSourceUriForRef(options.sourceRef, parsed);
|
|
15978
|
-
const requestedRevision = revisionIdForSourceRef(options.sourceRef);
|
|
15979
|
-
if (options.safetyPolicy) {
|
|
15980
|
-
if (!options.safetyPolicy.readOnlySourceAccess)
|
|
15981
|
-
throw new Error("Safety policy denied source resolution.");
|
|
16290
|
+
async function consumeOpenFilesOutbox(options) {
|
|
16291
|
+
const now = (options.now ?? new Date).toISOString();
|
|
16292
|
+
if (options.safetyPolicy)
|
|
15982
16293
|
assertWriteAllowed(options.dbPath, options.safetyPolicy);
|
|
15983
|
-
}
|
|
15984
16294
|
migrateKnowledgeDb(options.dbPath);
|
|
16295
|
+
const text = await readOutboxInput(options.input, options.config, options.safetyPolicy);
|
|
16296
|
+
const events = parseOutboxText(text);
|
|
15985
16297
|
const db = openKnowledgeDb(options.dbPath);
|
|
16298
|
+
const runId = `run_${randomUUID5()}`;
|
|
15986
16299
|
try {
|
|
15987
16300
|
return db.transaction(() => {
|
|
15988
|
-
|
|
15989
|
-
|
|
15990
|
-
|
|
15991
|
-
|
|
15992
|
-
|
|
15993
|
-
|
|
15994
|
-
|
|
15995
|
-
|
|
15996
|
-
|
|
15997
|
-
|
|
15998
|
-
|
|
15999
|
-
|
|
16000
|
-
|
|
16001
|
-
|
|
16002
|
-
|
|
16003
|
-
|
|
16004
|
-
|
|
16005
|
-
|
|
16006
|
-
|
|
16007
|
-
|
|
16008
|
-
|
|
16009
|
-
|
|
16010
|
-
|
|
16011
|
-
|
|
16012
|
-
|
|
16013
|
-
|
|
16014
|
-
|
|
16015
|
-
|
|
16016
|
-
|
|
16017
|
-
|
|
16018
|
-
|
|
16019
|
-
|
|
16020
|
-
|
|
16021
|
-
|
|
16022
|
-
|
|
16023
|
-
|
|
16024
|
-
|
|
16025
|
-
|
|
16026
|
-
|
|
16027
|
-
|
|
16028
|
-
|
|
16029
|
-
|
|
16030
|
-
|
|
16031
|
-
|
|
16032
|
-
|
|
16033
|
-
|
|
16034
|
-
|
|
16035
|
-
|
|
16036
|
-
|
|
16037
|
-
|
|
16038
|
-
|
|
16039
|
-
|
|
16040
|
-
|
|
16041
|
-
|
|
16042
|
-
|
|
16043
|
-
|
|
16044
|
-
|
|
16045
|
-
|
|
16046
|
-
|
|
16047
|
-
|
|
16048
|
-
|
|
16049
|
-
|
|
16050
|
-
|
|
16051
|
-
|
|
16052
|
-
|
|
16053
|
-
|
|
16054
|
-
|
|
16055
|
-
resolver: "open-files-read-only",
|
|
16056
|
-
mode: "local_catalog",
|
|
16057
|
-
purpose,
|
|
16058
|
-
read_only: true,
|
|
16059
|
-
source_ref: metadataString2(metadata, ["source_ref"]) ?? effectiveSourceRef,
|
|
16060
|
-
source_uri: source.uri,
|
|
16061
|
-
source_revision_id: revision?.id ?? null,
|
|
16062
|
-
revision: revision?.revision ?? null,
|
|
16063
|
-
hash: revision?.hash ?? metadataString2(metadata, ["hash"]),
|
|
16064
|
-
chunk_id: row.id,
|
|
16065
|
-
start_offset: row.start_offset,
|
|
16066
|
-
end_offset: row.end_offset,
|
|
16067
|
-
resolved_at: resolvedAt
|
|
16068
|
-
};
|
|
16069
|
-
const provenance = sourceProvenance({
|
|
16070
|
-
source_ref: evidence.source_ref,
|
|
16071
|
-
source_uri: evidence.source_uri,
|
|
16072
|
-
source_kind: source.kind,
|
|
16073
|
-
source_revision_id: evidence.source_revision_id,
|
|
16074
|
-
revision: evidence.revision,
|
|
16075
|
-
hash: evidence.hash,
|
|
16076
|
-
chunk_id: row.id,
|
|
16077
|
-
start_offset: row.start_offset,
|
|
16078
|
-
end_offset: row.end_offset,
|
|
16079
|
-
status: metadataString2(metadata, ["status"]),
|
|
16080
|
-
resolver: evidence.resolver
|
|
16081
|
-
});
|
|
16082
|
-
return {
|
|
16083
|
-
id: row.id,
|
|
16084
|
-
kind: row.kind,
|
|
16085
|
-
ordinal: row.ordinal,
|
|
16086
|
-
text: row.text,
|
|
16087
|
-
token_count: row.token_count,
|
|
16088
|
-
start_offset: row.start_offset,
|
|
16089
|
-
end_offset: row.end_offset,
|
|
16090
|
-
metadata,
|
|
16091
|
-
evidence,
|
|
16092
|
-
provenance
|
|
16093
|
-
};
|
|
16301
|
+
db.run(`INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
|
|
16302
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
|
|
16303
|
+
runId,
|
|
16304
|
+
"open-files-outbox",
|
|
16305
|
+
options.input,
|
|
16306
|
+
"completed",
|
|
16307
|
+
"local",
|
|
16308
|
+
"open-files-outbox",
|
|
16309
|
+
JSON.stringify({ path: options.input, events: events.length }),
|
|
16310
|
+
now,
|
|
16311
|
+
now
|
|
16312
|
+
]);
|
|
16313
|
+
const sourcesTouched = new Set;
|
|
16314
|
+
const revisionsTouched = new Set;
|
|
16315
|
+
let chunksDeleted = 0;
|
|
16316
|
+
let embeddingsDeleted = 0;
|
|
16317
|
+
let vectorEntriesDeleted = 0;
|
|
16318
|
+
let staleRevisions = 0;
|
|
16319
|
+
let deletedSources = 0;
|
|
16320
|
+
let movedSources = 0;
|
|
16321
|
+
let permissionUpdates = 0;
|
|
16322
|
+
recordAuditEvent(db, {
|
|
16323
|
+
event_type: "source_read",
|
|
16324
|
+
action: options.input.startsWith("s3://") ? "s3_outbox_read" : "local_outbox_read",
|
|
16325
|
+
target_uri: options.input,
|
|
16326
|
+
decision: "allow",
|
|
16327
|
+
metadata: { events: events.length, read_only: true },
|
|
16328
|
+
created_at: now
|
|
16329
|
+
});
|
|
16330
|
+
events.forEach((raw, index) => {
|
|
16331
|
+
const event = normalizeEvent(raw, now);
|
|
16332
|
+
const sourceId = ensureSource(db, event, now);
|
|
16333
|
+
sourcesTouched.add(sourceId);
|
|
16334
|
+
const createdRevisionId = ensureRevision(db, sourceId, event, now);
|
|
16335
|
+
if (createdRevisionId)
|
|
16336
|
+
revisionsTouched.add(createdRevisionId);
|
|
16337
|
+
const affectedRevisionIds = revisionIdsForEvent(db, sourceId, event);
|
|
16338
|
+
for (const revisionId of affectedRevisionIds) {
|
|
16339
|
+
revisionsTouched.add(revisionId);
|
|
16340
|
+
const invalidation = invalidateRevision(db, revisionId);
|
|
16341
|
+
chunksDeleted += invalidation.chunksDeleted;
|
|
16342
|
+
embeddingsDeleted += invalidation.embeddingsDeleted;
|
|
16343
|
+
vectorEntriesDeleted += invalidation.vectorEntriesDeleted;
|
|
16344
|
+
staleRevisions += 1;
|
|
16345
|
+
}
|
|
16346
|
+
if (isDeleteEvent(event.eventType, event.status))
|
|
16347
|
+
deletedSources += 1;
|
|
16348
|
+
if (isMoveEvent(event.eventType))
|
|
16349
|
+
movedSources += 1;
|
|
16350
|
+
if (isPermissionEvent(event.eventType) || event.acl !== undefined)
|
|
16351
|
+
permissionUpdates += 1;
|
|
16352
|
+
db.run(`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
|
|
16353
|
+
VALUES (?, ?, ?, ?, ?, ?)`, [
|
|
16354
|
+
stableId3("evt", `${runId}\x00${index}\x00${event.sourceRef}\x00${event.eventType}`),
|
|
16355
|
+
runId,
|
|
16356
|
+
"info",
|
|
16357
|
+
event.eventType,
|
|
16358
|
+
JSON.stringify({
|
|
16359
|
+
source_ref: event.sourceRef,
|
|
16360
|
+
source_uri: event.sourceUri,
|
|
16361
|
+
revision: event.revision,
|
|
16362
|
+
hash: event.hash,
|
|
16363
|
+
status: event.status,
|
|
16364
|
+
affected_revisions: affectedRevisionIds.length
|
|
16365
|
+
}),
|
|
16366
|
+
event.updatedAt
|
|
16367
|
+
]);
|
|
16094
16368
|
});
|
|
16095
|
-
|
|
16096
|
-
|
|
16097
|
-
|
|
16098
|
-
|
|
16099
|
-
|
|
16100
|
-
|
|
16101
|
-
|
|
16102
|
-
|
|
16103
|
-
|
|
16104
|
-
}));
|
|
16369
|
+
db.run(`INSERT INTO provider_usage (id, run_id, provider, model, input_tokens, output_tokens, cost_usd, metadata_json, created_at)
|
|
16370
|
+
VALUES (?, ?, ?, ?, 0, 0, 0, ?, ?)`, [
|
|
16371
|
+
stableId3("usage", runId),
|
|
16372
|
+
runId,
|
|
16373
|
+
"local",
|
|
16374
|
+
"open-files-outbox",
|
|
16375
|
+
JSON.stringify({ note: "No model provider used for outbox invalidation." }),
|
|
16376
|
+
now
|
|
16377
|
+
]);
|
|
16105
16378
|
recordAuditEvent(db, {
|
|
16106
|
-
event_type: "
|
|
16107
|
-
action: "
|
|
16108
|
-
target_uri: options.
|
|
16379
|
+
event_type: "write",
|
|
16380
|
+
action: "knowledge_outbox_invalidation",
|
|
16381
|
+
target_uri: options.dbPath,
|
|
16109
16382
|
decision: "allow",
|
|
16110
16383
|
metadata: {
|
|
16111
|
-
|
|
16112
|
-
|
|
16113
|
-
|
|
16114
|
-
|
|
16115
|
-
|
|
16116
|
-
|
|
16384
|
+
run_id: runId,
|
|
16385
|
+
events: events.length,
|
|
16386
|
+
sources: sourcesTouched.size,
|
|
16387
|
+
revisions: revisionsTouched.size,
|
|
16388
|
+
chunks_deleted: chunksDeleted,
|
|
16389
|
+
embeddings_deleted: embeddingsDeleted,
|
|
16390
|
+
vector_entries_deleted: vectorEntriesDeleted
|
|
16117
16391
|
},
|
|
16118
|
-
created_at:
|
|
16392
|
+
created_at: now
|
|
16119
16393
|
});
|
|
16120
|
-
const mime = metadataString2(sourceMetadata, ["mime", "content_type"]) ?? metadataString2(revisionMetadata, ["mime", "content_type"]);
|
|
16121
|
-
const size = metadataNumber2(sourceMetadata, ["size", "size_bytes"]) ?? metadataNumber2(revisionMetadata, ["size", "size_bytes"]);
|
|
16122
16394
|
return {
|
|
16123
|
-
|
|
16124
|
-
|
|
16125
|
-
|
|
16126
|
-
|
|
16127
|
-
|
|
16128
|
-
|
|
16129
|
-
|
|
16130
|
-
|
|
16131
|
-
|
|
16132
|
-
|
|
16133
|
-
|
|
16134
|
-
|
|
16135
|
-
|
|
16136
|
-
kind: source.kind,
|
|
16137
|
-
title: source.title,
|
|
16138
|
-
metadata: sourceMetadata,
|
|
16139
|
-
permissions,
|
|
16140
|
-
updated_at: source.updated_at
|
|
16141
|
-
},
|
|
16142
|
-
revision: revision ? {
|
|
16143
|
-
id: revision.id,
|
|
16144
|
-
revision: revision.revision,
|
|
16145
|
-
hash: revision.hash,
|
|
16146
|
-
extracted_text_uri: revision.extracted_text_uri,
|
|
16147
|
-
metadata: revisionMetadata,
|
|
16148
|
-
created_at: revision.created_at,
|
|
16149
|
-
reindex_required: revisionMetadata.reindex_required === true
|
|
16150
|
-
} : null,
|
|
16151
|
-
content: {
|
|
16152
|
-
mime,
|
|
16153
|
-
size,
|
|
16154
|
-
hash: revision?.hash ?? metadataString2(sourceMetadata, ["hash", "checksum", "sha256"]),
|
|
16155
|
-
text_available: totalChunks > 0,
|
|
16156
|
-
chunks_total: totalChunks,
|
|
16157
|
-
chunks_returned: chunks.length,
|
|
16158
|
-
char_count_returned: chunks.reduce((sum, chunk) => sum + chunk.text.length, 0),
|
|
16159
|
-
extracted_text_ref: revision?.extracted_text_uri ?? metadataString2(revisionMetadata, ["extracted_text_ref", "extracted_text_uri"]),
|
|
16160
|
-
bytes_available: false,
|
|
16161
|
-
bytes_exposed: false
|
|
16162
|
-
},
|
|
16163
|
-
chunks,
|
|
16164
|
-
citations
|
|
16395
|
+
path: options.input,
|
|
16396
|
+
db_path: options.dbPath,
|
|
16397
|
+
run_id: runId,
|
|
16398
|
+
events_seen: events.length,
|
|
16399
|
+
sources_touched: sourcesTouched.size,
|
|
16400
|
+
revisions_touched: revisionsTouched.size,
|
|
16401
|
+
chunks_deleted: chunksDeleted,
|
|
16402
|
+
embeddings_deleted: embeddingsDeleted,
|
|
16403
|
+
vector_entries_deleted: vectorEntriesDeleted,
|
|
16404
|
+
stale_revisions: staleRevisions,
|
|
16405
|
+
deleted_sources: deletedSources,
|
|
16406
|
+
moved_sources: movedSources,
|
|
16407
|
+
permission_updates: permissionUpdates
|
|
16165
16408
|
};
|
|
16166
16409
|
})();
|
|
16167
16410
|
} finally {
|
|
@@ -16169,824 +16412,1195 @@ async function resolveOpenFilesSource(options) {
|
|
|
16169
16412
|
}
|
|
16170
16413
|
}
|
|
16171
16414
|
|
|
16172
|
-
// src/
|
|
16173
|
-
|
|
16174
|
-
|
|
16175
|
-
}
|
|
16176
|
-
function
|
|
16177
|
-
return
|
|
16178
|
-
`).replace(/\n\s+/g, `
|
|
16179
|
-
`).replace(/[ \t]{2,}/g, " ").trim();
|
|
16180
|
-
}
|
|
16181
|
-
async function readS3Text3(uri, config2, safetyPolicy) {
|
|
16182
|
-
const parsed = new URL(uri);
|
|
16183
|
-
const bucket = parsed.hostname;
|
|
16184
|
-
const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ""));
|
|
16185
|
-
if (!bucket || !key)
|
|
16186
|
-
throw new Error(`Invalid S3 source URI: ${uri}`);
|
|
16187
|
-
if (safetyPolicy)
|
|
16188
|
-
assertS3ReadAllowed(uri, safetyPolicy);
|
|
16189
|
-
const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
|
|
16190
|
-
import("@aws-sdk/client-s3"),
|
|
16191
|
-
import("@aws-sdk/credential-providers")
|
|
16192
|
-
]);
|
|
16193
|
-
const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
|
|
16194
|
-
const client = new S3Client({
|
|
16195
|
-
region: s3Config?.region,
|
|
16196
|
-
credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
|
|
16197
|
-
maxAttempts: s3Config?.max_attempts
|
|
16198
|
-
});
|
|
16199
|
-
const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
|
|
16200
|
-
if (!response.Body)
|
|
16201
|
-
return "";
|
|
16202
|
-
return await response.Body.transformToString();
|
|
16203
|
-
}
|
|
16204
|
-
async function readWebText(uri, safetyPolicy) {
|
|
16205
|
-
if (safetyPolicy)
|
|
16206
|
-
assertWebSearchAllowed(safetyPolicy);
|
|
16207
|
-
const response = await fetch(uri, {
|
|
16208
|
-
headers: {
|
|
16209
|
-
accept: "text/markdown,text/plain,text/html,application/json;q=0.8,*/*;q=0.5",
|
|
16210
|
-
"user-agent": "@hasna/knowledge source-ingest"
|
|
16211
|
-
}
|
|
16212
|
-
});
|
|
16213
|
-
if (!response.ok)
|
|
16214
|
-
throw new Error(`Web source read failed ${response.status}: ${uri}`);
|
|
16215
|
-
const mime = response.headers.get("content-type");
|
|
16216
|
-
const body = await response.text();
|
|
16217
|
-
return { text: mime?.includes("html") ? stripHtml(body) : body, mime };
|
|
16218
|
-
}
|
|
16219
|
-
function titleForRef(parsed) {
|
|
16220
|
-
if (parsed.kind === "file")
|
|
16221
|
-
return basename3(parsed.path);
|
|
16222
|
-
if (parsed.kind === "s3")
|
|
16223
|
-
return basename3(parsed.key);
|
|
16224
|
-
if (parsed.kind === "web")
|
|
16225
|
-
return basename3(new URL(parsed.url).pathname) || parsed.url;
|
|
16226
|
-
return parsed.path ? basename3(parsed.path) : parsed.id;
|
|
16227
|
-
}
|
|
16228
|
-
async function readDirectSourceText(parsed, config2, safetyPolicy) {
|
|
16229
|
-
if (parsed.kind === "file") {
|
|
16230
|
-
if (!existsSync6(parsed.path))
|
|
16231
|
-
throw new Error(`Source file not found: ${parsed.path}`);
|
|
16232
|
-
const text = readFileSync6(parsed.path, "utf8");
|
|
16233
|
-
return {
|
|
16234
|
-
text,
|
|
16235
|
-
contentSource: "file",
|
|
16236
|
-
title: titleForRef(parsed),
|
|
16237
|
-
mime: "text/plain",
|
|
16238
|
-
size: text.length,
|
|
16239
|
-
hash: sha256Text(text),
|
|
16240
|
-
revision: null,
|
|
16241
|
-
extractedTextRef: null,
|
|
16242
|
-
metadata: { path: parsed.path },
|
|
16243
|
-
permissions: { mode: "read_only" }
|
|
16244
|
-
};
|
|
16245
|
-
}
|
|
16246
|
-
if (parsed.kind === "s3") {
|
|
16247
|
-
const text = await readS3Text3(parsed.uri, config2, safetyPolicy);
|
|
16248
|
-
return {
|
|
16249
|
-
text,
|
|
16250
|
-
contentSource: "s3",
|
|
16251
|
-
title: titleForRef(parsed),
|
|
16252
|
-
mime: "text/plain",
|
|
16253
|
-
size: text.length,
|
|
16254
|
-
hash: sha256Text(text),
|
|
16255
|
-
revision: null,
|
|
16256
|
-
extractedTextRef: null,
|
|
16257
|
-
metadata: { bucket: parsed.bucket, key: parsed.key },
|
|
16258
|
-
permissions: { mode: "read_only" }
|
|
16259
|
-
};
|
|
16260
|
-
}
|
|
16261
|
-
if (parsed.kind === "web") {
|
|
16262
|
-
const web = await readWebText(parsed.url, safetyPolicy);
|
|
16263
|
-
return {
|
|
16264
|
-
text: web.text,
|
|
16265
|
-
contentSource: "web",
|
|
16266
|
-
title: titleForRef(parsed),
|
|
16267
|
-
mime: web.mime,
|
|
16268
|
-
size: web.text.length,
|
|
16269
|
-
hash: sha256Text(web.text),
|
|
16270
|
-
revision: null,
|
|
16271
|
-
extractedTextRef: null,
|
|
16272
|
-
metadata: { url: parsed.url },
|
|
16273
|
-
permissions: { mode: "read_only" }
|
|
16274
|
-
};
|
|
16275
|
-
}
|
|
16276
|
-
throw new Error(`Direct source reading is not available for ${parsed.uri}`);
|
|
16277
|
-
}
|
|
16278
|
-
async function readTextRef(uri, config2, safetyPolicy) {
|
|
16279
|
-
if (uri.startsWith("open-files://")) {
|
|
16280
|
-
throw new Error("Open-files extracted text refs require an open-files resolver API. Ingest an open-files manifest with extracted_text or an extracted_text_ref using file://, s3://, or https://.");
|
|
16281
|
-
}
|
|
16282
|
-
const parsed = parseSourceRef(uri);
|
|
16283
|
-
const direct = await readDirectSourceText(parsed, config2, safetyPolicy);
|
|
16284
|
-
return { text: direct.text, contentSource: "extracted_text_ref" };
|
|
16415
|
+
// src/manifest-ingest.ts
|
|
16416
|
+
import { createHash as createHash5 } from "crypto";
|
|
16417
|
+
import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
|
|
16418
|
+
import { basename as basename2 } from "path";
|
|
16419
|
+
function stableId4(prefix, value) {
|
|
16420
|
+
return `${prefix}_${createHash5("sha256").update(value).digest("hex").slice(0, 20)}`;
|
|
16285
16421
|
}
|
|
16286
|
-
|
|
16287
|
-
|
|
16288
|
-
|
|
16289
|
-
|
|
16290
|
-
|
|
16291
|
-
|
|
16292
|
-
|
|
16293
|
-
|
|
16294
|
-
|
|
16295
|
-
|
|
16296
|
-
|
|
16297
|
-
|
|
16298
|
-
|
|
16299
|
-
|
|
16300
|
-
|
|
16301
|
-
|
|
16302
|
-
|
|
16303
|
-
|
|
16304
|
-
mime: resolved.content.mime,
|
|
16305
|
-
size: textRef.text.length,
|
|
16306
|
-
hash: resolved.revision.hash ?? sha256Text(textRef.text),
|
|
16307
|
-
revision: resolved.revision.revision,
|
|
16308
|
-
extractedTextRef: resolved.revision.extracted_text_uri,
|
|
16309
|
-
metadata: resolved.source?.metadata ?? {},
|
|
16310
|
-
permissions: resolved.source?.permissions ?? { mode: "read_only" }
|
|
16311
|
-
};
|
|
16422
|
+
function asObject2(value) {
|
|
16423
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value : undefined;
|
|
16424
|
+
}
|
|
16425
|
+
function asString2(value) {
|
|
16426
|
+
return typeof value === "string" && value.length > 0 ? value : undefined;
|
|
16427
|
+
}
|
|
16428
|
+
function asNumber(value) {
|
|
16429
|
+
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
|
16430
|
+
}
|
|
16431
|
+
function buildSourceRefFromItem(item) {
|
|
16432
|
+
const explicit = asString2(item.source_ref) ?? asString2(item.source_uri) ?? asString2(item.uri);
|
|
16433
|
+
if (explicit)
|
|
16434
|
+
return explicit;
|
|
16435
|
+
const fileId = asString2(item.file_id);
|
|
16436
|
+
if (fileId) {
|
|
16437
|
+
const revision = asString2(item.revision_id) ?? asString2(item.revision);
|
|
16438
|
+
const fileRef = `open-files://file/${encodeURIComponent(fileId)}`;
|
|
16439
|
+
return revision ? `${fileRef}/revision/${encodeURIComponent(revision)}` : fileRef;
|
|
16312
16440
|
}
|
|
16313
|
-
|
|
16314
|
-
|
|
16441
|
+
const sourceId = asString2(item.source_id);
|
|
16442
|
+
const path = asString2(item.path);
|
|
16443
|
+
if (sourceId && path) {
|
|
16444
|
+
return `open-files://source/${encodeURIComponent(sourceId)}/path/${encodeURIComponent(path)}`;
|
|
16315
16445
|
}
|
|
16316
|
-
|
|
16317
|
-
|
|
16318
|
-
`);
|
|
16319
|
-
return {
|
|
16320
|
-
text,
|
|
16321
|
-
contentSource: "catalog_chunks",
|
|
16322
|
-
title: resolved.source?.title ?? null,
|
|
16323
|
-
mime: resolved.content.mime,
|
|
16324
|
-
size: text.length,
|
|
16325
|
-
hash: resolved.revision?.hash ?? sha256Text(text),
|
|
16326
|
-
revision: resolved.revision?.revision ?? null,
|
|
16327
|
-
extractedTextRef: resolved.revision?.extracted_text_uri ?? null,
|
|
16328
|
-
metadata: resolved.source?.metadata ?? {},
|
|
16329
|
-
permissions: resolved.source?.permissions ?? { mode: "read_only" }
|
|
16330
|
-
};
|
|
16446
|
+
throw new Error("Manifest item is missing source_ref, file_id, or source_id/path.");
|
|
16331
16447
|
}
|
|
16332
|
-
function
|
|
16333
|
-
|
|
16334
|
-
|
|
16335
|
-
...resolved.metadata,
|
|
16336
|
-
source_ref: sourceRef,
|
|
16337
|
-
content_source: resolved.contentSource,
|
|
16338
|
-
read_only: true
|
|
16339
|
-
};
|
|
16340
|
-
const item = {
|
|
16341
|
-
source_ref: sourceRef,
|
|
16342
|
-
name: resolved.title ?? titleForRef(parsed),
|
|
16343
|
-
mime: resolved.mime ?? "text/plain",
|
|
16344
|
-
size: resolved.size ?? resolved.text.length,
|
|
16345
|
-
hash: hash2,
|
|
16346
|
-
revision: resolved.revision ?? hash2,
|
|
16347
|
-
status: "active",
|
|
16348
|
-
updated_at: new Date().toISOString(),
|
|
16349
|
-
permissions: {
|
|
16350
|
-
mode: "read_only",
|
|
16351
|
-
allowed_purposes: [purpose],
|
|
16352
|
-
...resolved.permissions
|
|
16353
|
-
},
|
|
16354
|
-
metadata,
|
|
16355
|
-
extracted_text_ref: resolved.extractedTextRef,
|
|
16356
|
-
extracted_text: resolved.text
|
|
16357
|
-
};
|
|
16358
|
-
if (parsed.kind === "open-files") {
|
|
16359
|
-
if (parsed.entity === "file")
|
|
16360
|
-
item.file_id = parsed.id;
|
|
16361
|
-
if (parsed.entity === "source") {
|
|
16362
|
-
item.source_id = parsed.id;
|
|
16363
|
-
item.path = parsed.path;
|
|
16364
|
-
}
|
|
16448
|
+
function baseSourceUri2(sourceRef, parsed) {
|
|
16449
|
+
if (parsed.kind === "open-files" && parsed.entity === "file" && parsed.revision_id) {
|
|
16450
|
+
return sourceRef.replace(/\/revision\/[^/]+$/, "");
|
|
16365
16451
|
}
|
|
16366
|
-
|
|
16367
|
-
item.path = parsed.path;
|
|
16368
|
-
if (parsed.kind === "s3")
|
|
16369
|
-
item.path = parsed.key;
|
|
16370
|
-
if (parsed.kind === "web")
|
|
16371
|
-
item.url = parsed.url;
|
|
16372
|
-
return item;
|
|
16452
|
+
return sourceRef;
|
|
16373
16453
|
}
|
|
16374
|
-
|
|
16375
|
-
const
|
|
16376
|
-
|
|
16377
|
-
|
|
16378
|
-
const
|
|
16379
|
-
|
|
16380
|
-
dbPath: options.dbPath,
|
|
16381
|
-
items: [item],
|
|
16382
|
-
sourceLabel: options.sourceRef,
|
|
16383
|
-
readAction: "source_ref_ingest_read",
|
|
16384
|
-
safetyPolicy: options.safetyPolicy,
|
|
16385
|
-
now: options.now
|
|
16386
|
-
});
|
|
16387
|
-
return {
|
|
16388
|
-
...result,
|
|
16389
|
-
source_ref: options.sourceRef,
|
|
16390
|
-
content_source: resolved.contentSource,
|
|
16391
|
-
read_only: true,
|
|
16392
|
-
hash: String(item.hash)
|
|
16393
|
-
};
|
|
16454
|
+
function textFromItem(item) {
|
|
16455
|
+
const direct = asString2(item.extracted_text) ?? asString2(item.text) ?? asString2(item.content_text) ?? asString2(item.markdown);
|
|
16456
|
+
if (direct !== undefined)
|
|
16457
|
+
return direct;
|
|
16458
|
+
const content = item.content;
|
|
16459
|
+
return typeof content === "string" ? content : null;
|
|
16394
16460
|
}
|
|
16395
|
-
|
|
16396
|
-
|
|
16397
|
-
|
|
16398
|
-
|
|
16399
|
-
|
|
16400
|
-
|
|
16401
|
-
if (!value)
|
|
16402
|
-
return {};
|
|
16403
|
-
try {
|
|
16404
|
-
const parsed = JSON.parse(value);
|
|
16405
|
-
return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : {};
|
|
16406
|
-
} catch {
|
|
16407
|
-
return {};
|
|
16408
|
-
}
|
|
16461
|
+
function extractedTextUriFromItem(item) {
|
|
16462
|
+
const direct = asString2(item.extracted_text_ref) ?? asString2(item.extracted_text_uri) ?? asString2(item.text_ref);
|
|
16463
|
+
if (direct)
|
|
16464
|
+
return direct;
|
|
16465
|
+
const content = asObject2(item.content);
|
|
16466
|
+
return asString2(content?.extracted_text_ref) ?? asString2(content?.extracted_text_uri) ?? null;
|
|
16409
16467
|
}
|
|
16410
|
-
function
|
|
16411
|
-
|
|
16412
|
-
|
|
16413
|
-
if (typeof value === "string" && value.length > 0)
|
|
16414
|
-
return value;
|
|
16415
|
-
}
|
|
16416
|
-
return null;
|
|
16468
|
+
function titleFromItem(item) {
|
|
16469
|
+
const path = asString2(item.path);
|
|
16470
|
+
return asString2(item.title) ?? asString2(item.name) ?? (path ? basename2(path) : null);
|
|
16417
16471
|
}
|
|
16418
|
-
function
|
|
16419
|
-
|
|
16420
|
-
const value = metadata[key];
|
|
16421
|
-
if (typeof value === "number" && Number.isFinite(value))
|
|
16422
|
-
return value;
|
|
16423
|
-
}
|
|
16424
|
-
return null;
|
|
16472
|
+
function hashFromItem(item) {
|
|
16473
|
+
return asString2(item.hash) ?? asString2(item.checksum) ?? asString2(item.sha256) ?? null;
|
|
16425
16474
|
}
|
|
16426
|
-
function
|
|
16427
|
-
|
|
16475
|
+
function revisionFromItem(item, parsed, hash2) {
|
|
16476
|
+
const revision = asString2(item.revision_id) ?? asString2(item.revision) ?? asString2(item.version_id) ?? (parsed.kind === "open-files" ? parsed.revision_id : undefined) ?? hash2 ?? asString2(item.updated_at);
|
|
16477
|
+
return revision ?? "current";
|
|
16428
16478
|
}
|
|
16429
|
-
function
|
|
16430
|
-
const
|
|
16431
|
-
|
|
16479
|
+
function metadataFromItem(item, normalized) {
|
|
16480
|
+
const metadata = {};
|
|
16481
|
+
for (const [key, value] of Object.entries(item)) {
|
|
16482
|
+
if (["text", "content", "content_text", "extracted_text", "markdown"].includes(key))
|
|
16483
|
+
continue;
|
|
16484
|
+
metadata[key] = value;
|
|
16485
|
+
}
|
|
16486
|
+
metadata.source_ref = normalized.sourceRef;
|
|
16487
|
+
metadata.source_uri = normalized.sourceUri;
|
|
16488
|
+
metadata.status = normalized.status;
|
|
16489
|
+
return metadata;
|
|
16432
16490
|
}
|
|
16433
|
-
function
|
|
16434
|
-
|
|
16435
|
-
|
|
16436
|
-
|
|
16491
|
+
function normalizeManifestItem(item, now) {
|
|
16492
|
+
const sourceRef = buildSourceRefFromItem(item);
|
|
16493
|
+
const parsed = parseSourceRef(sourceRef);
|
|
16494
|
+
const sourceUri = baseSourceUri2(sourceRef, parsed);
|
|
16495
|
+
const hash2 = hashFromItem(item);
|
|
16496
|
+
const status = asString2(item.status) ?? "active";
|
|
16497
|
+
return {
|
|
16498
|
+
raw: item,
|
|
16499
|
+
sourceRef,
|
|
16500
|
+
sourceUri,
|
|
16501
|
+
kind: parsed.kind,
|
|
16502
|
+
title: titleFromItem(item),
|
|
16503
|
+
revision: revisionFromItem(item, parsed, hash2),
|
|
16504
|
+
hash: hash2,
|
|
16505
|
+
extractedTextUri: extractedTextUriFromItem(item),
|
|
16506
|
+
text: textFromItem(item),
|
|
16507
|
+
metadata: metadataFromItem(item, { sourceRef, sourceUri, status }),
|
|
16508
|
+
acl: item.permissions ?? item.acl ?? {},
|
|
16509
|
+
status,
|
|
16510
|
+
updatedAt: asString2(item.updated_at) ?? now
|
|
16511
|
+
};
|
|
16437
16512
|
}
|
|
16438
|
-
function
|
|
16439
|
-
|
|
16513
|
+
function parseManifestText(text) {
|
|
16514
|
+
const trimmed = text.trim();
|
|
16515
|
+
if (!trimmed)
|
|
16516
|
+
return [];
|
|
16517
|
+
if (trimmed.startsWith("[")) {
|
|
16518
|
+
const parsed = JSON.parse(trimmed);
|
|
16519
|
+
if (!Array.isArray(parsed))
|
|
16520
|
+
throw new Error("Manifest array parse failed.");
|
|
16521
|
+
return parsed.map((entry) => {
|
|
16522
|
+
const item = asObject2(entry);
|
|
16523
|
+
if (!item)
|
|
16524
|
+
throw new Error("Manifest array entries must be objects.");
|
|
16525
|
+
return item;
|
|
16526
|
+
});
|
|
16527
|
+
}
|
|
16528
|
+
if (trimmed.startsWith("{")) {
|
|
16529
|
+
try {
|
|
16530
|
+
const parsed = JSON.parse(trimmed);
|
|
16531
|
+
const object2 = asObject2(parsed);
|
|
16532
|
+
if (!object2)
|
|
16533
|
+
throw new Error("Manifest object parse failed.");
|
|
16534
|
+
if (Array.isArray(object2.items)) {
|
|
16535
|
+
return object2.items.map((entry) => {
|
|
16536
|
+
const item = asObject2(entry);
|
|
16537
|
+
if (!item)
|
|
16538
|
+
throw new Error("Manifest items entries must be objects.");
|
|
16539
|
+
return item;
|
|
16540
|
+
});
|
|
16541
|
+
}
|
|
16542
|
+
if ("source_ref" in object2 || "source_uri" in object2 || "file_id" in object2)
|
|
16543
|
+
return [object2];
|
|
16544
|
+
} catch (error48) {
|
|
16545
|
+
const lines = trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0);
|
|
16546
|
+
if (lines.length <= 1)
|
|
16547
|
+
throw error48;
|
|
16548
|
+
return lines.map((line) => {
|
|
16549
|
+
const item = asObject2(JSON.parse(line));
|
|
16550
|
+
if (!item)
|
|
16551
|
+
throw new Error("Manifest JSONL entries must be objects.");
|
|
16552
|
+
return item;
|
|
16553
|
+
});
|
|
16554
|
+
}
|
|
16555
|
+
}
|
|
16556
|
+
return trimmed.split(/\r?\n/).filter((line) => line.trim().length > 0).map((line) => {
|
|
16557
|
+
const item = asObject2(JSON.parse(line));
|
|
16558
|
+
if (!item)
|
|
16559
|
+
throw new Error("Manifest JSONL entries must be objects.");
|
|
16560
|
+
return item;
|
|
16561
|
+
});
|
|
16562
|
+
}
|
|
16563
|
+
async function readS3Text2(uri, config2, safetyPolicy) {
|
|
16564
|
+
const parsed = new URL(uri);
|
|
16565
|
+
const bucket = parsed.hostname;
|
|
16566
|
+
const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ""));
|
|
16567
|
+
if (!bucket || !key)
|
|
16568
|
+
throw new Error(`Invalid S3 manifest URI: ${uri}`);
|
|
16569
|
+
if (safetyPolicy)
|
|
16570
|
+
assertS3ReadAllowed(uri, safetyPolicy);
|
|
16571
|
+
const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
|
|
16572
|
+
import("@aws-sdk/client-s3"),
|
|
16573
|
+
import("@aws-sdk/credential-providers")
|
|
16574
|
+
]);
|
|
16575
|
+
const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
|
|
16576
|
+
const client = new S3Client({
|
|
16577
|
+
region: s3Config?.region,
|
|
16578
|
+
credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
|
|
16579
|
+
maxAttempts: s3Config?.max_attempts
|
|
16580
|
+
});
|
|
16581
|
+
const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
|
|
16582
|
+
if (!response.Body)
|
|
16583
|
+
return "";
|
|
16584
|
+
return await response.Body.transformToString();
|
|
16440
16585
|
}
|
|
16441
|
-
function
|
|
16442
|
-
|
|
16586
|
+
async function readManifestInput(input, config2, safetyPolicy) {
|
|
16587
|
+
if (input.startsWith("s3://"))
|
|
16588
|
+
return readS3Text2(input, config2, safetyPolicy);
|
|
16589
|
+
if (!existsSync5(input))
|
|
16590
|
+
throw new Error(`Manifest not found: ${input}`);
|
|
16591
|
+
return readFileSync5(input, "utf8");
|
|
16443
16592
|
}
|
|
16444
|
-
function
|
|
16445
|
-
const
|
|
16446
|
-
|
|
16447
|
-
|
|
16593
|
+
function chunkText(text, maxChars, overlapChars) {
|
|
16594
|
+
const normalized = text.replace(/\r\n/g, `
|
|
16595
|
+
`);
|
|
16596
|
+
if (!normalized.trim())
|
|
16597
|
+
return [];
|
|
16598
|
+
const chunks = [];
|
|
16599
|
+
let start = 0;
|
|
16600
|
+
while (start < normalized.length) {
|
|
16601
|
+
const hardEnd = Math.min(normalized.length, start + maxChars);
|
|
16602
|
+
let end = hardEnd;
|
|
16603
|
+
if (hardEnd < normalized.length) {
|
|
16604
|
+
const paragraphBreak = normalized.lastIndexOf(`
|
|
16605
|
+
|
|
16606
|
+
`, hardEnd);
|
|
16607
|
+
const sentenceBreak = normalized.lastIndexOf(". ", hardEnd);
|
|
16608
|
+
const candidate = Math.max(paragraphBreak, sentenceBreak);
|
|
16609
|
+
if (candidate > start + Math.floor(maxChars * 0.5))
|
|
16610
|
+
end = candidate + (candidate === paragraphBreak ? 2 : 1);
|
|
16611
|
+
}
|
|
16612
|
+
const chunk = normalized.slice(start, end).trim();
|
|
16613
|
+
if (chunk) {
|
|
16614
|
+
chunks.push({
|
|
16615
|
+
ordinal: chunks.length,
|
|
16616
|
+
text: chunk,
|
|
16617
|
+
startOffset: start,
|
|
16618
|
+
endOffset: end
|
|
16619
|
+
});
|
|
16620
|
+
}
|
|
16621
|
+
if (end >= normalized.length)
|
|
16622
|
+
break;
|
|
16623
|
+
start = Math.max(0, end - overlapChars);
|
|
16624
|
+
}
|
|
16625
|
+
return chunks;
|
|
16448
16626
|
}
|
|
16449
|
-
function
|
|
16450
|
-
|
|
16451
|
-
|
|
16452
|
-
const matched = terms.filter((term) => haystack.includes(term)).length;
|
|
16453
|
-
if (matched === 0)
|
|
16454
|
-
return 0;
|
|
16455
|
-
return roundScore(Math.min(0.85, 0.35 + matched / terms.length * 0.5));
|
|
16627
|
+
function estimateTokenCount(text) {
|
|
16628
|
+
const words = text.trim().split(/\s+/).filter(Boolean).length;
|
|
16629
|
+
return Math.max(1, Math.ceil(words * 1.25));
|
|
16456
16630
|
}
|
|
16457
|
-
function
|
|
16458
|
-
|
|
16631
|
+
function deleteChunksForRevision(db, sourceRevisionId) {
|
|
16632
|
+
const rows = db.query("SELECT id FROM chunks WHERE source_revision_id = ?").all(sourceRevisionId);
|
|
16633
|
+
for (const row of rows) {
|
|
16634
|
+
db.run("DELETE FROM chunks_fts WHERE chunk_id = ?", [row.id]);
|
|
16635
|
+
}
|
|
16636
|
+
db.run("DELETE FROM chunks WHERE source_revision_id = ?", [sourceRevisionId]);
|
|
16637
|
+
return rows.length;
|
|
16459
16638
|
}
|
|
16460
|
-
function
|
|
16461
|
-
|
|
16639
|
+
function upsertSource(db, item, now) {
|
|
16640
|
+
const sourceId = stableId4("src", item.sourceUri);
|
|
16641
|
+
db.run(`INSERT INTO sources (id, uri, kind, title, metadata_json, acl_json, created_at, updated_at)
|
|
16642
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
16643
|
+
ON CONFLICT(uri) DO UPDATE SET
|
|
16644
|
+
kind = excluded.kind,
|
|
16645
|
+
title = excluded.title,
|
|
16646
|
+
metadata_json = excluded.metadata_json,
|
|
16647
|
+
acl_json = excluded.acl_json,
|
|
16648
|
+
updated_at = excluded.updated_at`, [
|
|
16649
|
+
sourceId,
|
|
16650
|
+
item.sourceUri,
|
|
16651
|
+
item.kind,
|
|
16652
|
+
item.title,
|
|
16653
|
+
JSON.stringify(item.metadata),
|
|
16654
|
+
JSON.stringify(item.acl ?? {}),
|
|
16655
|
+
now,
|
|
16656
|
+
item.updatedAt
|
|
16657
|
+
]);
|
|
16658
|
+
const row = db.query("SELECT id FROM sources WHERE uri = ?").get(item.sourceUri);
|
|
16659
|
+
if (!row)
|
|
16660
|
+
throw new Error(`Failed to upsert source: ${item.sourceUri}`);
|
|
16661
|
+
return row.id;
|
|
16462
16662
|
}
|
|
16463
|
-
function
|
|
16464
|
-
const
|
|
16465
|
-
|
|
16466
|
-
|
|
16467
|
-
|
|
16468
|
-
|
|
16663
|
+
function upsertRevision(db, sourceId, item, now) {
|
|
16664
|
+
const revisionId = stableId4("rev", `${sourceId}\x00${item.revision}`);
|
|
16665
|
+
db.run(`INSERT INTO source_revisions (id, source_id, revision, hash, extracted_text_uri, metadata_json, created_at)
|
|
16666
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
16667
|
+
ON CONFLICT(source_id, revision) DO UPDATE SET
|
|
16668
|
+
hash = excluded.hash,
|
|
16669
|
+
extracted_text_uri = excluded.extracted_text_uri,
|
|
16670
|
+
metadata_json = excluded.metadata_json`, [
|
|
16671
|
+
revisionId,
|
|
16672
|
+
sourceId,
|
|
16673
|
+
item.revision,
|
|
16674
|
+
item.hash,
|
|
16675
|
+
item.extractedTextUri,
|
|
16676
|
+
JSON.stringify(item.metadata),
|
|
16677
|
+
now
|
|
16678
|
+
]);
|
|
16679
|
+
const row = db.query("SELECT id FROM source_revisions WHERE source_id = ? AND revision = ?").get(sourceId, item.revision);
|
|
16680
|
+
if (!row)
|
|
16681
|
+
throw new Error(`Failed to upsert source revision: ${item.sourceRef}`);
|
|
16682
|
+
return row.id;
|
|
16469
16683
|
}
|
|
16470
|
-
function
|
|
16471
|
-
|
|
16472
|
-
|
|
16684
|
+
function insertChunks(db, sourceRevisionId, item, now, maxChars, overlapChars, safetyPolicy) {
|
|
16685
|
+
if (!item.text || item.status.toLowerCase() === "deleted")
|
|
16686
|
+
return { chunksInserted: 0, redactions: 0 };
|
|
16687
|
+
const redacted = redactSecrets(item.text, safetyPolicy);
|
|
16688
|
+
if (redacted.findings.length > 0) {
|
|
16689
|
+
recordRedactionFindings(db, {
|
|
16690
|
+
source_uri: item.sourceUri,
|
|
16691
|
+
findings: redacted.findings,
|
|
16692
|
+
metadata: { source_ref: item.sourceRef, revision: item.revision },
|
|
16693
|
+
created_at: now
|
|
16694
|
+
});
|
|
16695
|
+
recordAuditEvent(db, {
|
|
16696
|
+
event_type: "redaction",
|
|
16697
|
+
action: "source_text_redact",
|
|
16698
|
+
target_uri: item.sourceUri,
|
|
16699
|
+
decision: "redacted",
|
|
16700
|
+
metadata: { findings: redacted.findings.length, source_ref: item.sourceRef, revision: item.revision },
|
|
16701
|
+
created_at: now
|
|
16702
|
+
});
|
|
16703
|
+
}
|
|
16704
|
+
const chunks = chunkText(redacted.text, maxChars, overlapChars);
|
|
16705
|
+
for (const chunk of chunks) {
|
|
16706
|
+
const chunkId = stableId4("chk", `${sourceRevisionId}\x00${chunk.ordinal}\x00${chunk.text}`);
|
|
16707
|
+
const provenance = sourceProvenance({
|
|
16708
|
+
source_ref: item.sourceRef,
|
|
16709
|
+
source_uri: item.sourceUri,
|
|
16710
|
+
source_kind: item.kind,
|
|
16711
|
+
source_revision_id: sourceRevisionId,
|
|
16712
|
+
revision: item.revision,
|
|
16713
|
+
hash: item.hash,
|
|
16714
|
+
chunk_id: chunkId,
|
|
16715
|
+
start_offset: chunk.startOffset,
|
|
16716
|
+
end_offset: chunk.endOffset,
|
|
16717
|
+
status: item.status,
|
|
16718
|
+
resolver: "open-files-read-only"
|
|
16719
|
+
});
|
|
16720
|
+
const metadata = withProvenance({
|
|
16721
|
+
source_ref: item.sourceRef,
|
|
16722
|
+
source_uri: item.sourceUri,
|
|
16723
|
+
source_kind: item.kind,
|
|
16724
|
+
source_revision_id: sourceRevisionId,
|
|
16725
|
+
revision: item.revision,
|
|
16726
|
+
hash: item.hash,
|
|
16727
|
+
status: item.status,
|
|
16728
|
+
path: asString2(item.raw.path) ?? null,
|
|
16729
|
+
mime: asString2(item.raw.mime) ?? asString2(item.raw.content_type) ?? null,
|
|
16730
|
+
size: asNumber(item.raw.size) ?? null
|
|
16731
|
+
}, provenance);
|
|
16732
|
+
db.run(`INSERT INTO chunks (id, source_revision_id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json, created_at)
|
|
16733
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
|
|
16734
|
+
chunkId,
|
|
16735
|
+
sourceRevisionId,
|
|
16736
|
+
"source",
|
|
16737
|
+
chunk.ordinal,
|
|
16738
|
+
chunk.text,
|
|
16739
|
+
estimateTokenCount(chunk.text),
|
|
16740
|
+
chunk.startOffset,
|
|
16741
|
+
chunk.endOffset,
|
|
16742
|
+
JSON.stringify(metadata),
|
|
16743
|
+
now
|
|
16744
|
+
]);
|
|
16745
|
+
db.run("INSERT INTO chunks_fts (chunk_id, text, title, source_uri) VALUES (?, ?, ?, ?)", [chunkId, chunk.text, item.title ?? "", item.sourceUri]);
|
|
16746
|
+
}
|
|
16747
|
+
return { chunksInserted: chunks.length, redactions: redacted.findings.length };
|
|
16473
16748
|
}
|
|
16474
|
-
function
|
|
16475
|
-
const
|
|
16476
|
-
|
|
16477
|
-
|
|
16478
|
-
|
|
16479
|
-
|
|
16480
|
-
|
|
16481
|
-
return
|
|
16482
|
-
|
|
16483
|
-
|
|
16484
|
-
|
|
16485
|
-
|
|
16486
|
-
|
|
16487
|
-
|
|
16488
|
-
|
|
16489
|
-
start_offset: row.start_offset ?? metadataNumber3(metadata, ["start_offset"]),
|
|
16490
|
-
end_offset: row.end_offset ?? metadataNumber3(metadata, ["end_offset"]),
|
|
16491
|
-
status: metadataString3(metadata, ["status"]),
|
|
16492
|
-
resolver: "open-files-read-only"
|
|
16749
|
+
async function ingestOpenFilesManifest(options) {
|
|
16750
|
+
const now = options.now ?? new Date;
|
|
16751
|
+
if (options.safetyPolicy)
|
|
16752
|
+
assertWriteAllowed(options.dbPath, options.safetyPolicy);
|
|
16753
|
+
migrateKnowledgeDb(options.dbPath);
|
|
16754
|
+
const text = await readManifestInput(options.input, options.config, options.safetyPolicy);
|
|
16755
|
+
const items = parseManifestText(text);
|
|
16756
|
+
return ingestOpenFilesManifestItems({
|
|
16757
|
+
dbPath: options.dbPath,
|
|
16758
|
+
items,
|
|
16759
|
+
sourceLabel: options.input,
|
|
16760
|
+
safetyPolicy: options.safetyPolicy,
|
|
16761
|
+
now,
|
|
16762
|
+
maxChunkChars: options.maxChunkChars,
|
|
16763
|
+
chunkOverlapChars: options.chunkOverlapChars
|
|
16493
16764
|
});
|
|
16494
16765
|
}
|
|
16495
|
-
function
|
|
16496
|
-
|
|
16497
|
-
|
|
16498
|
-
|
|
16499
|
-
|
|
16500
|
-
|
|
16501
|
-
|
|
16502
|
-
|
|
16503
|
-
|
|
16504
|
-
|
|
16505
|
-
|
|
16506
|
-
|
|
16507
|
-
|
|
16508
|
-
|
|
16509
|
-
|
|
16510
|
-
|
|
16511
|
-
|
|
16512
|
-
|
|
16513
|
-
|
|
16514
|
-
|
|
16515
|
-
|
|
16516
|
-
|
|
16517
|
-
|
|
16518
|
-
|
|
16519
|
-
|
|
16520
|
-
|
|
16521
|
-
|
|
16522
|
-
|
|
16523
|
-
|
|
16524
|
-
|
|
16525
|
-
|
|
16526
|
-
|
|
16527
|
-
|
|
16766
|
+
async function ingestOpenFilesManifestItems(options) {
|
|
16767
|
+
const now = (options.now ?? new Date).toISOString();
|
|
16768
|
+
const maxChunkChars = options.maxChunkChars ?? 4000;
|
|
16769
|
+
const chunkOverlapChars = options.chunkOverlapChars ?? 200;
|
|
16770
|
+
if (maxChunkChars < 500)
|
|
16771
|
+
throw new Error("maxChunkChars must be at least 500.");
|
|
16772
|
+
if (chunkOverlapChars < 0 || chunkOverlapChars >= maxChunkChars)
|
|
16773
|
+
throw new Error("chunkOverlapChars must be less than maxChunkChars.");
|
|
16774
|
+
if (options.safetyPolicy)
|
|
16775
|
+
assertWriteAllowed(options.dbPath, options.safetyPolicy);
|
|
16776
|
+
migrateKnowledgeDb(options.dbPath);
|
|
16777
|
+
const db = openKnowledgeDb(options.dbPath);
|
|
16778
|
+
try {
|
|
16779
|
+
const result = db.transaction(() => {
|
|
16780
|
+
const seenSources = new Set;
|
|
16781
|
+
const seenRevisions = new Set;
|
|
16782
|
+
let chunksInserted = 0;
|
|
16783
|
+
let chunksDeleted = 0;
|
|
16784
|
+
let redactions = 0;
|
|
16785
|
+
let skipped = 0;
|
|
16786
|
+
recordAuditEvent(db, {
|
|
16787
|
+
event_type: "source_read",
|
|
16788
|
+
action: options.readAction ?? (options.sourceLabel.startsWith("s3://") ? "s3_manifest_read" : "local_manifest_read"),
|
|
16789
|
+
target_uri: options.sourceLabel,
|
|
16790
|
+
decision: "allow",
|
|
16791
|
+
metadata: { items: options.items.length, read_only: true },
|
|
16792
|
+
created_at: now
|
|
16793
|
+
});
|
|
16794
|
+
for (const raw of options.items) {
|
|
16795
|
+
const item = normalizeManifestItem(raw, now);
|
|
16796
|
+
const sourceId = upsertSource(db, item, now);
|
|
16797
|
+
const revisionId = upsertRevision(db, sourceId, item, now);
|
|
16798
|
+
seenSources.add(sourceId);
|
|
16799
|
+
seenRevisions.add(revisionId);
|
|
16800
|
+
if (item.text || item.status.toLowerCase() === "deleted") {
|
|
16801
|
+
chunksDeleted += deleteChunksForRevision(db, revisionId);
|
|
16802
|
+
}
|
|
16803
|
+
const inserted = insertChunks(db, revisionId, item, now, maxChunkChars, chunkOverlapChars, options.safetyPolicy);
|
|
16804
|
+
chunksInserted += inserted.chunksInserted;
|
|
16805
|
+
redactions += inserted.redactions;
|
|
16806
|
+
}
|
|
16807
|
+
recordAuditEvent(db, {
|
|
16808
|
+
event_type: "write",
|
|
16809
|
+
action: "knowledge_manifest_ingest",
|
|
16810
|
+
target_uri: options.dbPath,
|
|
16811
|
+
decision: "allow",
|
|
16812
|
+
metadata: { items: options.items.length, sources: seenSources.size, revisions: seenRevisions.size, chunks_inserted: chunksInserted, redactions },
|
|
16813
|
+
created_at: now
|
|
16814
|
+
});
|
|
16815
|
+
return {
|
|
16816
|
+
path: options.sourceLabel,
|
|
16817
|
+
db_path: options.dbPath,
|
|
16818
|
+
items_seen: options.items.length,
|
|
16819
|
+
sources_upserted: seenSources.size,
|
|
16820
|
+
revisions_upserted: seenRevisions.size,
|
|
16821
|
+
chunks_inserted: chunksInserted,
|
|
16822
|
+
chunks_deleted: chunksDeleted,
|
|
16823
|
+
redactions,
|
|
16824
|
+
skipped
|
|
16825
|
+
};
|
|
16826
|
+
})();
|
|
16827
|
+
return result;
|
|
16828
|
+
} finally {
|
|
16829
|
+
db.close();
|
|
16830
|
+
}
|
|
16528
16831
|
}
|
|
16529
|
-
|
|
16530
|
-
|
|
16531
|
-
|
|
16532
|
-
|
|
16533
|
-
|
|
16832
|
+
|
|
16833
|
+
// src/source-ingest.ts
|
|
16834
|
+
import { createHash as createHash6 } from "crypto";
|
|
16835
|
+
import { existsSync as existsSync6, readFileSync as readFileSync6 } from "fs";
|
|
16836
|
+
import { basename as basename3 } from "path";
|
|
16837
|
+
|
|
16838
|
+
// src/source-resolver.ts
|
|
16839
|
+
function parseJsonObject3(value) {
|
|
16840
|
+
if (!value)
|
|
16841
|
+
return {};
|
|
16842
|
+
try {
|
|
16843
|
+
const parsed = JSON.parse(value);
|
|
16844
|
+
return parsed && typeof parsed === "object" && !Array.isArray(parsed) ? parsed : {};
|
|
16845
|
+
} catch {
|
|
16846
|
+
return {};
|
|
16847
|
+
}
|
|
16534
16848
|
}
|
|
16535
|
-
function
|
|
16536
|
-
const
|
|
16537
|
-
|
|
16538
|
-
|
|
16539
|
-
|
|
16540
|
-
|
|
16541
|
-
|
|
16849
|
+
function metadataString3(metadata, keys) {
|
|
16850
|
+
for (const key of keys) {
|
|
16851
|
+
const value = metadata[key];
|
|
16852
|
+
if (typeof value === "string" && value.length > 0)
|
|
16853
|
+
return value;
|
|
16854
|
+
}
|
|
16855
|
+
return null;
|
|
16542
16856
|
}
|
|
16543
|
-
function
|
|
16544
|
-
const
|
|
16545
|
-
|
|
16546
|
-
|
|
16547
|
-
|
|
16548
|
-
|
|
16549
|
-
|
|
16857
|
+
function metadataNumber3(metadata, keys) {
|
|
16858
|
+
for (const key of keys) {
|
|
16859
|
+
const value = metadata[key];
|
|
16860
|
+
if (typeof value === "number" && Number.isFinite(value))
|
|
16861
|
+
return value;
|
|
16862
|
+
}
|
|
16863
|
+
return null;
|
|
16550
16864
|
}
|
|
16551
|
-
function
|
|
16552
|
-
const
|
|
16553
|
-
|
|
16554
|
-
|
|
16555
|
-
|
|
16556
|
-
const
|
|
16557
|
-
|
|
16558
|
-
|
|
16559
|
-
|
|
16560
|
-
|
|
16561
|
-
|
|
16562
|
-
|
|
16563
|
-
|
|
16564
|
-
source: sourceUri || sourceRef ? {
|
|
16565
|
-
uri: sourceUri,
|
|
16566
|
-
ref: sourceRef,
|
|
16567
|
-
kind: row.source_kind ?? metadataString3(metadata, ["source_kind"]),
|
|
16568
|
-
revision: row.revision ?? metadataString3(metadata, ["revision"]),
|
|
16569
|
-
hash: row.hash ?? metadataString3(metadata, ["hash"])
|
|
16570
|
-
} : null,
|
|
16571
|
-
citation: {
|
|
16572
|
-
chunk_id: row.chunk_id,
|
|
16573
|
-
start_offset: row.start_offset,
|
|
16574
|
-
end_offset: row.end_offset
|
|
16575
|
-
},
|
|
16576
|
-
artifact: isWiki ? {
|
|
16577
|
-
uri: row.wiki_artifact_uri,
|
|
16578
|
-
path: row.wiki_path,
|
|
16579
|
-
hash: row.wiki_content_hash,
|
|
16580
|
-
shard_key: row.wiki_path
|
|
16581
|
-
} : null,
|
|
16582
|
-
provenance,
|
|
16583
|
-
reasons: ["keyword_match"]
|
|
16584
|
-
};
|
|
16585
|
-
result.score = combinedScore(result.scores, result.citation);
|
|
16586
|
-
return result;
|
|
16865
|
+
function assertPurposeAllowed(permissions, purpose) {
|
|
16866
|
+
const mode = permissions.mode;
|
|
16867
|
+
if (typeof mode === "string" && mode !== "read_only") {
|
|
16868
|
+
throw new Error(`Source resolver denied ${purpose}. Permission mode is ${mode}, expected read_only.`);
|
|
16869
|
+
}
|
|
16870
|
+
const denied = permissions.denied_purposes;
|
|
16871
|
+
if (Array.isArray(denied) && denied.includes(purpose)) {
|
|
16872
|
+
throw new Error(`Source resolver denied ${purpose}. Purpose is explicitly denied.`);
|
|
16873
|
+
}
|
|
16874
|
+
const allowed = permissions.allowed_purposes;
|
|
16875
|
+
if (Array.isArray(allowed) && allowed.length > 0 && !allowed.includes(purpose)) {
|
|
16876
|
+
throw new Error(`Source resolver denied ${purpose}. Allowed purposes: ${allowed.join(", ")}`);
|
|
16877
|
+
}
|
|
16587
16878
|
}
|
|
16588
|
-
function
|
|
16589
|
-
|
|
16590
|
-
|
|
16591
|
-
|
|
16592
|
-
|
|
16593
|
-
|
|
16594
|
-
|
|
16595
|
-
|
|
16596
|
-
|
|
16597
|
-
|
|
16598
|
-
|
|
16599
|
-
|
|
16600
|
-
artifact: {
|
|
16601
|
-
uri: row.artifact_uri,
|
|
16602
|
-
path: row.path,
|
|
16603
|
-
hash: row.content_hash,
|
|
16604
|
-
shard_key: row.path
|
|
16605
|
-
},
|
|
16606
|
-
provenance: existingProvenance(metadata),
|
|
16607
|
-
reasons: ["wiki_catalog_match"]
|
|
16608
|
-
};
|
|
16609
|
-
result.score = combinedScore(result.scores, result.citation);
|
|
16610
|
-
return result;
|
|
16879
|
+
function sourceRevisionRef(sourceUri, revision, fallback) {
|
|
16880
|
+
if (!revision)
|
|
16881
|
+
return fallback;
|
|
16882
|
+
try {
|
|
16883
|
+
const parsed = parseSourceRef(sourceUri);
|
|
16884
|
+
if (parsed.kind === "open-files" && parsed.entity === "file") {
|
|
16885
|
+
return `${sourceUri}/revision/${encodeURIComponent(revision.revision)}`;
|
|
16886
|
+
}
|
|
16887
|
+
} catch {
|
|
16888
|
+
return fallback;
|
|
16889
|
+
}
|
|
16890
|
+
return fallback;
|
|
16611
16891
|
}
|
|
16612
|
-
function
|
|
16613
|
-
|
|
16614
|
-
|
|
16615
|
-
|
|
16616
|
-
|
|
16617
|
-
|
|
16618
|
-
title: row.name,
|
|
16619
|
-
text: null,
|
|
16620
|
-
score: 0,
|
|
16621
|
-
scores: { catalog: score },
|
|
16622
|
-
source: null,
|
|
16623
|
-
citation: null,
|
|
16624
|
-
artifact: {
|
|
16625
|
-
uri: row.artifact_uri,
|
|
16626
|
-
path: metadataString3(metadata, ["artifact_key"]),
|
|
16627
|
-
hash: metadataString3(metadata, ["content_hash"]),
|
|
16628
|
-
shard_key: row.shard_key
|
|
16629
|
-
},
|
|
16630
|
-
provenance: existingProvenance(metadata),
|
|
16631
|
-
reasons: ["index_catalog_match"]
|
|
16632
|
-
};
|
|
16633
|
-
result.score = combinedScore(result.scores, result.citation);
|
|
16634
|
-
return result;
|
|
16892
|
+
function selectSource(db, sourceUri, requestedRef) {
|
|
16893
|
+
return db.query(`SELECT id, uri, kind, title, metadata_json, acl_json, updated_at
|
|
16894
|
+
FROM sources
|
|
16895
|
+
WHERE uri = ? OR uri = ?
|
|
16896
|
+
ORDER BY CASE WHEN uri = ? THEN 0 ELSE 1 END
|
|
16897
|
+
LIMIT 1`).get(sourceUri, requestedRef, sourceUri) ?? null;
|
|
16635
16898
|
}
|
|
16636
|
-
function
|
|
16637
|
-
|
|
16638
|
-
|
|
16639
|
-
|
|
16640
|
-
|
|
16641
|
-
|
|
16899
|
+
function selectRevision(db, sourceId, revisionId) {
|
|
16900
|
+
if (revisionId) {
|
|
16901
|
+
return db.query(`SELECT id, revision, hash, extracted_text_uri, metadata_json, created_at
|
|
16902
|
+
FROM source_revisions
|
|
16903
|
+
WHERE source_id = ? AND revision = ?
|
|
16904
|
+
LIMIT 1`).get(sourceId, revisionId) ?? null;
|
|
16642
16905
|
}
|
|
16643
|
-
|
|
16644
|
-
|
|
16645
|
-
|
|
16646
|
-
|
|
16647
|
-
|
|
16648
|
-
existing.reasons = unique([...existing.reasons, ...entry.reasons]);
|
|
16649
|
-
existing.text = existing.text ?? entry.text;
|
|
16650
|
-
existing.title = existing.title ?? entry.title;
|
|
16651
|
-
existing.source = existing.source ?? entry.source;
|
|
16652
|
-
existing.citation = existing.citation ?? entry.citation;
|
|
16653
|
-
existing.artifact = existing.artifact ?? entry.artifact;
|
|
16654
|
-
existing.provenance = existing.provenance ?? entry.provenance;
|
|
16655
|
-
existing.score = combinedScore(existing.scores, existing.citation);
|
|
16906
|
+
return db.query(`SELECT id, revision, hash, extracted_text_uri, metadata_json, created_at
|
|
16907
|
+
FROM source_revisions
|
|
16908
|
+
WHERE source_id = ?
|
|
16909
|
+
ORDER BY created_at DESC, revision DESC
|
|
16910
|
+
LIMIT 1`).get(sourceId) ?? null;
|
|
16656
16911
|
}
|
|
16657
|
-
function
|
|
16658
|
-
|
|
16659
|
-
|
|
16660
|
-
|
|
16661
|
-
|
|
16662
|
-
|
|
16663
|
-
|
|
16664
|
-
|
|
16665
|
-
|
|
16666
|
-
|
|
16667
|
-
|
|
16668
|
-
|
|
16912
|
+
function countChunks(db, revisionId) {
|
|
16913
|
+
if (!revisionId)
|
|
16914
|
+
return 0;
|
|
16915
|
+
const row = db.query("SELECT COUNT(*) AS n FROM chunks WHERE source_revision_id = ?").get(revisionId);
|
|
16916
|
+
return row?.n ?? 0;
|
|
16917
|
+
}
|
|
16918
|
+
function selectChunks(db, revisionId, limit) {
|
|
16919
|
+
if (!revisionId || limit <= 0)
|
|
16920
|
+
return [];
|
|
16921
|
+
return db.query(`SELECT id, kind, ordinal, text, token_count, start_offset, end_offset, metadata_json
|
|
16922
|
+
FROM chunks
|
|
16923
|
+
WHERE source_revision_id = ?
|
|
16924
|
+
ORDER BY ordinal ASC
|
|
16925
|
+
LIMIT ?`).all(revisionId, limit);
|
|
16669
16926
|
}
|
|
16670
|
-
async function
|
|
16671
|
-
const
|
|
16672
|
-
|
|
16673
|
-
|
|
16674
|
-
const
|
|
16675
|
-
const
|
|
16676
|
-
const
|
|
16677
|
-
|
|
16678
|
-
|
|
16679
|
-
|
|
16680
|
-
|
|
16681
|
-
|
|
16682
|
-
let keywordCount = 0;
|
|
16683
|
-
let catalogCount = 0;
|
|
16684
|
-
let semanticCount = 0;
|
|
16685
|
-
const merged = new Map;
|
|
16927
|
+
async function resolveOpenFilesSource(options) {
|
|
16928
|
+
const purpose = options.purpose ?? "knowledge_answer";
|
|
16929
|
+
const limit = Math.max(0, Math.min(options.limit ?? 10, 100));
|
|
16930
|
+
const resolvedAt = (options.now ?? new Date).toISOString();
|
|
16931
|
+
const parsed = parseSourceRef(options.sourceRef);
|
|
16932
|
+
const sourceUri = catalogSourceUriForRef(options.sourceRef, parsed);
|
|
16933
|
+
const requestedRevision = revisionIdForSourceRef(options.sourceRef);
|
|
16934
|
+
if (options.safetyPolicy) {
|
|
16935
|
+
if (!options.safetyPolicy.readOnlySourceAccess)
|
|
16936
|
+
throw new Error("Safety policy denied source resolution.");
|
|
16937
|
+
assertWriteAllowed(options.dbPath, options.safetyPolicy);
|
|
16938
|
+
}
|
|
16686
16939
|
migrateKnowledgeDb(options.dbPath);
|
|
16687
16940
|
const db = openKnowledgeDb(options.dbPath);
|
|
16688
16941
|
try {
|
|
16689
|
-
|
|
16690
|
-
|
|
16691
|
-
|
|
16692
|
-
|
|
16693
|
-
|
|
16694
|
-
|
|
16695
|
-
|
|
16696
|
-
|
|
16697
|
-
|
|
16698
|
-
|
|
16699
|
-
|
|
16700
|
-
|
|
16701
|
-
|
|
16702
|
-
|
|
16703
|
-
|
|
16704
|
-
|
|
16705
|
-
|
|
16706
|
-
|
|
16707
|
-
|
|
16708
|
-
|
|
16709
|
-
|
|
16710
|
-
fake: options.fake,
|
|
16711
|
-
batchSize: options.batchSize,
|
|
16712
|
-
maxParallelCalls: options.maxParallelCalls
|
|
16713
|
-
});
|
|
16714
|
-
semanticProvider = semantic.provider;
|
|
16715
|
-
semanticModel = semantic.model;
|
|
16716
|
-
semanticDimensions = semantic.dimensions;
|
|
16717
|
-
semanticCount = semantic.results.length;
|
|
16718
|
-
for (const row of semantic.results) {
|
|
16719
|
-
const result = {
|
|
16720
|
-
kind: "source_chunk",
|
|
16721
|
-
id: row.chunk_id,
|
|
16722
|
-
title: null,
|
|
16723
|
-
text: row.text,
|
|
16724
|
-
score: 0,
|
|
16725
|
-
scores: { semantic: semanticScore(row.score) },
|
|
16726
|
-
source: {
|
|
16727
|
-
uri: row.source_uri,
|
|
16728
|
-
ref: row.source_ref,
|
|
16729
|
-
kind: row.provenance?.source_kind ?? null,
|
|
16730
|
-
revision: row.revision,
|
|
16731
|
-
hash: row.hash
|
|
16942
|
+
return db.transaction(() => {
|
|
16943
|
+
const source = selectSource(db, sourceUri, options.sourceRef);
|
|
16944
|
+
if (!source) {
|
|
16945
|
+
recordAuditEvent(db, {
|
|
16946
|
+
event_type: "source_read",
|
|
16947
|
+
action: "open_files_resolve_missing",
|
|
16948
|
+
target_uri: options.sourceRef,
|
|
16949
|
+
decision: "allow",
|
|
16950
|
+
metadata: { purpose, read_only: true, source_uri: sourceUri },
|
|
16951
|
+
created_at: resolvedAt
|
|
16952
|
+
});
|
|
16953
|
+
return {
|
|
16954
|
+
source_ref: options.sourceRef,
|
|
16955
|
+
source_uri: sourceUri,
|
|
16956
|
+
purpose,
|
|
16957
|
+
read_only: true,
|
|
16958
|
+
resolved: false,
|
|
16959
|
+
resolver: {
|
|
16960
|
+
name: "open-files-read-only",
|
|
16961
|
+
mode: "local_catalog",
|
|
16962
|
+
contract: "open-files-knowledge-source-v1"
|
|
16732
16963
|
},
|
|
16733
|
-
|
|
16734
|
-
|
|
16735
|
-
|
|
16736
|
-
|
|
16964
|
+
source: null,
|
|
16965
|
+
revision: null,
|
|
16966
|
+
content: {
|
|
16967
|
+
mime: null,
|
|
16968
|
+
size: null,
|
|
16969
|
+
hash: null,
|
|
16970
|
+
text_available: false,
|
|
16971
|
+
chunks_total: 0,
|
|
16972
|
+
chunks_returned: 0,
|
|
16973
|
+
char_count_returned: 0,
|
|
16974
|
+
extracted_text_ref: null,
|
|
16975
|
+
bytes_available: false,
|
|
16976
|
+
bytes_exposed: false
|
|
16737
16977
|
},
|
|
16738
|
-
|
|
16739
|
-
|
|
16740
|
-
reasons: ["semantic_match"]
|
|
16978
|
+
chunks: [],
|
|
16979
|
+
citations: []
|
|
16741
16980
|
};
|
|
16742
|
-
result.score = combinedScore(result.scores, result.citation);
|
|
16743
|
-
mergeResult(merged, result);
|
|
16744
16981
|
}
|
|
16745
|
-
|
|
16746
|
-
|
|
16747
|
-
|
|
16982
|
+
const sourceMetadata = parseJsonObject3(source.metadata_json);
|
|
16983
|
+
const permissions = parseJsonObject3(source.acl_json);
|
|
16984
|
+
try {
|
|
16985
|
+
assertPurposeAllowed(permissions, purpose);
|
|
16986
|
+
} catch (error48) {
|
|
16987
|
+
recordAuditEvent(db, {
|
|
16988
|
+
event_type: "source_read",
|
|
16989
|
+
action: "open_files_resolve",
|
|
16990
|
+
target_uri: options.sourceRef,
|
|
16991
|
+
decision: "deny",
|
|
16992
|
+
metadata: {
|
|
16993
|
+
purpose,
|
|
16994
|
+
read_only: true,
|
|
16995
|
+
source_uri: source.uri,
|
|
16996
|
+
error: error48 instanceof Error ? error48.message : String(error48)
|
|
16997
|
+
},
|
|
16998
|
+
created_at: resolvedAt
|
|
16999
|
+
});
|
|
17000
|
+
throw error48;
|
|
17001
|
+
}
|
|
17002
|
+
const revision = selectRevision(db, source.id, requestedRevision);
|
|
17003
|
+
const revisionMetadata = parseJsonObject3(revision?.metadata_json);
|
|
17004
|
+
const totalChunks = countChunks(db, revision?.id ?? null);
|
|
17005
|
+
const rows = selectChunks(db, revision?.id ?? null, limit);
|
|
17006
|
+
const effectiveSourceRef = sourceRevisionRef(source.uri, revision, options.sourceRef);
|
|
17007
|
+
const chunks = rows.map((row) => {
|
|
17008
|
+
const metadata = parseJsonObject3(row.metadata_json);
|
|
17009
|
+
const evidence = {
|
|
17010
|
+
resolver: "open-files-read-only",
|
|
17011
|
+
mode: "local_catalog",
|
|
17012
|
+
purpose,
|
|
17013
|
+
read_only: true,
|
|
17014
|
+
source_ref: metadataString3(metadata, ["source_ref"]) ?? effectiveSourceRef,
|
|
17015
|
+
source_uri: source.uri,
|
|
17016
|
+
source_revision_id: revision?.id ?? null,
|
|
17017
|
+
revision: revision?.revision ?? null,
|
|
17018
|
+
hash: revision?.hash ?? metadataString3(metadata, ["hash"]),
|
|
17019
|
+
chunk_id: row.id,
|
|
17020
|
+
start_offset: row.start_offset,
|
|
17021
|
+
end_offset: row.end_offset,
|
|
17022
|
+
resolved_at: resolvedAt
|
|
17023
|
+
};
|
|
17024
|
+
const provenance = sourceProvenance({
|
|
17025
|
+
source_ref: evidence.source_ref,
|
|
17026
|
+
source_uri: evidence.source_uri,
|
|
17027
|
+
source_kind: source.kind,
|
|
17028
|
+
source_revision_id: evidence.source_revision_id,
|
|
17029
|
+
revision: evidence.revision,
|
|
17030
|
+
hash: evidence.hash,
|
|
17031
|
+
chunk_id: row.id,
|
|
17032
|
+
start_offset: row.start_offset,
|
|
17033
|
+
end_offset: row.end_offset,
|
|
17034
|
+
status: metadataString3(metadata, ["status"]),
|
|
17035
|
+
resolver: evidence.resolver
|
|
17036
|
+
});
|
|
17037
|
+
return {
|
|
17038
|
+
id: row.id,
|
|
17039
|
+
kind: row.kind,
|
|
17040
|
+
ordinal: row.ordinal,
|
|
17041
|
+
text: row.text,
|
|
17042
|
+
token_count: row.token_count,
|
|
17043
|
+
start_offset: row.start_offset,
|
|
17044
|
+
end_offset: row.end_offset,
|
|
17045
|
+
metadata,
|
|
17046
|
+
evidence,
|
|
17047
|
+
provenance
|
|
17048
|
+
};
|
|
17049
|
+
});
|
|
17050
|
+
const citations = chunks.map((chunk) => ({
|
|
17051
|
+
source_ref: chunk.evidence.source_ref,
|
|
17052
|
+
source_uri: source.uri,
|
|
17053
|
+
chunk_id: chunk.id,
|
|
17054
|
+
quote: chunk.text.slice(0, 500),
|
|
17055
|
+
start_offset: chunk.start_offset,
|
|
17056
|
+
end_offset: chunk.end_offset,
|
|
17057
|
+
evidence: chunk.evidence,
|
|
17058
|
+
provenance: chunk.provenance
|
|
17059
|
+
}));
|
|
17060
|
+
recordAuditEvent(db, {
|
|
17061
|
+
event_type: "source_read",
|
|
17062
|
+
action: "open_files_resolve",
|
|
17063
|
+
target_uri: options.sourceRef,
|
|
17064
|
+
decision: "allow",
|
|
17065
|
+
metadata: {
|
|
17066
|
+
purpose,
|
|
17067
|
+
read_only: true,
|
|
17068
|
+
source_uri: source.uri,
|
|
17069
|
+
revision: revision?.revision ?? null,
|
|
17070
|
+
chunks_returned: chunks.length,
|
|
17071
|
+
chunks_total: totalChunks
|
|
17072
|
+
},
|
|
17073
|
+
created_at: resolvedAt
|
|
17074
|
+
});
|
|
17075
|
+
const mime = metadataString3(sourceMetadata, ["mime", "content_type"]) ?? metadataString3(revisionMetadata, ["mime", "content_type"]);
|
|
17076
|
+
const size = metadataNumber3(sourceMetadata, ["size", "size_bytes"]) ?? metadataNumber3(revisionMetadata, ["size", "size_bytes"]);
|
|
17077
|
+
return {
|
|
17078
|
+
source_ref: effectiveSourceRef,
|
|
17079
|
+
source_uri: source.uri,
|
|
17080
|
+
purpose,
|
|
17081
|
+
read_only: true,
|
|
17082
|
+
resolved: true,
|
|
17083
|
+
resolver: {
|
|
17084
|
+
name: "open-files-read-only",
|
|
17085
|
+
mode: "local_catalog",
|
|
17086
|
+
contract: "open-files-knowledge-source-v1"
|
|
17087
|
+
},
|
|
17088
|
+
source: {
|
|
17089
|
+
id: source.id,
|
|
17090
|
+
uri: source.uri,
|
|
17091
|
+
kind: source.kind,
|
|
17092
|
+
title: source.title,
|
|
17093
|
+
metadata: sourceMetadata,
|
|
17094
|
+
permissions,
|
|
17095
|
+
updated_at: source.updated_at
|
|
17096
|
+
},
|
|
17097
|
+
revision: revision ? {
|
|
17098
|
+
id: revision.id,
|
|
17099
|
+
revision: revision.revision,
|
|
17100
|
+
hash: revision.hash,
|
|
17101
|
+
extracted_text_uri: revision.extracted_text_uri,
|
|
17102
|
+
metadata: revisionMetadata,
|
|
17103
|
+
created_at: revision.created_at,
|
|
17104
|
+
reindex_required: revisionMetadata.reindex_required === true
|
|
17105
|
+
} : null,
|
|
17106
|
+
content: {
|
|
17107
|
+
mime,
|
|
17108
|
+
size,
|
|
17109
|
+
hash: revision?.hash ?? metadataString3(sourceMetadata, ["hash", "checksum", "sha256"]),
|
|
17110
|
+
text_available: totalChunks > 0,
|
|
17111
|
+
chunks_total: totalChunks,
|
|
17112
|
+
chunks_returned: chunks.length,
|
|
17113
|
+
char_count_returned: chunks.reduce((sum, chunk) => sum + chunk.text.length, 0),
|
|
17114
|
+
extracted_text_ref: revision?.extracted_text_uri ?? metadataString3(revisionMetadata, ["extracted_text_ref", "extracted_text_uri"]),
|
|
17115
|
+
bytes_available: false,
|
|
17116
|
+
bytes_exposed: false
|
|
17117
|
+
},
|
|
17118
|
+
chunks,
|
|
17119
|
+
citations
|
|
17120
|
+
};
|
|
17121
|
+
})();
|
|
17122
|
+
} finally {
|
|
17123
|
+
db.close();
|
|
16748
17124
|
}
|
|
16749
|
-
const results = sortResults(Array.from(merged.values())).slice(0, limit);
|
|
16750
|
-
return {
|
|
16751
|
-
query,
|
|
16752
|
-
limit,
|
|
16753
|
-
mode: {
|
|
16754
|
-
keyword: true,
|
|
16755
|
-
catalog: true,
|
|
16756
|
-
semantic: semanticEnabled
|
|
16757
|
-
},
|
|
16758
|
-
semantic_provider: semanticProvider,
|
|
16759
|
-
semantic_model: semanticModel,
|
|
16760
|
-
semantic_dimensions: semanticDimensions,
|
|
16761
|
-
counts: {
|
|
16762
|
-
keyword_results: keywordCount,
|
|
16763
|
-
catalog_results: catalogCount,
|
|
16764
|
-
semantic_results: semanticCount,
|
|
16765
|
-
merged_results: results.length
|
|
16766
|
-
},
|
|
16767
|
-
warnings,
|
|
16768
|
-
results
|
|
16769
|
-
};
|
|
16770
17125
|
}
|
|
16771
17126
|
|
|
16772
|
-
// src/
|
|
16773
|
-
function
|
|
16774
|
-
return
|
|
16775
|
-
}
|
|
16776
|
-
function normalizeQuery(query) {
|
|
16777
|
-
return query.normalize("NFKC").trim().replace(/\s+/g, " ").toLowerCase();
|
|
16778
|
-
}
|
|
16779
|
-
function queryTerms2(query) {
|
|
16780
|
-
return Array.from(new Set(normalizeQuery(query).match(/[\p{L}\p{N}_]+/gu) ?? [])).slice(0, 16);
|
|
17127
|
+
// src/source-ingest.ts
|
|
17128
|
+
function sha256Text(text) {
|
|
17129
|
+
return `sha256:${createHash6("sha256").update(text).digest("hex")}`;
|
|
16781
17130
|
}
|
|
16782
|
-
function
|
|
16783
|
-
return [
|
|
17131
|
+
function stripHtml(html) {
|
|
17132
|
+
return html.replace(/<script[\s\S]*?<\/script>/gi, " ").replace(/<style[\s\S]*?<\/style>/gi, " ").replace(/<[^>]+>/g, " ").replace(/ /g, " ").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/\s+\n/g, `
|
|
17133
|
+
`).replace(/\n\s+/g, `
|
|
17134
|
+
`).replace(/[ \t]{2,}/g, " ").trim();
|
|
16784
17135
|
}
|
|
16785
|
-
function
|
|
16786
|
-
|
|
16787
|
-
|
|
16788
|
-
const
|
|
16789
|
-
|
|
16790
|
-
|
|
17136
|
+
async function readS3Text3(uri, config2, safetyPolicy) {
|
|
17137
|
+
const parsed = new URL(uri);
|
|
17138
|
+
const bucket = parsed.hostname;
|
|
17139
|
+
const key = decodeURIComponent(parsed.pathname.replace(/^\/+/, ""));
|
|
17140
|
+
if (!bucket || !key)
|
|
17141
|
+
throw new Error(`Invalid S3 source URI: ${uri}`);
|
|
17142
|
+
if (safetyPolicy)
|
|
17143
|
+
assertS3ReadAllowed(uri, safetyPolicy);
|
|
17144
|
+
const [{ S3Client, GetObjectCommand }, { fromIni }] = await Promise.all([
|
|
17145
|
+
import("@aws-sdk/client-s3"),
|
|
17146
|
+
import("@aws-sdk/credential-providers")
|
|
17147
|
+
]);
|
|
17148
|
+
const s3Config = config2?.storage.type === "s3" && config2.storage.s3?.bucket === bucket ? config2.storage.s3 : undefined;
|
|
17149
|
+
const client = new S3Client({
|
|
17150
|
+
region: s3Config?.region,
|
|
17151
|
+
credentials: s3Config?.profile ? fromIni({ profile: s3Config.profile }) : undefined,
|
|
17152
|
+
maxAttempts: s3Config?.max_attempts
|
|
17153
|
+
});
|
|
17154
|
+
const response = await client.send(new GetObjectCommand({ Bucket: bucket, Key: key }));
|
|
17155
|
+
if (!response.Body)
|
|
17156
|
+
return "";
|
|
17157
|
+
return await response.Body.transformToString();
|
|
16791
17158
|
}
|
|
16792
|
-
function
|
|
16793
|
-
if (
|
|
16794
|
-
|
|
16795
|
-
|
|
16796
|
-
|
|
16797
|
-
|
|
16798
|
-
|
|
16799
|
-
|
|
17159
|
+
async function readWebText(uri, safetyPolicy) {
|
|
17160
|
+
if (safetyPolicy)
|
|
17161
|
+
assertWebSearchAllowed(safetyPolicy);
|
|
17162
|
+
const response = await fetch(uri, {
|
|
17163
|
+
headers: {
|
|
17164
|
+
accept: "text/markdown,text/plain,text/html,application/json;q=0.8,*/*;q=0.5",
|
|
17165
|
+
"user-agent": "@hasna/knowledge source-ingest"
|
|
17166
|
+
}
|
|
17167
|
+
});
|
|
17168
|
+
if (!response.ok)
|
|
17169
|
+
throw new Error(`Web source read failed ${response.status}: ${uri}`);
|
|
17170
|
+
const mime = response.headers.get("content-type");
|
|
17171
|
+
const body = await response.text();
|
|
17172
|
+
return { text: mime?.includes("html") ? stripHtml(body) : body, mime };
|
|
16800
17173
|
}
|
|
16801
|
-
function
|
|
16802
|
-
if (
|
|
16803
|
-
return
|
|
16804
|
-
if ("
|
|
16805
|
-
return
|
|
16806
|
-
if ("
|
|
16807
|
-
return
|
|
16808
|
-
return
|
|
17174
|
+
function titleForRef(parsed) {
|
|
17175
|
+
if (parsed.kind === "file")
|
|
17176
|
+
return basename3(parsed.path);
|
|
17177
|
+
if (parsed.kind === "s3")
|
|
17178
|
+
return basename3(parsed.key);
|
|
17179
|
+
if (parsed.kind === "web")
|
|
17180
|
+
return basename3(new URL(parsed.url).pathname) || parsed.url;
|
|
17181
|
+
return parsed.path ? basename3(parsed.path) : parsed.id;
|
|
16809
17182
|
}
|
|
16810
|
-
function
|
|
16811
|
-
if (
|
|
16812
|
-
|
|
16813
|
-
|
|
16814
|
-
|
|
16815
|
-
|
|
16816
|
-
|
|
16817
|
-
|
|
16818
|
-
|
|
16819
|
-
|
|
17183
|
+
async function readDirectSourceText(parsed, config2, safetyPolicy) {
|
|
17184
|
+
if (parsed.kind === "file") {
|
|
17185
|
+
if (!existsSync6(parsed.path))
|
|
17186
|
+
throw new Error(`Source file not found: ${parsed.path}`);
|
|
17187
|
+
const text = readFileSync6(parsed.path, "utf8");
|
|
17188
|
+
return {
|
|
17189
|
+
text,
|
|
17190
|
+
contentSource: "file",
|
|
17191
|
+
title: titleForRef(parsed),
|
|
17192
|
+
mime: "text/plain",
|
|
17193
|
+
size: text.length,
|
|
17194
|
+
hash: sha256Text(text),
|
|
17195
|
+
revision: null,
|
|
17196
|
+
extractedTextRef: null,
|
|
17197
|
+
metadata: { path: parsed.path },
|
|
17198
|
+
permissions: { mode: "read_only" }
|
|
17199
|
+
};
|
|
17200
|
+
}
|
|
17201
|
+
if (parsed.kind === "s3") {
|
|
17202
|
+
const text = await readS3Text3(parsed.uri, config2, safetyPolicy);
|
|
17203
|
+
return {
|
|
17204
|
+
text,
|
|
17205
|
+
contentSource: "s3",
|
|
17206
|
+
title: titleForRef(parsed),
|
|
17207
|
+
mime: "text/plain",
|
|
17208
|
+
size: text.length,
|
|
17209
|
+
hash: sha256Text(text),
|
|
17210
|
+
revision: null,
|
|
17211
|
+
extractedTextRef: null,
|
|
17212
|
+
metadata: { bucket: parsed.bucket, key: parsed.key },
|
|
17213
|
+
permissions: { mode: "read_only" }
|
|
17214
|
+
};
|
|
17215
|
+
}
|
|
17216
|
+
if (parsed.kind === "web") {
|
|
17217
|
+
const web = await readWebText(parsed.url, safetyPolicy);
|
|
17218
|
+
return {
|
|
17219
|
+
text: web.text,
|
|
17220
|
+
contentSource: "web",
|
|
17221
|
+
title: titleForRef(parsed),
|
|
17222
|
+
mime: web.mime,
|
|
17223
|
+
size: web.text.length,
|
|
17224
|
+
hash: sha256Text(web.text),
|
|
17225
|
+
revision: null,
|
|
17226
|
+
extractedTextRef: null,
|
|
17227
|
+
metadata: { url: parsed.url },
|
|
17228
|
+
permissions: { mode: "read_only" }
|
|
17229
|
+
};
|
|
17230
|
+
}
|
|
17231
|
+
throw new Error(`Direct source reading is not available for ${parsed.uri}`);
|
|
16820
17232
|
}
|
|
16821
|
-
function
|
|
16822
|
-
if (
|
|
16823
|
-
|
|
16824
|
-
|
|
16825
|
-
|
|
16826
|
-
|
|
16827
|
-
|
|
16828
|
-
return 0.35;
|
|
17233
|
+
async function readTextRef(uri, config2, safetyPolicy) {
|
|
17234
|
+
if (uri.startsWith("open-files://")) {
|
|
17235
|
+
throw new Error("Open-files extracted text refs require an open-files resolver API. Ingest an open-files manifest with extracted_text or an extracted_text_ref using file://, s3://, or https://.");
|
|
17236
|
+
}
|
|
17237
|
+
const parsed = parseSourceRef(uri);
|
|
17238
|
+
const direct = await readDirectSourceText(parsed, config2, safetyPolicy);
|
|
17239
|
+
return { text: direct.text, contentSource: "extracted_text_ref" };
|
|
16829
17240
|
}
|
|
16830
|
-
function
|
|
16831
|
-
|
|
16832
|
-
|
|
16833
|
-
|
|
16834
|
-
|
|
16835
|
-
|
|
16836
|
-
|
|
16837
|
-
|
|
17241
|
+
async function readOpenFilesSourceText(options) {
|
|
17242
|
+
const resolved = await resolveOpenFilesSource({
|
|
17243
|
+
dbPath: options.dbPath,
|
|
17244
|
+
sourceRef: options.sourceRef,
|
|
17245
|
+
purpose: options.purpose ?? "knowledge_index",
|
|
17246
|
+
limit: 100,
|
|
17247
|
+
safetyPolicy: options.safetyPolicy,
|
|
17248
|
+
now: options.now
|
|
17249
|
+
});
|
|
17250
|
+
if (!resolved.resolved) {
|
|
17251
|
+
throw new Error("Open-files source is not in the local knowledge catalog. Ingest an open-files manifest first or use the open-files resolver API.");
|
|
17252
|
+
}
|
|
17253
|
+
if (resolved.revision?.extracted_text_uri && !resolved.content.text_available) {
|
|
17254
|
+
const textRef = await readTextRef(resolved.revision.extracted_text_uri, options.config, options.safetyPolicy);
|
|
17255
|
+
return {
|
|
17256
|
+
text: textRef.text,
|
|
17257
|
+
contentSource: textRef.contentSource,
|
|
17258
|
+
title: resolved.source?.title ?? null,
|
|
17259
|
+
mime: resolved.content.mime,
|
|
17260
|
+
size: textRef.text.length,
|
|
17261
|
+
hash: resolved.revision.hash ?? sha256Text(textRef.text),
|
|
17262
|
+
revision: resolved.revision.revision,
|
|
17263
|
+
extractedTextRef: resolved.revision.extracted_text_uri,
|
|
17264
|
+
metadata: resolved.source?.metadata ?? {},
|
|
17265
|
+
permissions: resolved.source?.permissions ?? { mode: "read_only" }
|
|
17266
|
+
};
|
|
17267
|
+
}
|
|
17268
|
+
if (resolved.chunks.length === 0) {
|
|
17269
|
+
throw new Error("Open-files source has no extracted text chunks yet. Ingest an open-files manifest with extracted_text or extracted_text_ref first.");
|
|
17270
|
+
}
|
|
17271
|
+
const text = resolved.chunks.map((chunk) => chunk.text).join(`
|
|
17272
|
+
|
|
17273
|
+
`);
|
|
17274
|
+
return {
|
|
17275
|
+
text,
|
|
17276
|
+
contentSource: "catalog_chunks",
|
|
17277
|
+
title: resolved.source?.title ?? null,
|
|
17278
|
+
mime: resolved.content.mime,
|
|
17279
|
+
size: text.length,
|
|
17280
|
+
hash: resolved.revision?.hash ?? sha256Text(text),
|
|
17281
|
+
revision: resolved.revision?.revision ?? null,
|
|
17282
|
+
extractedTextRef: resolved.revision?.extracted_text_uri ?? null,
|
|
17283
|
+
metadata: resolved.source?.metadata ?? {},
|
|
17284
|
+
permissions: resolved.source?.permissions ?? { mode: "read_only" }
|
|
17285
|
+
};
|
|
16838
17286
|
}
|
|
16839
|
-
function
|
|
16840
|
-
const
|
|
16841
|
-
|
|
16842
|
-
|
|
16843
|
-
|
|
16844
|
-
|
|
16845
|
-
|
|
17287
|
+
function manifestItemForSource(sourceRef, parsed, resolved, purpose) {
|
|
17288
|
+
const hash2 = resolved.hash ?? sha256Text(resolved.text);
|
|
17289
|
+
const metadata = {
|
|
17290
|
+
...resolved.metadata,
|
|
17291
|
+
source_ref: sourceRef,
|
|
17292
|
+
content_source: resolved.contentSource,
|
|
17293
|
+
read_only: true
|
|
17294
|
+
};
|
|
17295
|
+
const item = {
|
|
17296
|
+
source_ref: sourceRef,
|
|
17297
|
+
name: resolved.title ?? titleForRef(parsed),
|
|
17298
|
+
mime: resolved.mime ?? "text/plain",
|
|
17299
|
+
size: resolved.size ?? resolved.text.length,
|
|
17300
|
+
hash: hash2,
|
|
17301
|
+
revision: resolved.revision ?? hash2,
|
|
17302
|
+
status: "active",
|
|
17303
|
+
updated_at: new Date().toISOString(),
|
|
17304
|
+
permissions: {
|
|
17305
|
+
mode: "read_only",
|
|
17306
|
+
allowed_purposes: [purpose],
|
|
17307
|
+
...resolved.permissions
|
|
17308
|
+
},
|
|
17309
|
+
metadata,
|
|
17310
|
+
extracted_text_ref: resolved.extractedTextRef,
|
|
17311
|
+
extracted_text: resolved.text
|
|
16846
17312
|
};
|
|
16847
|
-
|
|
16848
|
-
|
|
16849
|
-
|
|
16850
|
-
|
|
16851
|
-
|
|
16852
|
-
|
|
16853
|
-
|
|
16854
|
-
|
|
17313
|
+
if (parsed.kind === "open-files") {
|
|
17314
|
+
if (parsed.entity === "file")
|
|
17315
|
+
item.file_id = parsed.id;
|
|
17316
|
+
if (parsed.entity === "source") {
|
|
17317
|
+
item.source_id = parsed.id;
|
|
17318
|
+
item.path = parsed.path;
|
|
17319
|
+
}
|
|
17320
|
+
}
|
|
17321
|
+
if (parsed.kind === "file")
|
|
17322
|
+
item.path = parsed.path;
|
|
17323
|
+
if (parsed.kind === "s3")
|
|
17324
|
+
item.path = parsed.key;
|
|
17325
|
+
if (parsed.kind === "web")
|
|
17326
|
+
item.url = parsed.url;
|
|
17327
|
+
return item;
|
|
17328
|
+
}
|
|
17329
|
+
async function ingestSourceRef(options) {
|
|
17330
|
+
const purpose = options.purpose ?? "knowledge_index";
|
|
17331
|
+
const parsed = parseSourceRef(options.sourceRef);
|
|
17332
|
+
const resolved = parsed.kind === "open-files" ? await readOpenFilesSourceText(options) : await readDirectSourceText(parsed, options.config, options.safetyPolicy);
|
|
17333
|
+
const item = manifestItemForSource(options.sourceRef, parsed, resolved, purpose);
|
|
17334
|
+
const result = await ingestOpenFilesManifestItems({
|
|
17335
|
+
dbPath: options.dbPath,
|
|
17336
|
+
items: [item],
|
|
17337
|
+
sourceLabel: options.sourceRef,
|
|
17338
|
+
readAction: "source_ref_ingest_read",
|
|
17339
|
+
safetyPolicy: options.safetyPolicy,
|
|
17340
|
+
now: options.now
|
|
17341
|
+
});
|
|
16855
17342
|
return {
|
|
16856
17343
|
...result,
|
|
16857
|
-
|
|
16858
|
-
|
|
16859
|
-
|
|
16860
|
-
|
|
16861
|
-
final_score: Number(final.toFixed(6))
|
|
16862
|
-
}
|
|
17344
|
+
source_ref: options.sourceRef,
|
|
17345
|
+
content_source: resolved.contentSource,
|
|
17346
|
+
read_only: true,
|
|
17347
|
+
hash: String(item.hash)
|
|
16863
17348
|
};
|
|
16864
17349
|
}
|
|
16865
|
-
|
|
16866
|
-
|
|
16867
|
-
|
|
16868
|
-
|
|
16869
|
-
|
|
16870
|
-
return normalized.length <= maxChars ? normalized : `${normalized.slice(0, Math.max(0, maxChars - 1)).trim()}...`;
|
|
17350
|
+
|
|
17351
|
+
// src/web-search.ts
|
|
17352
|
+
import { createHash as createHash7, randomUUID as randomUUID6 } from "crypto";
|
|
17353
|
+
function stableHash(value) {
|
|
17354
|
+
return `sha256:${createHash7("sha256").update(value).digest("hex")}`;
|
|
16871
17355
|
}
|
|
16872
|
-
function
|
|
16873
|
-
const
|
|
16874
|
-
return
|
|
16875
|
-
id,
|
|
16876
|
-
result_id: result.id,
|
|
16877
|
-
kind: result.kind,
|
|
16878
|
-
source_uri: result.source?.uri ?? null,
|
|
16879
|
-
source_ref: result.source?.ref ?? null,
|
|
16880
|
-
artifact_uri: result.artifact?.uri ?? null,
|
|
16881
|
-
artifact_path: result.artifact?.path ?? null,
|
|
16882
|
-
revision: result.source?.revision ?? null,
|
|
16883
|
-
hash: result.source?.hash ?? result.artifact?.hash ?? null,
|
|
16884
|
-
chunk_id: result.citation?.chunk_id ?? null,
|
|
16885
|
-
start_offset: result.citation?.start_offset ?? null,
|
|
16886
|
-
end_offset: result.citation?.end_offset ?? null,
|
|
16887
|
-
quote: quoteFor(result, 500),
|
|
16888
|
-
provenance: result.provenance
|
|
16889
|
-
};
|
|
17356
|
+
function estimateTokens2(text) {
|
|
17357
|
+
const words = text.trim().split(/\s+/).filter(Boolean).length;
|
|
17358
|
+
return Math.max(1, Math.ceil(words * 1.25));
|
|
16890
17359
|
}
|
|
16891
|
-
function
|
|
16892
|
-
|
|
16893
|
-
|
|
17360
|
+
function asRecord(value) {
|
|
17361
|
+
return value && typeof value === "object" && !Array.isArray(value) ? value : {};
|
|
17362
|
+
}
|
|
17363
|
+
function asString3(value) {
|
|
17364
|
+
return typeof value === "string" && value.length > 0 ? value : null;
|
|
17365
|
+
}
|
|
17366
|
+
function sourceFromRecord(value) {
|
|
17367
|
+
const record2 = asRecord(value);
|
|
17368
|
+
const url2 = asString3(record2.url) ?? asString3(record2.uri) ?? asString3(record2.sourceUrl);
|
|
17369
|
+
if (!url2)
|
|
16894
17370
|
return null;
|
|
16895
17371
|
return {
|
|
16896
|
-
|
|
16897
|
-
|
|
16898
|
-
|
|
16899
|
-
|
|
16900
|
-
text,
|
|
16901
|
-
score: result.score
|
|
17372
|
+
url: url2,
|
|
17373
|
+
title: asString3(record2.title) ?? asString3(record2.name),
|
|
17374
|
+
snippet: asString3(record2.snippet) ?? asString3(record2.text) ?? asString3(record2.description),
|
|
17375
|
+
provider_metadata: record2
|
|
16902
17376
|
};
|
|
16903
17377
|
}
|
|
16904
|
-
function
|
|
16905
|
-
|
|
17378
|
+
function collectSources(value, output) {
|
|
17379
|
+
if (Array.isArray(value)) {
|
|
17380
|
+
for (const entry of value)
|
|
17381
|
+
collectSources(entry, output);
|
|
17382
|
+
return;
|
|
17383
|
+
}
|
|
17384
|
+
const source = sourceFromRecord(value);
|
|
17385
|
+
if (source)
|
|
17386
|
+
output.set(source.url, source);
|
|
17387
|
+
const record2 = asRecord(value);
|
|
17388
|
+
for (const key of ["sources", "results", "citations", "annotations", "output"]) {
|
|
17389
|
+
if (record2[key])
|
|
17390
|
+
collectSources(record2[key], output);
|
|
17391
|
+
}
|
|
17392
|
+
}
|
|
17393
|
+
function fakeSources(query, limit) {
|
|
17394
|
+
return Array.from({ length: Math.min(limit, 3) }, (_, index) => ({
|
|
17395
|
+
url: `https://example.com/knowledge-web-${index + 1}`,
|
|
17396
|
+
title: `Fake web source ${index + 1}`,
|
|
17397
|
+
snippet: `Deterministic web-search fixture for "${query}"`,
|
|
17398
|
+
provider_metadata: { fake: true, rank: index + 1 }
|
|
17399
|
+
}));
|
|
16906
17400
|
}
|
|
16907
|
-
function
|
|
16908
|
-
const
|
|
16909
|
-
const
|
|
16910
|
-
const
|
|
16911
|
-
const
|
|
16912
|
-
|
|
16913
|
-
|
|
16914
|
-
|
|
16915
|
-
|
|
16916
|
-
|
|
16917
|
-
|
|
16918
|
-
|
|
16919
|
-
|
|
16920
|
-
|
|
16921
|
-
|
|
16922
|
-
|
|
16923
|
-
|
|
16924
|
-
|
|
16925
|
-
|
|
16926
|
-
|
|
16927
|
-
|
|
16928
|
-
|
|
16929
|
-
|
|
16930
|
-
|
|
16931
|
-
|
|
16932
|
-
|
|
17401
|
+
async function openAiWebSearch(input) {
|
|
17402
|
+
const { generateText } = await import("ai");
|
|
17403
|
+
const { createOpenAI } = await import("@ai-sdk/openai");
|
|
17404
|
+
const settings = providerSettings(input.config, "openai");
|
|
17405
|
+
const openai = createOpenAI({
|
|
17406
|
+
apiKey: input.env[settings.api_key_env],
|
|
17407
|
+
baseURL: settings.base_url
|
|
17408
|
+
});
|
|
17409
|
+
const webSearch = openai.tools?.webSearch;
|
|
17410
|
+
if (!webSearch)
|
|
17411
|
+
throw new Error("OpenAI provider does not expose tools.webSearch.");
|
|
17412
|
+
return generateText({
|
|
17413
|
+
model: openai(input.model),
|
|
17414
|
+
prompt: input.query,
|
|
17415
|
+
tools: {
|
|
17416
|
+
web_search: webSearch({
|
|
17417
|
+
externalWebAccess: true,
|
|
17418
|
+
searchContextSize: "medium",
|
|
17419
|
+
...input.domains.length > 0 ? { allowedDomains: input.domains } : {}
|
|
17420
|
+
})
|
|
17421
|
+
},
|
|
17422
|
+
toolChoice: { type: "tool", toolName: "web_search" }
|
|
17423
|
+
});
|
|
17424
|
+
}
|
|
17425
|
+
async function anthropicWebSearch(input) {
|
|
17426
|
+
const { generateText } = await import("ai");
|
|
17427
|
+
const { createAnthropic } = await import("@ai-sdk/anthropic");
|
|
17428
|
+
const settings = providerSettings(input.config, "anthropic");
|
|
17429
|
+
const anthropic = createAnthropic({
|
|
17430
|
+
apiKey: input.env[settings.api_key_env],
|
|
17431
|
+
baseURL: settings.base_url
|
|
17432
|
+
});
|
|
17433
|
+
const factory = anthropic.tools?.webSearch_20250305 ?? anthropic.tools?.webSearch;
|
|
17434
|
+
if (!factory)
|
|
17435
|
+
throw new Error("Anthropic provider does not expose a web search tool.");
|
|
17436
|
+
return generateText({
|
|
17437
|
+
model: anthropic(input.model),
|
|
17438
|
+
prompt: input.query,
|
|
17439
|
+
tools: {
|
|
17440
|
+
web_search: factory({
|
|
17441
|
+
maxUses: input.maxUses,
|
|
17442
|
+
...input.domains.length > 0 ? { allowedDomains: input.domains } : {}
|
|
17443
|
+
})
|
|
16933
17444
|
}
|
|
17445
|
+
});
|
|
17446
|
+
}
|
|
17447
|
+
async function fileWebSources(options, sources, now) {
|
|
17448
|
+
if (!options.fileResults || sources.length === 0)
|
|
17449
|
+
return 0;
|
|
17450
|
+
const items = sources.map((source) => {
|
|
17451
|
+
const text = [source.title, source.snippet, source.url].filter(Boolean).join(`
|
|
17452
|
+
`);
|
|
17453
|
+
const hash2 = stableHash(text);
|
|
17454
|
+
return {
|
|
17455
|
+
source_ref: source.url,
|
|
17456
|
+
name: source.title ?? source.url,
|
|
17457
|
+
url: source.url,
|
|
17458
|
+
mime: "text/plain",
|
|
17459
|
+
hash: hash2,
|
|
17460
|
+
revision: hash2,
|
|
17461
|
+
status: "active",
|
|
17462
|
+
updated_at: now,
|
|
17463
|
+
permissions: { mode: "read_only", allowed_purposes: ["knowledge_answer", "knowledge_index"] },
|
|
17464
|
+
metadata: {
|
|
17465
|
+
source_ref: source.url,
|
|
17466
|
+
content_source: "provider_web_search",
|
|
17467
|
+
provider_metadata: source.provider_metadata
|
|
17468
|
+
},
|
|
17469
|
+
extracted_text: text
|
|
17470
|
+
};
|
|
17471
|
+
});
|
|
17472
|
+
const result = await ingestOpenFilesManifestItems({
|
|
17473
|
+
dbPath: options.dbPath,
|
|
17474
|
+
items,
|
|
17475
|
+
sourceLabel: `web-search:${options.query}`,
|
|
17476
|
+
readAction: "provider_web_search_file_results",
|
|
17477
|
+
safetyPolicy: options.safetyPolicy,
|
|
17478
|
+
now: new Date(now)
|
|
17479
|
+
});
|
|
17480
|
+
return result.sources_upserted;
|
|
17481
|
+
}
|
|
17482
|
+
async function runProviderWebSearch(options) {
|
|
17483
|
+
const query = options.query.trim();
|
|
17484
|
+
if (!query)
|
|
17485
|
+
throw new Error("Web search query is required.");
|
|
17486
|
+
const env = options.env ?? process.env;
|
|
17487
|
+
const now = (options.now ?? new Date).toISOString();
|
|
17488
|
+
const limit = Math.max(1, Math.min(options.limit ?? 5, 20));
|
|
17489
|
+
const maxUses = Math.max(1, Math.min(options.maxUses ?? 3, 10));
|
|
17490
|
+
const domains = options.domains ?? [];
|
|
17491
|
+
const modelRef = resolveModelRef(options.modelRef ?? (options.provider ? `${options.provider}:${providerSettings(options.config, options.provider).default_model}` : "default"), options.config);
|
|
17492
|
+
const parsed = parseModelRef(modelRef);
|
|
17493
|
+
const provider = options.provider ?? parsed.provider;
|
|
17494
|
+
const model = parsed.provider === provider ? parsed.model : providerSettings(options.config, provider).default_model;
|
|
17495
|
+
const runId = `run_${randomUUID6()}`;
|
|
17496
|
+
if (!options.fake && options.safetyPolicy)
|
|
17497
|
+
assertWebSearchAllowed(options.safetyPolicy);
|
|
17498
|
+
if (!options.fake && provider !== "openai" && provider !== "anthropic") {
|
|
17499
|
+
throw new Error(`Provider ${provider} does not expose native web search yet.`);
|
|
17500
|
+
}
|
|
17501
|
+
if (!options.fake)
|
|
17502
|
+
assertProviderCredentials(provider, options.config, env);
|
|
17503
|
+
migrateKnowledgeDb(options.dbPath);
|
|
17504
|
+
const db = openKnowledgeDb(options.dbPath);
|
|
17505
|
+
try {
|
|
17506
|
+
db.run(`INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
|
|
17507
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`, [
|
|
17508
|
+
runId,
|
|
17509
|
+
"provider-web-search",
|
|
17510
|
+
query,
|
|
17511
|
+
"running",
|
|
17512
|
+
provider,
|
|
17513
|
+
model,
|
|
17514
|
+
JSON.stringify({ domains, max_uses: maxUses, fake: options.fake === true }),
|
|
17515
|
+
now,
|
|
17516
|
+
now
|
|
17517
|
+
]);
|
|
17518
|
+
recordAuditEvent(db, {
|
|
17519
|
+
event_type: "source_read",
|
|
17520
|
+
action: options.fake ? "fake_provider_web_search" : "provider_web_search",
|
|
17521
|
+
target_uri: query,
|
|
17522
|
+
decision: "allow",
|
|
17523
|
+
metadata: { provider, model, domains, max_uses: maxUses },
|
|
17524
|
+
created_at: now
|
|
17525
|
+
});
|
|
16934
17526
|
} finally {
|
|
16935
17527
|
db.close();
|
|
16936
17528
|
}
|
|
16937
|
-
|
|
16938
|
-
|
|
16939
|
-
|
|
16940
|
-
const
|
|
16941
|
-
|
|
16942
|
-
|
|
16943
|
-
|
|
16944
|
-
|
|
16945
|
-
|
|
16946
|
-
|
|
16947
|
-
|
|
16948
|
-
|
|
16949
|
-
|
|
16950
|
-
|
|
16951
|
-
|
|
16952
|
-
|
|
16953
|
-
|
|
16954
|
-
|
|
16955
|
-
|
|
16956
|
-
|
|
16957
|
-
|
|
16958
|
-
|
|
16959
|
-
|
|
16960
|
-
|
|
16961
|
-
|
|
16962
|
-
|
|
16963
|
-
|
|
16964
|
-
|
|
16965
|
-
|
|
16966
|
-
|
|
16967
|
-
|
|
16968
|
-
|
|
17529
|
+
let answer = "";
|
|
17530
|
+
let sources = [];
|
|
17531
|
+
let usage = { input_tokens: estimateTokens2(query), output_tokens: 0, cost_usd: 0 };
|
|
17532
|
+
const warnings = [];
|
|
17533
|
+
if (options.fake) {
|
|
17534
|
+
sources = fakeSources(query, limit);
|
|
17535
|
+
answer = `Fake web search answer for: ${query}`;
|
|
17536
|
+
usage.output_tokens = estimateTokens2(answer);
|
|
17537
|
+
} else {
|
|
17538
|
+
const result = provider === "openai" ? await openAiWebSearch({ query, model, config: options.config, env, maxUses, domains }) : await anthropicWebSearch({ query, model, config: options.config, env, maxUses, domains });
|
|
17539
|
+
answer = result.text;
|
|
17540
|
+
const collected = new Map;
|
|
17541
|
+
collectSources(result.sources, collected);
|
|
17542
|
+
collectSources(result.toolResults, collected);
|
|
17543
|
+
sources = Array.from(collected.values()).slice(0, limit);
|
|
17544
|
+
const normalized = normalizeAiSdkUsage({
|
|
17545
|
+
provider,
|
|
17546
|
+
model,
|
|
17547
|
+
usage: result.usage,
|
|
17548
|
+
providerMetadata: result.providerMetadata
|
|
17549
|
+
});
|
|
17550
|
+
usage = {
|
|
17551
|
+
input_tokens: normalized.input_tokens,
|
|
17552
|
+
output_tokens: normalized.output_tokens,
|
|
17553
|
+
cost_usd: normalized.cost_usd
|
|
17554
|
+
};
|
|
17555
|
+
}
|
|
17556
|
+
const filedSources = await fileWebSources(options, sources, now);
|
|
17557
|
+
const writeDb = openKnowledgeDb(options.dbPath);
|
|
17558
|
+
try {
|
|
17559
|
+
writeDb.run(`UPDATE runs SET status = ?, metadata_json = ?, updated_at = ? WHERE id = ?`, [
|
|
17560
|
+
"completed",
|
|
17561
|
+
JSON.stringify({ domains, max_uses: maxUses, sources: sources.length, filed_sources: filedSources, fake: options.fake === true }),
|
|
17562
|
+
now,
|
|
17563
|
+
runId
|
|
17564
|
+
]);
|
|
17565
|
+
writeDb.run(`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
|
|
17566
|
+
VALUES (?, ?, ?, ?, ?, ?)`, [
|
|
17567
|
+
`evt_${randomUUID6()}`,
|
|
17568
|
+
runId,
|
|
17569
|
+
"info",
|
|
17570
|
+
"provider_web_search_completed",
|
|
17571
|
+
JSON.stringify({ sources: sources.length, filed_sources: filedSources }),
|
|
17572
|
+
now
|
|
17573
|
+
]);
|
|
17574
|
+
recordProviderUsage(writeDb, {
|
|
17575
|
+
run_id: runId,
|
|
17576
|
+
provider,
|
|
17577
|
+
model,
|
|
17578
|
+
input_tokens: usage.input_tokens,
|
|
17579
|
+
output_tokens: usage.output_tokens,
|
|
17580
|
+
cost_usd: usage.cost_usd,
|
|
17581
|
+
metadata: { web_search: true, sources: sources.length, filed_sources: filedSources },
|
|
17582
|
+
created_at: now
|
|
17583
|
+
});
|
|
17584
|
+
} finally {
|
|
17585
|
+
writeDb.close();
|
|
16969
17586
|
}
|
|
17587
|
+
if (sources.length === 0)
|
|
17588
|
+
warnings.push("no_web_sources_returned");
|
|
16970
17589
|
return {
|
|
16971
|
-
|
|
16972
|
-
|
|
16973
|
-
|
|
16974
|
-
|
|
16975
|
-
|
|
16976
|
-
|
|
16977
|
-
|
|
16978
|
-
|
|
16979
|
-
|
|
16980
|
-
graph: loadGraphEvidence(options.dbPath, results),
|
|
16981
|
-
notes: {
|
|
16982
|
-
permissions: Array.from(permissionNotes),
|
|
16983
|
-
freshness: Array.from(freshnessNotes)
|
|
16984
|
-
}
|
|
17590
|
+
run_id: runId,
|
|
17591
|
+
query,
|
|
17592
|
+
provider,
|
|
17593
|
+
model,
|
|
17594
|
+
answer,
|
|
17595
|
+
sources,
|
|
17596
|
+
filed_sources: filedSources,
|
|
17597
|
+
usage,
|
|
17598
|
+
warnings
|
|
16985
17599
|
};
|
|
16986
17600
|
}
|
|
16987
17601
|
|
|
16988
17602
|
// src/storage-contract.ts
|
|
16989
|
-
import { createHash as
|
|
17603
|
+
import { createHash as createHash8, randomUUID as randomUUID7 } from "crypto";
|
|
16990
17604
|
var GENERATED_ARTIFACTS = [
|
|
16991
17605
|
{
|
|
16992
17606
|
kind: "schema",
|
|
@@ -17022,7 +17636,7 @@ var GENERATED_ARTIFACTS = [
|
|
|
17022
17636
|
function hashArtifactBody(body) {
|
|
17023
17637
|
const bytes = typeof body === "string" ? Buffer.from(body) : Buffer.from(body);
|
|
17024
17638
|
return {
|
|
17025
|
-
hash: `sha256:${
|
|
17639
|
+
hash: `sha256:${createHash8("sha256").update(bytes).digest("hex")}`,
|
|
17026
17640
|
size_bytes: bytes.byteLength
|
|
17027
17641
|
};
|
|
17028
17642
|
}
|
|
@@ -17147,7 +17761,7 @@ function recordStorageObjects(db, objects, now = new Date) {
|
|
|
17147
17761
|
`);
|
|
17148
17762
|
const insert = db.transaction((entries) => {
|
|
17149
17763
|
for (const entry of entries) {
|
|
17150
|
-
statement.run(
|
|
17764
|
+
statement.run(randomUUID7(), entry.uri, entry.kind, entry.content_type ?? null, entry.hash ?? null, entry.size_bytes ?? null, JSON.stringify({
|
|
17151
17765
|
key: entry.key,
|
|
17152
17766
|
...entry.metadata ?? {}
|
|
17153
17767
|
}), timestamp, timestamp);
|
|
@@ -17157,7 +17771,7 @@ function recordStorageObjects(db, objects, now = new Date) {
|
|
|
17157
17771
|
}
|
|
17158
17772
|
|
|
17159
17773
|
// src/wiki-layout.ts
|
|
17160
|
-
import { createHash as
|
|
17774
|
+
import { createHash as createHash9 } from "crypto";
|
|
17161
17775
|
function todayParts(now) {
|
|
17162
17776
|
const year = String(now.getUTCFullYear());
|
|
17163
17777
|
const month = String(now.getUTCMonth() + 1).padStart(2, "0");
|
|
@@ -17165,7 +17779,7 @@ function todayParts(now) {
|
|
|
17165
17779
|
return { year, month, day };
|
|
17166
17780
|
}
|
|
17167
17781
|
function stableId5(prefix, value) {
|
|
17168
|
-
return `${prefix}_${
|
|
17782
|
+
return `${prefix}_${createHash9("sha256").update(value).digest("hex").slice(0, 20)}`;
|
|
17169
17783
|
}
|
|
17170
17784
|
function estimateTokenCount2(text) {
|
|
17171
17785
|
const words = text.trim().split(/\s+/).filter(Boolean).length;
|
|
@@ -17522,6 +18136,23 @@ class KnowledgeService {
|
|
|
17522
18136
|
config: this.config()
|
|
17523
18137
|
});
|
|
17524
18138
|
}
|
|
18139
|
+
async runPrompt(options) {
|
|
18140
|
+
const workspace = this.ensureWorkspace();
|
|
18141
|
+
return runKnowledgePrompt({
|
|
18142
|
+
...options,
|
|
18143
|
+
dbPath: workspace.knowledgeDbPath,
|
|
18144
|
+
config: this.config()
|
|
18145
|
+
});
|
|
18146
|
+
}
|
|
18147
|
+
async webSearch(options) {
|
|
18148
|
+
const workspace = this.ensureWorkspace();
|
|
18149
|
+
return runProviderWebSearch({
|
|
18150
|
+
...options,
|
|
18151
|
+
dbPath: workspace.knowledgeDbPath,
|
|
18152
|
+
config: this.config(),
|
|
18153
|
+
safetyPolicy: this.safetyPolicy()
|
|
18154
|
+
});
|
|
18155
|
+
}
|
|
17525
18156
|
}
|
|
17526
18157
|
function createKnowledgeService(options = {}) {
|
|
17527
18158
|
return new KnowledgeService(options);
|
|
@@ -17703,6 +18334,41 @@ function buildServer() {
|
|
|
17703
18334
|
return errorText(error48 instanceof Error ? error48.message : String(error48));
|
|
17704
18335
|
}
|
|
17705
18336
|
});
|
|
18337
|
+
registerTool(server, "knowledge_ask", "Knowledge prompt answer", "Answer a prompt using read-only knowledge context and optional AI SDK generation", {
|
|
18338
|
+
scope: scopeField,
|
|
18339
|
+
prompt: exports_external.string().describe("Prompt to answer with the knowledge base"),
|
|
18340
|
+
limit: exports_external.number().optional().describe("Maximum context results"),
|
|
18341
|
+
semantic: exports_external.boolean().optional().describe("Include vector semantic results"),
|
|
18342
|
+
generate: exports_external.boolean().optional().describe("Call AI SDK text generation; omitted returns a local citation draft"),
|
|
18343
|
+
approve_write: exports_external.boolean().optional().describe("Record approval intent for future durable wiki writes"),
|
|
18344
|
+
model: exports_external.string().optional().describe("Model alias/ref, default configured provider default"),
|
|
18345
|
+
dimensions: exports_external.number().optional().describe("Embedding dimensions for deterministic fake mode"),
|
|
18346
|
+
fake: exports_external.boolean().optional().describe("Use deterministic fake embeddings/generation for local tests")
|
|
18347
|
+
}, async ({ scope, prompt, limit, semantic, generate, approve_write, model, dimensions, fake }) => {
|
|
18348
|
+
const service = createKnowledgeService({ scope });
|
|
18349
|
+
try {
|
|
18350
|
+
return jsonText({ ok: true, ...await service.runPrompt({ prompt, limit, semantic, generate, approveWrite: approve_write, modelRef: model, dimensions, fake }) });
|
|
18351
|
+
} catch (error48) {
|
|
18352
|
+
return errorText(error48 instanceof Error ? error48.message : String(error48));
|
|
18353
|
+
}
|
|
18354
|
+
});
|
|
18355
|
+
registerTool(server, "ok_web_search", "Provider web search", "Run safety-gated provider-native web search and return citations/sources", {
|
|
18356
|
+
scope: scopeField,
|
|
18357
|
+
query: exports_external.string().describe("Web search query"),
|
|
18358
|
+
limit: exports_external.number().optional().describe("Maximum sources"),
|
|
18359
|
+
provider: exports_external.enum(["openai", "anthropic", "deepseek"]).optional().describe("Provider override"),
|
|
18360
|
+
model: exports_external.string().optional().describe("Model alias/ref"),
|
|
18361
|
+
domains: exports_external.array(exports_external.string()).optional().describe("Allowed domains"),
|
|
18362
|
+
fake: exports_external.boolean().optional().describe("Use deterministic fake web results"),
|
|
18363
|
+
file_results: exports_external.boolean().optional().describe("File web snippets as web source refs")
|
|
18364
|
+
}, async ({ scope, query, limit, provider, model, domains, fake, file_results }) => {
|
|
18365
|
+
const service = createKnowledgeService({ scope });
|
|
18366
|
+
try {
|
|
18367
|
+
return jsonText({ ok: true, ...await service.webSearch({ query, limit, provider, modelRef: model, domains, fake, fileResults: file_results }) });
|
|
18368
|
+
} catch (error48) {
|
|
18369
|
+
return errorText(error48 instanceof Error ? error48.message : String(error48));
|
|
18370
|
+
}
|
|
18371
|
+
});
|
|
17706
18372
|
registerTool(server, "ok_add", "Add a knowledge item", "Add a new item to the knowledge store", {
|
|
17707
18373
|
title: exports_external.string().describe("Item title"),
|
|
17708
18374
|
content: exports_external.string().describe("Item content/body"),
|