akm-cli 0.7.4 → 0.8.0-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +224 -1
- package/README.md +22 -6
- package/SECURITY.md +93 -0
- package/dist/cli/config-migrate.js +144 -0
- package/dist/cli/config-validate.js +39 -0
- package/dist/cli/confirm.js +73 -0
- package/dist/cli/parse-args.js +133 -0
- package/dist/cli/shared.js +129 -0
- package/dist/cli.js +2631 -1440
- package/dist/commands/add-cli.js +279 -0
- package/dist/commands/agent-dispatch.js +110 -0
- package/dist/commands/agent-support.js +68 -0
- package/dist/commands/completions.js +3 -0
- package/dist/commands/config-cli.js +130 -534
- package/dist/commands/consolidate.js +2122 -0
- package/dist/commands/curate.js +45 -3
- package/dist/commands/db-cli.js +23 -0
- package/dist/commands/distill-promotion-policy.js +660 -0
- package/dist/commands/distill.js +1081 -73
- package/dist/commands/env.js +213 -0
- package/dist/commands/eval-cases.js +43 -0
- package/dist/commands/events.js +15 -24
- package/dist/commands/extract-cli.js +127 -0
- package/dist/commands/extract-prompt.js +204 -0
- package/dist/commands/extract.js +477 -0
- package/dist/commands/feedback-cli.js +331 -0
- package/dist/commands/graph.js +477 -0
- package/dist/commands/health.js +1302 -0
- package/dist/commands/help/help-accept.md +12 -0
- package/dist/commands/help/help-improve.md +69 -0
- package/dist/commands/help/help-proposals.md +18 -0
- package/dist/commands/help/help-propose.md +17 -0
- package/dist/commands/help/help-reject.md +11 -0
- package/dist/commands/history.js +54 -46
- package/dist/commands/improve-auto-accept.js +97 -0
- package/dist/commands/improve-cli.js +217 -0
- package/dist/commands/improve-profiles.js +166 -0
- package/dist/commands/improve-result-file.js +167 -0
- package/dist/commands/improve.js +2373 -0
- package/dist/commands/info.js +5 -2
- package/dist/commands/init.js +50 -2
- package/dist/commands/installed-stashes.js +102 -139
- package/dist/commands/knowledge.js +136 -0
- package/dist/commands/lint/agent-linter.js +49 -0
- package/dist/commands/lint/base-linter.js +479 -0
- package/dist/commands/lint/command-linter.js +49 -0
- package/dist/commands/lint/default-linter.js +16 -0
- package/dist/commands/lint/env-key-rules.js +154 -0
- package/dist/commands/lint/index.js +196 -0
- package/dist/commands/lint/knowledge-linter.js +16 -0
- package/dist/commands/lint/markdown-insertion.js +343 -0
- package/dist/commands/lint/memory-linter.js +61 -0
- package/dist/commands/lint/registry.js +36 -0
- package/dist/commands/lint/skill-linter.js +45 -0
- package/dist/commands/lint/task-linter.js +50 -0
- package/dist/commands/lint/types.js +4 -0
- package/dist/commands/lint/workflow-linter.js +56 -0
- package/dist/commands/lint.js +4 -0
- package/dist/commands/migration-help.js +3 -0
- package/dist/commands/proposal.js +67 -12
- package/dist/commands/propose.js +120 -45
- package/dist/commands/reflect.js +1104 -60
- package/dist/commands/registry-cli.js +150 -0
- package/dist/commands/registry-search.js +5 -2
- package/dist/commands/remember-cli.js +257 -0
- package/dist/commands/remember.js +70 -7
- package/dist/commands/schema-repair.js +203 -0
- package/dist/commands/search.js +115 -14
- package/dist/commands/secret.js +173 -0
- package/dist/commands/self-update.js +3 -0
- package/dist/commands/show.js +158 -60
- package/dist/commands/source-add.js +17 -45
- package/dist/commands/source-clone.js +3 -0
- package/dist/commands/source-manage.js +14 -19
- package/dist/commands/tasks.js +437 -0
- package/dist/commands/url-checker.js +42 -0
- package/dist/core/action-contributors.js +28 -0
- package/dist/core/asset-ref.js +17 -2
- package/dist/core/asset-registry.js +12 -17
- package/dist/core/asset-serialize.js +88 -0
- package/dist/core/asset-spec.js +67 -1
- package/dist/core/common.js +182 -0
- package/dist/core/concurrent.js +25 -0
- package/dist/core/config-io.js +347 -0
- package/dist/core/config-migration.js +622 -0
- package/dist/core/config-schema.js +534 -0
- package/dist/core/config-sources.js +108 -0
- package/dist/core/config-types.js +4 -0
- package/dist/core/config-walker.js +337 -0
- package/dist/core/config.js +364 -968
- package/dist/core/errors.js +42 -20
- package/dist/core/events.js +105 -135
- package/dist/core/file-lock.js +104 -0
- package/dist/core/frontmatter.js +75 -8
- package/dist/core/lesson-lint.js +3 -0
- package/dist/core/markdown.js +20 -0
- package/dist/core/memory-belief.js +62 -0
- package/dist/core/memory-contradiction-detect.js +274 -0
- package/dist/core/memory-improve.js +806 -0
- package/dist/core/parse.js +158 -0
- package/dist/core/paths.js +280 -14
- package/dist/core/proposal-quality-validators.js +380 -0
- package/dist/core/proposal-validators.js +69 -0
- package/dist/core/proposals.js +512 -42
- package/dist/core/state-db.js +1068 -0
- package/dist/core/text-truncation.js +107 -0
- package/dist/core/time.js +54 -0
- package/dist/core/tty.js +59 -0
- package/dist/core/warn.js +64 -1
- package/dist/core/write-source.js +3 -0
- package/dist/indexer/db-backup.js +391 -0
- package/dist/indexer/db-search.js +198 -489
- package/dist/indexer/db.js +990 -108
- package/dist/indexer/ensure-index.js +136 -0
- package/dist/indexer/file-context.js +3 -0
- package/dist/indexer/graph-boost.js +376 -101
- package/dist/indexer/graph-db.js +391 -0
- package/dist/indexer/graph-dedup.js +95 -0
- package/dist/indexer/graph-extraction.js +550 -114
- package/dist/indexer/index-context.js +4 -0
- package/dist/indexer/indexer.js +547 -309
- package/dist/indexer/llm-cache.js +52 -0
- package/dist/indexer/manifest.js +3 -0
- package/dist/indexer/matchers.js +167 -160
- package/dist/indexer/memory-inference.js +152 -74
- package/dist/indexer/metadata-contributors.js +29 -0
- package/dist/indexer/metadata.js +275 -196
- package/dist/indexer/path-resolver.js +92 -0
- package/dist/indexer/project-context.js +192 -0
- package/dist/indexer/ranking-contributors.js +331 -0
- package/dist/indexer/ranking.js +81 -0
- package/dist/indexer/search-fields.js +5 -9
- package/dist/indexer/search-hit-enrichers.js +111 -0
- package/dist/indexer/search-source.js +44 -10
- package/dist/indexer/semantic-status.js +6 -17
- package/dist/indexer/staleness-detect.js +447 -0
- package/dist/indexer/usage-events.js +12 -9
- package/dist/indexer/walker.js +28 -0
- package/dist/integrations/agent/builders.js +135 -0
- package/dist/integrations/agent/config.js +122 -230
- package/dist/integrations/agent/detect.js +3 -0
- package/dist/integrations/agent/index.js +7 -13
- package/dist/integrations/agent/model-aliases.js +55 -0
- package/dist/integrations/agent/profiles.js +70 -5
- package/dist/integrations/agent/prompts.js +250 -36
- package/dist/integrations/agent/runner.js +151 -0
- package/dist/integrations/agent/sdk-runner.js +126 -0
- package/dist/integrations/agent/spawn.js +183 -35
- package/dist/integrations/github.js +3 -0
- package/dist/integrations/lockfile.js +32 -69
- package/dist/integrations/session-logs/index.js +69 -0
- package/dist/integrations/session-logs/inline-refs.js +35 -0
- package/dist/integrations/session-logs/pre-filter.js +152 -0
- package/dist/integrations/session-logs/providers/claude-code.js +282 -0
- package/dist/integrations/session-logs/providers/opencode.js +258 -0
- package/dist/integrations/session-logs/types.js +4 -0
- package/dist/llm/call-ai.js +62 -0
- package/dist/llm/client.js +79 -88
- package/dist/llm/embedder.js +20 -29
- package/dist/llm/embedders/cache.js +3 -7
- package/dist/llm/embedders/local.js +42 -1
- package/dist/llm/embedders/remote.js +20 -8
- package/dist/llm/embedders/types.js +3 -7
- package/dist/llm/feature-gate.js +95 -48
- package/dist/llm/graph-extract.js +676 -72
- package/dist/llm/index-passes.js +44 -29
- package/dist/llm/memory-infer.js +80 -71
- package/dist/llm/metadata-enhance.js +42 -29
- package/dist/llm/prompts/extract-session.md +80 -0
- package/dist/llm/prompts/graph-extract-user-prompt.md +35 -0
- package/dist/output/cli-hints-full.md +292 -0
- package/dist/output/cli-hints-short.md +66 -0
- package/dist/output/cli-hints.js +7 -311
- package/dist/output/context.js +60 -8
- package/dist/output/renderers.js +306 -258
- package/dist/output/shapes/curate.js +56 -0
- package/dist/output/shapes/distill.js +10 -0
- package/dist/output/shapes/env-list.js +19 -0
- package/dist/output/shapes/events.js +11 -0
- package/dist/output/shapes/helpers.js +424 -0
- package/dist/output/shapes/history.js +7 -0
- package/dist/output/shapes/passthrough.js +102 -0
- package/dist/output/shapes/proposal-accept.js +7 -0
- package/dist/output/shapes/proposal-diff.js +7 -0
- package/dist/output/shapes/proposal-list.js +7 -0
- package/dist/output/shapes/proposal-producer.js +11 -0
- package/dist/output/shapes/proposal-reject.js +7 -0
- package/dist/output/shapes/proposal-show.js +7 -0
- package/dist/output/shapes/registry-search.js +6 -0
- package/dist/output/shapes/registry.js +30 -0
- package/dist/output/shapes/search.js +6 -0
- package/dist/output/shapes/secret-list.js +19 -0
- package/dist/output/shapes/show.js +6 -0
- package/dist/output/shapes/vault-list.js +19 -0
- package/dist/output/shapes.js +51 -511
- package/dist/output/text/add.js +6 -0
- package/dist/output/text/clone.js +6 -0
- package/dist/output/text/config.js +6 -0
- package/dist/output/text/curate.js +6 -0
- package/dist/output/text/distill.js +7 -0
- package/dist/output/text/enable-disable.js +7 -0
- package/dist/output/text/events.js +10 -0
- package/dist/output/text/feedback.js +6 -0
- package/dist/output/text/helpers.js +1039 -0
- package/dist/output/text/history.js +7 -0
- package/dist/output/text/import.js +6 -0
- package/dist/output/text/index.js +6 -0
- package/dist/output/text/info.js +6 -0
- package/dist/output/text/init.js +6 -0
- package/dist/output/text/list.js +6 -0
- package/dist/output/text/proposal-producer.js +8 -0
- package/dist/output/text/proposal.js +11 -0
- package/dist/output/text/registry-commands.js +11 -0
- package/dist/output/text/registry.js +30 -0
- package/dist/output/text/remember.js +6 -0
- package/dist/output/text/remove.js +6 -0
- package/dist/output/text/save.js +6 -0
- package/dist/output/text/search.js +6 -0
- package/dist/output/text/show.js +6 -0
- package/dist/output/text/update.js +6 -0
- package/dist/output/text/upgrade.js +6 -0
- package/dist/output/text/vault.js +16 -0
- package/dist/output/text/wiki.js +15 -0
- package/dist/output/text/workflow.js +14 -0
- package/dist/output/text.js +44 -1093
- package/dist/registry/build-index.js +3 -0
- package/dist/registry/create-provider-registry.js +3 -0
- package/dist/registry/factory.js +4 -1
- package/dist/registry/origin-resolve.js +3 -0
- package/dist/registry/providers/index.js +3 -0
- package/dist/registry/providers/skills-sh.js +71 -50
- package/dist/registry/providers/static-index.js +53 -48
- package/dist/registry/providers/types.js +3 -24
- package/dist/registry/resolve.js +11 -16
- package/dist/registry/types.js +3 -0
- package/dist/scripts/migrate-storage.js +17750 -0
- package/dist/scripts/migrations/import-fs-improve-runs-to-db.js +9031 -0
- package/dist/scripts/migrations/v16-to-v17.js +141 -0
- package/dist/setup/detect.js +3 -0
- package/dist/setup/ripgrep-install.js +3 -0
- package/dist/setup/ripgrep-resolve.js +3 -0
- package/dist/setup/setup.js +775 -37
- package/dist/setup/steps.js +3 -15
- package/dist/sources/include.js +3 -0
- package/dist/sources/provider-factory.js +5 -12
- package/dist/sources/provider.js +3 -20
- package/dist/sources/providers/filesystem.js +19 -23
- package/dist/sources/providers/git.js +179 -20
- package/dist/sources/providers/index.js +3 -0
- package/dist/sources/providers/install-types.js +3 -13
- package/dist/sources/providers/npm.js +3 -4
- package/dist/sources/providers/provider-utils.js +3 -0
- package/dist/sources/providers/sync-from-ref.js +3 -11
- package/dist/sources/providers/tar-utils.js +3 -0
- package/dist/sources/providers/website.js +18 -22
- package/dist/sources/resolve.js +3 -0
- package/dist/sources/types.js +3 -0
- package/dist/sources/website-ingest.js +7 -0
- package/dist/tasks/backends/cron.js +203 -0
- package/dist/tasks/backends/exec-utils.js +28 -0
- package/dist/tasks/backends/index.js +24 -0
- package/dist/tasks/backends/launchd-template.xml +19 -0
- package/dist/tasks/backends/launchd.js +187 -0
- package/dist/tasks/backends/schtasks-template.xml +29 -0
- package/dist/tasks/backends/schtasks.js +215 -0
- package/dist/tasks/parser.js +211 -0
- package/dist/tasks/resolveAkmBin.js +87 -0
- package/dist/tasks/runner.js +458 -0
- package/dist/tasks/schedule.js +227 -0
- package/dist/tasks/schema.js +15 -0
- package/dist/tasks/validator.js +62 -0
- package/dist/version.js +3 -0
- package/dist/wiki/index-template.md +12 -0
- package/dist/wiki/ingest-workflow-template.md +54 -0
- package/dist/wiki/log-template.md +8 -0
- package/dist/wiki/schema-template.md +61 -0
- package/dist/wiki/wiki-templates.js +15 -0
- package/dist/wiki/wiki.js +13 -61
- package/dist/workflows/authoring.js +8 -25
- package/dist/workflows/cli.js +3 -0
- package/dist/workflows/db.js +141 -2
- package/dist/workflows/document-cache.js +3 -10
- package/dist/workflows/parser.js +3 -0
- package/dist/workflows/renderer.js +11 -3
- package/dist/workflows/runs.js +91 -89
- package/dist/workflows/schema.js +3 -0
- package/dist/workflows/scope-key.js +79 -0
- package/dist/workflows/validator.js +4 -8
- package/dist/workflows/workflow-template.md +24 -0
- package/docs/README.md +10 -2
- package/docs/data-and-telemetry.md +225 -0
- package/docs/migration/release-notes/0.7.0.md +1 -1
- package/docs/migration/release-notes/0.7.4.md +1 -1
- package/docs/migration/release-notes/0.7.5.md +20 -0
- package/docs/migration/release-notes/0.8.0.md +48 -0
- package/docs/migration/v0.7-to-v0.8.md +1307 -0
- package/package.json +29 -11
- package/dist/commands/install-audit.js +0 -381
- package/dist/commands/vault.js +0 -333
- package/dist/templates/wiki-templates.js +0 -100
|
@@ -1,133 +1,589 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
* Walks the primary stash for `memory:` and `knowledge:` assets, asks the
|
|
5
|
-
* configured LLM to extract entities and relations from each one, and
|
|
6
|
-
* persists the result to a single stash-local artifact at
|
|
7
|
-
* `<stashRoot>/.akm/graph.json`. The artifact is consumed by the search
|
|
8
|
-
* pipeline (see `src/indexer/graph-boost.ts`) as a single boost component
|
|
9
|
-
* inside the existing FTS5+boosts loop — there is NO second SearchHit
|
|
10
|
-
* scorer and no parallel ranking track.
|
|
11
|
-
*
|
|
12
|
-
* Disabling — three preconditions must ALL hold for the pass to run:
|
|
13
|
-
* 1. `akm.llm` must be configured (no provider = no extraction). When
|
|
14
|
-
* absent, `resolveIndexPassLLM("graph", config)` returns `undefined`
|
|
15
|
-
* and the pass short-circuits.
|
|
16
|
-
* 2. `llm.features.graph_extraction !== false` — the locked v1 spec §14
|
|
17
|
-
* feature-flag layer. Set to `false` to block the pass at the
|
|
18
|
-
* feature-gate layer (no network call may ever issue).
|
|
19
|
-
* 3. `index.graph.llm !== false` — the per-pass opt-out layer (#208).
|
|
20
|
-
* Set to `false` to skip just this pass while leaving other passes
|
|
21
|
-
* that share the same `llm` block enabled.
|
|
22
|
-
* Toggling any one off does NOT delete the existing `graph.json` — the
|
|
23
|
-
* user keeps the boost component they already have, it just stops
|
|
24
|
-
* refreshing.
|
|
25
|
-
*
|
|
26
|
-
* Locked v1 contract:
|
|
27
|
-
* - LLM access is exclusively via `resolveIndexPassLLM("graph", config)`.
|
|
28
|
-
* - The `graph.json` file is an indexer artifact, NOT a user-visible
|
|
29
|
-
* asset. It does not have an asset ref, does not appear in search
|
|
30
|
-
* hits, and is not addressable via `akm show`. Direct `fs.writeFile`
|
|
31
|
-
* is therefore the correct primitive — `writeAssetToSource` is
|
|
32
|
-
* reserved for asset writes (CLAUDE.md / spec §10 step 5).
|
|
33
|
-
*/
|
|
1
|
+
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
2
|
+
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
3
|
+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
34
4
|
import fs from "node:fs";
|
|
35
5
|
import path from "node:path";
|
|
6
|
+
import { TYPE_DIRS } from "../core/asset-spec";
|
|
7
|
+
import { concurrentMap } from "../core/concurrent";
|
|
8
|
+
import { getIndexPassConfig, resolveBatchSize } from "../core/config";
|
|
36
9
|
import { parseFrontmatter } from "../core/frontmatter";
|
|
37
|
-
import { warn } from "../core/warn";
|
|
38
|
-
import {
|
|
10
|
+
import { warn, warnVerbose } from "../core/warn";
|
|
11
|
+
import { isProcessEnabled } from "../llm/feature-gate";
|
|
12
|
+
import * as graphExtract from "../llm/graph-extract";
|
|
39
13
|
import { resolveIndexPassLLM } from "../llm/index-passes";
|
|
14
|
+
import { computeBodyHash, GRAPH_SCHEMA_VERSION, getLlmCacheEntriesByRefs, getLlmCacheEntry, upsertLlmCacheEntry, } from "./db";
|
|
15
|
+
import { loadStoredGraphSnapshot, replaceStoredGraph } from "./graph-db";
|
|
16
|
+
import { deduplicateGraph } from "./graph-dedup";
|
|
17
|
+
import { walkMarkdownFiles } from "./walker";
|
|
40
18
|
/** Schema version for the persisted artifact — bumps trigger a full rebuild. */
|
|
41
|
-
export const GRAPH_FILE_SCHEMA_VERSION =
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
19
|
+
export const GRAPH_FILE_SCHEMA_VERSION = GRAPH_SCHEMA_VERSION;
|
|
20
|
+
const EMPTY_QUALITY = {
|
|
21
|
+
consideredFiles: 0,
|
|
22
|
+
extractedFiles: 0,
|
|
23
|
+
entityCount: 0,
|
|
24
|
+
relationCount: 0,
|
|
25
|
+
extractionCoverage: 0,
|
|
26
|
+
density: 0,
|
|
27
|
+
};
|
|
48
28
|
const EMPTY_RESULT = {
|
|
49
29
|
considered: 0,
|
|
50
30
|
extracted: 0,
|
|
51
31
|
totalEntities: 0,
|
|
52
32
|
totalRelations: 0,
|
|
53
33
|
written: false,
|
|
34
|
+
quality: { ...EMPTY_QUALITY },
|
|
35
|
+
telemetry: {
|
|
36
|
+
cacheHits: 0,
|
|
37
|
+
cacheMisses: 0,
|
|
38
|
+
truncationCount: 0,
|
|
39
|
+
failureCount: 0,
|
|
40
|
+
},
|
|
41
|
+
warnings: [],
|
|
54
42
|
};
|
|
43
|
+
function roundMetric(value) {
|
|
44
|
+
return Number(value.toFixed(4));
|
|
45
|
+
}
|
|
46
|
+
function computeGraphQualityTelemetry(consideredFiles, extractedFiles, entityCount, relationCount) {
|
|
47
|
+
const extractionCoverage = consideredFiles > 0 ? extractedFiles / consideredFiles : 0;
|
|
48
|
+
const maxEdges = entityCount > 1 ? (entityCount * (entityCount - 1)) / 2 : 0;
|
|
49
|
+
const density = maxEdges > 0 ? relationCount / maxEdges : 0;
|
|
50
|
+
return {
|
|
51
|
+
consideredFiles,
|
|
52
|
+
extractedFiles,
|
|
53
|
+
entityCount,
|
|
54
|
+
relationCount,
|
|
55
|
+
extractionCoverage: roundMetric(extractionCoverage),
|
|
56
|
+
density: roundMetric(density),
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
export const DEFAULT_GRAPH_EXTRACTION_INCLUDE_TYPES = ["memory", "knowledge"];
|
|
60
|
+
const SUPPORTED_GRAPH_EXTRACTION_INCLUDE_TYPES = new Set([
|
|
61
|
+
"memory",
|
|
62
|
+
"knowledge",
|
|
63
|
+
"skill",
|
|
64
|
+
"command",
|
|
65
|
+
"agent",
|
|
66
|
+
"workflow",
|
|
67
|
+
"lesson",
|
|
68
|
+
"task",
|
|
69
|
+
"wiki",
|
|
70
|
+
]);
|
|
71
|
+
const GRAPH_CACHE_VARIANT_PREFIX = "graph-extraction";
|
|
72
|
+
function normalizeConfidence(raw) {
|
|
73
|
+
if (typeof raw !== "number" || !Number.isFinite(raw))
|
|
74
|
+
return undefined;
|
|
75
|
+
return Math.max(0, Math.min(1, raw));
|
|
76
|
+
}
|
|
77
|
+
function getGraphExtractorId(config) {
|
|
78
|
+
const fingerprint = computeBodyHash(JSON.stringify({
|
|
79
|
+
promptVersion: graphExtract.GRAPH_EXTRACT_PROMPT_VERSION,
|
|
80
|
+
model: config.model,
|
|
81
|
+
batchSize: config.batchSize,
|
|
82
|
+
includeTypes: config.includeTypes,
|
|
83
|
+
maxChunkBodyChars: 1600,
|
|
84
|
+
maxBatchBodyChars: 1600,
|
|
85
|
+
})).slice(0, 16);
|
|
86
|
+
return `${GRAPH_CACHE_VARIANT_PREFIX}:${graphExtract.GRAPH_EXTRACT_PROMPT_VERSION}:${config.model}:${fingerprint}`;
|
|
87
|
+
}
|
|
88
|
+
function buildLowQualityWarnings(quality, telemetry) {
|
|
89
|
+
const warnings = [];
|
|
90
|
+
if (quality.consideredFiles >= 5 && quality.extractionCoverage < 0.3) {
|
|
91
|
+
warnings.push(`Low graph extraction coverage (${quality.extractedFiles}/${quality.consideredFiles}, ${quality.extractionCoverage}).`);
|
|
92
|
+
}
|
|
93
|
+
if (quality.entityCount >= 8 && quality.relationCount === 0) {
|
|
94
|
+
warnings.push("Graph extraction produced many entities but no relations.");
|
|
95
|
+
}
|
|
96
|
+
if (telemetry.failureCount > 0) {
|
|
97
|
+
warnings.push(`Graph extraction encountered ${telemetry.failureCount} failed file extraction(s).`);
|
|
98
|
+
}
|
|
99
|
+
return warnings;
|
|
100
|
+
}
|
|
101
|
+
export function getGraphExtractionIncludeTypes(config) {
|
|
102
|
+
const configured = getIndexPassConfig(config.index, "graph")?.graphExtractionIncludeTypes;
|
|
103
|
+
if (!configured || configured.length === 0)
|
|
104
|
+
return [...DEFAULT_GRAPH_EXTRACTION_INCLUDE_TYPES];
|
|
105
|
+
const out = [];
|
|
106
|
+
const seen = new Set();
|
|
107
|
+
for (const rawType of configured) {
|
|
108
|
+
const type = rawType.trim().toLowerCase();
|
|
109
|
+
if (!type || seen.has(type))
|
|
110
|
+
continue;
|
|
111
|
+
if (!SUPPORTED_GRAPH_EXTRACTION_INCLUDE_TYPES.has(type))
|
|
112
|
+
continue;
|
|
113
|
+
seen.add(type);
|
|
114
|
+
out.push(type);
|
|
115
|
+
}
|
|
116
|
+
return out.length > 0 ? out : [...DEFAULT_GRAPH_EXTRACTION_INCLUDE_TYPES];
|
|
117
|
+
}
|
|
118
|
+
function validateGraphCacheShape(raw) {
|
|
119
|
+
if (!raw || typeof raw !== "object")
|
|
120
|
+
return undefined;
|
|
121
|
+
const obj = raw;
|
|
122
|
+
if (!Array.isArray(obj.entities) || !obj.entities.every((e) => typeof e === "string"))
|
|
123
|
+
return undefined;
|
|
124
|
+
if (obj.relations !== undefined &&
|
|
125
|
+
(!Array.isArray(obj.relations) ||
|
|
126
|
+
!obj.relations.every((r) => {
|
|
127
|
+
if (!r || typeof r !== "object")
|
|
128
|
+
return false;
|
|
129
|
+
const rel = r;
|
|
130
|
+
if (typeof rel.from !== "string" || typeof rel.to !== "string")
|
|
131
|
+
return false;
|
|
132
|
+
if (rel.type !== undefined && typeof rel.type !== "string")
|
|
133
|
+
return false;
|
|
134
|
+
if (rel.confidence !== undefined && (typeof rel.confidence !== "number" || !Number.isFinite(rel.confidence))) {
|
|
135
|
+
return false;
|
|
136
|
+
}
|
|
137
|
+
return true;
|
|
138
|
+
}))) {
|
|
139
|
+
return undefined;
|
|
140
|
+
}
|
|
141
|
+
return {
|
|
142
|
+
entities: obj.entities,
|
|
143
|
+
relations: Array.isArray(obj.relations) ? obj.relations : [],
|
|
144
|
+
confidence: normalizeConfidence(obj.confidence),
|
|
145
|
+
...(typeof obj.status === "string" ? { status: obj.status } : {}),
|
|
146
|
+
...(typeof obj.reason === "string" ? { reason: obj.reason } : {}),
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
function loadGraphFile(stashRoot, db) {
|
|
150
|
+
if (!db)
|
|
151
|
+
return { files: [] };
|
|
152
|
+
const graph = loadStoredGraphSnapshot(stashRoot, db);
|
|
153
|
+
if (!graph)
|
|
154
|
+
return { files: [] };
|
|
155
|
+
const out = [];
|
|
156
|
+
for (const node of graph.files) {
|
|
157
|
+
const cacheShape = validateGraphCacheShape({ entities: node.entities, relations: node.relations });
|
|
158
|
+
if (!cacheShape)
|
|
159
|
+
continue;
|
|
160
|
+
out.push({
|
|
161
|
+
path: node.path,
|
|
162
|
+
type: node.type,
|
|
163
|
+
bodyHash: node.bodyHash,
|
|
164
|
+
entities: cacheShape.entities,
|
|
165
|
+
relations: cacheShape.relations,
|
|
166
|
+
confidence: normalizeConfidence(node.confidence),
|
|
167
|
+
...(node.status ? { status: node.status } : {}),
|
|
168
|
+
...(node.reason ? { reason: node.reason } : {}),
|
|
169
|
+
...(node.extractionRunId ? { extractionRunId: node.extractionRunId } : {}),
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
return {
|
|
173
|
+
files: out,
|
|
174
|
+
...(graph.telemetry ? { telemetry: graph.telemetry } : {}),
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
function mergeGraphNodes(previousNodes, refreshedNodes, candidatePaths) {
|
|
178
|
+
if (!candidatePaths)
|
|
179
|
+
return refreshedNodes;
|
|
180
|
+
const refreshedByPath = new Map(refreshedNodes.map((node) => [node.path, node]));
|
|
181
|
+
const merged = [];
|
|
182
|
+
for (const node of previousNodes) {
|
|
183
|
+
if (candidatePaths.has(node.path))
|
|
184
|
+
continue;
|
|
185
|
+
merged.push(node);
|
|
186
|
+
}
|
|
187
|
+
for (const node of refreshedNodes)
|
|
188
|
+
merged.push(refreshedByPath.get(node.path) ?? node);
|
|
189
|
+
return merged;
|
|
190
|
+
}
|
|
191
|
+
function reuseGraphNode(previousNodes, candidate, bodyHash) {
|
|
192
|
+
const node = previousNodes.get(candidate.absPath);
|
|
193
|
+
if (!node)
|
|
194
|
+
return undefined;
|
|
195
|
+
if (node.type !== candidate.type)
|
|
196
|
+
return undefined;
|
|
197
|
+
if (typeof node.bodyHash !== "string" || node.bodyHash.length === 0)
|
|
198
|
+
return undefined;
|
|
199
|
+
if (node.bodyHash !== bodyHash)
|
|
200
|
+
return undefined;
|
|
201
|
+
const validated = validateGraphCacheShape({ entities: node.entities, relations: node.relations });
|
|
202
|
+
if (!validated)
|
|
203
|
+
return undefined;
|
|
204
|
+
return {
|
|
205
|
+
entities: validated.entities,
|
|
206
|
+
relations: validated.relations,
|
|
207
|
+
confidence: normalizeConfidence(node.confidence),
|
|
208
|
+
...(node.status ? { status: node.status } : {}),
|
|
209
|
+
...(node.reason ? { reason: node.reason } : {}),
|
|
210
|
+
};
|
|
211
|
+
}
|
|
55
212
|
/**
|
|
56
213
|
* Top-level entry point. Returns a no-op result when the pass is disabled.
|
|
57
214
|
*
|
|
58
215
|
* Three preconditions — ALL must hold for the pass to run:
|
|
59
216
|
*
|
|
60
|
-
* 1. **Provider configured** —
|
|
217
|
+
* 1. **Provider configured** — an LLM profile must be selectable. Without a
|
|
61
218
|
* configured provider, `resolveIndexPassLLM("graph", config)` returns
|
|
62
219
|
* `undefined` (the pass cannot run because there is no model to call).
|
|
63
|
-
* 2. **Feature gate** — `
|
|
64
|
-
* `true`). When `false`, no network call may issue regardless
|
|
65
|
-
* per-pass settings.
|
|
220
|
+
* 2. **Feature gate** — `profiles.improve.default.processes.graphExtraction.enabled`
|
|
221
|
+
* (defaults to `true`). When `false`, no network call may issue regardless
|
|
222
|
+
* of per-pass settings.
|
|
66
223
|
* 3. **Per-pass gate** — `index.graph.llm` (defaults to `true`). When
|
|
67
224
|
* `false`, the indexer simply skips this pass for the current run.
|
|
68
225
|
*
|
|
69
226
|
* If any of the three is missing or `false`, this function short-circuits
|
|
70
|
-
* to an empty no-op result, leaving any existing
|
|
71
|
-
*
|
|
227
|
+
* to an empty no-op result, leaving any existing persisted graph untouched.
|
|
228
|
+
*
|
|
229
|
+
* When `config.index.graph.graphExtractionBatchSize > 1`, eligible files are
|
|
230
|
+
* chunked into batches and each chunk is processed with a single LLM call via
|
|
231
|
+
* `extractGraphFromBodies`. Default batch size is 1 (one call per asset —
|
|
232
|
+
* preserves existing behaviour, fully opt-in).
|
|
72
233
|
*/
|
|
73
|
-
export async function runGraphExtractionPass(config, sources, signal) {
|
|
74
|
-
// Gate 1 —
|
|
75
|
-
//
|
|
76
|
-
|
|
234
|
+
export async function runGraphExtractionPass(config, sources, signal, db, reEnrich, onProgress, options = {}) {
|
|
235
|
+
// Gate 1 — feature gate via isProcessEnabled, which reads the 0.8.0 path
|
|
236
|
+
// (profiles.improve.default.processes.graphExtraction.enabled). Defaults to
|
|
237
|
+
// enabled when the key is absent.
|
|
238
|
+
if (!isProcessEnabled("index", "graph_extraction", config))
|
|
77
239
|
return { ...EMPTY_RESULT };
|
|
78
240
|
// Gate 2 — per-pass opt-out (#208). Returns the resolved llm config or
|
|
79
241
|
// `undefined` when the pass should not run.
|
|
80
242
|
const llmConfig = resolveIndexPassLLM("graph", config);
|
|
81
|
-
if (!llmConfig)
|
|
243
|
+
if (!llmConfig) {
|
|
244
|
+
const reason = getIndexPassConfig(config.index, "graph")?.llm === false
|
|
245
|
+
? "index.graph.llm is false"
|
|
246
|
+
: "no default LLM profile is configured";
|
|
247
|
+
warnVerbose(`graph extraction: skipped because ${reason}.`);
|
|
82
248
|
return { ...EMPTY_RESULT };
|
|
249
|
+
}
|
|
83
250
|
// The pass only writes to the primary (working) stash. Read-only caches
|
|
84
251
|
// (git, npm, website) are deliberately untouched — the graph artifact for
|
|
85
252
|
// those sources would be clobbered by the next sync().
|
|
86
253
|
const primary = sources[0];
|
|
87
|
-
if (!primary)
|
|
254
|
+
if (!primary) {
|
|
255
|
+
warnVerbose("graph extraction: skipped because no primary stash source is available.");
|
|
88
256
|
return { ...EMPTY_RESULT };
|
|
89
|
-
|
|
257
|
+
}
|
|
258
|
+
const includeTypes = getGraphExtractionIncludeTypes(config);
|
|
259
|
+
const eligible = collectEligibleFiles(primary.path, includeTypes).filter((candidate) => !options.candidatePaths || options.candidatePaths.has(candidate.absPath));
|
|
90
260
|
const considered = eligible.length;
|
|
91
|
-
if (considered === 0)
|
|
261
|
+
if (considered === 0) {
|
|
262
|
+
const scoped = options.candidatePaths ? ` matching ${options.candidatePaths.size} candidate path(s)` : "";
|
|
263
|
+
warnVerbose(`graph extraction: skipped because no eligible files${scoped} were found under ${primary.path}. ` +
|
|
264
|
+
`includeTypes=${includeTypes.join(",")}`);
|
|
92
265
|
return { ...EMPTY_RESULT };
|
|
266
|
+
}
|
|
267
|
+
const previousGraph = loadGraphFile(primary.path, db);
|
|
268
|
+
const previousNodes = new Map(previousGraph.files.map((node) => [node.path, node]));
|
|
93
269
|
const nodes = [];
|
|
94
270
|
let totalEntities = 0;
|
|
95
271
|
let totalRelations = 0;
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
272
|
+
let processed = 0;
|
|
273
|
+
let extracted = 0;
|
|
274
|
+
onProgress?.({ processed, total: considered, extracted, totalEntities, totalRelations });
|
|
275
|
+
const reportProgress = (currentPath, result) => {
|
|
276
|
+
processed += 1;
|
|
277
|
+
if (result) {
|
|
278
|
+
if (result.entities.length > 0)
|
|
279
|
+
extracted += 1;
|
|
280
|
+
totalEntities += result.entities.length;
|
|
281
|
+
totalRelations += result.relations.length;
|
|
282
|
+
}
|
|
283
|
+
onProgress?.({
|
|
284
|
+
processed,
|
|
285
|
+
total: considered,
|
|
286
|
+
extracted,
|
|
287
|
+
totalEntities,
|
|
288
|
+
totalRelations,
|
|
289
|
+
currentPath,
|
|
290
|
+
});
|
|
291
|
+
};
|
|
292
|
+
// Resolve the effective batch size. Falls back to
|
|
293
|
+
// DEFAULT_GRAPH_EXTRACTION_BATCH_SIZE (4) when unset, and clamps against
|
|
294
|
+
// `llm.contextLength` if the model's context window is configured.
|
|
295
|
+
const batchSize = resolveBatchSize(getIndexPassConfig(config.index, "graph")?.graphExtractionBatchSize, llmConfig.contextLength);
|
|
296
|
+
const extractionRunId = crypto.randomUUID();
|
|
297
|
+
const extractorId = getGraphExtractorId({ model: llmConfig.model, batchSize, includeTypes });
|
|
298
|
+
const cacheVariant = extractorId;
|
|
299
|
+
const telemetry = {
|
|
300
|
+
extractorId,
|
|
301
|
+
extractionRunId,
|
|
302
|
+
model: llmConfig.model,
|
|
303
|
+
promptVersion: graphExtract.GRAPH_EXTRACT_PROMPT_VERSION,
|
|
304
|
+
batchSize,
|
|
305
|
+
cacheHits: 0,
|
|
306
|
+
cacheMisses: 0,
|
|
307
|
+
truncationCount: 0,
|
|
308
|
+
failureCount: 0,
|
|
309
|
+
};
|
|
310
|
+
const canReusePreviousGraph = previousGraph.telemetry?.extractorId === extractorId;
|
|
311
|
+
const runtimeTelemetry = {
|
|
312
|
+
truncationCount: 0,
|
|
313
|
+
failureCount: 0,
|
|
314
|
+
filteredGenericEntities: 0,
|
|
315
|
+
filteredInvalidRelations: 0,
|
|
316
|
+
filteredLowConfidenceRelations: 0,
|
|
317
|
+
contextBatchRetries: 0,
|
|
318
|
+
nonArrayBatchFailures: 0,
|
|
319
|
+
};
|
|
320
|
+
const batchState = {
|
|
321
|
+
batchingDisabled: false,
|
|
322
|
+
nonArrayBatchFailures: 0,
|
|
323
|
+
};
|
|
324
|
+
warnVerbose(`graph extraction: starting for ${considered} eligible file(s) under ${primary.path}; ` +
|
|
325
|
+
`includeTypes=${includeTypes.join(",")}, batchSize=${batchSize}, concurrency=${llmConfig.concurrency ?? 1}, ` +
|
|
326
|
+
`reEnrich=${reEnrich === true}, candidateScoped=${options.candidatePaths ? "true" : "false"}.`);
|
|
327
|
+
const onFallback = (evt) => {
|
|
328
|
+
warn(`[akm] LLM fallback for ${evt.feature}: ${evt.reason}`);
|
|
329
|
+
};
|
|
330
|
+
let extractionResults;
|
|
331
|
+
if (batchSize <= 1) {
|
|
332
|
+
// ── Original per-asset path (with incremental cache) ─────────────────
|
|
333
|
+
extractionResults = await concurrentMap(eligible, async (candidate) => {
|
|
334
|
+
if (signal?.aborted) {
|
|
335
|
+
reportProgress(candidate.absPath, undefined);
|
|
336
|
+
return undefined;
|
|
337
|
+
}
|
|
338
|
+
const bodyHash = computeBodyHash(candidate.body);
|
|
339
|
+
let cached;
|
|
340
|
+
if (db) {
|
|
341
|
+
if (!(reEnrich ?? false)) {
|
|
342
|
+
const cacheEntry = getLlmCacheEntry(db, candidate.absPath, bodyHash, cacheVariant);
|
|
343
|
+
if (cacheEntry) {
|
|
344
|
+
try {
|
|
345
|
+
cached = validateGraphCacheShape(JSON.parse(cacheEntry.resultJson));
|
|
346
|
+
if (cached)
|
|
347
|
+
telemetry.cacheHits += 1;
|
|
348
|
+
}
|
|
349
|
+
catch {
|
|
350
|
+
cached = undefined;
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
else if (!(reEnrich ?? false)) {
|
|
356
|
+
// No DB — best-effort reuse from the previous in-memory graph.
|
|
357
|
+
cached = reuseGraphNode(previousNodes, candidate, bodyHash);
|
|
358
|
+
}
|
|
359
|
+
if (!cached && !(reEnrich ?? false) && canReusePreviousGraph) {
|
|
360
|
+
const reused = reuseGraphNode(previousNodes, candidate, bodyHash);
|
|
361
|
+
if (reused) {
|
|
362
|
+
cached = reused;
|
|
363
|
+
if (db) {
|
|
364
|
+
upsertLlmCacheEntry(db, candidate.absPath, bodyHash, JSON.stringify(reused), cacheVariant);
|
|
365
|
+
}
|
|
366
|
+
telemetry.cacheHits += 1;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
if (!cached) {
|
|
370
|
+
telemetry.cacheMisses += 1;
|
|
371
|
+
const extraction = await graphExtract.extractGraphFromBody(llmConfig, candidate.body, signal, config, onFallback, { batchState, telemetry: runtimeTelemetry });
|
|
372
|
+
cached = {
|
|
373
|
+
entities: extraction.entities,
|
|
374
|
+
relations: extraction.relations,
|
|
375
|
+
...(extraction.confidence !== undefined ? { confidence: extraction.confidence } : {}),
|
|
376
|
+
...(extraction.status ? { status: extraction.status } : {}),
|
|
377
|
+
...(extraction.reason ? { reason: extraction.reason } : {}),
|
|
378
|
+
};
|
|
379
|
+
if (db) {
|
|
380
|
+
upsertLlmCacheEntry(db, candidate.absPath, bodyHash, JSON.stringify(cached), cacheVariant);
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
const result = {
|
|
384
|
+
absPath: candidate.absPath,
|
|
385
|
+
type: candidate.type,
|
|
386
|
+
bodyHash,
|
|
387
|
+
entities: cached.entities,
|
|
388
|
+
relations: cached.relations,
|
|
389
|
+
...(cached.confidence !== undefined ? { confidence: cached.confidence } : {}),
|
|
390
|
+
...(cached.status ? { status: cached.status } : {}),
|
|
391
|
+
...(cached.reason ? { reason: cached.reason } : {}),
|
|
392
|
+
};
|
|
393
|
+
reportProgress(candidate.absPath, result);
|
|
394
|
+
return result;
|
|
395
|
+
},
|
|
396
|
+
// Default concurrency of 4 for cloud APIs. Set `llm.concurrency: 1`
|
|
397
|
+
// in config.json for local model servers (LM Studio, Ollama).
|
|
398
|
+
llmConfig.concurrency ?? 1);
|
|
399
|
+
}
|
|
400
|
+
else {
|
|
401
|
+
// ── Batched path (with incremental cache) ────────────────────────────
|
|
402
|
+
// Chunk eligible files into groups of `batchSize` and call
|
|
403
|
+
// `extractGraphFromBodies` once per chunk. Cache hits are resolved
|
|
404
|
+
// before chunking so they don't consume LLM tokens in the batch call.
|
|
405
|
+
const rawResults = new Array(eligible.length).fill(undefined);
|
|
406
|
+
const chunkStarts = [];
|
|
407
|
+
for (let start = 0; start < eligible.length; start += batchSize)
|
|
408
|
+
chunkStarts.push(start);
|
|
409
|
+
await concurrentMap(chunkStarts, async (start) => {
|
|
410
|
+
if (signal?.aborted)
|
|
411
|
+
return;
|
|
412
|
+
const chunk = eligible.slice(start, start + batchSize);
|
|
413
|
+
const reportChunkProgress = () => {
|
|
414
|
+
for (let j = 0; j < chunk.length; j++) {
|
|
415
|
+
const candidate = chunk[j];
|
|
416
|
+
if (!candidate)
|
|
417
|
+
continue;
|
|
418
|
+
reportProgress(candidate.absPath, rawResults[start + j]);
|
|
419
|
+
}
|
|
420
|
+
};
|
|
421
|
+
// Pre-resolve cache hits for this chunk; track which positions need LLM.
|
|
422
|
+
const bodyHashes = chunk.map((c) => computeBodyHash(c.body));
|
|
423
|
+
// Batch the cache lookup: one IN(...) query for the whole chunk instead
|
|
424
|
+
// of N individual SELECTs. The map covers every ref in this chunk that
|
|
425
|
+
// has any cached row; the per-position hash check happens below.
|
|
426
|
+
const chunkCache = db && !reEnrich
|
|
427
|
+
? getLlmCacheEntriesByRefs(db, chunk.map((c) => c.absPath), cacheVariant)
|
|
428
|
+
: new Map();
|
|
429
|
+
const needsLlm = chunk.map((c, j) => {
|
|
430
|
+
if (!db || reEnrich)
|
|
431
|
+
return true;
|
|
432
|
+
const cached = chunkCache.get(c.absPath);
|
|
433
|
+
// Hash mismatch → body changed, treat as cache miss.
|
|
434
|
+
if (!cached || cached.bodyHash !== (bodyHashes[j] ?? ""))
|
|
435
|
+
return true;
|
|
436
|
+
try {
|
|
437
|
+
const parsed = validateGraphCacheShape(JSON.parse(cached.resultJson));
|
|
438
|
+
if (!parsed)
|
|
439
|
+
return true;
|
|
440
|
+
telemetry.cacheHits += 1;
|
|
441
|
+
rawResults[start + j] = {
|
|
442
|
+
absPath: c.absPath,
|
|
443
|
+
type: c.type,
|
|
444
|
+
bodyHash: bodyHashes[j] ?? "",
|
|
445
|
+
entities: parsed.entities,
|
|
446
|
+
relations: parsed.relations,
|
|
447
|
+
...(parsed.confidence !== undefined ? { confidence: parsed.confidence } : {}),
|
|
448
|
+
...(parsed.status ? { status: parsed.status } : {}),
|
|
449
|
+
...(parsed.reason ? { reason: parsed.reason } : {}),
|
|
450
|
+
};
|
|
451
|
+
return false;
|
|
452
|
+
}
|
|
453
|
+
catch {
|
|
454
|
+
return true;
|
|
455
|
+
}
|
|
456
|
+
});
|
|
457
|
+
// Secondary incremental path: reuse previous graph nodes when the body hash
|
|
458
|
+
// still matches and DB cache is missing/stale/unavailable.
|
|
459
|
+
if (!(reEnrich ?? false) && canReusePreviousGraph) {
|
|
460
|
+
for (let j = 0; j < chunk.length; j++) {
|
|
461
|
+
if (!needsLlm[j])
|
|
462
|
+
continue;
|
|
463
|
+
const candidate = chunk[j];
|
|
464
|
+
if (!candidate)
|
|
465
|
+
continue;
|
|
466
|
+
const reused = reuseGraphNode(previousNodes, candidate, bodyHashes[j] ?? "");
|
|
467
|
+
if (!reused)
|
|
468
|
+
continue;
|
|
469
|
+
telemetry.cacheHits += 1;
|
|
470
|
+
rawResults[start + j] = {
|
|
471
|
+
absPath: candidate.absPath,
|
|
472
|
+
type: candidate.type,
|
|
473
|
+
bodyHash: bodyHashes[j] ?? "",
|
|
474
|
+
entities: reused.entities,
|
|
475
|
+
relations: reused.relations,
|
|
476
|
+
...(reused.confidence !== undefined ? { confidence: reused.confidence } : {}),
|
|
477
|
+
...(reused.status ? { status: reused.status } : {}),
|
|
478
|
+
...(reused.reason ? { reason: reused.reason } : {}),
|
|
479
|
+
};
|
|
480
|
+
if (db) {
|
|
481
|
+
upsertLlmCacheEntry(db, candidate.absPath, bodyHashes[j] ?? "", JSON.stringify(reused), cacheVariant);
|
|
482
|
+
}
|
|
483
|
+
needsLlm[j] = false;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
const uncachedChunk = chunk.filter((_, j) => needsLlm[j]);
|
|
487
|
+
if (uncachedChunk.length === 0) {
|
|
488
|
+
reportChunkProgress();
|
|
489
|
+
return;
|
|
490
|
+
}
|
|
491
|
+
const bodies = uncachedChunk.map((c) => c.body);
|
|
492
|
+
telemetry.cacheMisses += uncachedChunk.length;
|
|
493
|
+
// extractGraphFromBodies always returns an array of the same length
|
|
494
|
+
// as bodies (it falls back per-asset for any missing indices).
|
|
495
|
+
const batchExtractions = await graphExtract.extractGraphFromBodies(llmConfig, bodies, signal, config, onFallback, { batchState, telemetry: runtimeTelemetry });
|
|
496
|
+
// Map LLM results back to original positions and write cache entries.
|
|
497
|
+
let llmIdx = 0;
|
|
498
|
+
for (let j = 0; j < chunk.length; j++) {
|
|
499
|
+
if (!needsLlm[j])
|
|
500
|
+
continue;
|
|
501
|
+
const candidate = chunk[j];
|
|
502
|
+
const extraction = batchExtractions[llmIdx++];
|
|
503
|
+
if (!candidate || !extraction)
|
|
504
|
+
continue;
|
|
505
|
+
if (db) {
|
|
506
|
+
upsertLlmCacheEntry(db, candidate.absPath, bodyHashes[j] ?? "", JSON.stringify({
|
|
507
|
+
entities: extraction.entities,
|
|
508
|
+
relations: extraction.relations,
|
|
509
|
+
...(extraction.confidence !== undefined ? { confidence: extraction.confidence } : {}),
|
|
510
|
+
...(extraction.status ? { status: extraction.status } : {}),
|
|
511
|
+
...(extraction.reason ? { reason: extraction.reason } : {}),
|
|
512
|
+
}), cacheVariant);
|
|
513
|
+
}
|
|
514
|
+
rawResults[start + j] = {
|
|
515
|
+
absPath: candidate.absPath,
|
|
516
|
+
type: candidate.type,
|
|
517
|
+
bodyHash: bodyHashes[j] ?? "",
|
|
518
|
+
entities: extraction.entities,
|
|
519
|
+
relations: extraction.relations,
|
|
520
|
+
...(extraction.confidence !== undefined ? { confidence: extraction.confidence } : {}),
|
|
521
|
+
...(extraction.status ? { status: extraction.status } : {}),
|
|
522
|
+
...(extraction.reason ? { reason: extraction.reason } : {}),
|
|
523
|
+
};
|
|
524
|
+
}
|
|
525
|
+
reportChunkProgress();
|
|
526
|
+
}, llmConfig.concurrency ?? 1);
|
|
527
|
+
extractionResults = rawResults;
|
|
528
|
+
}
|
|
529
|
+
for (const result of extractionResults) {
|
|
530
|
+
if (!result)
|
|
101
531
|
continue;
|
|
102
532
|
nodes.push({
|
|
103
|
-
path:
|
|
104
|
-
type:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
...(r.
|
|
113
|
-
}))
|
|
533
|
+
path: result.absPath,
|
|
534
|
+
type: result.type,
|
|
535
|
+
bodyHash: result.bodyHash,
|
|
536
|
+
entities: [...new Set(result.entities.map((entity) => entity.trim()).filter(Boolean))],
|
|
537
|
+
relations: result.relations
|
|
538
|
+
.map((r) => ({
|
|
539
|
+
from: r.from.trim(),
|
|
540
|
+
to: r.to.trim(),
|
|
541
|
+
...(r.type ? { type: r.type.trim() } : {}),
|
|
542
|
+
...(normalizeConfidence(r.confidence) !== undefined ? { confidence: normalizeConfidence(r.confidence) } : {}),
|
|
543
|
+
}))
|
|
544
|
+
.filter((relation) => relation.from && relation.to),
|
|
545
|
+
...(normalizeConfidence(result.confidence) !== undefined
|
|
546
|
+
? { confidence: normalizeConfidence(result.confidence) }
|
|
547
|
+
: {}),
|
|
548
|
+
status: result.status ?? (result.entities.length > 0 ? "extracted" : "empty"),
|
|
549
|
+
reason: result.reason ?? (result.entities.length > 0 ? "none" : "no_graph_content"),
|
|
550
|
+
extractionRunId,
|
|
114
551
|
});
|
|
115
|
-
totalEntities += extraction.entities.length;
|
|
116
|
-
totalRelations += extraction.relations.length;
|
|
117
552
|
}
|
|
553
|
+
const mergedNodes = mergeGraphNodes(previousGraph.files, nodes, options.candidatePaths);
|
|
554
|
+
const assetRefs = mergedNodes.map((node) => node.path);
|
|
555
|
+
const deduped = deduplicateGraph(mergedNodes.map((node) => ({ entities: node.entities, relations: node.relations })), assetRefs);
|
|
556
|
+
telemetry.truncationCount = runtimeTelemetry.truncationCount ?? 0;
|
|
557
|
+
telemetry.failureCount = runtimeTelemetry.failureCount ?? 0;
|
|
558
|
+
const qualityConsidered = mergedNodes.length;
|
|
559
|
+
const qualityExtracted = mergedNodes.filter((node) => node.status === "extracted" && node.entities.length > 0).length;
|
|
560
|
+
const quality = computeGraphQualityTelemetry(qualityConsidered, qualityExtracted, deduped.entities.length, deduped.relations.length);
|
|
561
|
+
const warnings = buildLowQualityWarnings(quality, telemetry);
|
|
562
|
+
for (const warning of warnings)
|
|
563
|
+
warnVerbose(`graph extraction quality: ${warning}`);
|
|
118
564
|
const graph = {
|
|
119
565
|
schemaVersion: GRAPH_FILE_SCHEMA_VERSION,
|
|
120
566
|
generatedAt: new Date().toISOString(),
|
|
121
567
|
stashRoot: primary.path,
|
|
122
|
-
files:
|
|
568
|
+
files: mergedNodes,
|
|
569
|
+
entities: deduped.entities,
|
|
570
|
+
relations: deduped.relations,
|
|
571
|
+
quality,
|
|
572
|
+
telemetry,
|
|
123
573
|
};
|
|
124
|
-
const written = writeGraphFile(primary.path, graph);
|
|
574
|
+
const written = writeGraphFile(primary.path, graph, db);
|
|
575
|
+
warnVerbose(`graph extraction: ${written ? "persisted" : "did not persist"} graph for ${primary.path}; ` +
|
|
576
|
+
`considered=${considered}, extractedThisRun=${extracted}, storedFiles=${mergedNodes.length}, ` +
|
|
577
|
+
`entities=${deduped.entities.length}, relations=${deduped.relations.length}, coverage=${quality.extractionCoverage}.`);
|
|
125
578
|
return {
|
|
126
579
|
considered,
|
|
127
|
-
extracted
|
|
580
|
+
extracted,
|
|
128
581
|
totalEntities,
|
|
129
582
|
totalRelations,
|
|
130
583
|
written,
|
|
584
|
+
quality,
|
|
585
|
+
telemetry,
|
|
586
|
+
warnings,
|
|
131
587
|
};
|
|
132
588
|
}
|
|
133
589
|
/**
|
|
@@ -141,10 +597,16 @@ export async function runGraphExtractionPass(config, sources, signal) {
|
|
|
141
597
|
*
|
|
142
598
|
* Exported for direct unit testing.
|
|
143
599
|
*/
|
|
144
|
-
export function collectEligibleFiles(stashRoot) {
|
|
600
|
+
export function collectEligibleFiles(stashRoot, includeTypes = [...DEFAULT_GRAPH_EXTRACTION_INCLUDE_TYPES]) {
|
|
145
601
|
const out = [];
|
|
146
|
-
for (const
|
|
147
|
-
const
|
|
602
|
+
for (const rawType of includeTypes) {
|
|
603
|
+
const type = rawType.trim().toLowerCase();
|
|
604
|
+
if (!SUPPORTED_GRAPH_EXTRACTION_INCLUDE_TYPES.has(type))
|
|
605
|
+
continue;
|
|
606
|
+
const stashDir = TYPE_DIRS[type];
|
|
607
|
+
if (!stashDir)
|
|
608
|
+
continue;
|
|
609
|
+
const dir = path.join(stashRoot, stashDir);
|
|
148
610
|
if (!fs.existsSync(dir))
|
|
149
611
|
continue;
|
|
150
612
|
for (const filePath of walkMarkdownFiles(dir)) {
|
|
@@ -168,47 +630,21 @@ export function collectEligibleFiles(stashRoot) {
|
|
|
168
630
|
}
|
|
169
631
|
return out;
|
|
170
632
|
}
|
|
171
|
-
function* walkMarkdownFiles(root) {
|
|
172
|
-
let entries;
|
|
173
|
-
try {
|
|
174
|
-
entries = fs.readdirSync(root, { withFileTypes: true });
|
|
175
|
-
}
|
|
176
|
-
catch {
|
|
177
|
-
return;
|
|
178
|
-
}
|
|
179
|
-
for (const entry of entries) {
|
|
180
|
-
const full = path.join(root, entry.name);
|
|
181
|
-
if (entry.isDirectory()) {
|
|
182
|
-
yield* walkMarkdownFiles(full);
|
|
183
|
-
}
|
|
184
|
-
else if (entry.isFile() && entry.name.toLowerCase().endsWith(".md")) {
|
|
185
|
-
yield full;
|
|
186
|
-
}
|
|
187
|
-
}
|
|
188
|
-
}
|
|
189
633
|
// ── Persistence ─────────────────────────────────────────────────────────────
|
|
190
634
|
/**
|
|
191
|
-
*
|
|
192
|
-
*
|
|
193
|
-
* Direct `fs.writeFile` is intentional. The graph artifact is an indexer
|
|
194
|
-
* cache — not a user-visible asset — so it does not have an asset ref and
|
|
195
|
-
* `writeAssetToSource` (which routes through the asset-spec rendering
|
|
196
|
-
* layer) is the wrong primitive here. See CLAUDE.md / spec §10 step 5 for
|
|
197
|
-
* the carve-out: kind-branching writes for asset content live in
|
|
198
|
-
* `src/core/write-source.ts`; opaque indexer artifacts may write directly.
|
|
635
|
+
* Persist graph rows into the SQLite index DB.
|
|
199
636
|
*/
|
|
200
|
-
function writeGraphFile(stashRoot, graph) {
|
|
201
|
-
|
|
202
|
-
|
|
637
|
+
function writeGraphFile(stashRoot, graph, db) {
|
|
638
|
+
if (!db) {
|
|
639
|
+
warn("graph extraction: no database handle available; skipping graph persistence.");
|
|
640
|
+
return false;
|
|
641
|
+
}
|
|
203
642
|
try {
|
|
204
|
-
|
|
205
|
-
const tmp = `${target}.tmp.${process.pid}.${Math.random().toString(36).slice(2)}`;
|
|
206
|
-
fs.writeFileSync(tmp, `${JSON.stringify(graph, null, 2)}\n`, "utf8");
|
|
207
|
-
fs.renameSync(tmp, target);
|
|
643
|
+
replaceStoredGraph(db, graph);
|
|
208
644
|
return true;
|
|
209
645
|
}
|
|
210
646
|
catch (err) {
|
|
211
|
-
warn(`graph extraction: failed to
|
|
647
|
+
warn(`graph extraction: failed to persist graph for ${stashRoot}: ${err instanceof Error ? err.message : String(err)}`);
|
|
212
648
|
return false;
|
|
213
649
|
}
|
|
214
650
|
}
|