akm-cli 0.8.0-rc2 → 0.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (313) hide show
  1. package/{.github/CHANGELOG.md → CHANGELOG.md} +238 -3
  2. package/README.md +22 -6
  3. package/SECURITY.md +93 -0
  4. package/dist/assets/help/help-accept.md +12 -0
  5. package/dist/assets/help/help-improve.md +81 -0
  6. package/dist/{commands → assets}/help/help-proposals.md +7 -4
  7. package/dist/assets/help/help-reject.md +11 -0
  8. package/dist/{output → assets/hints}/cli-hints-full.md +60 -32
  9. package/dist/{output → assets/hints}/cli-hints-short.md +10 -7
  10. package/dist/assets/profiles/default.json +15 -0
  11. package/dist/assets/profiles/graph-refresh.json +13 -0
  12. package/dist/assets/profiles/memory-focus.json +12 -0
  13. package/dist/assets/profiles/quick.json +15 -0
  14. package/dist/assets/profiles/thorough.json +15 -0
  15. package/dist/assets/prompts/extract-session.md +80 -0
  16. package/dist/assets/prompts/graph-extract-user-prompt.md +35 -0
  17. package/dist/assets/tasks/graph-refresh-weekly.yml +10 -0
  18. package/dist/cli/config-migrate.js +144 -0
  19. package/dist/cli/config-validate.js +39 -0
  20. package/dist/cli/confirm.js +73 -0
  21. package/dist/cli/parse-args.js +93 -3
  22. package/dist/cli/shared.js +129 -0
  23. package/dist/cli.js +2141 -1268
  24. package/dist/commands/add-cli.js +279 -0
  25. package/dist/commands/agent-dispatch.js +20 -12
  26. package/dist/commands/agent-support.js +11 -5
  27. package/dist/commands/completions.js +3 -0
  28. package/dist/commands/config-cli.js +129 -517
  29. package/dist/commands/consolidate.js +1557 -147
  30. package/dist/commands/curate.js +44 -3
  31. package/dist/commands/db-cli.js +23 -0
  32. package/dist/commands/distill-promotion-policy.js +5 -3
  33. package/dist/commands/distill.js +906 -100
  34. package/dist/commands/env.js +213 -0
  35. package/dist/commands/eval-cases.js +3 -0
  36. package/dist/commands/events.js +3 -0
  37. package/dist/commands/extract-cli.js +127 -0
  38. package/dist/commands/extract-prompt.js +217 -0
  39. package/dist/commands/extract.js +477 -0
  40. package/dist/commands/feedback-cli.js +331 -0
  41. package/dist/commands/graph.js +260 -5
  42. package/dist/commands/health.js +1042 -55
  43. package/dist/commands/history.js +51 -16
  44. package/dist/commands/improve-auto-accept.js +97 -0
  45. package/dist/commands/improve-cli.js +236 -0
  46. package/dist/commands/improve-profiles.js +138 -0
  47. package/dist/commands/improve-result-file.js +167 -0
  48. package/dist/commands/improve.js +1736 -346
  49. package/dist/commands/info.js +26 -28
  50. package/dist/commands/init.js +49 -1
  51. package/dist/commands/installed-stashes.js +6 -23
  52. package/dist/commands/knowledge.js +3 -0
  53. package/dist/commands/lint/agent-linter.js +3 -0
  54. package/dist/commands/lint/base-linter.js +199 -5
  55. package/dist/commands/lint/command-linter.js +3 -0
  56. package/dist/commands/lint/default-linter.js +3 -0
  57. package/dist/commands/lint/env-key-rules.js +154 -0
  58. package/dist/commands/lint/index.js +92 -3
  59. package/dist/commands/lint/knowledge-linter.js +3 -0
  60. package/dist/commands/lint/markdown-insertion.js +343 -0
  61. package/dist/commands/lint/memory-linter.js +3 -0
  62. package/dist/commands/lint/registry.js +3 -0
  63. package/dist/commands/lint/skill-linter.js +3 -0
  64. package/dist/commands/lint/task-linter.js +15 -12
  65. package/dist/commands/lint/types.js +3 -0
  66. package/dist/commands/lint/workflow-linter.js +3 -0
  67. package/dist/commands/lint.js +3 -0
  68. package/dist/commands/migration-help.js +5 -2
  69. package/dist/commands/proposal-drain-policies.js +128 -0
  70. package/dist/commands/proposal-drain.js +477 -0
  71. package/dist/commands/proposal.js +60 -6
  72. package/dist/commands/propose.js +24 -19
  73. package/dist/commands/reflect.js +1004 -94
  74. package/dist/commands/registry-cli.js +150 -0
  75. package/dist/commands/registry-search.js +3 -0
  76. package/dist/commands/remember-cli.js +257 -0
  77. package/dist/commands/remember.js +15 -6
  78. package/dist/commands/schema-repair.js +88 -15
  79. package/dist/commands/search.js +99 -14
  80. package/dist/commands/secret.js +173 -0
  81. package/dist/commands/self-update.js +3 -0
  82. package/dist/commands/show.js +32 -13
  83. package/dist/commands/source-add.js +7 -35
  84. package/dist/commands/source-clone.js +3 -0
  85. package/dist/commands/source-manage.js +3 -0
  86. package/dist/commands/tasks.js +161 -95
  87. package/dist/commands/url-checker.js +3 -0
  88. package/dist/core/action-contributors.js +3 -0
  89. package/dist/core/asset-ref.js +13 -2
  90. package/dist/core/asset-registry.js +9 -2
  91. package/dist/core/asset-serialize.js +88 -0
  92. package/dist/core/asset-spec.js +61 -5
  93. package/dist/core/common.js +93 -5
  94. package/dist/core/concurrent.js +3 -0
  95. package/dist/core/config-io.js +347 -0
  96. package/dist/core/config-migration.js +622 -0
  97. package/dist/core/config-schema.js +558 -0
  98. package/dist/core/config-sources.js +108 -0
  99. package/dist/core/config-types.js +4 -0
  100. package/dist/core/config-walker.js +337 -0
  101. package/dist/core/config.js +366 -1077
  102. package/dist/core/errors.js +42 -20
  103. package/dist/core/events.js +31 -25
  104. package/dist/core/file-lock.js +104 -0
  105. package/dist/core/frontmatter.js +75 -10
  106. package/dist/core/lesson-lint.js +3 -0
  107. package/dist/core/markdown.js +3 -0
  108. package/dist/core/memory-belief.js +62 -0
  109. package/dist/core/memory-contradiction-detect.js +274 -0
  110. package/dist/core/memory-improve.js +142 -14
  111. package/dist/core/parse.js +3 -0
  112. package/dist/core/paths.js +218 -50
  113. package/dist/core/proposal-quality-validators.js +380 -0
  114. package/dist/core/proposal-validators.js +11 -3
  115. package/dist/core/proposals.js +464 -5
  116. package/dist/core/state-db.js +349 -56
  117. package/dist/core/text-truncation.js +107 -0
  118. package/dist/core/time.js +3 -0
  119. package/dist/core/tty.js +59 -0
  120. package/dist/core/warn.js +7 -2
  121. package/dist/core/write-source.js +12 -0
  122. package/dist/indexer/db-backup.js +391 -0
  123. package/dist/indexer/db-search.js +136 -28
  124. package/dist/indexer/db.js +661 -166
  125. package/dist/indexer/ensure-index.js +3 -0
  126. package/dist/indexer/file-context.js +3 -0
  127. package/dist/indexer/graph-boost.js +162 -40
  128. package/dist/indexer/graph-db.js +241 -51
  129. package/dist/indexer/graph-dedup.js +3 -7
  130. package/dist/indexer/graph-extraction.js +242 -149
  131. package/dist/indexer/index-context.js +3 -9
  132. package/dist/indexer/indexer.js +86 -16
  133. package/dist/indexer/llm-cache.js +24 -19
  134. package/dist/indexer/manifest.js +3 -0
  135. package/dist/indexer/matchers.js +184 -11
  136. package/dist/indexer/memory-inference.js +94 -50
  137. package/dist/indexer/metadata-contributors.js +3 -0
  138. package/dist/indexer/metadata.js +110 -50
  139. package/dist/indexer/path-resolver.js +3 -0
  140. package/dist/indexer/project-context.js +192 -0
  141. package/dist/indexer/ranking-contributors.js +134 -7
  142. package/dist/indexer/ranking.js +8 -1
  143. package/dist/indexer/search-fields.js +5 -9
  144. package/dist/indexer/search-hit-enrichers.js +91 -2
  145. package/dist/indexer/search-source.js +20 -1
  146. package/dist/indexer/semantic-status.js +4 -1
  147. package/dist/indexer/staleness-detect.js +447 -0
  148. package/dist/indexer/usage-events.js +12 -9
  149. package/dist/indexer/walker.js +3 -0
  150. package/dist/integrations/agent/builders.js +135 -0
  151. package/dist/integrations/agent/config.js +121 -401
  152. package/dist/integrations/agent/detect.js +3 -0
  153. package/dist/integrations/agent/index.js +6 -14
  154. package/dist/integrations/agent/model-aliases.js +55 -0
  155. package/dist/integrations/agent/profiles.js +3 -0
  156. package/dist/integrations/agent/prompts.js +137 -8
  157. package/dist/integrations/agent/runner.js +208 -0
  158. package/dist/integrations/agent/sdk-runner.js +8 -2
  159. package/dist/integrations/agent/spawn.js +54 -14
  160. package/dist/integrations/github.js +3 -0
  161. package/dist/integrations/lockfile.js +22 -51
  162. package/dist/integrations/session-logs/index.js +4 -0
  163. package/dist/integrations/session-logs/inline-refs.js +35 -0
  164. package/dist/integrations/session-logs/pre-filter.js +152 -0
  165. package/dist/integrations/session-logs/providers/claude-code.js +226 -0
  166. package/dist/integrations/session-logs/providers/opencode.js +231 -25
  167. package/dist/integrations/session-logs/types.js +3 -0
  168. package/dist/llm/call-ai.js +14 -26
  169. package/dist/llm/client.js +16 -2
  170. package/dist/llm/embedder.js +20 -29
  171. package/dist/llm/embedders/cache.js +3 -7
  172. package/dist/llm/embedders/local.js +42 -1
  173. package/dist/llm/embedders/remote.js +20 -8
  174. package/dist/llm/embedders/types.js +3 -7
  175. package/dist/llm/feature-gate.js +92 -56
  176. package/dist/llm/graph-extract.js +402 -31
  177. package/dist/llm/index-passes.js +44 -29
  178. package/dist/llm/memory-infer.js +30 -2
  179. package/dist/llm/metadata-enhance.js +3 -7
  180. package/dist/output/cli-hints.js +7 -4
  181. package/dist/output/context.js +60 -8
  182. package/dist/output/renderers.js +170 -194
  183. package/dist/output/shapes/curate.js +56 -0
  184. package/dist/output/shapes/distill.js +10 -0
  185. package/dist/output/shapes/env-list.js +19 -0
  186. package/dist/output/shapes/events.js +11 -0
  187. package/dist/output/shapes/helpers.js +424 -0
  188. package/dist/output/shapes/history.js +7 -0
  189. package/dist/output/shapes/passthrough.js +105 -0
  190. package/dist/output/shapes/proposal-accept.js +7 -0
  191. package/dist/output/shapes/proposal-diff.js +7 -0
  192. package/dist/output/shapes/proposal-list.js +7 -0
  193. package/dist/output/shapes/proposal-producer.js +11 -0
  194. package/dist/output/shapes/proposal-reject.js +7 -0
  195. package/dist/output/shapes/proposal-show.js +7 -0
  196. package/dist/output/shapes/registry-search.js +6 -0
  197. package/dist/output/shapes/registry.js +30 -0
  198. package/dist/output/shapes/search.js +6 -0
  199. package/dist/output/shapes/secret-list.js +19 -0
  200. package/dist/output/shapes/show.js +6 -0
  201. package/dist/output/shapes/vault-list.js +19 -0
  202. package/dist/output/shapes.js +51 -549
  203. package/dist/output/text/add.js +6 -0
  204. package/dist/output/text/clone.js +6 -0
  205. package/dist/output/text/config.js +6 -0
  206. package/dist/output/text/curate.js +6 -0
  207. package/dist/output/text/distill.js +7 -0
  208. package/dist/output/text/enable-disable.js +7 -0
  209. package/dist/output/text/events.js +10 -0
  210. package/dist/output/text/feedback.js +6 -0
  211. package/dist/output/text/helpers.js +1059 -0
  212. package/dist/output/text/history.js +7 -0
  213. package/dist/output/text/import.js +6 -0
  214. package/dist/output/text/index.js +6 -0
  215. package/dist/output/text/info.js +6 -0
  216. package/dist/output/text/init.js +6 -0
  217. package/dist/output/text/list.js +6 -0
  218. package/dist/output/text/proposal-producer.js +8 -0
  219. package/dist/output/text/proposal.js +12 -0
  220. package/dist/output/text/registry-commands.js +11 -0
  221. package/dist/output/text/registry.js +30 -0
  222. package/dist/output/text/remember.js +6 -0
  223. package/dist/output/text/remove.js +6 -0
  224. package/dist/output/text/save.js +6 -0
  225. package/dist/output/text/search.js +6 -0
  226. package/dist/output/text/show.js +6 -0
  227. package/dist/output/text/update.js +6 -0
  228. package/dist/output/text/upgrade.js +6 -0
  229. package/dist/output/text/vault.js +16 -0
  230. package/dist/output/text/wiki.js +15 -0
  231. package/dist/output/text/workflow.js +14 -0
  232. package/dist/output/text.js +44 -1329
  233. package/dist/registry/build-index.js +3 -0
  234. package/dist/registry/create-provider-registry.js +3 -0
  235. package/dist/registry/factory.js +4 -1
  236. package/dist/registry/origin-resolve.js +3 -0
  237. package/dist/registry/providers/index.js +3 -0
  238. package/dist/registry/providers/skills-sh.js +11 -2
  239. package/dist/registry/providers/static-index.js +10 -1
  240. package/dist/registry/providers/types.js +3 -24
  241. package/dist/registry/resolve.js +11 -16
  242. package/dist/registry/types.js +3 -0
  243. package/dist/scripts/migrate-storage.js +17767 -0
  244. package/dist/scripts/migrations/import-fs-improve-runs-to-db.js +9031 -0
  245. package/dist/scripts/migrations/v16-to-v17.js +141 -0
  246. package/dist/setup/detect.js +3 -0
  247. package/dist/setup/ripgrep-install.js +3 -0
  248. package/dist/setup/ripgrep-resolve.js +3 -0
  249. package/dist/setup/setup.js +306 -67
  250. package/dist/setup/steps.js +3 -15
  251. package/dist/sources/include.js +3 -0
  252. package/dist/sources/provider-factory.js +3 -11
  253. package/dist/sources/provider.js +3 -20
  254. package/dist/sources/providers/filesystem.js +19 -23
  255. package/dist/sources/providers/git.js +171 -21
  256. package/dist/sources/providers/index.js +3 -0
  257. package/dist/sources/providers/install-types.js +3 -13
  258. package/dist/sources/providers/npm.js +3 -4
  259. package/dist/sources/providers/provider-utils.js +3 -0
  260. package/dist/sources/providers/sync-from-ref.js +3 -11
  261. package/dist/sources/providers/tar-utils.js +3 -0
  262. package/dist/sources/providers/website.js +18 -22
  263. package/dist/sources/resolve.js +3 -0
  264. package/dist/sources/types.js +3 -0
  265. package/dist/sources/website-ingest.js +3 -0
  266. package/dist/tasks/backends/cron.js +3 -0
  267. package/dist/tasks/backends/exec-utils.js +3 -0
  268. package/dist/tasks/backends/index.js +3 -11
  269. package/dist/tasks/backends/launchd.js +4 -1
  270. package/dist/tasks/backends/schtasks.js +4 -1
  271. package/dist/tasks/parser.js +51 -38
  272. package/dist/tasks/resolveAkmBin.js +3 -0
  273. package/dist/tasks/runner.js +35 -9
  274. package/dist/tasks/schedule.js +20 -1
  275. package/dist/tasks/schema.js +5 -3
  276. package/dist/tasks/validator.js +6 -3
  277. package/dist/version.js +3 -0
  278. package/dist/wiki/wiki-templates.js +6 -3
  279. package/dist/wiki/wiki.js +4 -1
  280. package/dist/workflows/authoring.js +4 -1
  281. package/dist/workflows/cli.js +3 -0
  282. package/dist/workflows/db.js +140 -10
  283. package/dist/workflows/document-cache.js +3 -10
  284. package/dist/workflows/parser.js +3 -0
  285. package/dist/workflows/renderer.js +3 -0
  286. package/dist/workflows/runs.js +18 -1
  287. package/dist/workflows/schema.js +3 -0
  288. package/dist/workflows/scope-key.js +3 -0
  289. package/dist/workflows/validator.js +5 -9
  290. package/docs/README.md +7 -2
  291. package/docs/data-and-telemetry.md +225 -0
  292. package/docs/migration/release-notes/0.7.5.md +2 -2
  293. package/docs/migration/release-notes/0.8.0.md +57 -5
  294. package/docs/migration/v0.7-to-v0.8.md +1378 -0
  295. package/package.json +28 -11
  296. package/.github/LICENSE +0 -374
  297. package/dist/commands/help/help-accept.md +0 -9
  298. package/dist/commands/help/help-improve.md +0 -53
  299. package/dist/commands/help/help-reject.md +0 -8
  300. package/dist/commands/install-audit.js +0 -385
  301. package/dist/commands/vault.js +0 -310
  302. package/dist/indexer/match-contributors.js +0 -141
  303. package/dist/integrations/agent/pipeline.js +0 -39
  304. package/dist/integrations/agent/runners.js +0 -31
  305. package/dist/llm/prompts/graph-extract-user-prompt.md +0 -12
  306. /package/dist/{tasks → assets}/backends/launchd-template.xml +0 -0
  307. /package/dist/{tasks → assets}/backends/schtasks-template.xml +0 -0
  308. /package/dist/{commands → assets}/help/help-propose.md +0 -0
  309. /package/dist/{wiki → assets/wiki}/index-template.md +0 -0
  310. /package/dist/{wiki → assets/wiki}/ingest-workflow-template.md +0 -0
  311. /package/dist/{wiki → assets/wiki}/log-template.md +0 -0
  312. /package/dist/{wiki → assets/wiki}/schema-template.md +0 -0
  313. /package/dist/{workflows → assets/workflows}/workflow-template.md +0 -0
@@ -1,3 +1,6 @@
1
+ // This Source Code Form is subject to the terms of the Mozilla Public
2
+ // License, v. 2.0. If a copy of the MPL was not distributed with this
3
+ // file, You can obtain one at https://mozilla.org/MPL/2.0/.
1
4
  /**
2
5
  * LLM helper for the `akm index` graph-extraction pass (#207).
3
6
  *
@@ -17,18 +20,23 @@
17
20
  * the connection via `resolveIndexPassLLM("graph", config)` and pass it
18
21
  * straight through.
19
22
  */
23
+ import userPromptTemplate from "../assets/prompts/graph-extract-user-prompt.md" with { type: "text" };
20
24
  import { toErrorMessage } from "../core/common";
21
- import { warn } from "../core/warn";
25
+ import { warn, warnVerbose } from "../core/warn";
22
26
  import { chatCompletion, parseEmbeddedJsonResponse } from "./client";
23
27
  import { tryLlmFeature } from "./feature-gate";
24
- import userPromptTemplate from "./prompts/graph-extract-user-prompt.md" with { type: "text" };
25
28
  /**
26
29
  * Separator token used between assets in a batch prompt.
27
30
  * Chosen to be visually clear and unlikely to appear verbatim in asset bodies.
28
31
  */
29
32
  const BATCH_ASSET_SEPARATOR = "=== ASSET";
30
- /** Hard cap on body chars sent to the model. */
31
- const MAX_BODY_CHARS = 4000;
33
+ export const GRAPH_EXTRACT_PROMPT_VERSION = "v2";
34
+ /** Asset bodies longer than this are chunked instead of truncated. */
35
+ const MAX_CHUNK_BODY_CHARS = 1600;
36
+ /** Bodies longer than this are excluded from multi-asset batch prompts. */
37
+ const MAX_BATCH_BODY_CHARS = 1600;
38
+ const MIN_RELATION_CONFIDENCE = 0.5;
39
+ const NON_ARRAY_BATCH_DISABLE_THRESHOLD = 2;
32
40
  /** Hard cap on entities returned per asset — guards against runaway LLM output. */
33
41
  const MAX_ENTITIES_PER_ASSET = 32;
34
42
  /** Hard cap on relations returned per asset. */
@@ -37,6 +45,42 @@ const SYSTEM_PROMPT = "You extract a knowledge graph from developer notes. Retur
37
45
  const USER_PROMPT_PREFIX = userPromptTemplate
38
46
  .replace("{{MAX_ENTITIES}}", String(MAX_ENTITIES_PER_ASSET))
39
47
  .replace("{{MAX_RELATIONS}}", String(MAX_RELATIONS_PER_ASSET));
48
+ /**
49
+ * Detect whether an error message indicates a context size exceeded condition.
50
+ * Covers common patterns from OpenAI-compatible APIs (LM Studio, Ollama, etc).
51
+ */
52
+ function isContextSizeError(message) {
53
+ const lower = message.toLowerCase();
54
+ return (lower.includes("context size") ||
55
+ lower.includes("context length") ||
56
+ lower.includes("context_window") ||
57
+ lower.includes("prompt too long") ||
58
+ (lower.includes("exceeds") && lower.includes("context")));
59
+ }
60
+ const GENERIC_ENTITIES = new Set([
61
+ "agent",
62
+ "application",
63
+ "assistant",
64
+ "code",
65
+ "content",
66
+ "data",
67
+ "developer",
68
+ "document",
69
+ "file",
70
+ "knowledge",
71
+ "memory",
72
+ "note",
73
+ "notes",
74
+ "project",
75
+ "service",
76
+ "system",
77
+ "task",
78
+ "team",
79
+ "text",
80
+ "thing",
81
+ "user",
82
+ ]);
83
+ const GENERIC_RELATION_TYPES = new Set(["has", "is", "mentions", "references", "related to"]);
40
84
  function parseConfidence(raw) {
41
85
  if (typeof raw !== "number" || !Number.isFinite(raw))
42
86
  return undefined;
@@ -68,12 +112,205 @@ function normalizeRelationType(raw) {
68
112
  return "integrates with";
69
113
  return normalized;
70
114
  }
115
+ function normalizeEntityKey(raw) {
116
+ return normalizeEntityName(raw).toLowerCase();
117
+ }
118
+ function bumpTelemetry(telemetry, key, amount = 1) {
119
+ if (!telemetry)
120
+ return;
121
+ telemetry[key] = (telemetry[key] ?? 0) + amount;
122
+ }
123
+ function normalizeBatchState(state) {
124
+ if (!state)
125
+ return undefined;
126
+ state.batchingDisabled = state.batchingDisabled === true;
127
+ state.nonArrayBatchFailures = Math.max(0, state.nonArrayBatchFailures ?? 0);
128
+ return state;
129
+ }
130
+ function splitParagraph(text, maxChars) {
131
+ if (text.length <= maxChars)
132
+ return { chunks: [text], truncationCount: 0 };
133
+ const chunks = [];
134
+ let truncationCount = 0;
135
+ let remaining = text;
136
+ while (remaining.length > maxChars) {
137
+ let splitAt = remaining.lastIndexOf(" ", maxChars);
138
+ if (splitAt < Math.floor(maxChars * 0.6))
139
+ splitAt = maxChars;
140
+ const piece = remaining.slice(0, splitAt).trim();
141
+ if (piece)
142
+ chunks.push(piece);
143
+ remaining = remaining.slice(splitAt).trim();
144
+ truncationCount += 1;
145
+ }
146
+ if (remaining)
147
+ chunks.push(remaining);
148
+ return { chunks, truncationCount };
149
+ }
150
+ function splitBodyIntoChunks(body, maxChars = MAX_CHUNK_BODY_CHARS) {
151
+ const sections = body
152
+ .split(/\n(?=#{1,6}\s)/)
153
+ .map((section) => section.trim())
154
+ .filter(Boolean);
155
+ if (sections.length === 0)
156
+ return { chunks: [body.trim()].filter(Boolean), truncationCount: 0 };
157
+ const chunks = [];
158
+ let current = "";
159
+ let truncationCount = 0;
160
+ const flush = () => {
161
+ const trimmed = current.trim();
162
+ if (trimmed)
163
+ chunks.push(trimmed);
164
+ current = "";
165
+ };
166
+ for (const section of sections) {
167
+ if (section.length <= maxChars) {
168
+ const candidate = current ? `${current}\n\n${section}` : section;
169
+ if (candidate.length <= maxChars)
170
+ current = candidate;
171
+ else {
172
+ flush();
173
+ current = section;
174
+ }
175
+ continue;
176
+ }
177
+ const paragraphs = section
178
+ .split(/\n\s*\n/)
179
+ .map((part) => part.trim())
180
+ .filter(Boolean);
181
+ for (const paragraph of paragraphs) {
182
+ if (paragraph.length <= maxChars) {
183
+ const candidate = current ? `${current}\n\n${paragraph}` : paragraph;
184
+ if (candidate.length <= maxChars)
185
+ current = candidate;
186
+ else {
187
+ flush();
188
+ current = paragraph;
189
+ }
190
+ continue;
191
+ }
192
+ flush();
193
+ const split = splitParagraph(paragraph, maxChars);
194
+ truncationCount += split.truncationCount;
195
+ for (const piece of split.chunks) {
196
+ if (piece.length <= maxChars)
197
+ chunks.push(piece);
198
+ }
199
+ }
200
+ }
201
+ flush();
202
+ return { chunks, truncationCount };
203
+ }
204
+ /** Consistency weight for blending chunk-agreement with LLM confidence. */
205
+ const CONSISTENCY_WEIGHT = 0.4;
206
+ function mergeGraphExtractions(extractions) {
207
+ const totalChunks = extractions.length;
208
+ const entityCanonical = new Map();
209
+ const entityChunkCounts = new Map();
210
+ const relationByKey = new Map();
211
+ const relationChunkCounts = new Map();
212
+ let confidence;
213
+ let truncationCount = 0;
214
+ let filteredGenericEntities = 0;
215
+ let filteredInvalidRelations = 0;
216
+ let filteredLowConfidenceRelations = 0;
217
+ let firstFailureReason;
218
+ for (const extraction of extractions) {
219
+ truncationCount += extraction.truncationCount ?? 0;
220
+ filteredGenericEntities += extraction.filteredGenericEntities ?? 0;
221
+ filteredInvalidRelations += extraction.filteredInvalidRelations ?? 0;
222
+ filteredLowConfidenceRelations += extraction.filteredLowConfidenceRelations ?? 0;
223
+ if (extraction.status === "failed" && !firstFailureReason)
224
+ firstFailureReason = extraction.reason;
225
+ const nextConfidence = parseConfidence(extraction.confidence);
226
+ if (nextConfidence !== undefined)
227
+ confidence = confidence === undefined ? nextConfidence : Math.max(confidence, nextConfidence);
228
+ for (const entity of extraction.entities) {
229
+ const key = normalizeEntityKey(entity);
230
+ if (!key)
231
+ continue;
232
+ if (!entityCanonical.has(key))
233
+ entityCanonical.set(key, entity);
234
+ entityChunkCounts.set(key, (entityChunkCounts.get(key) ?? 0) + 1);
235
+ }
236
+ }
237
+ for (const extraction of extractions) {
238
+ for (const relation of extraction.relations) {
239
+ const fromKey = normalizeEntityKey(relation.from);
240
+ const toKey = normalizeEntityKey(relation.to);
241
+ const type = normalizeRelationType(relation.type ?? "");
242
+ if (!fromKey || !toKey || !type)
243
+ continue;
244
+ const from = entityCanonical.get(fromKey);
245
+ const to = entityCanonical.get(toKey);
246
+ if (!from || !to)
247
+ continue;
248
+ const key = `${fromKey}\u0000${toKey}\u0000${type}`;
249
+ if (!relationByKey.has(key)) {
250
+ relationByKey.set(key, {
251
+ from,
252
+ to,
253
+ type,
254
+ });
255
+ relationChunkCounts.set(key, 0);
256
+ }
257
+ relationChunkCounts.set(key, (relationChunkCounts.get(key) ?? 0) + 1);
258
+ const nextConfidence = parseConfidence(relation.confidence);
259
+ const existing = relationByKey.get(key);
260
+ if (existing && nextConfidence !== undefined) {
261
+ const current = parseConfidence(existing.confidence) ?? 0;
262
+ if (nextConfidence > current)
263
+ existing.confidence = nextConfidence;
264
+ }
265
+ }
266
+ }
267
+ function blendConsistency(llmConfidence, chunkCount) {
268
+ const consistency = totalChunks > 1 ? chunkCount / totalChunks : 1;
269
+ if (llmConfidence === undefined)
270
+ return consistency;
271
+ return (1 - CONSISTENCY_WEIGHT) * llmConfidence + CONSISTENCY_WEIGHT * consistency;
272
+ }
273
+ const entities = [...entityCanonical.values()].slice(0, MAX_ENTITIES_PER_ASSET);
274
+ const relations = [...relationByKey.values()].slice(0, MAX_RELATIONS_PER_ASSET);
275
+ for (const relation of relations) {
276
+ const fromKey = normalizeEntityKey(relation.from);
277
+ const toKey = normalizeEntityKey(relation.to);
278
+ const type = normalizeRelationType(relation.type ?? "");
279
+ if (!fromKey || !toKey || !type)
280
+ continue;
281
+ const key = `${fromKey}\u0000${toKey}\u0000${type}`;
282
+ const chunkCount = relationChunkCounts.get(key) ?? 1;
283
+ relation.confidence = blendConsistency(relation.confidence, chunkCount);
284
+ }
285
+ const status = entities.length > 0 ? "extracted" : firstFailureReason ? "failed" : "empty";
286
+ const reason = status === "extracted" ? "none" : (firstFailureReason ?? "no_graph_content");
287
+ const mergedConfidence = confidence !== undefined ? blendConsistency(confidence, totalChunks) : totalChunks > 1 ? 1 : undefined;
288
+ return {
289
+ entities,
290
+ relations,
291
+ ...(mergedConfidence !== undefined ? { confidence: mergedConfidence } : {}),
292
+ status,
293
+ reason,
294
+ chunkCount: extractions.length,
295
+ truncationCount,
296
+ filteredGenericEntities,
297
+ filteredInvalidRelations,
298
+ filteredLowConfidenceRelations,
299
+ };
300
+ }
71
301
  function parseGraphExtraction(raw) {
72
- const empty = { entities: [], relations: [] };
302
+ const empty = (reason = "no_graph_content") => ({
303
+ entities: [],
304
+ relations: [],
305
+ status: reason === "llm_error" || reason === "invalid_json" || reason === "context_limit" ? "failed" : "empty",
306
+ reason,
307
+ });
73
308
  if (typeof raw !== "object" || raw === null || Array.isArray(raw))
74
- return empty;
309
+ return empty();
75
310
  const item = raw;
311
+ const extractionConfidence = parseConfidence(item.confidence);
76
312
  const entityCanonical = new Map();
313
+ let filteredGenericEntities = 0;
77
314
  if (Array.isArray(item.entities)) {
78
315
  for (const value of item.entities) {
79
316
  if (typeof value !== "string")
@@ -81,6 +318,11 @@ function parseGraphExtraction(raw) {
81
318
  const normalized = normalizeEntityName(value);
82
319
  if (!normalized)
83
320
  continue;
321
+ const normalizedKey = normalized.toLowerCase();
322
+ if (!/[a-z0-9]/i.test(normalized) || GENERIC_ENTITIES.has(normalizedKey)) {
323
+ filteredGenericEntities += 1;
324
+ continue;
325
+ }
84
326
  const key = normalized.toLowerCase();
85
327
  if (!entityCanonical.has(key))
86
328
  entityCanonical.set(key, normalized);
@@ -90,21 +332,37 @@ function parseGraphExtraction(raw) {
90
332
  }
91
333
  const entities = Array.from(entityCanonical.values());
92
334
  const relations = [];
335
+ let filteredInvalidRelations = 0;
336
+ let filteredLowConfidenceRelations = 0;
93
337
  if (Array.isArray(item.relations)) {
94
338
  for (const relation of item.relations) {
95
- if (typeof relation !== "object" || relation === null || Array.isArray(relation))
339
+ if (typeof relation !== "object" || relation === null || Array.isArray(relation)) {
340
+ filteredInvalidRelations += 1;
96
341
  continue;
342
+ }
97
343
  const rel = relation;
98
344
  const fromRaw = typeof rel.from === "string" ? normalizeEntityName(rel.from) : "";
99
345
  const toRaw = typeof rel.to === "string" ? normalizeEntityName(rel.to) : "";
100
- if (!fromRaw || !toRaw)
346
+ if (!fromRaw || !toRaw) {
347
+ filteredInvalidRelations += 1;
101
348
  continue;
349
+ }
102
350
  const from = entityCanonical.get(fromRaw.toLowerCase());
103
351
  const to = entityCanonical.get(toRaw.toLowerCase());
104
- if (!from || !to)
352
+ if (!from || !to || from.toLowerCase() === to.toLowerCase()) {
353
+ filteredInvalidRelations += 1;
105
354
  continue;
355
+ }
106
356
  const type = typeof rel.type === "string" ? normalizeRelationType(rel.type) : undefined;
357
+ if (type !== undefined && GENERIC_RELATION_TYPES.has(type)) {
358
+ filteredInvalidRelations += 1;
359
+ continue;
360
+ }
107
361
  const confidence = parseConfidence(rel.confidence);
362
+ if (confidence !== undefined && confidence < MIN_RELATION_CONFIDENCE) {
363
+ filteredLowConfidenceRelations += 1;
364
+ continue;
365
+ }
108
366
  relations.push({
109
367
  from,
110
368
  to,
@@ -115,10 +373,17 @@ function parseGraphExtraction(raw) {
115
373
  break;
116
374
  }
117
375
  }
118
- const confidence = parseConfidence(item.confidence);
376
+ const confidence = extractionConfidence;
377
+ const status = entities.length > 0 ? "extracted" : "empty";
378
+ const reason = entities.length > 0 ? "none" : filteredGenericEntities > 0 ? "generic_entities_only" : "no_graph_content";
119
379
  return {
120
380
  entities,
121
381
  relations,
382
+ status,
383
+ reason,
384
+ filteredGenericEntities,
385
+ filteredInvalidRelations,
386
+ filteredLowConfidenceRelations,
122
387
  ...(confidence !== undefined ? { confidence } : {}),
123
388
  };
124
389
  }
@@ -161,9 +426,7 @@ function buildBatchSystemPrompt() {
161
426
  }
162
427
  function buildBatchUserPrompt(bodies) {
163
428
  const count = bodies.length;
164
- const assetBlocks = bodies
165
- .map((body, i) => `${BATCH_ASSET_SEPARATOR} ${i + 1} ===\n${body.trim().slice(0, MAX_BODY_CHARS)}`)
166
- .join("\n\n");
429
+ const assetBlocks = bodies.map((body, i) => `${BATCH_ASSET_SEPARATOR} ${i + 1} ===\n${body.trim()}`).join("\n\n");
167
430
  return (`Extract entities and relations from the N=${count} assets below.\n\n` +
168
431
  `Rules:\n` +
169
432
  `- Output ONLY a JSON array of exactly ${count} objects, one per asset, preserving input order.\n` +
@@ -177,6 +440,9 @@ function buildBatchUserPrompt(bodies) {
177
440
  `- The array MUST have exactly ${count} elements — one placeholder per asset even if empty.\n\n` +
178
441
  assetBlocks);
179
442
  }
443
+ function formatContextHint(llmConfig) {
444
+ return llmConfig.contextLength ? `, configured contextLength=${llmConfig.contextLength}` : "";
445
+ }
180
446
  /**
181
447
  * Parse and validate a single item from the batch response array.
182
448
  * Mirrors the validation logic in `extractGraphFromBody`.
@@ -207,14 +473,15 @@ function parseBatchItem(raw) {
207
473
  * @param akmConfig - Full AKM config (for feature-gate checks).
208
474
  * @param onFallback - Optional fallback event sink.
209
475
  */
210
- export async function extractGraphFromBodies(llmConfig, bodies, signal, akmConfig, onFallback) {
476
+ export async function extractGraphFromBodies(llmConfig, bodies, signal, akmConfig, onFallback, options = {}) {
211
477
  const empty = () => ({ entities: [], relations: [] });
478
+ const batchState = normalizeBatchState(options.batchState);
212
479
  // Degenerate case: no bodies → empty array (not an error).
213
480
  if (bodies.length === 0)
214
481
  return [];
215
482
  // Single body: delegate to the single-asset path for identical behaviour.
216
483
  if (bodies.length === 1) {
217
- const result = await extractGraphFromBody(llmConfig, bodies[0] ?? "", signal, akmConfig, onFallback);
484
+ const result = await extractGraphFromBody(llmConfig, bodies[0] ?? "", signal, akmConfig, onFallback, options);
218
485
  return [result];
219
486
  }
220
487
  // Filter out bodies that are empty so we don't waste tokens, but keep
@@ -222,17 +489,37 @@ export async function extractGraphFromBodies(llmConfig, bodies, signal, akmConfi
222
489
  const results = bodies.map(empty);
223
490
  const nonEmptyIndices = [];
224
491
  const nonEmptyBodies = [];
492
+ const oversizedIndices = [];
225
493
  for (let i = 0; i < bodies.length; i++) {
226
494
  const trimmed = (bodies[i] ?? "").trim();
227
495
  if (trimmed) {
228
- nonEmptyIndices.push(i);
229
- nonEmptyBodies.push(trimmed);
496
+ if (trimmed.length > MAX_BATCH_BODY_CHARS) {
497
+ oversizedIndices.push(i);
498
+ }
499
+ else {
500
+ nonEmptyIndices.push(i);
501
+ nonEmptyBodies.push(trimmed);
502
+ }
230
503
  }
231
504
  }
505
+ if (oversizedIndices.length > 0) {
506
+ await Promise.all(oversizedIndices.map(async (index) => {
507
+ results[index] = await extractGraphFromBody(llmConfig, bodies[index] ?? "", signal, akmConfig, onFallback, options);
508
+ }));
509
+ }
232
510
  if (nonEmptyBodies.length === 0)
233
511
  return results;
512
+ if (batchState?.batchingDisabled) {
513
+ return Promise.all(bodies.map((body) => extractGraphFromBody(llmConfig, body, signal, akmConfig, onFallback, options)));
514
+ }
234
515
  const systemPrompt = buildBatchSystemPrompt();
235
516
  const userPrompt = buildBatchUserPrompt(nonEmptyBodies);
517
+ const truncatedBodies = nonEmptyBodies.filter((body) => body.length > MAX_BATCH_BODY_CHARS).length;
518
+ if (truncatedBodies > 0) {
519
+ warnVerbose(`graph extraction (batch): ${truncatedBodies}/${nonEmptyBodies.length} asset body/bodies exceed the batch body threshold of ${MAX_BATCH_BODY_CHARS} chars.`);
520
+ }
521
+ let batchContextError = false;
522
+ let nonArrayResponse = false;
236
523
  const batchResult = await tryLlmFeature("graph_extraction", akmConfig, async () => {
237
524
  try {
238
525
  const raw = await chatCompletion(llmConfig, [
@@ -247,13 +534,32 @@ export async function extractGraphFromBodies(llmConfig, bodies, signal, akmConfi
247
534
  return null;
248
535
  const parsed = parseEmbeddedJsonResponse(raw);
249
536
  if (!Array.isArray(parsed)) {
250
- warn("graph extraction (batch): LLM response was not a JSON array; will fall back per-asset.");
537
+ nonArrayResponse = true;
538
+ bumpTelemetry(options.telemetry, "nonArrayBatchFailures");
539
+ if (batchState) {
540
+ batchState.nonArrayBatchFailures += 1;
541
+ if (batchState.nonArrayBatchFailures >= NON_ARRAY_BATCH_DISABLE_THRESHOLD) {
542
+ batchState.batchingDisabled = true;
543
+ }
544
+ }
545
+ warn(`graph extraction (batch): LLM response was not a JSON array for ${nonEmptyBodies.length} asset(s); ` +
546
+ `will fall back per-asset. promptChars=${userPrompt.length}${formatContextHint(llmConfig)}`);
251
547
  return null;
252
548
  }
253
549
  return parsed;
254
550
  }
255
551
  catch (err) {
256
- warn(`graph extraction (batch) failed: ${toErrorMessage(err)}`);
552
+ const errMsg = toErrorMessage(err);
553
+ if (isContextSizeError(errMsg)) {
554
+ batchContextError = true;
555
+ bumpTelemetry(options.telemetry, "contextBatchRetries");
556
+ warn(`graph extraction (batch): context size exceeded for ${nonEmptyBodies.length} asset(s); ` +
557
+ `skipping batch. promptChars=${userPrompt.length}${formatContextHint(llmConfig)}`);
558
+ }
559
+ else {
560
+ warn(`graph extraction (batch) failed for ${nonEmptyBodies.length} asset(s); ` +
561
+ `promptChars=${userPrompt.length}${formatContextHint(llmConfig)}: ${errMsg}`);
562
+ }
257
563
  return null;
258
564
  }
259
565
  }, null, {
@@ -262,6 +568,12 @@ export async function extractGraphFromBodies(llmConfig, bodies, signal, akmConfi
262
568
  });
263
569
  // Map successful batch results back to their original indices.
264
570
  if (batchResult !== null) {
571
+ if (batchState)
572
+ batchState.nonArrayBatchFailures = 0;
573
+ if (batchResult.length > nonEmptyBodies.length) {
574
+ warn(`graph extraction (batch): response had ${batchResult.length} items for ${nonEmptyBodies.length} assets; ` +
575
+ `ignoring ${batchResult.length - nonEmptyBodies.length} extra item(s).`);
576
+ }
265
577
  for (let j = 0; j < nonEmptyBodies.length; j++) {
266
578
  const originalIndex = nonEmptyIndices[j];
267
579
  if (originalIndex === undefined)
@@ -272,10 +584,26 @@ export async function extractGraphFromBodies(llmConfig, bodies, signal, akmConfi
272
584
  // j >= batchResult.length → partial failure; handled below.
273
585
  }
274
586
  }
587
+ if (batchContextError && nonEmptyBodies.length > 1) {
588
+ const splitAt = Math.ceil(nonEmptyBodies.length / 2);
589
+ const left = await extractGraphFromBodies(llmConfig, nonEmptyBodies.slice(0, splitAt), signal, akmConfig, onFallback, options);
590
+ const right = await extractGraphFromBodies(llmConfig, nonEmptyBodies.slice(splitAt), signal, akmConfig, onFallback, options);
591
+ const combined = [...left, ...right];
592
+ for (let j = 0; j < nonEmptyIndices.length; j++) {
593
+ const origIdx = nonEmptyIndices[j];
594
+ if (origIdx === undefined)
595
+ continue;
596
+ results[origIdx] = combined[j] ?? empty();
597
+ }
598
+ return results;
599
+ }
275
600
  // Partial-failure fallback: any non-empty body whose result is still the
276
601
  // empty placeholder (either because batchResult was null or the array was
277
- // shorter than expected) gets an individual retry.
602
+ // shorter than expected) gets an individual retry — unless the batch failed
603
+ // due to context size, in which case individual calls would also fail.
278
604
  const fallbackIndices = nonEmptyIndices.filter((_origIdx, j) => {
605
+ if (batchContextError)
606
+ return false; // skip individual retries on context error
279
607
  // Result is still empty → needs a fallback call.
280
608
  if (batchResult === null)
281
609
  return true;
@@ -291,9 +619,16 @@ export async function extractGraphFromBodies(llmConfig, bodies, signal, akmConfi
291
619
  }
292
620
  await Promise.all(fallbackIndices.map(async (origIdx) => {
293
621
  const body = bodies[origIdx] ?? "";
294
- results[origIdx] = await extractGraphFromBody(llmConfig, body, signal, akmConfig, onFallback);
622
+ results[origIdx] = await extractGraphFromBody(llmConfig, body, signal, akmConfig, onFallback, options);
295
623
  }));
296
624
  }
625
+ else if (batchContextError) {
626
+ warn(`graph extraction (batch): skipped ${nonEmptyBodies.length} asset(s) due to context size error; ` +
627
+ `consider increasing llm.contextLength or reducing index.graph.graphExtractionBatchSize to 1.`);
628
+ }
629
+ else if (nonArrayResponse && batchState?.batchingDisabled) {
630
+ warn("graph extraction (batch): disabling batching for the rest of this run after repeated non-array responses.");
631
+ }
297
632
  return results;
298
633
  }
299
634
  /**
@@ -306,12 +641,31 @@ export async function extractGraphFromBodies(llmConfig, bodies, signal, akmConfi
306
641
  * Routes through `tryLlmFeature("graph_extraction", ...)` so the feature gate
307
642
  * and onFallback hook are honoured uniformly (Fix C5).
308
643
  */
309
- export async function extractGraphFromBody(llmConfig, body, signal, akmConfig, onFallback) {
310
- const empty = { entities: [], relations: [] };
644
+ export async function extractGraphFromBody(llmConfig, body, signal, akmConfig, onFallback, options = {}) {
645
+ const empty = (reason, status) => ({
646
+ entities: [],
647
+ relations: [],
648
+ ...(status ? { status } : {}),
649
+ ...(reason ? { reason } : {}),
650
+ });
311
651
  const trimmedBody = body.trim();
312
652
  if (!trimmedBody)
313
- return empty;
314
- const userPrompt = `${USER_PROMPT_PREFIX}${trimmedBody.slice(0, MAX_BODY_CHARS)}`;
653
+ return empty();
654
+ const chunked = splitBodyIntoChunks(trimmedBody, MAX_CHUNK_BODY_CHARS);
655
+ if (chunked.truncationCount > 0) {
656
+ bumpTelemetry(options.telemetry, "truncationCount", chunked.truncationCount);
657
+ warnVerbose(`graph extraction: split a long asset into ${chunked.chunks.length} chunk(s) with ${chunked.truncationCount} hard split(s).`);
658
+ }
659
+ if (chunked.chunks.length > 1) {
660
+ const chunkResults = [];
661
+ for (const chunk of chunked.chunks) {
662
+ chunkResults.push(await extractGraphFromBody(llmConfig, chunk, signal, akmConfig, onFallback, options));
663
+ }
664
+ const merged = mergeGraphExtractions(chunkResults);
665
+ merged.truncationCount = (merged.truncationCount ?? 0) + chunked.truncationCount;
666
+ return merged;
667
+ }
668
+ const userPrompt = `${USER_PROMPT_PREFIX}${trimmedBody}`;
315
669
  return tryLlmFeature("graph_extraction", akmConfig, async () => {
316
670
  try {
317
671
  const raw = await chatCompletion(llmConfig, [
@@ -319,19 +673,36 @@ export async function extractGraphFromBody(llmConfig, body, signal, akmConfig, o
319
673
  { role: "user", content: userPrompt },
320
674
  ], { temperature: 0.1, timeoutMs: llmConfig.timeoutMs, signal });
321
675
  if (!raw)
322
- return empty;
676
+ return empty();
323
677
  const parsed = parseEmbeddedJsonResponse(raw);
324
678
  if (!parsed) {
325
679
  warn("graph extraction: invalid JSON response from LLM; skipping asset.");
326
- return empty;
680
+ bumpTelemetry(options.telemetry, "failureCount");
681
+ return empty("invalid_json", "failed");
327
682
  }
328
- return parseGraphExtraction(parsed);
683
+ const extraction = parseGraphExtraction(parsed);
684
+ bumpTelemetry(options.telemetry, "filteredGenericEntities", extraction.filteredGenericEntities ?? 0);
685
+ bumpTelemetry(options.telemetry, "filteredInvalidRelations", extraction.filteredInvalidRelations ?? 0);
686
+ bumpTelemetry(options.telemetry, "filteredLowConfidenceRelations", extraction.filteredLowConfidenceRelations ?? 0);
687
+ if (extraction.status === "failed")
688
+ bumpTelemetry(options.telemetry, "failureCount");
689
+ return extraction;
329
690
  }
330
691
  catch (err) {
331
- warn(`graph extraction failed: ${toErrorMessage(err)}`);
332
- return empty;
692
+ const errMsg = toErrorMessage(err);
693
+ if (isContextSizeError(errMsg)) {
694
+ bumpTelemetry(options.telemetry, "failureCount");
695
+ warn(`graph extraction: context size exceeded for asset; promptChars=${userPrompt.length}${formatContextHint(llmConfig)}. ` +
696
+ `Consider increasing llm.contextLength in config.json.`);
697
+ return empty("context_limit", "failed");
698
+ }
699
+ else {
700
+ bumpTelemetry(options.telemetry, "failureCount");
701
+ warn(`graph extraction failed for asset; promptChars=${userPrompt.length}${formatContextHint(llmConfig)}: ${errMsg}`);
702
+ return empty("llm_error", "failed");
703
+ }
333
704
  }
334
- }, empty, {
705
+ }, empty(), {
335
706
  timeoutMs: llmConfig.timeoutMs,
336
707
  onFallback,
337
708
  });