npm - @c4a/server-cli - Versions diffs - 0.4.15-alpha.4 → 0.4.15-alpha.5 - Mend

@c4a/server-cli 0.4.15-alpha.4 → 0.4.15-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/README.md +10 -5
package/index.js +954 -85
package/package.json +1 -1
package/serve.js +2016 -216
package/web/assets/ContentDetail--oZBzWh0.js +1 -0
package/web/assets/ContentDetail-B5s8bbFo.js +1 -0
package/web/assets/ContentDetail-C3kXsx-i.js +1 -0
package/web/assets/ContentDetail-CcLGF_Yi.js +1 -0
package/web/assets/ContentDetail-D-2xyerw.js +1 -0
package/web/assets/ContentDetail-DlQ8URkx.js +1 -0
package/web/assets/ContentDetail-TPc0m0eM.js +1 -0
package/web/assets/ContentDetail-y0yi2qln.js +1 -0
package/web/assets/EntityDetail-3CFtMmgQ.js +1 -0
package/web/assets/EntityDetail-BI3etmj4.js +1 -0
package/web/assets/EntityDetail-CoFb-qZW.js +1 -0
package/web/assets/EntityDetail-D_WP7tD4.js +1 -0
package/web/assets/EntityDetail-DiJPemDY.js +1 -0
package/web/assets/EntityDetail-DihnDvhA.js +1 -0
package/web/assets/EntityDetail-DyDH4GAw.js +1 -0
package/web/assets/EntityDetail-dIZiNN2t.js +1 -0
package/web/assets/RelationDetail-B2gHrceI.js +1 -0
package/web/assets/RelationDetail-CEq9vopD.js +1 -0
package/web/assets/RelationDetail-CaYrspaS.js +1 -0
package/web/assets/RelationDetail-CpoGdy25.js +1 -0
package/web/assets/RelationDetail-DU9ECyHi.js +1 -0
package/web/assets/RelationDetail-Dz7HAlU5.js +1 -0
package/web/assets/RelationDetail-Wh3IgNaF.js +1 -0
package/web/assets/RelationDetail-zZ_ZfkYX.js +1 -0
package/web/assets/index-BPMqeFze.js +111 -0
package/web/assets/index-BgRuvBL5.js +111 -0
package/web/assets/index-CcrkBEZl.js +111 -0
package/web/assets/index-DGDx8sCs.js +111 -0
package/web/assets/index-DIyAwnqE.js +111 -0
package/web/assets/index-DW1cCA8v.js +111 -0
package/web/assets/index-DiAYi5t8.css +1 -0
package/web/assets/index-FOCWvgW_.css +1 -0
package/web/assets/index-daOjyLzy.css +1 -0
package/web/assets/index-moF8uSEi.js +111 -0
package/web/assets/index-sPNyENFN.js +111 -0
package/web/assets/index-uGqDxUnx.css +1 -0
package/web/index.html +2 -2

package/serve.js CHANGED Viewed

@@ -281,6 +281,10 @@ var init_serverConfig = __esm(() => {
         default_model: "gemini-3-pro-preview"
       }
     },
+    indexing: {
+      task_timeout_ms: 150 * 60 * 1000,
+      file_timeout_ms: 15 * 60 * 1000
+    },
     embedding: {
       provider: "huggingface",
       huggingface: {
@@ -4340,7 +4344,7 @@ var init_atomsSchema = __esm(() => {
   init_zod();
   init_base();
   init_baseSchema();
-  confidenceAtomSchema = exports_external.number().min(0).max(1).optional();
+  confidenceAtomSchema = exports_external.number().min(0).max(1).optional().catch(undefined);
   entityAtomSchema = exports_external.object({
     name: exports_external.string(),
     kind: kindSchema.optional().catch(undefined),
@@ -186001,6 +186005,10 @@ function mergeServerConfig(parsed) {
         ...isPlainObject3(input.llm?.google) ? input.llm?.google : {}
       }
     },
+    indexing: {
+      ...defaults2.indexing,
+      ...isPlainObject3(input.indexing) ? input.indexing : {}
+    },
     embedding: {
       ...defaults2.embedding,
       ...isPlainObject3(input.embedding) ? input.embedding : {},
@@ -194956,14 +194964,21 @@ function isRetryableStatus(status) {
 function isAuthStatus(status) {
   return status === 401 || status === 403;
 }
-function isBadRequest(status) {
-  return status === 400;
+function throwLlmError(error40, status) {
+  const detail = toErrorMessage(error40);
+  const statusTag = status ? ` [HTTP ${status}]` : "";
+  if (isAuthStatus(status)) {
+    throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, `LLM 认证失败${statusTag}: ${detail}`, detail);
+  }
+  throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, `LLM 调用失败${statusTag}: ${detail}`, detail);
 }
 class LlmServiceImpl {
   options;
+  supportsTemperature;
   constructor(options) {
     this.options = options;
+    this.supportsTemperature = options.provider !== "openai";
   }
   async generateText(prompt, options) {
     if (this.options.forceStream) {
@@ -194975,7 +194990,7 @@ class LlmServiceImpl {
         model: this.options.languageModel,
         prompt,
         maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
-        temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
+        ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
         maxRetries: 0
       };
       if (options?.systemPrompt) {
@@ -195012,13 +195027,7 @@ class LlmServiceImpl {
         durationMs,
         error: toErrorMessage(error40)
       });
-      if (isAuthStatus(status)) {
-        throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
-      }
-      if (isBadRequest(status)) {
-        throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
-      }
-      throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
+      throwLlmError(error40, status);
     }
   }
   async generateTextViaStream(prompt, options) {
@@ -195028,7 +195037,7 @@ class LlmServiceImpl {
         model: this.options.languageModel,
         prompt,
         maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
-        temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
+        ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
         maxRetries: 0
       };
       if (options?.systemPrompt) {
@@ -195066,13 +195075,7 @@ class LlmServiceImpl {
         durationMs: Date.now() - startedAt,
         error: toErrorMessage(error40)
       });
-      if (isAuthStatus(status)) {
-        throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
-      }
-      if (isBadRequest(status)) {
-        throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
-      }
-      throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
+      throwLlmError(error40, status);
     }
   }
   streamText(prompt, options) {
@@ -195095,7 +195098,7 @@ class LlmServiceImpl {
         model: this.options.languageModel,
         prompt,
         maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
-        temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
+        ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
         maxRetries: 0,
         onFinish: (event) => {
           const finishEvent = event;
@@ -195141,13 +195144,7 @@ class LlmServiceImpl {
         durationMs: Date.now() - startedAt,
         error: toErrorMessage(error40)
       });
-      if (isAuthStatus(status)) {
-        throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
-      }
-      if (isBadRequest(status)) {
-        throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
-      }
-      throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
+      throwLlmError(error40, status);
     }
   }
 }
@@ -195858,7 +195855,12 @@ function parseExtractionOutput(raw5, schema2) {
       return { success: false, error: new Error("Empty output") };
     }
     const protocolParsed = tryParseProtocol(trimmed);
-    const parsed = protocolParsed ?? tryParseJson(trimmed);
+    let parsed = protocolParsed ?? tryParseJson(trimmed);
+    if (Array.isArray(parsed)) {
+      parsed = { paragraphs: parsed };
+    }
+    parsed = normalizeFlatOutput(parsed);
+    parsed = stripNulls(parsed);
     const result = schema2.safeParse(parsed);
     if (!result.success) {
       return { success: false, error: result.error };
@@ -195928,6 +195930,37 @@ function tryParseJson(raw5) {
 function repairAndParse(raw5) {
   return JSON.parse(jsonrepair(raw5));
 }
+var PARAGRAPH_TAG_RE = /^P\d+$/;
+function normalizeFlatOutput(parsed) {
+  if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
+    return parsed;
+  const obj = parsed;
+  if ("paragraphs" in obj)
+    return parsed;
+  const keys = Object.keys(obj);
+  if (keys.length === 0)
+    return { paragraphs: [] };
+  const allTags = keys.every((k) => PARAGRAPH_TAG_RE.test(k));
+  if (!allTags)
+    return parsed;
+  const paragraphs = keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag2) => ({ tag: tag2, atoms: obj[tag2] }));
+  return { paragraphs };
+}
+function stripNulls(value) {
+  if (value === null)
+    return;
+  if (Array.isArray(value))
+    return value.map(stripNulls);
+  if (typeof value === "object" && value !== null) {
+    const out2 = {};
+    for (const [k, v] of Object.entries(value)) {
+      if (v !== null)
+        out2[k] = stripNulls(v);
+    }
+    return out2;
+  }
+  return value;
+}
 function isRecord(value) {
   return !!value && typeof value === "object" && "key" in value && "value" in value && typeof value.key === "string";
 }
@@ -196220,30 +196253,32 @@ Each atom type has specific required fields. Fields with "?" suffix are optional
 ${ATOM_TYPES_BLOCK}
 ## Output Format
-Return a single JSON object with this structure:
+Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
 {
-  "paragraphs": [
-    {
-      "tag": "P0",
-      "atoms": {
-        "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
-        "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }],
-        "rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
-      }
-    }
-  ]
+  "P0": {
+    "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
+    "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }]
+  },
+  "P3": {
+    "rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
+  }
 }
 ## Rules
-- Each paragraph tag (P0, P1, ...) corresponds to the tagged paragraph in the input.
+- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
+- Skip paragraphs with no atoms — do NOT emit empty objects.
 - Only include atom types that are actually found in a paragraph (all types are optional).
 - Every atom MUST include all required fields for its type (see schemas above). Fields with "?" suffix are optional.
 - **Enum fields MUST use ONLY the listed values.** For example, entity.kind must be one of "implementation"|"external"|"concept" — do NOT use values from other atom types (e.g., do NOT put "team" or "human" in entity.kind; those belong to roles.kind).
 - Every atom MUST include a "confidence" field (0.0-1.0) indicating how confident you are in the extraction. Use higher values (0.85-1.0) for explicitly stated facts and lower values (0.5-0.7) for inferred or ambiguous information.
 - **Classify correctly:** People, teams, and personas → "roles" (not "entities"). Technical systems, services, modules → "entities".
+- **Entity reference consistency (CRITICAL):** Every entity name referenced in relation.from, relation.to, behavior.subject, or any other cross-reference field MUST also appear in the "entities" array of the SAME paragraph (or a preceding paragraph in the same chunk). If an entity is mentioned for the first time in a relation, you MUST also extract it as an entity. This ensures no "dangling references" — every name used in relations has a corresponding entity declaration.
 - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions, English input → English descriptions).
 - JSON structure keys (tag, atom type names, field names) must always be in English.
 - Be thorough: extract ALL relevant atoms from each paragraph.
+- **Tables: basic extraction only.** For paragraphs containing markdown tables, extract the table heading as an entity and a brief summary attribute. Detailed table modeling (row-level data, comparisons, metrics) is handled by a dedicated table extraction pass — do NOT attempt exhaustive table column extraction here.
+- **Diagrams: basic extraction only.** For paragraphs containing text-based diagrams (e.g. \`\`\`mermaid, \`\`\`plantuml, \`\`\`dot, etc.), extract the diagram title as an entity and a brief summary attribute describing what the diagram shows. Detailed diagram modeling (nodes, edges, states, transitions) is handled by a dedicated diagram extraction pass — do NOT attempt exhaustive diagram parsing here.
+- **Metrics checklist:** If the text mentions performance targets, SLA, response times, throughput, error rates, port numbers, timeouts, capacity limits, or any numeric thresholds, extract them as "metrics" atoms.
 - Do NOT include "claims" — they are system-generated and not part of document extraction.`;
 function buildDocAtomAnnotationPrompt(chunkText) {
   return `Extract all semantic atoms from the following document text.
@@ -196255,6 +196290,13 @@ ${chunkText}
 Return ONLY a valid JSON object. No markdown fences, no explanation.`;
 }
+function toFlatFormat(result) {
+  const flat = {};
+  for (const p4 of result.paragraphs) {
+    flat[p4.tag] = p4.atoms;
+  }
+  return flat;
+}
 function buildDocGleaningPrompt(chunkText, previousResult) {
   return `Review the following document text and the previously extracted atoms.
 Check for any MISSING atoms that were not captured in the first pass.
@@ -196263,66 +196305,483 @@ Check for any MISSING atoms that were not captured in the first pass.
 ${chunkText}
 ## Previously Extracted Atoms
-${JSON.stringify(previousResult, null, 2)}
+${JSON.stringify(toFlatFormat(previousResult), null, 2)}
 ## Instructions
-- If you find missing atoms, output them in the same JSON format (with paragraph tags).
+- If you find missing atoms, output them in the same flat JSON format keyed by paragraph tags (e.g. {"P0": {"entities": [...]}, "P3": {"rules": [...]}}).
 - Only include NEW atoms not already in the previous extraction.
 - Every atom MUST include a "confidence" field (0.0-1.0).
-- If nothing is missing, return: {"paragraphs": []}
+- **Entity reference consistency:** If you add a new relation whose from/to references an entity not yet declared in the previous extraction or your current output, you MUST also add that entity to the "entities" array.
+- If nothing is missing, return: {}
 - Respond in the same language as the input text.
 Return ONLY a valid JSON object. No markdown fences, no explanation.`;
 }
 var DOC_ANNOTATION_SYSTEM_PROMPT = SYSTEM_PROMPT;
+// ../llm/src/prompts/entityResolution.ts
+var ENTITY_RESOLUTION_SYSTEM_PROMPT = `You are an entity resolution assistant. You review a list of entity names extracted from a technical document and perform two tasks:
+## Task 1: Merge Duplicates
+- Only merge names that clearly refer to the same entity (same system, service, tool, etc.)
+- Prefer the LONGER, more descriptive name as the canonical name
+- Do NOT merge names that share a substring but refer to different things
+- When uncertain, do NOT merge — add to "ambiguous" instead
+- Chinese and English names for the same entity SHOULD be merged (e.g. "Vmok" → "Vmok 微模块框架")
+- Abbreviations should be merged with their full forms (e.g. "AGW" → "API Gateway")
+## Task 2: Remove Noise
+- Remove names that are NOT meaningful named entities — they are generic words, actions, or descriptions
+- Examples of noise: common verbs/nouns (登录, 路由, 直连), generic technical terms (Env, query), action descriptions (Kill 3001 进程)
+- Examples of REAL entities to KEEP: product names (TTAstra, Gulux), tools (nvm, Rush), services (Op Main 服务), platforms (AGW 平台)
+- When uncertain, KEEP the name — only remove if clearly not a named entity
+## Output
+Valid JSON only. No markdown fences, no explanation.`;
+function buildEntityResolutionPrompt(input) {
+  const parts = [];
+  parts.push(`## All Entity Names (${input.allNames.length} total)`);
+  parts.push(input.allNames.map((n, i) => `${i + 1}. ${n}`).join(`
+`));
+  if (input.candidates.length > 0) {
+    parts.push("");
+    parts.push(`## Suspected Duplicates (${input.candidates.length} pairs)`);
+    parts.push("Review each pair and decide whether to merge:");
+    for (const c of input.candidates) {
+      parts.push(`- "${c.short}" ↔ "${c.long}" — ${c.reason}`);
+    }
+  }
+  if (input.noiseCandidates && input.noiseCandidates.length > 0) {
+    parts.push("");
+    parts.push(`## Suspected Noise (${input.noiseCandidates.length} names)`);
+    parts.push("Review each name — remove if NOT a meaningful named entity, keep if it IS:");
+    for (const n of input.noiseCandidates) {
+      parts.push(`- "${n}"`);
+    }
+  }
+  if (input.contextSnippets && input.contextSnippets.length > 0) {
+    parts.push("");
+    parts.push("## Context Snippets");
+    for (const s of input.contextSnippets) {
+      parts.push(`- **${s.name}**: ${s.snippet}`);
+    }
+  }
+  parts.push("");
+  parts.push(`## Output Format
+Return a JSON object:
+{
+  "merges": [
+    { "from": "alias name", "to": "canonical name" }
+  ],
+  "remove": ["noise_name_1", "noise_name_2"],
+  "ambiguous": ["name1", "name2"]
+}
+- "merges": confirmed duplicate pairs. "from" will be replaced by "to" everywhere.
+- "remove": names confirmed as noise. They will be deleted from entity list.
+- "ambiguous": names you're unsure about (optional, for logging).
+Return ONLY valid JSON. No markdown fences, no explanation.`);
+  return parts.join(`
+`);
+}
+// ../llm/src/prompts/docTableAnnotation.ts
+init_src();
+var entityFields = zodObjectToPromptFields(entityAtomSchema);
+var attributeFields = zodObjectToPromptFields(attributeAtomSchema);
+var relationFields = zodObjectToPromptFields(relationAtomSchema);
+var comparisonFields = zodObjectToPromptFields(comparisonAtomSchema);
+var metricFields = zodObjectToPromptFields(metricAtomSchema);
+var behaviorFields = zodObjectToPromptFields(behaviorAtomSchema);
+var eventFields = zodObjectToPromptFields(eventAtomSchema);
+var transitionFields = zodObjectToPromptFields(transitionAtomSchema);
+var constraintFields = zodObjectToPromptFields(constraintAtomSchema);
+var stateFields = zodObjectToPromptFields(stateAtomSchema);
+var ruleFields = zodObjectToPromptFields(ruleAtomSchema);
+var TABLE_SYSTEM_PROMPT = `You are a table data modeling assistant. Your task is to extract structured semantic atoms from markdown tables in documents.
+Each table paragraph is tagged with [P0], [P1], etc. You must classify the table type FIRST, then apply the corresponding extraction rules.
+## Step 1: Classify the Table
+Determine the table type by examining the relationship between rows:
+### Type A: Collection / Record Table
+**Rows are peer instances of the same concept.** Each row is an independent record; columns describe different facets of the same instance.
+- Examples: code→name mappings, enum definitions, config parameter lists, reference data tables
+- Key signal: removing one row does not affect the meaning of other rows
+### Type B: Single-Object Property Table
+**Rows describe properties/fields of ONE entity.** First column is property name, other columns are its type/value/description.
+- Examples: API field definitions, configuration schema, entity attribute lists
+- Key signal: all rows refer to the same parent entity
+### Type C: Comparison / Evaluation Table
+**Rows or columns represent different subjects being compared** across the same dimensions.
+- Examples: technology selection, vendor evaluation, feature comparison
+- Key signal: multiple named subjects evaluated on shared criteria
+### Type D: Matrix / Cross-Reference Table
+**Both row headers and column headers are dimensions.** Cells represent the relationship at the intersection.
+- Examples: permission matrices (role × operation), compatibility matrices, dependency tables
+- Key signal: both axes are meaningful dimensions, cells are binary/rating/relationship values
+### Type E: Metrics / KPI Table
+**Rows are measurable indicators** with numeric targets, thresholds, or SLA values.
+- Examples: SLA tables, performance baselines, capacity planning tables
+- Key signal: columns include target/threshold/unit/SLA-style values
+### Type F: Timeline / Process Table
+**Rows represent ordered steps or phases** in a sequence.
+- Examples: deployment steps, approval workflows, version changelog, migration plans
+- Key signal: rows have implicit ordering, may have phase/step/date columns
+## Step 2: Extract Atoms by Table Type
+### Type A → Single attribute with row-object array
+1. Create ONE entity for the abstract concept (table heading or the concept rows represent).
+   Entity schema: ${entityFields}
+2. Create ONE attribute with \`type: "table"\` and \`value\` as an array of row objects. Each row object uses column headers as keys.
+   Attribute schema: ${attributeFields}
+   Example: \`{ "name": "Region Code Mapping", "type": "table", "value": [{"Code": "1001", "Name": "CN_North", "Region": "CN-NORTH"}, ...] }\`
+3. **Extract ALL rows — do not sample.** If a table has 30 rows, the value array must contain all 30 objects.
+4. Extract structural patterns: status indicators (DEPRECATED, enabled/disabled) → "states" + "rules" atoms.
+   State schema: ${stateFields}
+   Rule schema: ${ruleFields}
+### Type B → Multiple attribute atoms
+1. Create ONE entity for the parent structure.
+   Entity schema: ${entityFields}
+2. Create one attribute per row: \`name\` = property name, \`type\` = property type, \`value\` = default/example.
+   Attribute schema: ${attributeFields}
+3. Extract constraints from "required" or "validation" columns.
+   Constraint schema: ${constraintFields}
+### Type C → Comparison atom
+1. Use "comparisons" atom. Subjects = compared items, dimensions = evaluation criteria.
+   Comparison schema: ${comparisonFields}
+2. Extract "decisions" atoms if the table leads to a conclusion.
+### Type D → Relations or table attribute
+1. If cells are simple (yes/no, allowed/denied): extract as "relations" atoms.
+   Relation schema: ${relationFields}
+   Map each cell to a relation: row header → \`from\`, column header → \`to\`, cell value → \`type\` or \`description\`.
+2. If cells are complex: use Type A approach (single attribute with \`type: "table"\`).
+3. Create entities for both row headers and column headers if they are named concepts.
+### Type E → Metrics atoms
+1. Create one "metrics" atom per row.
+   Metric schema: ${metricFields}
+2. Also create the parent entity if named (e.g., "SLA Requirements").
+### Type F → Behaviors/Events/Transitions
+1. Create one "behaviors" atom per step/phase.
+   Behavior schema: ${behaviorFields}
+2. If there are triggers: extract "events" atoms.
+   Event schema: ${eventFields}
+3. If there are state changes: extract "transitions" atoms.
+   Transition schema: ${transitionFields}
+4. Create the parent entity for the process/workflow.
+## Output Format
+Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
+{
+  "P0": {
+    "tableType": "A",
+    "entities": [...],
+    "attributes": [...]
+  },
+  "P3": {
+    "tableType": "C",
+    "comparisons": [...]
+  }
+}
+## Rules
+- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
+- Skip paragraphs with no atoms — do NOT emit empty objects.
+- Every atom MUST include a "confidence" field (0.0-1.0).
+- The "tableType" field is required for each paragraph (one of "A", "B", "C", "D", "E", "F").
+- Only include atom types that are actually extracted.
+- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
+- JSON structure keys must always be in English.
+- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
+- Do NOT include "claims" — they are system-generated.`;
+function buildDocTableAnnotationPrompt(tableText) {
+  return `Classify and extract atoms from the following table paragraphs.
+Each paragraph is tagged with [P0], [P1], etc. First classify each table, then extract atoms accordingly.
+---
+${tableText}
+---
+Return ONLY a valid JSON object. No markdown fences, no explanation.`;
+}
+var DOC_TABLE_ANNOTATION_SYSTEM_PROMPT = TABLE_SYSTEM_PROMPT;
+// ../llm/src/prompts/docDiagramAnnotation.ts
+init_src();
+var entityFields2 = zodObjectToPromptFields(entityAtomSchema);
+var attributeFields2 = zodObjectToPromptFields(attributeAtomSchema);
+var relationFields2 = zodObjectToPromptFields(relationAtomSchema);
+var behaviorFields2 = zodObjectToPromptFields(behaviorAtomSchema);
+var transitionFields2 = zodObjectToPromptFields(transitionAtomSchema);
+var stateFields2 = zodObjectToPromptFields(stateAtomSchema);
+var roleFields = zodObjectToPromptFields(roleAtomSchema);
+var eventFields2 = zodObjectToPromptFields(eventAtomSchema);
+var decisionFields = zodObjectToPromptFields(decisionAtomSchema);
+var constraintFields2 = zodObjectToPromptFields(constraintAtomSchema);
+var DIAGRAM_FENCE_TAGS = [
+  "mermaid",
+  "plantuml",
+  "puml",
+  "dot",
+  "graphviz",
+  "viz",
+  "d2",
+  "c4plantuml",
+  "ditaa",
+  "nomnoml",
+  "wavedrom",
+  "vega",
+  "vega-lite"
+];
+var DIAGRAM_FENCE_REGEX = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "i");
+var DIAGRAM_SYSTEM_PROMPT = `You are a diagram analysis assistant. Your task is to extract structured semantic atoms from text-based diagrams (Mermaid, PlantUML, Graphviz, D2, etc.) embedded in documents.
+Each diagram paragraph is tagged with [P0], [P1], etc. You must classify the diagram type FIRST, then extract atoms accordingly.
+## Step 1: Identify the Diagram Format and Type
+### Formats
+- **Mermaid**: flowchart/graph, sequenceDiagram, stateDiagram, classDiagram, erDiagram, gantt, pie, gitgraph
+- **PlantUML / C4-PlantUML**: @startuml/@enduml blocks, all UML types, C4 architecture (System_Context, Container, Component)
+- **Graphviz (DOT)**: digraph/graph, general directed/undirected graphs
+- **D2**: modern declarative diagrams with shape/connection syntax
+- **Others**: ditaa (ASCII art), nomnoml (UML), wavedrom (timing), vega/vega-lite (data viz)
+### Diagram Types (by semantic content)
+- **Flowchart / Process**: decision trees, algorithms, business process flows
+- **Sequence**: interaction between participants over time (API calls, protocols)
+- **State Machine**: states and transitions triggered by events/guards
+- **Class / ER**: data models, entity relationships, inheritance hierarchies
+- **Architecture**: system components, containers, deployment topology
+- **Gantt / Timeline**: project schedules, milestones, phases
+- **Pie / Data Viz**: statistical distributions, metrics visualization
+## Step 2: Extract Atoms by Diagram Type
+### Flowchart / Process → entities + relations + behaviors + decisions
+1. Extract each node as an entity.
+   Entity schema: ${entityFields2}
+2. Extract each arrow/edge as a relation. Use edge labels as \`type\` or \`description\`.
+   Relation schema: ${relationFields2}
+3. Extract action nodes as behaviors (what the process does at each step).
+   Behavior schema: ${behaviorFields2}
+4. Extract diamond/decision nodes as decisions.
+   Decision schema: ${decisionFields}
+### Sequence → entities + relations + behaviors + events
+1. Extract each participant/actor as an entity (or role if it's a person/team).
+   Entity schema: ${entityFields2}
+   Role schema: ${roleFields}
+2. Extract each message/call as a relation (\`from\` = caller, \`to\` = callee, \`type\` = message label).
+   Relation schema: ${relationFields2}
+3. Extract significant interactions as behaviors.
+   Behavior schema: ${behaviorFields2}
+4. Extract triggers, responses, and async messages as events.
+   Event schema: ${eventFields2}
+### State Machine → entities + states + transitions + events
+1. Extract the state machine subject as an entity.
+   Entity schema: ${entityFields2}
+2. Extract each state as a state atom.
+   State schema: ${stateFields2}
+3. Extract each arrow as a transition (\`from\` = source state, \`to\` = target state, \`trigger\` = event/guard).
+   Transition schema: ${transitionFields2}
+4. Extract triggers as events.
+   Event schema: ${eventFields2}
+### Class / ER → entities + attributes + relations
+1. Extract each class/entity as an entity.
+   Entity schema: ${entityFields2}
+2. Extract fields/properties as attributes.
+   Attribute schema: ${attributeFields2}
+3. Extract associations, inheritance, composition as relations (\`type\` = "INHERITS", "CONTAINS", "REFERENCES", etc.).
+   Relation schema: ${relationFields2}
+### Architecture → entities + relations + constraints
+1. Extract each system/service/container/component as an entity.
+   Entity schema: ${entityFields2}
+2. Extract connections between components as relations.
+   Relation schema: ${relationFields2}
+3. Extract deployment constraints, technology choices.
+   Constraint schema: ${constraintFields2}
+### Gantt / Timeline → behaviors + events + constraints
+1. Extract each task/phase as a behavior.
+   Behavior schema: ${behaviorFields2}
+2. Extract milestones and deadlines as events.
+   Event schema: ${eventFields2}
+3. Extract dependencies and critical path constraints.
+   Constraint schema: ${constraintFields2}
+### Pie / Data Viz → attributes (summary only)
+1. Extract the chart title as an entity.
+   Entity schema: ${entityFields2}
+2. Extract each slice/data point as an attribute (\`name\` = label, \`value\` = amount/percentage, \`type\` = "metric").
+   Attribute schema: ${attributeFields2}
+## Additional Extraction: Diagram Description
+For EVERY diagram, also extract a "description" attribute on the diagram's primary entity:
+- \`name\`: "diagram_description"
+- \`type\`: "description"
+- \`value\`: A 1-3 sentence natural language summary of what the diagram communicates.
+This description is critical for downstream AI consumers who cannot render the diagram.
+## Output Format
+Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
+{
+  "P0": {
+    "diagramFormat": "mermaid",
+    "diagramType": "sequence",
+    "entities": [...],
+    "relations": [...]
+  }
+}
+## Rules
+- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
+- Skip paragraphs with no atoms — do NOT emit empty objects.
+- Every atom MUST include a "confidence" field (0.0-1.0).
+- The "diagramFormat" and "diagramType" fields are required for each paragraph.
+- Only include atom types that are actually extracted.
+- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
+- JSON structure keys must always be in English.
+- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
+- **Extract ALL nodes and edges** — do not sample or skip.
+- Do NOT include "claims" — they are system-generated.`;
+function buildDocDiagramAnnotationPrompt(diagramText) {
+  return `Analyze and extract atoms from the following diagram paragraphs.
+Each paragraph is tagged with [P0], [P1], etc. First identify the diagram format and type, then extract atoms accordingly.
+---
+${diagramText}
+---
+Return ONLY a valid JSON object. No markdown fences, no explanation.`;
+}
+var DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT = DIAGRAM_SYSTEM_PROMPT;
 // ../llm/src/chunking/markdownChunker.ts
 var DEFAULT_MAX_TOKENS2 = 4000;
+var DEFAULT_PARAGRAPH_MAX_TOKENS = 500;
 function estimateTokens(text2) {
   return Math.ceil(text2.length / 4);
 }
+function findCodeBlockRanges(content) {
+  const ranges = [];
+  const fenceRe = /^(`{3,}|~{3,})/gm;
+  let openStart = -1;
+  let openFence = "";
+  let match2;
+  while ((match2 = fenceRe.exec(content)) !== null) {
+    const fence = match2[1];
+    if (openStart === -1) {
+      openStart = match2.index;
+      openFence = fence[0].repeat(fence.length);
+    } else if (fence[0] === openFence[0] && fence.length >= openFence.length) {
+      ranges.push({ start: openStart, end: match2.index + match2[0].length });
+      openStart = -1;
+      openFence = "";
+    }
+  }
+  if (openStart !== -1) {
+    ranges.push({ start: openStart, end: content.length });
+  }
+  return ranges;
+}
+function isInsideCodeBlock(pos, ranges) {
+  for (const r of ranges) {
+    if (pos >= r.start && pos < r.end)
+      return true;
+    if (r.start > pos)
+      break;
+  }
+  return false;
+}
 function parseSections(content) {
-  const headingRe = /^(#{1,6})\s+(.*)$/gm;
-  const sections = [];
+  const codeRanges = findCodeBlockRanges(content);
   const matches = [];
-  let match2;
-  while ((match2 = headingRe.exec(content)) !== null) {
-    matches.push({
-      index: match2.index,
-      level: match2[1].length,
-      heading: match2[2].trim()
-    });
+  const atxRe = /^(#{1,6})\s+(.*)$/gm;
+  let m;
+  while ((m = atxRe.exec(content)) !== null) {
+    if (!isInsideCodeBlock(m.index, codeRanges)) {
+      matches.push({
+        index: m.index,
+        endIndex: m.index + m[0].length,
+        level: m[1].length,
+        heading: m[2].trim()
+      });
+    }
+  }
+  const lines = content.split(`
+`);
+  let offset = 0;
+  for (let i = 0;i < lines.length; i++) {
+    const line = lines[i];
+    if (i > 0) {
+      const prevLine = lines[i - 1].trim();
+      const prevLineStart = offset - lines[i - 1].length - 1;
+      if (prevLine && !isInsideCodeBlock(prevLineStart, codeRanges)) {
+        if (/^={2,}\s*$/.test(line)) {
+          matches.push({
+            index: prevLineStart < 0 ? 0 : prevLineStart,
+            endIndex: offset + line.length,
+            level: 1,
+            heading: prevLine
+          });
+        } else if (/^-{2,}\s*$/.test(line) && !/^-{3,}\s*$/.test(prevLine)) {
+          matches.push({
+            index: prevLineStart < 0 ? 0 : prevLineStart,
+            endIndex: offset + line.length,
+            level: 2,
+            heading: prevLine
+          });
+        }
+      }
+    }
+    offset += line.length + 1;
+  }
+  matches.sort((a, b) => a.index - b.index);
+  const deduped = [];
+  for (const match2 of matches) {
+    const last = deduped[deduped.length - 1];
+    if (last && match2.index < last.endIndex)
+      continue;
+    deduped.push(match2);
   }
+  return buildSectionsFromMatches(content, deduped);
+}
+function buildSectionsFromMatches(content, matches) {
+  const sections = [];
   if (matches.length === 0) {
     const body2 = content.trim();
     if (body2) {
-      sections.push({
-        heading: "",
-        level: 0,
-        body: body2,
-        paragraphs: splitParagraphs(body2)
-      });
+      sections.push({ heading: "", level: 0, body: body2, paragraphs: splitParagraphs(body2) });
     }
     return sections;
   }
   if (matches[0].index > 0) {
     const preBody = content.slice(0, matches[0].index).trim();
     if (preBody) {
-      sections.push({
-        heading: "",
-        level: 0,
-        body: preBody,
-        paragraphs: splitParagraphs(preBody)
-      });
+      sections.push({ heading: "", level: 0, body: preBody, paragraphs: splitParagraphs(preBody) });
     }
   }
   for (let i = 0;i < matches.length; i++) {
     const m = matches[i];
-    const start2 = m.index;
-    const end = i + 1 < matches.length ? matches[i + 1].index : content.length;
-    const fullText = content.slice(start2, end).trim();
-    const headingLineEnd = fullText.indexOf(`
-`);
-    const body2 = headingLineEnd === -1 ? "" : fullText.slice(headingLineEnd + 1).trim();
+    const bodyStart = m.endIndex;
+    const bodyEnd = i + 1 < matches.length ? matches[i + 1].index : content.length;
+    const body2 = content.slice(bodyStart, bodyEnd).trim();
     sections.push({
       heading: m.heading,
       level: m.level,
@@ -196337,6 +196796,128 @@ function splitParagraphs(text2) {
     return [];
   return text2.split(/\n\n+/).map((p4) => p4.trim()).filter(Boolean);
 }
+function splitOversizedText(text2, maxTokens) {
+  const doubleNewlineParts = text2.split(/\n\n+/).map((p4) => p4.trim()).filter(Boolean);
+  if (doubleNewlineParts.length > 1) {
+    const results = [];
+    let acc = "";
+    let accTokens = 0;
+    for (const part of doubleNewlineParts) {
+      const partTokens = estimateTokens(part);
+      if (partTokens > maxTokens) {
+        if (acc) {
+          results.push(acc);
+          acc = "";
+          accTokens = 0;
+        }
+        results.push(...splitOversizedText(part, maxTokens));
+        continue;
+      }
+      if (acc && accTokens + partTokens > maxTokens) {
+        results.push(acc);
+        acc = "";
+        accTokens = 0;
+      }
+      acc = acc ? acc + `
+` + part : part;
+      accTokens += partTokens;
+    }
+    if (acc)
+      results.push(acc);
+    return results;
+  }
+  const lines = text2.split(`
+`);
+  if (lines.length > 1) {
+    const blocks = mergeAtomicBlocks(lines);
+    const results = [];
+    let acc = "";
+    let accTokens = 0;
+    for (const block of blocks) {
+      const blockTokens = estimateTokens(block);
+      if (blockTokens > maxTokens) {
+        if (acc) {
+          results.push(acc);
+          acc = "";
+          accTokens = 0;
+        }
+        results.push(block);
+        continue;
+      }
+      if (acc && accTokens + blockTokens > maxTokens) {
+        results.push(acc);
+        acc = "";
+        accTokens = 0;
+      }
+      acc = acc ? acc + `
+` + block : block;
+      accTokens += blockTokens;
+    }
+    if (acc)
+      results.push(acc);
+    return results;
+  }
+  return forceBreakText(text2, maxTokens);
+}
+function mergeAtomicBlocks(lines) {
+  const result = [];
+  let i = 0;
+  while (i < lines.length) {
+    const line = lines[i];
+    const trimmed = line.trimStart();
+    if (/^(`{3,}|~{3,})/.test(trimmed)) {
+      const fence = trimmed.match(/^(`{3,}|~{3,})/)[1];
+      const fenceChar = fence[0];
+      const fenceLen = fence.length;
+      const blockLines = [line];
+      i++;
+      while (i < lines.length) {
+        blockLines.push(lines[i]);
+        const inner = lines[i].trimStart();
+        if (inner.startsWith(fenceChar) && inner.match(new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`))) {
+          i++;
+          break;
+        }
+        i++;
+      }
+      result.push(blockLines.join(`
+`));
+      continue;
+    }
+    if (trimmed.startsWith("|")) {
+      const tableLines = [line];
+      i++;
+      while (i < lines.length && lines[i].trimStart().startsWith("|")) {
+        tableLines.push(lines[i]);
+        i++;
+      }
+      result.push(tableLines.join(`
+`));
+      continue;
+    }
+    result.push(line);
+    i++;
+  }
+  return result;
+}
+function forceBreakText(text2, maxTokens) {
+  const maxChars = maxTokens * 4;
+  const results = [];
+  let remaining = text2;
+  while (remaining.length > maxChars) {
+    let breakAt = maxChars;
+    const spaceIdx = remaining.lastIndexOf(" ", maxChars);
+    if (spaceIdx > maxChars * 0.7) {
+      breakAt = spaceIdx;
+    }
+    results.push(remaining.slice(0, breakAt).trim());
+    remaining = remaining.slice(breakAt).trim();
+  }
+  if (remaining)
+    results.push(remaining);
+  return results;
+}
 function buildBreadcrumb(sections, sectionIndex) {
   const current = sections[sectionIndex];
   if (current.level <= 0)
@@ -196365,11 +196946,53 @@ function sectionHeadingLine(section) {
     return "";
   return `${"#".repeat(section.level)} ${section.heading}`;
 }
+function buildCoarseParagraphs(sections, paragraphMaxTokens) {
+  const result = [];
+  const rawEntries = [];
+  for (let sIdx = 0;sIdx < sections.length; sIdx++) {
+    const section = sections[sIdx];
+    if (!section.body.trim())
+      continue;
+    const bodyTokens = estimateTokens(section.body);
+    if (bodyTokens > paragraphMaxTokens) {
+      const parts = splitOversizedText(section.body, paragraphMaxTokens);
+      for (const part of parts) {
+        rawEntries.push({ sectionIndex: sIdx, text: part, tokens: estimateTokens(part) });
+      }
+    } else {
+      rawEntries.push({ sectionIndex: sIdx, text: section.body, tokens: bodyTokens });
+    }
+  }
+  const MERGE_THRESHOLD = 150;
+  const merged = [];
+  for (const entry of rawEntries) {
+    const last = merged[merged.length - 1];
+    if (last && last.tokens < MERGE_THRESHOLD && entry.tokens < MERGE_THRESHOLD && last.tokens + entry.tokens <= paragraphMaxTokens) {
+      last.text = last.text + `
+` + entry.text;
+      last.tokens += entry.tokens;
+    } else {
+      merged.push({ ...entry });
+    }
+  }
+  let pIdx = 0;
+  for (const entry of merged) {
+    result.push({
+      sectionIndex: entry.sectionIndex,
+      paragraphIndex: pIdx++,
+      text: entry.text
+    });
+  }
+  return result;
+}
 function chunkMarkdown(content, options = {}) {
   const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS2;
+  const paragraphMaxTokens = options.paragraphMaxTokens ?? DEFAULT_PARAGRAPH_MAX_TOKENS;
   const sections = parseSections(content);
   if (sections.length === 0)
     return [];
+  const coarseParagraphs = buildCoarseParagraphs(sections, paragraphMaxTokens);
   const chunks = [];
   let pendingSections = [];
   let pendingTokens = 0;
@@ -196387,14 +197010,16 @@ function chunkMarkdown(content, options = {}) {
       const heading = sectionHeadingLine(entry.section);
       if (heading)
         textParts.push(heading);
-      for (let pIdx = 0;pIdx < entry.section.paragraphs.length; pIdx++) {
-        const pText = entry.section.paragraphs[pIdx];
-        textParts.push(pText);
-        paragraphs.push({
-          sectionIndex: entry.sectionIndex,
-          paragraphIndex: pIdx,
-          text: pText
-        });
+      const sectionParas = coarseParagraphs.filter((p4) => p4.sectionIndex === entry.sectionIndex);
+      for (const p4 of sectionParas) {
+        if (!paragraphs.some((existing) => existing.paragraphIndex === p4.paragraphIndex && existing.text === p4.text)) {
+          textParts.push(p4.text);
+          paragraphs.push({
+            sectionIndex: p4.sectionIndex,
+            paragraphIndex: p4.paragraphIndex,
+            text: p4.text
+          });
+        }
       }
     }
     chunks.push({
@@ -196417,7 +197042,7 @@ function chunkMarkdown(content, options = {}) {
 ` : "") + section.body);
     if (sectionTokens > maxTokens && section.paragraphs.length > 1) {
       flushPending();
-      splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks);
+      splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks, coarseParagraphs);
       continue;
     }
     const crumbTokens = pendingSections.length === 0 ? estimateTokens(breadcrumbPrefix(breadcrumb)) : 0;
@@ -196430,9 +197055,10 @@ function chunkMarkdown(content, options = {}) {
   flushPending();
   return chunks;
 }
-function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks) {
+function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks, coarseParagraphs) {
   const headingLine = sectionHeadingLine(section);
   const prefix = breadcrumbPrefix(breadcrumb);
+  const sectionParas = coarseParagraphs.filter((p4) => p4.sectionIndex === sectionIndex);
   let accParagraphs = [];
   let accTextParts = [];
   let accTokens = 0;
@@ -196459,18 +197085,265 @@ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens,
     accTokens = baseOverhead;
   }
   accTokens = baseOverhead;
-  for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
-    const pText = section.paragraphs[pIdx];
-    const pTokens = estimateTokens(pText);
+  for (const p4 of sectionParas) {
+    const pTokens = estimateTokens(p4.text);
     if (accParagraphs.length > 0 && accTokens + pTokens > maxTokens) {
       flushAcc();
     }
-    accParagraphs.push({ sectionIndex, paragraphIndex: pIdx, text: pText });
-    accTextParts.push(pText);
+    accParagraphs.push({ sectionIndex, paragraphIndex: p4.paragraphIndex, text: p4.text });
+    accTextParts.push(p4.text);
     accTokens += pTokens;
   }
   flushAcc();
 }
+// ../llm/src/chunking/normalizeMarkdown.ts
+function normalizeMarkdown(content) {
+  const stats = { repairs: {} };
+  function count(category) {
+    stats.repairs[category] = (stats.repairs[category] ?? 0) + 1;
+  }
+  let result = stripBomAndInvisible(content, count);
+  result = normalizeLineEndings(result, count);
+  const lines = result.split(`
+`);
+  const output = processBlocks(lines, count);
+  return { content: output.join(`
+`), stats };
+}
+function stripBomAndInvisible(text2, count) {
+  const cleaned = text2.replace(/[\uFEFF\u200B\u200C\u200D]/g, "");
+  if (cleaned.length !== text2.length) {
+    count("invisible_chars");
+  }
+  return cleaned;
+}
+function normalizeLineEndings(text2, count) {
+  if (text2.includes("\r")) {
+    count("line_endings");
+    return text2.replace(/\r\n?/g, `
+`);
+  }
+  return text2;
+}
+function processBlocks(inputLines, count) {
+  const lines = splitInlineFences(inputLines, count);
+  const output = [];
+  let i = 0;
+  while (i < lines.length) {
+    const line = lines[i];
+    const trimmed = line.trimStart();
+    const fenceMatch = trimmed.match(/^(`{3,}|~{3,})/);
+    if (fenceMatch) {
+      const result = handleCodeFence(lines, i, fenceMatch[1], count);
+      output.push(...result.lines);
+      i = result.nextIndex;
+      continue;
+    }
+    if (looksLikeTableRow(trimmed)) {
+      const result = handleTableBlock(lines, i, count);
+      output.push(...result.lines);
+      i = result.nextIndex;
+      continue;
+    }
+    if (trimmed === "") {
+      const result = handleBlankLines(lines, i, count);
+      output.push(...result.lines);
+      i = result.nextIndex;
+      continue;
+    }
+    if (trimmed.startsWith("<!--")) {
+      const result = handleHtmlComment(lines, i, count);
+      output.push(...result.lines);
+      i = result.nextIndex;
+      continue;
+    }
+    if (looksLikeJsonBlockStart(trimmed)) {
+      const result = handleUnfencedJson(lines, i, count);
+      if (result) {
+        output.push(...result.lines);
+        i = result.nextIndex;
+        continue;
+      }
+    }
+    output.push(line);
+    i++;
+  }
+  return output;
+}
+function handleCodeFence(lines, startIdx, fence, count) {
+  const fenceChar = fence[0];
+  const fenceLen = fence.length;
+  const result = [lines[startIdx]];
+  let i = startIdx + 1;
+  while (i < lines.length) {
+    const trimmed = lines[i].trimStart();
+    result.push(lines[i]);
+    const closingRe = new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`);
+    if (closingRe.test(trimmed)) {
+      return { lines: result, nextIndex: i + 1 };
+    }
+    i++;
+  }
+  count("unclosed_code_fence");
+  result.push(fence);
+  return { lines: result, nextIndex: i };
+}
+function handleTableBlock(lines, startIdx, count) {
+  const tableLines = [];
+  let i = startIdx;
+  while (i < lines.length && looksLikeTableRow(lines[i].trimStart())) {
+    tableLines.push(lines[i]);
+    i++;
+  }
+  if (tableLines.length < 2) {
+    return { lines: tableLines, nextIndex: i };
+  }
+  const normalized = tableLines.map((line) => {
+    const trimmed = line.trimStart();
+    if (!trimmed.startsWith("|") && trimmed.includes("|")) {
+      count("table_leading_pipe");
+      return "| " + trimmed + (trimmed.endsWith("|") ? "" : " |");
+    }
+    return line;
+  });
+  const hasSeparator = normalized.some((line) => /^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/.test(line.trim()));
+  if (!hasSeparator && normalized.length >= 2) {
+    const firstRow = normalized[0].trim();
+    const colCount = countPipes(firstRow) - 1;
+    if (colCount >= 2) {
+      const separator = "| " + Array(colCount).fill("---").join(" | ") + " |";
+      count("table_missing_separator");
+      const result = [normalized[0], separator, ...normalized.slice(1)];
+      return { lines: result, nextIndex: i };
+    }
+  }
+  return { lines: normalized, nextIndex: i };
+}
+function handleBlankLines(lines, startIdx, count) {
+  let i = startIdx;
+  while (i < lines.length && lines[i].trim() === "") {
+    i++;
+  }
+  const blankCount = i - startIdx;
+  if (blankCount > 2) {
+    count("excessive_blank_lines");
+    return { lines: [""], nextIndex: i };
+  }
+  return { lines: lines.slice(startIdx, i), nextIndex: i };
+}
+function handleHtmlComment(lines, startIdx, count) {
+  const firstLine = lines[startIdx];
+  if (firstLine.includes("-->")) {
+    count("html_comment");
+    return { lines: [], nextIndex: startIdx + 1 };
+  }
+  let i = startIdx + 1;
+  while (i < lines.length) {
+    if (lines[i].includes("-->")) {
+      count("html_comment");
+      return { lines: [], nextIndex: i + 1 };
+    }
+    i++;
+  }
+  return { lines: [firstLine], nextIndex: startIdx + 1 };
+}
+function looksLikeJsonBlockStart(trimmed) {
+  return trimmed === "{" || trimmed === "[";
+}
+var MIN_JSON_BLOCK_LINES = 5;
+function handleUnfencedJson(lines, startIdx, count) {
+  const opener = lines[startIdx].trimStart();
+  const openChar = opener[0];
+  const closeChar = openChar === "{" ? "}" : "]";
+  let depth = 0;
+  let i = startIdx;
+  let inString = false;
+  while (i < lines.length) {
+    const line = lines[i];
+    for (let c = 0;c < line.length; c++) {
+      const ch = line[c];
+      if (ch === "\\" && inString) {
+        c++;
+        continue;
+      }
+      if (ch === '"') {
+        inString = !inString;
+        continue;
+      }
+      if (inString)
+        continue;
+      if (ch === "/" && c + 1 < line.length && line[c + 1] === "/") {
+        break;
+      }
+      if (ch === "{" || ch === "[")
+        depth++;
+      else if (ch === "}" || ch === "]")
+        depth--;
+    }
+    i++;
+    if (depth === 0) {
+      const blockLen = i - startIdx;
+      if (blockLen < MIN_JSON_BLOCK_LINES) {
+        return null;
+      }
+      const lastTrimmed = lines[i - 1].trimEnd();
+      if (!lastTrimmed.endsWith(closeChar)) {
+        return null;
+      }
+      count("unfenced_json_block");
+      const fenced = ["```json"];
+      for (let j = startIdx;j < i; j++) {
+        fenced.push(lines[j]);
+      }
+      fenced.push("```");
+      return { lines: fenced, nextIndex: i };
+    }
+    if (depth < 0) {
+      return null;
+    }
+  }
+  return null;
+}
+function splitInlineFences(lines, count) {
+  const result = [];
+  for (const line of lines) {
+    const trimmed = line.trimStart();
+    if (/^(`{3,}|~{3,})/.test(trimmed)) {
+      result.push(line);
+      continue;
+    }
+    const inlineMatch = trimmed.match(/(`{3,}|~{3,})(\S*)\s*$/);
+    if (inlineMatch) {
+      const fenceStr = inlineMatch[1];
+      const fenceIdx = trimmed.lastIndexOf(fenceStr);
+      const beforeFence = trimmed.substring(0, fenceIdx);
+      if (beforeFence.trim().length > 0) {
+        const leadingWhitespace = line.substring(0, line.length - trimmed.length);
+        count("inline_code_fence");
+        result.push(leadingWhitespace + beforeFence.trimEnd());
+        result.push(trimmed.substring(fenceIdx));
+        continue;
+      }
+    }
+    result.push(line);
+  }
+  return result;
+}
+function looksLikeTableRow(trimmed) {
+  if (trimmed.startsWith("#") || trimmed.startsWith("```") || trimmed.startsWith("~~~")) {
+    return false;
+  }
+  return countPipes(trimmed) >= 1;
+}
+function countPipes(text2) {
+  let count = 0;
+  for (let i = 0;i < text2.length; i++) {
+    if (text2[i] === "|" && (i === 0 || text2[i - 1] !== "\\")) {
+      count++;
+    }
+  }
+  return count;
+}
 // ../llm/src/utils/mapConcurrent.ts
 async function mapConcurrent(items, concurrency, fn) {
   const results = [];
@@ -196499,9 +197372,760 @@ async function mapConcurrent(items, concurrency, fn) {
 }
 // ../api/src/services/docIndexer.ts
 init_src();
+// ../api/src/services/docEmbedding.ts
+var EMBEDDING_BATCH_SIZE = 20;
+var EMBEDDING_MAX_TOKENS = 480;
+function isPureCodeBlock(text2) {
+  const trimmed = text2.trim();
+  if (/^```[\s\S]*```\s*$/.test(trimmed))
+    return true;
+  const lines = trimmed.split(`
+`).filter(Boolean);
+  if (lines.length < 3)
+    return false;
+  const indentedLines = lines.filter((l) => /^\s{2,}/.test(l)).length;
+  const indentRatio = indentedLines / lines.length;
+  if (indentRatio > 0.8)
+    return true;
+  const codeChars = (trimmed.match(/[{}();=><|&![\]]/g) || []).length;
+  const ratio = codeChars / trimmed.length;
+  if (ratio > 0.15 && indentRatio > 0.6)
+    return true;
+  return false;
+}
+var CODE_SKELETON_MAX_LINES = 20;
+var CODE_SKELETON_MAX_CHARS = 800;
+function skeletonizeCodeBlock(text2) {
+  const lines = text2.split(`
+`);
+  let indentUnit = 2;
+  for (const line of lines) {
+    const match2 = line.match(/^(\s+)\S/);
+    if (match2) {
+      const spaces = match2[1].replace(/\t/g, "  ").length;
+      if (spaces > 0) {
+        indentUnit = spaces;
+        break;
+      }
+    }
+  }
+  const maxIndent = indentUnit * 2;
+  const kept = [];
+  let lastWasElided = false;
+  for (const line of lines) {
+    const trimmed = line.trimStart();
+    if (trimmed === "")
+      continue;
+    const leadingSpaces = line.replace(/\t/g, "  ").length - trimmed.length;
+    if (leadingSpaces <= maxIndent) {
+      if (lastWasElided) {
+        kept.push("    ...");
+        lastWasElided = false;
+      }
+      kept.push(line);
+    } else {
+      lastWasElided = true;
+    }
+  }
+  if (lastWasElided)
+    kept.push("    ...");
+  let result = kept;
+  if (result.length > CODE_SKELETON_MAX_LINES) {
+    result = result.slice(0, CODE_SKELETON_MAX_LINES);
+    result.push("[...]");
+  }
+  let joined = result.join(`
+`);
+  if (joined.length > CODE_SKELETON_MAX_CHARS) {
+    joined = joined.slice(0, CODE_SKELETON_MAX_CHARS) + `
+[...]`;
+  }
+  return joined;
+}
+function truncateForEmbedding(text2, maxTokens) {
+  const maxChars = maxTokens * 4;
+  if (text2.length <= maxChars)
+    return text2;
+  const spaceIdx = text2.lastIndexOf(" ", maxChars);
+  const breakAt = spaceIdx > maxChars * 0.8 ? spaceIdx : maxChars;
+  return text2.slice(0, breakAt);
+}
+async function generateEmbeddings(digest, embeddingService, onProgress) {
+  const paragraphs = [];
+  let skippedCode = 0;
+  for (let sIdx = 0;sIdx < digest.sections.length; sIdx++) {
+    const section = digest.sections[sIdx];
+    for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
+      const text2 = section.paragraphs[pIdx].text;
+      if (isPureCodeBlock(text2)) {
+        skippedCode++;
+        continue;
+      }
+      paragraphs.push({
+        sectionIndex: sIdx,
+        paragraphIndex: pIdx,
+        text: truncateForEmbedding(text2, EMBEDDING_MAX_TOKENS)
+      });
+    }
+  }
+  if (paragraphs.length === 0)
+    return 0;
+  if (skippedCode > 0) {
+    onProgress?.({ phase: "embedding", progress: 85, message: `Skipped ${skippedCode} code-only paragraphs` });
+  }
+  const embeddings = [];
+  const totalParagraphs = paragraphs.length;
+  onProgress?.({ phase: "embedding", progress: 85, message: `Loading model (${totalParagraphs} paragraphs)` });
+  const warmupStart = Date.now();
+  await embeddingService.getDimension();
+  const warmupMs = Date.now() - warmupStart;
+  if (warmupMs > 500) {
+    onProgress?.({ phase: "embedding", progress: 86, message: `Model ready (${(warmupMs / 1000).toFixed(1)}s)` });
+  }
+  const totalBatches = Math.ceil(totalParagraphs / EMBEDDING_BATCH_SIZE);
+  for (let i = 0;i < totalParagraphs; i += EMBEDDING_BATCH_SIZE) {
+    const batchIndex = Math.floor(i / EMBEDDING_BATCH_SIZE) + 1;
+    const batch2 = paragraphs.slice(i, i + EMBEDDING_BATCH_SIZE);
+    const texts = batch2.map((p4) => p4.text);
+    const batchStart = Date.now();
+    try {
+      const vectors = await embeddingService.embedBatch(texts);
+      for (let j = 0;j < batch2.length; j++) {
+        embeddings.push({
+          sectionIndex: batch2[j].sectionIndex,
+          paragraphIndex: batch2[j].paragraphIndex,
+          vector: vectors[j]
+        });
+      }
+    } catch {
+      for (let fi = 0;fi < batch2.length; fi++) {
+        const p4 = batch2[fi];
+        try {
+          const vector = await embeddingService.embed(p4.text);
+          embeddings.push({
+            sectionIndex: p4.sectionIndex,
+            paragraphIndex: p4.paragraphIndex,
+            vector
+          });
+        } catch {
+          console.warn(`[docIndexer] embedding failed for section ${p4.sectionIndex} paragraph ${p4.paragraphIndex}`);
+        }
+        const embedded2 = i + fi + 1;
+        const progress2 = 86 + Math.round(embedded2 / totalParagraphs * 9);
+        onProgress?.({ phase: "embedding", progress: progress2, message: `Fallback ${embedded2}/${totalParagraphs}` });
+      }
+      continue;
+    }
+    const embedded = Math.min(i + EMBEDDING_BATCH_SIZE, totalParagraphs);
+    const batchMs = Date.now() - batchStart;
+    const progress = 86 + Math.round(embedded / totalParagraphs * 9);
+    onProgress?.({ phase: "embedding", progress, message: `Batch ${batchIndex}/${totalBatches} (${embedded}/${totalParagraphs}, ${(batchMs / 1000).toFixed(1)}s)` });
+  }
+  digest.embeddings = embeddings;
+  return embeddings.length;
+}
+async function writeToVectorStore(digest, vectorStore, hashId, sourceId, sourcePath) {
+  if (digest.embeddings.length === 0)
+    return;
+  try {
+    await vectorStore.deleteByPrefix(`${hashId}:`);
+    await vectorStore.add(digest.embeddings.map((e) => ({
+      id: `${hashId}:${e.sectionIndex}:${e.paragraphIndex}`,
+      embedding: e.vector,
+      metadata: {
+        layer: "digest",
+        sourceId,
+        hashId,
+        sourcePath,
+        sectionIndex: e.sectionIndex,
+        paragraphIndex: e.paragraphIndex
+      }
+    })));
+  } catch (err2) {
+    console.warn(`[docIndexer] IVectorStore write failed (non-blocking):`, err2);
+  }
+}
+// ../api/src/services/docTableExtractor.ts
+init_src();
+function detectTableColumnCount(text2) {
+  const sepMatch = text2.match(/^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/m);
+  if (!sepMatch)
+    return 0;
+  return (sepMatch[0].match(/\|/g)?.length ?? 1) - 1;
+}
+async function extractTableAtoms(chunk, sections, result, llmService) {
+  const tableParagraphs = [];
+  for (let i = 0;i < chunk.paragraphs.length; i++) {
+    const p4 = chunk.paragraphs[i];
+    const colCount = detectTableColumnCount(p4.text);
+    if (colCount < 2)
+      continue;
+    const section = sections[p4.sectionIndex];
+    const sectionHeading = section?.heading ? `${"#".repeat(section.level)} ${section.heading}` : "";
+    tableParagraphs.push({
+      chunkParaIndex: i,
+      colCount,
+      text: p4.text,
+      sectionHeading
+    });
+  }
+  if (tableParagraphs.length === 0) {
+    return { extracted: 0, llmCalls: 0, totalTokens: 0 };
+  }
+  const parts = [];
+  const tagToChunkIndex = new Map;
+  for (let ti = 0;ti < tableParagraphs.length; ti++) {
+    const tp = tableParagraphs[ti];
+    const tag2 = `P${ti}`;
+    tagToChunkIndex.set(tag2, tp.chunkParaIndex);
+    if (tp.sectionHeading) {
+      parts.push(tp.sectionHeading);
+    }
+    if (tp.chunkParaIndex > 0) {
+      const prevPara = chunk.paragraphs[tp.chunkParaIndex - 1];
+      if (prevPara && detectTableColumnCount(prevPara.text) === 0) {
+        parts.push(prevPara.text);
+      }
+    }
+    parts.push(`[${tag2}] ${tp.text}`);
+    parts.push("");
+  }
+  const tableText = parts.join(`
+`);
+  const prompt = buildDocTableAnnotationPrompt(tableText);
+  try {
+    const res = await llmService.generateText(prompt, {
+      systemPrompt: DOC_TABLE_ANNOTATION_SYSTEM_PROMPT
+    });
+    const parsed = parseExtractionOutput(res.text, docChunkResultSchema);
+    if (!parsed.success) {
+      console.warn(`[docIndexer] table extraction: parse failed: ${parsed.error.message.slice(0, 200)}`);
+      return { extracted: 0, llmCalls: 1, totalTokens: res.usage.totalTokens };
+    }
+    let extracted = 0;
+    for (const tableP of parsed.data.paragraphs) {
+      const chunkParaIndex = tagToChunkIndex.get(tableP.tag);
+      if (chunkParaIndex === undefined)
+        continue;
+      const originalTag = `P${chunkParaIndex}`;
+      const existing = result.paragraphs.find((rp) => rp.tag === originalTag);
+      const tableAtomCount = Object.values(tableP.atoms).reduce((sum, arr) => sum + (Array.isArray(arr) ? arr.length : 0), 0);
+      if (tableAtomCount === 0)
+        continue;
+      if (existing) {
+        for (const [atomType, atoms2] of Object.entries(tableP.atoms)) {
+          if (Array.isArray(atoms2) && atoms2.length > 0) {
+            existing.atoms[atomType] = atoms2;
+          }
+        }
+      } else {
+        result.paragraphs.push({ ...tableP, tag: originalTag });
+      }
+      extracted++;
+      const tp = tableParagraphs.find((t4) => t4.chunkParaIndex === chunkParaIndex);
+      console.log(`[docIndexer] table extraction: ${originalTag} → ${tableAtomCount} atoms (table has ${tp?.colCount ?? "?"} cols)`);
+    }
+    return { extracted, llmCalls: 1, totalTokens: res.usage.totalTokens };
+  } catch (err2) {
+    console.warn("[docIndexer] table extraction failed (non-blocking):", err2);
+    return { extracted: 0, llmCalls: 1, totalTokens: 0 };
+  }
+}
+// ../api/src/services/docDiagramExtractor.ts
+init_src();
+var DIAGRAM_OPEN_RE = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "im");
+function detectDiagramFormat(text2) {
+  const trimmed = text2.trim();
+  const match2 = trimmed.match(new RegExp(`^\`\`\`(${DIAGRAM_FENCE_TAGS.join("|")})\\s*\\n`, "i"));
+  if (!match2)
+    return null;
+  if (!trimmed.endsWith("```"))
+    return null;
+  return match2[1].toLowerCase();
+}
+async function extractDiagramAtoms(chunk, sections, result, llmService) {
+  const diagramParagraphs = [];
+  for (let i = 0;i < chunk.paragraphs.length; i++) {
+    const p4 = chunk.paragraphs[i];
+    const format = detectDiagramFormat(p4.text);
+    if (!format)
+      continue;
+    const section = sections[p4.sectionIndex];
+    const sectionHeading = section?.heading ? `${"#".repeat(section.level)} ${section.heading}` : "";
+    diagramParagraphs.push({
+      chunkParaIndex: i,
+      format,
+      text: p4.text,
+      sectionHeading
+    });
+  }
+  if (diagramParagraphs.length === 0) {
+    return { extracted: 0, llmCalls: 0, totalTokens: 0 };
+  }
+  const parts = [];
+  const tagToChunkIndex = new Map;
+  for (let di = 0;di < diagramParagraphs.length; di++) {
+    const dp = diagramParagraphs[di];
+    const tag2 = `P${di}`;
+    tagToChunkIndex.set(tag2, dp.chunkParaIndex);
+    if (dp.sectionHeading) {
+      parts.push(dp.sectionHeading);
+    }
+    if (dp.chunkParaIndex > 0) {
+      const prevPara = chunk.paragraphs[dp.chunkParaIndex - 1];
+      if (prevPara && !detectDiagramFormat(prevPara.text)) {
+        parts.push(prevPara.text);
+      }
+    }
+    parts.push(`[${tag2}] ${dp.text}`);
+    parts.push("");
+  }
+  const diagramText = parts.join(`
+`);
+  const prompt = buildDocDiagramAnnotationPrompt(diagramText);
+  try {
+    const res = await llmService.generateText(prompt, {
+      systemPrompt: DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT
+    });
+    const parsed = parseExtractionOutput(res.text, docChunkResultSchema);
+    if (!parsed.success) {
+      console.warn(`[docIndexer] diagram extraction: parse failed — ${parsed.error.message.slice(0, 200)}`);
+      return { extracted: 0, llmCalls: 1, totalTokens: res.usage.totalTokens };
+    }
+    let extracted = 0;
+    for (const diagramP of parsed.data.paragraphs) {
+      const chunkParaIndex = tagToChunkIndex.get(diagramP.tag);
+      if (chunkParaIndex === undefined)
+        continue;
+      const originalTag = `P${chunkParaIndex}`;
+      const existing = result.paragraphs.find((rp) => rp.tag === originalTag);
+      const diagramAtomCount = Object.values(diagramP.atoms).reduce((sum, arr) => sum + (Array.isArray(arr) ? arr.length : 0), 0);
+      if (diagramAtomCount === 0)
+        continue;
+      if (existing) {
+        for (const [atomType, atoms2] of Object.entries(diagramP.atoms)) {
+          if (Array.isArray(atoms2) && atoms2.length > 0) {
+            existing.atoms[atomType] = atoms2;
+          }
+        }
+      } else {
+        result.paragraphs.push({ ...diagramP, tag: originalTag });
+      }
+      extracted++;
+      const dp = diagramParagraphs.find((d) => d.chunkParaIndex === chunkParaIndex);
+      console.log(`[docIndexer] diagram extraction: ${originalTag} → ${diagramAtomCount} atoms (${dp?.format ?? "unknown"} diagram)`);
+    }
+    return { extracted, llmCalls: 1, totalTokens: res.usage.totalTokens };
+  } catch (err2) {
+    console.warn("[docIndexer] diagram extraction failed (non-blocking):", err2);
+    return { extracted: 0, llmCalls: 1, totalTokens: 0 };
+  }
+}
+// ../api/src/services/docAtomPostProcess.ts
+function postProcessDigestAtoms(sections) {
+  autoCompleteEntities(sections);
+  normalizeEntityNames(sections);
+  warnCrossRefIssues(sections);
+}
+function isNoiseEntityName(name21) {
+  const trimmed = name21.trim();
+  if (trimmed.length === 0)
+    return true;
+  if (trimmed.startsWith("$"))
+    return true;
+  if (/[+=]/.test(trimmed))
+    return true;
+  if (/^\d/.test(trimmed))
+    return true;
+  return false;
+}
+function autoCompleteEntities(sections) {
+  const declaredEntities = new Set;
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      const entities2 = para.atoms.entities;
+      if (entities2) {
+        for (const e of entities2)
+          declaredEntities.add(e.name);
+      }
+    }
+  }
+  let autoCreated = 0;
+  let skippedNoise = 0;
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      const referencedNames = new Set;
+      const relations = para.atoms.relations;
+      if (relations) {
+        for (const r of relations) {
+          referencedNames.add(r.from);
+          referencedNames.add(r.to);
+        }
+      }
+      const boundaries = para.atoms.boundaries;
+      if (boundaries) {
+        for (const b of boundaries) {
+          for (const name21 of b.contains)
+            referencedNames.add(name21);
+          if (b.excludes)
+            for (const name21 of b.excludes)
+              referencedNames.add(name21);
+        }
+      }
+      for (const name21 of referencedNames) {
+        if (!declaredEntities.has(name21)) {
+          if (isNoiseEntityName(name21)) {
+            skippedNoise++;
+            continue;
+          }
+          if (!para.atoms.entities) {
+            para.atoms.entities = [];
+          }
+          para.atoms.entities.push({
+            name: name21,
+            kind: "concept",
+            confidence: 0.6
+          });
+          declaredEntities.add(name21);
+          autoCreated++;
+        }
+      }
+    }
+  }
+  if (autoCreated > 0 || skippedNoise > 0) {
+    console.log(`[docAtomPostProcess] auto-created ${autoCreated} entities, skipped ${skippedNoise} noise names`);
+  }
+}
+function normalizeEntityNames(sections) {
+  const allNames = [];
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      const entities2 = para.atoms.entities;
+      if (entities2) {
+        for (const e of entities2)
+          allNames.push(e.name);
+      }
+    }
+  }
+  const uniqueNames = [...new Set(allNames)].sort((a, b) => b.length - a.length);
+  const mergeMap = new Map;
+  for (let i = 0;i < uniqueNames.length; i++) {
+    const short = uniqueNames[i];
+    if (short.length < 3)
+      continue;
+    if (mergeMap.has(short))
+      continue;
+    for (let j = 0;j < i; j++) {
+      const long = uniqueNames[j];
+      if (mergeMap.has(long))
+        continue;
+      if (long.includes(short) && long !== short) {
+        mergeMap.set(short, long);
+        break;
+      }
+    }
+  }
+  if (mergeMap.size === 0)
+    return;
+  let normalized = 0;
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      const entities2 = para.atoms.entities;
+      if (entities2) {
+        for (const e of entities2) {
+          const canonical = mergeMap.get(e.name);
+          if (canonical) {
+            e.name = canonical;
+            normalized++;
+          }
+        }
+        const seen = new Set;
+        para.atoms.entities = entities2.filter((e) => {
+          if (seen.has(e.name))
+            return false;
+          seen.add(e.name);
+          return true;
+        });
+      }
+      const relations = para.atoms.relations;
+      if (relations) {
+        for (const r of relations) {
+          const fromCanonical = mergeMap.get(r.from);
+          if (fromCanonical) {
+            r.from = fromCanonical;
+            normalized++;
+          }
+          const toCanonical = mergeMap.get(r.to);
+          if (toCanonical) {
+            r.to = toCanonical;
+            normalized++;
+          }
+        }
+      }
+      const boundaries = para.atoms.boundaries;
+      if (boundaries) {
+        for (const b of boundaries) {
+          b.contains = b.contains.map((n) => mergeMap.get(n) ?? n);
+          if (b.excludes)
+            b.excludes = b.excludes.map((n) => mergeMap.get(n) ?? n);
+        }
+      }
+    }
+  }
+  if (normalized > 0) {
+    console.log(`[docAtomPostProcess] normalized ${normalized} entity name references (${mergeMap.size} merge rules)`);
+    for (const [short, long] of mergeMap) {
+      console.log(`  "${short}" → "${long}"`);
+    }
+  }
+}
+function warnCrossRefIssues(sections) {
+  const allEntityNames = new Set;
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      const entities2 = para.atoms.entities;
+      if (entities2) {
+        for (const e of entities2)
+          allEntityNames.add(e.name);
+      }
+    }
+  }
+  const allStateValues = new Set;
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      const states = para.atoms.states;
+      if (states) {
+        for (const s of states)
+          for (const v of s.values)
+            allStateValues.add(v);
+      }
+    }
+  }
+  let warnings = 0;
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      const transitions = para.atoms.transitions;
+      if (transitions) {
+        for (const t4 of transitions) {
+          if (allStateValues.size > 0 && !allStateValues.has(t4.from)) {
+            console.warn(`[docAtomPostProcess] transition.from "${t4.from}" not in declared states`);
+            warnings++;
+          }
+          if (allStateValues.size > 0 && !allStateValues.has(t4.to)) {
+            console.warn(`[docAtomPostProcess] transition.to "${t4.to}" not in declared states`);
+            warnings++;
+          }
+        }
+      }
+      const roles = para.atoms.roles;
+      if (roles) {
+        const allBehaviorNames = new Set;
+        for (const s of sections) {
+          for (const p4 of s.paragraphs) {
+            const behaviors = p4.atoms.behaviors;
+            if (behaviors)
+              for (const b of behaviors)
+                allBehaviorNames.add(b.name);
+          }
+        }
+        for (const role of roles) {
+          if (role.performs) {
+            for (const p4 of role.performs) {
+              if (allBehaviorNames.size > 0 && !allBehaviorNames.has(p4)) {
+                console.warn(`[docAtomPostProcess] role.performs "${p4}" not in declared behaviors`);
+                warnings++;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  if (warnings > 0) {
+    console.warn(`[docAtomPostProcess] ${warnings} cross-reference warnings (non-blocking)`);
+  }
+}
+function detectNoiseCandidates(entityNames) {
+  const candidates = [];
+  for (const name21 of entityNames) {
+    if (/^[\u4e00-\u9fff]{1,4}$/.test(name21)) {
+      candidates.push(name21);
+      continue;
+    }
+    if (/^[a-zA-Z]{1,5}$/.test(name21) && name21[0] === name21[0].toLowerCase()) {
+      candidates.push(name21);
+      continue;
+    }
+    if (/^Kill\s+\d/.test(name21)) {
+      candidates.push(name21);
+      continue;
+    }
+  }
+  return candidates;
+}
+function collectExtractionStats(sections) {
+  const atomTypeCounts = {};
+  const entityNames = new Set;
+  let paragraphsWithAtoms = 0;
+  let paragraphsTotal = 0;
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      paragraphsTotal++;
+      let hasAtoms = false;
+      for (const [atomType, atoms2] of Object.entries(para.atoms)) {
+        if (!Array.isArray(atoms2) || atoms2.length === 0)
+          continue;
+        hasAtoms = true;
+        atomTypeCounts[atomType] = (atomTypeCounts[atomType] ?? 0) + atoms2.length;
+        if (atomType === "entities") {
+          for (const e of atoms2)
+            entityNames.add(e.name);
+        }
+      }
+      if (hasAtoms)
+        paragraphsWithAtoms++;
+    }
+  }
+  return {
+    entityCount: atomTypeCounts.entities ?? 0,
+    relationCount: atomTypeCounts.relations ?? 0,
+    atomTypeCounts,
+    uniqueEntityNames: [...entityNames],
+    paragraphsWithAtoms,
+    paragraphsTotal
+  };
+}
+function detectResolutionCandidates(entityNames) {
+  const candidates = [];
+  const sorted = [...entityNames].sort((a, b) => b.length - a.length);
+  for (let i = 0;i < sorted.length; i++) {
+    const long = sorted[i];
+    for (let j = i + 1;j < sorted.length; j++) {
+      const short = sorted[j];
+      if (short.length < 3)
+        continue;
+      if (short === long)
+        continue;
+      if (long.includes(short)) {
+        candidates.push({ short, long, reason: "substring match" });
+        continue;
+      }
+      if (long.toLowerCase() === short.toLowerCase()) {
+        candidates.push({ short, long, reason: "case-insensitive match" });
+        continue;
+      }
+      if (long.toLowerCase().includes(short.toLowerCase()) && short.length >= 4) {
+        candidates.push({ short, long, reason: "case-insensitive substring" });
+      }
+    }
+  }
+  return candidates;
+}
+function applyEntityMerges(sections, merges) {
+  if (merges.length === 0)
+    return 0;
+  const mergeMap = new Map;
+  for (const m of merges)
+    mergeMap.set(m.from, m.to);
+  let normalized = 0;
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      const entities2 = para.atoms.entities;
+      if (entities2) {
+        for (const e of entities2) {
+          const canonical = mergeMap.get(e.name);
+          if (canonical) {
+            e.name = canonical;
+            normalized++;
+          }
+        }
+        const seen = new Set;
+        para.atoms.entities = entities2.filter((e) => {
+          if (seen.has(e.name))
+            return false;
+          seen.add(e.name);
+          return true;
+        });
+      }
+      const relations = para.atoms.relations;
+      if (relations) {
+        for (const r of relations) {
+          const fromCanonical = mergeMap.get(r.from);
+          if (fromCanonical) {
+            r.from = fromCanonical;
+            normalized++;
+          }
+          const toCanonical = mergeMap.get(r.to);
+          if (toCanonical) {
+            r.to = toCanonical;
+            normalized++;
+          }
+        }
+      }
+      const boundaries = para.atoms.boundaries;
+      if (boundaries) {
+        for (const b of boundaries) {
+          b.contains = b.contains.map((n) => mergeMap.get(n) ?? n);
+          if (b.excludes)
+            b.excludes = b.excludes.map((n) => mergeMap.get(n) ?? n);
+        }
+      }
+    }
+  }
+  if (normalized > 0) {
+    console.log(`[docAtomPostProcess] LLM entity resolution: normalized ${normalized} references (${merges.length} merge rules)`);
+    for (const m of merges) {
+      console.log(`  "${m.from}" → "${m.to}"`);
+    }
+  }
+  return normalized;
+}
+function removeNoiseEntities(sections, names) {
+  if (names.length === 0)
+    return 0;
+  const removeSet = new Set(names);
+  let removed = 0;
+  for (const section of sections) {
+    for (const para of section.paragraphs) {
+      const entities2 = para.atoms.entities;
+      if (entities2) {
+        const before = entities2.length;
+        para.atoms.entities = entities2.filter((e) => !removeSet.has(e.name));
+        removed += before - para.atoms.entities.length;
+      }
+      const relations = para.atoms.relations;
+      if (relations) {
+        para.atoms.relations = relations.filter((r) => !removeSet.has(r.from) || !removeSet.has(r.to));
+      }
+      const boundaries = para.atoms.boundaries;
+      if (boundaries) {
+        for (const b of boundaries) {
+          b.contains = b.contains.filter((n) => !removeSet.has(n));
+          if (b.excludes)
+            b.excludes = b.excludes.filter((n) => !removeSet.has(n));
+        }
+      }
+    }
+  }
+  if (removed > 0) {
+    console.log(`[docAtomPostProcess] removed ${removed} noise entity instances (${names.length} names)`);
+    for (const n of names) {
+      console.log(`  ✕ "${n}"`);
+    }
+  }
+  return removed;
+}
+// ../api/src/services/docIndexer.ts
 var CHUNK_CONCURRENCY = 2;
 var GLEANING_MAX_ROUNDS = 2;
-var EMBEDDING_BATCH_SIZE = 20;
+var CODE_BLOCK_MIN_LENGTH = 500;
 function injectParagraphTags(chunk, sections) {
   const parts = [];
   if (chunk.breadcrumb.length > 0) {
@@ -196519,7 +198143,11 @@ function injectParagraphTags(chunk, sections) {
         parts.push(`${"#".repeat(section.level)} ${section.heading}`);
       }
     }
-    parts.push(`[P${i}] ${p4.text}`);
+    if (p4.text.length >= CODE_BLOCK_MIN_LENGTH && isPureCodeBlock(p4.text)) {
+      parts.push(`[P${i}] ${skeletonizeCodeBlock(p4.text)}`);
+    } else {
+      parts.push(`[P${i}] ${p4.text}`);
+    }
   }
   return parts.join(`
@@ -196584,12 +198212,14 @@ Continue the JSON output from the exact point of truncation. Output ONLY the rem
     });
     const combined = trimmed + result.text.trim();
     JSON.parse(jsonrepair(combined));
+    console.log(`[docIndexer] continuation: merged T1 (${trimmed.length} chars) + continuation (${result.text.trim().length} chars) = ${combined.length} chars`);
     return {
       text: combined,
       extraCalls: 1,
       extraTokens: result.usage.totalTokens
     };
-  } catch {
+  } catch (contErr) {
+    console.warn(`[docIndexer] continuation: merge failed, returning original (${trimmed.length} chars). ` + `Error: ${contErr instanceof Error ? contErr.message : String(contErr)}`);
     return { text: text2, extraCalls: 1, extraTokens: 0 };
   }
 }
@@ -196630,9 +198260,16 @@ async function processChunk(chunk, chunkIndex, llmService, sections, onStep) {
   llmCalls += continued.extraCalls;
   totalTokens += continued.extraTokens;
   onStep?.("T1 done", llmCalls, totalTokens);
-  const parseResult = parseExtractionOutput(continued.text, docChunkResultSchema);
+  let parseResult = parseExtractionOutput(continued.text, docChunkResultSchema);
   if (!parseResult.success) {
-    throw new Error(`Chunk ${chunkIndex} T1 parse failed: ${parseResult.error.message}`);
+    const preview = continued.text.slice(0, 500).replace(/\n/g, "\\n");
+    console.warn(`[docIndexer] chunk ${chunkIndex} T1 strict parse failed, attempting lenient. ` + `Error: ${parseResult.error.message.slice(0, 200)}. ` + `LLM output preview: ${preview}`);
+    const lenient = tryLenientParse(continued.text, chunkIndex);
+    if (lenient) {
+      parseResult = { success: true, data: lenient };
+    } else {
+      throw new Error(`Chunk ${chunkIndex} T1 parse failed: ${parseResult.error.message}`);
+    }
   }
   try {
     const rawJson = JSON.parse(jsonrepair(continued.text));
@@ -196689,8 +198326,19 @@ async function processChunk(chunk, chunkIndex, llmService, sections, onStep) {
     chunkText,
     previousResult: parseResult.data
   });
-  const finalResult = cumulativeResult;
-  return { result: finalResult, llmCalls, totalTokens };
+  const tableResult = await extractTableAtoms(chunk, sections, cumulativeResult, llmService);
+  if (tableResult.extracted > 0) {
+    onStep?.(`table extraction (${tableResult.extracted} tables)`, llmCalls + tableResult.llmCalls, totalTokens + tableResult.totalTokens);
+  }
+  llmCalls += tableResult.llmCalls;
+  totalTokens += tableResult.totalTokens;
+  const diagramResult = await extractDiagramAtoms(chunk, sections, cumulativeResult, llmService);
+  if (diagramResult.extracted > 0) {
+    onStep?.(`diagram extraction (${diagramResult.extracted} diagrams)`, llmCalls + diagramResult.llmCalls, totalTokens + diagramResult.totalTokens);
+  }
+  llmCalls += diagramResult.llmCalls;
+  totalTokens += diagramResult.totalTokens;
+  return { result: cumulativeResult, llmCalls, totalTokens };
 }
 function mapChunkResultToSections(chunk, chunkResult, sections) {
   for (const p4 of chunkResult.paragraphs) {
@@ -196735,6 +198383,123 @@ function mapChunkResultToSections(chunk, chunkResult, sections) {
     }
   }
 }
+var ATOM_TYPE_KEYS = new Set([
+  "entities",
+  "relations",
+  "behaviors",
+  "attributes",
+  "states",
+  "rules",
+  "transitions",
+  "events",
+  "decisions",
+  "metrics",
+  "roles",
+  "constraints",
+  "comparisons",
+  "boundaries"
+]);
+function looksLikeAtoms(obj) {
+  return Object.keys(obj).some((k) => ATOM_TYPE_KEYS.has(k) && Array.isArray(obj[k]));
+}
+var ATOM_REQUIRED_FIELDS = {
+  entities: ["name"],
+  relations: ["from", "to", "type"],
+  behaviors: ["name"],
+  attributes: ["name"],
+  states: ["name"],
+  rules: ["description"],
+  transitions: ["from", "to"],
+  events: ["name"],
+  decisions: ["description"],
+  metrics: ["name"],
+  roles: ["name"],
+  constraints: ["description"],
+  comparisons: ["description"],
+  boundaries: ["name"]
+};
+var PARAGRAPH_TAG_RE2 = /^P\d+$/;
+function tryLenientParse(rawText, chunkIndex) {
+  try {
+    let raw5 = JSON.parse(jsonrepair(rawText));
+    if (Array.isArray(raw5)) {
+      raw5 = { paragraphs: raw5 };
+    }
+    if (raw5 && typeof raw5 === "object" && !Array.isArray(raw5) && !raw5.paragraphs) {
+      const keys = Object.keys(raw5);
+      if (keys.length > 0 && keys.every((k) => PARAGRAPH_TAG_RE2.test(k))) {
+        raw5 = {
+          paragraphs: keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag2) => ({ tag: tag2, atoms: raw5[tag2] }))
+        };
+      }
+    }
+    if (!Array.isArray(raw5?.paragraphs) && raw5 && typeof raw5 === "object" && looksLikeAtoms(raw5)) {
+      raw5 = { paragraphs: [{ tag: "P0", atoms: raw5 }] };
+    }
+    if (!raw5 || !Array.isArray(raw5.paragraphs))
+      return null;
+    const salvaged = { paragraphs: [] };
+    let droppedAtoms = 0;
+    let fixedTags = 0;
+    for (let idx = 0;idx < raw5.paragraphs.length; idx++) {
+      const rawPara = raw5.paragraphs[idx];
+      if (!rawPara || typeof rawPara !== "object")
+        continue;
+      let tag2 = rawPara.tag;
+      if (!tag2 || typeof tag2 !== "string" || !/^P\d+$/.test(tag2)) {
+        tag2 = `P${idx}`;
+        fixedTags++;
+      }
+      let atomsObj;
+      if (rawPara.atoms && typeof rawPara.atoms === "object") {
+        atomsObj = rawPara.atoms;
+      } else if (looksLikeAtoms(rawPara)) {
+        atomsObj = rawPara;
+      } else {
+        continue;
+      }
+      const cleanAtoms = {};
+      for (const [atomType, atoms2] of Object.entries(atomsObj)) {
+        if (!ATOM_TYPE_KEYS.has(atomType) || !Array.isArray(atoms2))
+          continue;
+        const requiredFields = ATOM_REQUIRED_FIELDS[atomType] ?? [];
+        const kept = [];
+        for (const atom of atoms2) {
+          if (!atom || typeof atom !== "object") {
+            droppedAtoms++;
+            continue;
+          }
+          const rec = atom;
+          const hasRequired = requiredFields.every((f) => rec[f] != null && rec[f] !== "");
+          if (hasRequired) {
+            kept.push(atom);
+          } else {
+            droppedAtoms++;
+          }
+        }
+        if (kept.length > 0)
+          cleanAtoms[atomType] = kept;
+      }
+      salvaged.paragraphs.push({ tag: tag2, atoms: cleanAtoms });
+    }
+    if (salvaged.paragraphs.length === 0)
+      return null;
+    const result = docChunkResultSchema.safeParse(salvaged);
+    if (!result.success)
+      return null;
+    const fixes = [];
+    if (fixedTags > 0)
+      fixes.push(`${fixedTags} tags auto-assigned`);
+    if (droppedAtoms > 0)
+      fixes.push(`${droppedAtoms} invalid atoms dropped`);
+    if (fixes.length > 0) {
+      console.warn(`[docIndexer] chunk ${chunkIndex}: lenient parse salvaged — ${fixes.join(", ")}`);
+    }
+    return result.data;
+  } catch {
+    return null;
+  }
+}
 function ensureAtomConfidence(atoms2) {
   const DEFAULT_DOC_CONFIDENCE = 0.7;
   for (const atomList of Object.values(atoms2)) {
@@ -196758,90 +198523,62 @@ function countAtoms(sections) {
   }
   return counts;
 }
-async function generateEmbeddings(digest, embeddingService, onProgress) {
-  const paragraphs = [];
-  for (let sIdx = 0;sIdx < digest.sections.length; sIdx++) {
-    const section = digest.sections[sIdx];
-    for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
-      paragraphs.push({
-        sectionIndex: sIdx,
-        paragraphIndex: pIdx,
-        text: section.paragraphs[pIdx].text
-      });
-    }
-  }
-  if (paragraphs.length === 0)
-    return 0;
-  const embeddings = [];
-  const totalParagraphs = paragraphs.length;
-  onProgress?.({ phase: "embedding", progress: 85, message: `Loading model (${totalParagraphs} paragraphs)` });
-  const warmupStart = Date.now();
-  await embeddingService.getDimension();
-  const warmupMs = Date.now() - warmupStart;
-  if (warmupMs > 500) {
-    onProgress?.({ phase: "embedding", progress: 86, message: `Model ready (${(warmupMs / 1000).toFixed(1)}s)` });
+function formatExtractionStats(stats) {
+  const typeSummary = Object.entries(stats.atomTypeCounts).sort(([, a], [, b]) => b - a).map(([t4, c]) => `${t4}:${c}`).join(" ");
+  return `${stats.uniqueEntityNames.length} entities, ${stats.relationCount} relations, ` + `${stats.paragraphsWithAtoms}/${stats.paragraphsTotal} paragraphs with atoms | ${typeSummary}`;
+}
+async function runEntityResolution(sections, entityNames, llmService, onProgress) {
+  const candidates = detectResolutionCandidates(entityNames);
+  const noiseCandidates = detectNoiseCandidates(entityNames);
+  if (candidates.length === 0 && noiseCandidates.length === 0) {
+    console.log("[docIndexer] entity resolution: no duplicates or noise candidates, skipping");
+    onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.no_resolution" } });
+    return { llmCalls: 0, totalTokens: 0 };
   }
-  const totalBatches = Math.ceil(totalParagraphs / EMBEDDING_BATCH_SIZE);
-  for (let i = 0;i < totalParagraphs; i += EMBEDDING_BATCH_SIZE) {
-    const batchIndex = Math.floor(i / EMBEDDING_BATCH_SIZE) + 1;
-    const batch2 = paragraphs.slice(i, i + EMBEDDING_BATCH_SIZE);
-    const texts = batch2.map((p4) => p4.text);
-    const batchStart = Date.now();
+  console.log(`[docIndexer] entity resolution: ${candidates.length} duplicate pairs, ${noiseCandidates.length} noise candidates`);
+  onProgress?.({
+    phase: "post-processing",
+    progress: 83,
+    message: { key: "index.doc.msg.resolving", params: { duplicates: candidates.length, noise: noiseCandidates.length } }
+  });
+  try {
+    const prompt = buildEntityResolutionPrompt({
+      allNames: entityNames,
+      candidates,
+      ...noiseCandidates.length > 0 ? { noiseCandidates } : {}
+    });
+    const result = await llmService.generateText(prompt, {
+      systemPrompt: ENTITY_RESOLUTION_SYSTEM_PROMPT
+    });
+    let resolution;
     try {
-      const vectors = await embeddingService.embedBatch(texts);
-      for (let j = 0;j < batch2.length; j++) {
-        embeddings.push({
-          sectionIndex: batch2[j].sectionIndex,
-          paragraphIndex: batch2[j].paragraphIndex,
-          vector: vectors[j]
-        });
-      }
+      resolution = JSON.parse(jsonrepair(result.text));
     } catch {
-      for (let fi = 0;fi < batch2.length; fi++) {
-        const p4 = batch2[fi];
-        try {
-          const vector = await embeddingService.embed(p4.text);
-          embeddings.push({
-            sectionIndex: p4.sectionIndex,
-            paragraphIndex: p4.paragraphIndex,
-            vector
-          });
-        } catch {
-          console.warn(`[docIndexer] embedding failed for section ${p4.sectionIndex} paragraph ${p4.paragraphIndex}`);
-        }
-        const embedded2 = i + fi + 1;
-        const progress2 = 86 + Math.round(embedded2 / totalParagraphs * 9);
-        onProgress?.({ phase: "embedding", progress: progress2, message: `Fallback ${embedded2}/${totalParagraphs}` });
-      }
-      continue;
+      console.warn("[docIndexer] entity resolution: failed to parse LLM response, skipping");
+      onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.resolution_parse_failed" } });
+      return { llmCalls: 1, totalTokens: result.usage.totalTokens };
     }
-    const embedded = Math.min(i + EMBEDDING_BATCH_SIZE, totalParagraphs);
-    const batchMs = Date.now() - batchStart;
-    const progress = 86 + Math.round(embedded / totalParagraphs * 9);
-    onProgress?.({ phase: "embedding", progress, message: `Batch ${batchIndex}/${totalBatches} (${embedded}/${totalParagraphs}, ${(batchMs / 1000).toFixed(1)}s)` });
-  }
-  digest.embeddings = embeddings;
-  return embeddings.length;
-}
-async function writeToVectorStore(digest, vectorStore, hashId, sourceId, sourcePath) {
-  if (digest.embeddings.length === 0)
-    return;
-  try {
-    await vectorStore.deleteByPrefix(`${hashId}:`);
-    await vectorStore.add(digest.embeddings.map((e) => ({
-      id: `${hashId}:${e.sectionIndex}:${e.paragraphIndex}`,
-      embedding: e.vector,
-      metadata: {
-        layer: "digest",
-        sourceId,
-        hashId,
-        sourcePath,
-        sectionIndex: e.sectionIndex,
-        paragraphIndex: e.paragraphIndex
+    const mergeCount = applyEntityMerges(sections, resolution.merges ?? []);
+    const removeCount = removeNoiseEntities(sections, resolution.remove ?? []);
+    onProgress?.({
+      phase: "post-processing",
+      progress: 84,
+      message: {
+        key: "index.doc.msg.resolution_result",
+        params: {
+          merges: resolution.merges?.length ?? 0,
+          mergeRefs: mergeCount,
+          removed: resolution.remove?.length ?? 0,
+          removeRefs: removeCount,
+          ambiguous: resolution.ambiguous?.length ?? 0
+        }
       }
-    })));
+    });
+    return { llmCalls: 1, totalTokens: result.usage.totalTokens };
   } catch (err2) {
-    console.warn(`[docIndexer] IVectorStore write failed (non-blocking):`, err2);
+    console.warn("[docIndexer] entity resolution LLM call failed (non-blocking):", err2);
+    onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.resolution_failed" } });
+    return { llmCalls: 0, totalTokens: 0 };
   }
 }
 async function indexDocument(input) {
@@ -196854,16 +198591,22 @@ async function indexDocument(input) {
     digestStore: digestStore2,
     onProgress
   } = input;
+  const { content: normalizedContent, stats: normalizeStats } = normalizeMarkdown(content);
+  const repairCount = Object.values(normalizeStats.repairs).reduce((a, b) => a + b, 0);
+  if (repairCount > 0) {
+    const repairSummary = Object.entries(normalizeStats.repairs).map(([k, v]) => `${k}:${v}`).join(" ");
+    console.log(`[docIndexer] markdown normalized: ${repairCount} repairs (${repairSummary})`);
+  }
   onProgress?.({ phase: "chunking", progress: 3 });
-  const chunks = chunkMarkdown(content);
-  const parsedSections = parseSections(content);
+  const chunks = chunkMarkdown(normalizedContent);
+  const parsedSections = parseSections(normalizedContent);
   if (chunks.length === 0) {
     throw new Error("Document produced no chunks — content may be empty");
   }
-  onProgress?.({ phase: "chunking", progress: 8, message: `${chunks.length} chunks, ${parsedSections.length} sections` });
+  onProgress?.({ phase: "chunking", progress: 8, message: { key: "index.doc.msg.chunking_result", params: { chunks: chunks.length, sections: parsedSections.length } } });
   const totalChunks = chunks.length;
-  const annotateMsg = input.llmModel ? `${totalChunks} chunks, LLM ${input.llmModel}` : `${totalChunks} chunks`;
-  onProgress?.({ phase: "annotating", progress: 10, message: annotateMsg });
+  const annotateStartMsg = input.llmModel ? { key: "index.doc.msg.annotating_start_model", params: { n: totalChunks, model: input.llmModel } } : { key: "index.doc.msg.annotating_start", params: { n: totalChunks } };
+  onProgress?.({ phase: "annotating", progress: 10, message: annotateStartMsg });
   let completedChunks = 0;
   let totalLlmCalls = 0;
   let totalTokens = 0;
@@ -196873,7 +198616,7 @@ async function indexDocument(input) {
       onProgress?.({
         phase: "annotating",
         progress: baseProgress,
-        message: `Chunk ${completedChunks + 1}/${totalChunks} ${step} (${calls} calls, ${tokens} tokens)`
+        message: { key: "index.doc.msg.annotating_chunk", params: { current: completedChunks + 1, total: totalChunks, step, calls, tokens } }
       });
     });
     completedChunks++;
@@ -196883,7 +198626,7 @@ async function indexDocument(input) {
     onProgress?.({
       phase: "annotating",
       progress,
-      message: `Chunk ${completedChunks}/${totalChunks} done, total ${totalLlmCalls} calls ${totalTokens} tokens`
+      message: { key: "index.doc.msg.annotating_chunk_done", params: { current: completedChunks, total: totalChunks, calls: totalLlmCalls, tokens: totalTokens } }
     });
     return result;
   });
@@ -196894,17 +198637,25 @@ async function indexDocument(input) {
   const sectionsMap = new Map;
   for (let i = 0;i < parsedSections.length; i++) {
     const s = parsedSections[i];
-    const sectionKey = `${i}`;
-    sectionsMap.set(sectionKey, {
+    sectionsMap.set(`${i}`, {
       heading: s.heading,
       level: s.level,
       paragraphs: new Map
     });
-    for (let pIdx = 0;pIdx < s.paragraphs.length; pIdx++) {
-      sectionsMap.get(sectionKey).paragraphs.set(`${i}:${pIdx}`, {
-        text: s.paragraphs[pIdx],
-        atoms: {}
-      });
+  }
+  for (const chunk of chunks) {
+    for (const cp of chunk.paragraphs) {
+      const sectionKey = `${cp.sectionIndex}`;
+      if (!sectionsMap.has(sectionKey)) {
+        sectionsMap.set(sectionKey, { heading: "", level: 0, paragraphs: new Map });
+      }
+      const paragraphKey = `${cp.sectionIndex}:${cp.paragraphIndex}`;
+      if (!sectionsMap.get(sectionKey).paragraphs.has(paragraphKey)) {
+        sectionsMap.get(sectionKey).paragraphs.set(paragraphKey, {
+          text: cp.text,
+          atoms: {}
+        });
+      }
     }
   }
   for (const success2 of chunkProcessResult.successes) {
@@ -196934,6 +198685,25 @@ async function indexDocument(input) {
       ensureAtomConfidence(para.atoms);
     }
   }
+  onProgress?.({ phase: "post-processing", progress: 81, message: { key: "index.doc.msg.post_process_start" } });
+  postProcessDigestAtoms(digestSections);
+  const preStats = collectExtractionStats(digestSections);
+  const statsMsg = formatExtractionStats(preStats);
+  console.log(`[docIndexer] extraction stats: ${statsMsg}`);
+  onProgress?.({ phase: "post-processing", progress: 82, message: {
+    key: "index.doc.msg.extraction_stats",
+    params: {
+      entities: preStats.uniqueEntityNames.length,
+      relations: preStats.relationCount,
+      withAtoms: preStats.paragraphsWithAtoms,
+      totalParas: preStats.paragraphsTotal
+    }
+  } });
+  if ((input.enableEntityResolution ?? true) && preStats.uniqueEntityNames.length > 1) {
+    const resolutionResult = await runEntityResolution(digestSections, preStats.uniqueEntityNames, llmService, onProgress);
+    totalLlmCalls += resolutionResult.llmCalls;
+    totalTokens += resolutionResult.totalTokens;
+  }
   const atomCounts = countAtoms(sectionsMap);
   const paragraphCount = digestSections.reduce((sum, s) => sum + s.paragraphs.length, 0);
   if (paragraphCount === 0) {
@@ -196955,7 +198725,7 @@ async function indexDocument(input) {
       processedAt: new Date().toISOString()
     }
   };
-  const embedMsg = input.embeddingModel ? `Embedding ${input.embeddingModel}` : undefined;
+  const embedMsg = input.embeddingModel ? { key: "index.doc.msg.embedding_model", params: { model: input.embeddingModel } } : undefined;
   onProgress?.({ phase: "embedding", progress: 85, ...embedMsg ? { message: embedMsg } : {} });
   let embeddingCount = 0;
   if (input.embeddingService) {
@@ -197250,45 +199020,55 @@ async function runDocIndexPipeline(opts) {
     const llmModelId = serverConfig2.llm[llmProvider]?.default_model ?? llmProvider;
     const embProvider = serverConfig2.embedding?.provider;
     const embModelId = embProvider ? serverConfig2.embedding[embProvider]?.model_id ?? embProvider : undefined;
+    const fileTimeoutMs = serverConfig2.indexing?.file_timeout_ms ?? 15 * 60 * 1000;
+    const abortSignal = indexTaskManager.getAbortSignal?.(sourceId) ?? null;
     for (let fileIdx = 0;fileIdx < filesToIndex.length; fileIdx++) {
+      if (abortSignal?.aborted) {
+        const reason = typeof abortSignal.reason === "string" ? abortSignal.reason : "Task aborted";
+        console.warn(`[runDocIndexPipeline] aborted before file ${fileIdx + 1}/${filesToIndex.length}: ${reason}`);
+        break;
+      }
       const file2 = filesToIndex[fileIdx];
-      const fileLabel = `[${fileIdx + 1}/${filesToIndex.length}] ${file2.sourcePath}`;
       if (indexTaskManager.hasTask(sourceId)) {
         indexTaskManager.updateProgress(sourceId, {
-          stage: "annotating",
+          stage: "chunking",
           percent: 0,
-          message: `${fileLabel} 开始索引`
+          message: { key: "index.doc.msg.file_start", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath } }
         });
       }
       try {
-        await indexDocument({
-          sourceId,
-          hashId: file2.hashId,
-          sourcePath: file2.sourcePath,
-          content: file2.content,
-          contentType: "markdown",
-          llmService,
-          embeddingService,
-          vectorStore,
-          digestStore: digestStore2,
-          llmModel: `${llmProvider}/${llmModelId}`,
-          ...embModelId ? { embeddingModel: `${embProvider}/${embModelId}` } : {},
-          onProgress: (p4) => {
-            if (indexTaskManager.hasTask(sourceId)) {
-              indexTaskManager.updateProgress(sourceId, {
-                stage: p4.phase,
-                percent: p4.progress,
-                ...p4.message != null ? { message: `[${fileIdx + 1}/${filesToIndex.length}] ${p4.message}` } : {}
-              });
+        const fileTimeout = new Promise((_, reject) => setTimeout(() => reject(new Error(`File timeout after ${Math.round(fileTimeoutMs / 60000)}min: ${file2.sourcePath}`)), fileTimeoutMs));
+        await Promise.race([
+          indexDocument({
+            sourceId,
+            hashId: file2.hashId,
+            sourcePath: file2.sourcePath,
+            content: file2.content,
+            contentType: "markdown",
+            llmService,
+            embeddingService,
+            vectorStore,
+            digestStore: digestStore2,
+            llmModel: `${llmProvider}/${llmModelId}`,
+            ...embModelId ? { embeddingModel: `${embProvider}/${embModelId}` } : {},
+            onProgress: (p4) => {
+              if (indexTaskManager.hasTask(sourceId)) {
+                indexTaskManager.updateProgress(sourceId, {
+                  stage: p4.phase,
+                  percent: p4.progress,
+                  ...p4.message != null ? { message: p4.message } : {}
+                });
+              }
             }
-          }
-        });
+          }),
+          fileTimeout
+        ]);
         stored.push({ hash_id: file2.hashId, status: "created" });
         if (indexTaskManager.hasTask(sourceId)) {
           indexTaskManager.updateProgress(sourceId, {
             stage: "storing",
             percent: 100,
-            message: `${fileLabel} 索引完成`
+            message: { key: "index.doc.msg.file_done", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath } }
           });
         }
       } catch (err2) {
@@ -197299,11 +199079,15 @@ async function runDocIndexPipeline(opts) {
           indexTaskManager.updateProgress(sourceId, {
             stage: "annotating",
             percent: 0,
-            message: `${fileLabel} 索引失败: ${msg}`
+            message: { key: "index.doc.msg.file_error", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath, error: msg } }
           });
         }
       }
     }
+    if (abortSignal?.aborted) {
+      console.warn(`[runDocIndexPipeline] pipeline aborted for ${sourceId}, skipping completion`);
+      return;
+    }
     if (stored.length === 0 && errors5.length > 0) {
       const errorCode = errors5[0].code ?? "DOC_INDEX_LLM_EXHAUSTED" /* DOC_INDEX_LLM_EXHAUSTED */;
       indexTaskManager.failTask(sourceId, errors5[0].error, errorCode);
@@ -197417,7 +199201,7 @@ async function handleDocIndex(c, storageProvider, source2) {
     throw new C4AError("DOC_INDEX_EMBEDDING_UNAVAILABLE" /* DOC_INDEX_EMBEDDING_UNAVAILABLE */, "Embedding service not configured", null);
   }
   const modulePaths = modules?.map((m) => m.path);
-  indexTaskManager.createTask(source2.id, "server", source2.id, modulePaths);
+  indexTaskManager.createTask(source2.id, "server", source2.id, modulePaths, serverConfig2.indexing?.task_timeout_ms);
   const hashToPath = new Map;
   for (const sf of latestByPath.values()) {
     hashToPath.set(sf.hash_id, sf.source_path ?? "");
@@ -199261,6 +201045,10 @@ function mergeServerConfig2(parsed) {
         ...isPlainObject5(input.llm?.google) ? input.llm?.google : {}
       }
     },
+    indexing: {
+      ...defaults2.indexing,
+      ...isPlainObject5(input.indexing) ? input.indexing : {}
+    },
     embedding: {
       ...defaults2.embedding,
       ...isPlainObject5(input.embedding) ? input.embedding : {},
@@ -199935,7 +201723,8 @@ import path9 from "node:path";
 import { fileURLToPath } from "node:url";
 // ../server/src/indexTaskManager.ts
-var DEFAULT_INDEX_TASK_TIMEOUT_MS = 20 * 60 * 1000;
+var DEFAULT_INDEX_TASK_TIMEOUT_MS = 150 * 60 * 1000;
+var DEFAULT_FILE_TIMEOUT_MS = 15 * 60 * 1000;
 class IndexTaskManager {
   broadcaster;
@@ -199955,12 +201744,18 @@ class IndexTaskManager {
   getTask(sourceId) {
     return this.indexTasks.get(sourceId) ?? null;
   }
-  createTask(sourceId, machineId, targetCommit, modules) {
+  getAbortSignal(sourceId) {
+    return this.indexTasks.get(sourceId)?.abortController.signal ?? null;
+  }
+  createTask(sourceId, machineId, targetCommit, modules, timeoutMs) {
     const existing = this.indexTasks.get(sourceId);
     if (existing) {
       clearTimeout(existing.timer);
+      existing.abortController.abort("Task replaced by new task");
       this.indexTasks.delete(sourceId);
     }
+    const abortController = new AbortController;
+    const effectiveTimeout = timeoutMs ?? this.timeoutMs;
     const task = {
       sourceId,
       machineId,
@@ -199968,8 +201763,10 @@ class IndexTaskManager {
       startedAt: new Date,
       timer: setTimeout(() => {
         this.timeoutTask(sourceId);
-      }, this.timeoutMs),
+      }, effectiveTimeout),
+      timeoutMs: effectiveTimeout,
       progress: null,
+      abortController,
       ...modules && modules.length > 0 ? { modules } : {}
     };
     this.indexTasks.set(sourceId, task);
@@ -200007,7 +201804,7 @@ class IndexTaskManager {
       clearTimeout(task.timer);
       task.timer = setTimeout(() => {
         this.timeoutTask(sourceId);
-      }, this.timeoutMs);
+      }, task.timeoutMs);
       nextPhase();
       return;
     }
@@ -200026,6 +201823,7 @@ class IndexTaskManager {
       return;
     this.pendingPhases.delete(sourceId);
     clearTimeout(task.timer);
+    task.abortController.abort(error40);
     this.indexTasks.delete(sourceId);
     this.broadcaster.error({
       source_id: sourceId,
@@ -200040,6 +201838,7 @@ class IndexTaskManager {
       return;
     this.pendingPhases.delete(sourceId);
     clearTimeout(task.timer);
+    task.abortController.abort("Task timed out");
     this.indexTasks.delete(sourceId);
     this.broadcaster.timeout({
       source_id: sourceId,
@@ -200055,6 +201854,7 @@ class IndexTaskManager {
   destroy() {
     for (const task of this.indexTasks.values()) {
       clearTimeout(task.timer);
+      task.abortController.abort("Manager destroyed");
     }
     this.indexTasks.clear();
     this.pendingPhases.clear();