npm - @c4a/server-cli - Versions diffs - 0.4.15-alpha.4 → 0.4.15-alpha.6 - Mend

@c4a/server-cli 0.4.15-alpha.4 → 0.4.15-alpha.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/README.md +10 -5
package/index.js +965 -90
package/package.json +1 -1
package/serve.js +2027 -221
package/web/assets/ContentDetail--oZBzWh0.js +1 -0
package/web/assets/ContentDetail-B5s8bbFo.js +1 -0
package/web/assets/ContentDetail-C0zfArPg.js +1 -0
package/web/assets/ContentDetail-C3kXsx-i.js +1 -0
package/web/assets/ContentDetail-CcLGF_Yi.js +1 -0
package/web/assets/ContentDetail-D-2xyerw.js +1 -0
package/web/assets/ContentDetail-DlQ8URkx.js +1 -0
package/web/assets/ContentDetail-TPc0m0eM.js +1 -0
package/web/assets/ContentDetail-y0yi2qln.js +1 -0
package/web/assets/EntityDetail-3CFtMmgQ.js +1 -0
package/web/assets/EntityDetail-BI3etmj4.js +1 -0
package/web/assets/EntityDetail-C9k4cMVL.js +1 -0
package/web/assets/EntityDetail-CoFb-qZW.js +1 -0
package/web/assets/EntityDetail-D_WP7tD4.js +1 -0
package/web/assets/EntityDetail-DiJPemDY.js +1 -0
package/web/assets/EntityDetail-DihnDvhA.js +1 -0
package/web/assets/EntityDetail-DyDH4GAw.js +1 -0
package/web/assets/EntityDetail-dIZiNN2t.js +1 -0
package/web/assets/RelationDetail-B2gHrceI.js +1 -0
package/web/assets/RelationDetail-BK8C5waL.js +1 -0
package/web/assets/RelationDetail-CEq9vopD.js +1 -0
package/web/assets/RelationDetail-CaYrspaS.js +1 -0
package/web/assets/RelationDetail-CpoGdy25.js +1 -0
package/web/assets/RelationDetail-DU9ECyHi.js +1 -0
package/web/assets/RelationDetail-Dz7HAlU5.js +1 -0
package/web/assets/RelationDetail-Wh3IgNaF.js +1 -0
package/web/assets/RelationDetail-zZ_ZfkYX.js +1 -0
package/web/assets/index-BKETuM1m.js +111 -0
package/web/assets/index-BPMqeFze.js +111 -0
package/web/assets/index-BgRuvBL5.js +111 -0
package/web/assets/index-C96WspeJ.css +1 -0
package/web/assets/index-CcrkBEZl.js +111 -0
package/web/assets/index-DGDx8sCs.js +111 -0
package/web/assets/index-DIyAwnqE.js +111 -0
package/web/assets/index-DW1cCA8v.js +111 -0
package/web/assets/index-DiAYi5t8.css +1 -0
package/web/assets/index-FOCWvgW_.css +1 -0
package/web/assets/index-daOjyLzy.css +1 -0
package/web/assets/index-moF8uSEi.js +111 -0
package/web/assets/index-sPNyENFN.js +111 -0
package/web/assets/index-uGqDxUnx.css +1 -0
package/web/index.html +2 -2

package/index.js CHANGED Viewed

@@ -40342,6 +40342,10 @@ var init_serverConfig = __esm(() => {
         default_model: "gemini-3-pro-preview"
       }
     },
+    indexing: {
+      task_timeout_ms: 150 * 60 * 1000,
+      file_timeout_ms: 15 * 60 * 1000
+    },
     embedding: {
       provider: "huggingface",
       huggingface: {
@@ -44401,7 +44405,7 @@ var init_atomsSchema = __esm(() => {
   init_zod();
   init_base();
   init_baseSchema();
-  confidenceAtomSchema = exports_external.number().min(0).max(1).optional();
+  confidenceAtomSchema = exports_external.number().min(0).max(1).optional().catch(undefined);
   entityAtomSchema = exports_external.object({
     name: exports_external.string(),
     kind: kindSchema.optional().catch(undefined),
@@ -220955,14 +220959,21 @@ function isRetryableStatus(status) {
 function isAuthStatus(status) {
   return status === 401 || status === 403;
 }
-function isBadRequest(status) {
-  return status === 400;
+function throwLlmError(error40, status) {
+  const detail = toErrorMessage(error40);
+  const statusTag = status ? ` [HTTP ${status}]` : "";
+  if (isAuthStatus(status)) {
+    throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, `LLM 认证失败${statusTag}: ${detail}`, detail);
+  }
+  throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, `LLM 调用失败${statusTag}: ${detail}`, detail);
 }
 class LlmServiceImpl {
   options;
+  supportsTemperature;
   constructor(options) {
     this.options = options;
+    this.supportsTemperature = options.provider !== "openai";
   }
   async generateText(prompt, options) {
     if (this.options.forceStream) {
@@ -220974,7 +220985,7 @@ class LlmServiceImpl {
         model: this.options.languageModel,
         prompt,
         maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
-        temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
+        ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
         maxRetries: 0
       };
       if (options?.systemPrompt) {
@@ -221011,13 +221022,7 @@ class LlmServiceImpl {
         durationMs,
         error: toErrorMessage(error40)
       });
-      if (isAuthStatus(status)) {
-        throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
-      }
-      if (isBadRequest(status)) {
-        throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
-      }
-      throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
+      throwLlmError(error40, status);
     }
   }
   async generateTextViaStream(prompt, options) {
@@ -221027,7 +221032,7 @@ class LlmServiceImpl {
         model: this.options.languageModel,
         prompt,
         maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
-        temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
+        ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
         maxRetries: 0
       };
       if (options?.systemPrompt) {
@@ -221065,13 +221070,7 @@ class LlmServiceImpl {
         durationMs: Date.now() - startedAt,
         error: toErrorMessage(error40)
       });
-      if (isAuthStatus(status)) {
-        throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
-      }
-      if (isBadRequest(status)) {
-        throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
-      }
-      throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
+      throwLlmError(error40, status);
     }
   }
   streamText(prompt, options) {
@@ -221094,7 +221093,7 @@ class LlmServiceImpl {
         model: this.options.languageModel,
         prompt,
         maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
-        temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
+        ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
         maxRetries: 0,
         onFinish: (event) => {
           const finishEvent = event;
@@ -221140,13 +221139,7 @@ class LlmServiceImpl {
         durationMs: Date.now() - startedAt,
         error: toErrorMessage(error40)
       });
-      if (isAuthStatus(status)) {
-        throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
-      }
-      if (isBadRequest(status)) {
-        throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
-      }
-      throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
+      throwLlmError(error40, status);
     }
   }
 }
@@ -221857,7 +221850,12 @@ function parseExtractionOutput(raw, schema) {
       return { success: false, error: new Error("Empty output") };
     }
     const protocolParsed = tryParseProtocol(trimmed);
-    const parsed = protocolParsed ?? tryParseJson(trimmed);
+    let parsed = protocolParsed ?? tryParseJson(trimmed);
+    if (Array.isArray(parsed)) {
+      parsed = { paragraphs: parsed };
+    }
+    parsed = normalizeFlatOutput(parsed);
+    parsed = stripNulls(parsed);
     const result = schema.safeParse(parsed);
     if (!result.success) {
       return { success: false, error: result.error };
@@ -221927,6 +221925,37 @@ function tryParseJson(raw) {
 function repairAndParse(raw) {
   return JSON.parse(jsonrepair(raw));
 }
+var PARAGRAPH_TAG_RE = /^P\d+$/;
+function normalizeFlatOutput(parsed) {
+  if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
+    return parsed;
+  const obj = parsed;
+  if ("paragraphs" in obj)
+    return parsed;
+  const keys = Object.keys(obj);
+  if (keys.length === 0)
+    return { paragraphs: [] };
+  const allTags = keys.every((k) => PARAGRAPH_TAG_RE.test(k));
+  if (!allTags)
+    return parsed;
+  const paragraphs = keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag) => ({ tag, atoms: obj[tag] }));
+  return { paragraphs };
+}
+function stripNulls(value) {
+  if (value === null)
+    return;
+  if (Array.isArray(value))
+    return value.map(stripNulls);
+  if (typeof value === "object" && value !== null) {
+    const out = {};
+    for (const [k, v] of Object.entries(value)) {
+      if (v !== null)
+        out[k] = stripNulls(v);
+    }
+    return out;
+  }
+  return value;
+}
 function isRecord(value) {
   return !!value && typeof value === "object" && "key" in value && "value" in value && typeof value.key === "string";
 }
@@ -222183,20 +222212,20 @@ class GleaningExtractor {
 // ../llm/src/prompts/docAtomAnnotation.ts
 init_src();
 var DOC_ATOM_DEFS = [
-  ["entities", "Named things: systems, services, modules, documents, APIs (NOT people/teams — use roles for those)", entityAtomSchema],
+  ["entities", "Named things with independent identity — something you can ask questions about ('What does X do?', 'Who owns X?'). Examples: systems, services, modules, APIs, products. If it is a value, address, path, or configuration detail, it is an attribute of an entity, not an entity itself. (NOT people/teams — use roles for those). kind: implementation=internal systems/services, external=third-party dependencies, concept=abstract/not-yet-implemented", entityAtomSchema],
   ["relations", "Connections between entities", relationAtomSchema],
   ["behaviors", "Actions/operations: functions, API calls, user actions, workflows", behaviorAtomSchema],
   ["attributes", "Properties of entities", attributeAtomSchema],
   ["states", "Possible states of entities", stateAtomSchema],
-  ["rules", "Business/domain rules: validations, constraints expressed as logic", ruleAtomSchema],
+  ["rules", "Conditional business/domain logic: IF condition THEN consequence (e.g., 'IF user not authenticated THEN reject request')", ruleAtomSchema],
   ["transitions", "State changes: from→to triggered by events or guards", transitionAtomSchema],
   ["events", "Occurrences that trigger behaviors", eventAtomSchema],
   ["decisions", "Architectural or business decisions", decisionAtomSchema],
   ["metrics", "Measurable targets: SLA, throughput, error_rate, with thresholds", metricAtomSchema],
-  ["roles", "Actors: human roles, teams, personas that perform behaviors", roleAtomSchema],
-  ["constraints", "Hard/soft requirements", constraintAtomSchema],
+  ["roles", "Actors: human roles, teams, personas that perform behaviors. kind: human=individual role, team=group/department, persona=user archetype. System-triggered actions use entity relations, NOT roles", roleAtomSchema],
+  ["constraints", "Declarative requirements: 'X must/should/must-not Y' (e.g., 'passwords must be >= 8 chars'). Unlike rules, constraints have no IF-THEN condition — they are unconditional mandates or restrictions", constraintAtomSchema],
   ["comparisons", "Side-by-side evaluations", comparisonAtomSchema],
-  ["boundaries", "System/domain boundaries: what is included vs excluded", boundaryAtomSchema]
+  ["boundaries", "Explicit scope declarations: what is included vs excluded. Only extract when the text explicitly declares scope (e.g., 'this product covers X but NOT Y'). Implicit containment (A runs inside B) is expressed via entity relations, not boundaries", boundaryAtomSchema]
 ];
 function buildAtomTypesBlock() {
   return DOC_ATOM_DEFS.map(([name21, desc, schema], i) => {
@@ -222219,30 +222248,38 @@ Each atom type has specific required fields. Fields with "?" suffix are optional
 ${ATOM_TYPES_BLOCK}
 ## Output Format
-Return a single JSON object with this structure:
+Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
 {
-  "paragraphs": [
-    {
-      "tag": "P0",
-      "atoms": {
-        "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
-        "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }],
-        "rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
-      }
-    }
-  ]
+  "P0": {
+    "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
+    "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }]
+  },
+  "P3": {
+    "constraints": [{ "description": "User must be authenticated before access", "severity": "must", "confidence": 0.9 }],
+    "rules": [{ "description": "Reject request if user is not authenticated", "expression": "IF !user.isAuthenticated THEN reject", "confidence": 0.85 }]
+  }
 }
 ## Rules
-- Each paragraph tag (P0, P1, ...) corresponds to the tagged paragraph in the input.
+- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
+- Skip paragraphs with no atoms — do NOT emit empty objects.
 - Only include atom types that are actually found in a paragraph (all types are optional).
 - Every atom MUST include all required fields for its type (see schemas above). Fields with "?" suffix are optional.
 - **Enum fields MUST use ONLY the listed values.** For example, entity.kind must be one of "implementation"|"external"|"concept" — do NOT use values from other atom types (e.g., do NOT put "team" or "human" in entity.kind; those belong to roles.kind).
 - Every atom MUST include a "confidence" field (0.0-1.0) indicating how confident you are in the extraction. Use higher values (0.85-1.0) for explicitly stated facts and lower values (0.5-0.7) for inferred or ambiguous information.
 - **Classify correctly:** People, teams, and personas → "roles" (not "entities"). Technical systems, services, modules → "entities".
+- **Entity reference consistency (CRITICAL):** Every entity name referenced in relation.from, relation.to, behavior.subject, or any other cross-reference field MUST also appear in the "entities" array of the SAME paragraph (or a preceding paragraph in the same chunk). If an entity is mentioned for the first time in a relation, you MUST also extract it as an entity. This ensures no "dangling references" — every name used in relations has a corresponding entity declaration.
+- **Cross-atom reference consistency:** transitions[].from and transitions[].to values MUST exist in states[].values of the same entity. roles[].performs values MUST match names declared in behaviors[].name.
+- **Constraints vs rules distinction:** Use "constraints" for unconditional declarative mandates ('X must Y'). Use "rules" for conditional logic ('IF X THEN Y'). Do not mix them — a requirement with no condition is a constraint, a requirement triggered by a condition is a rule. Do NOT invent a rule for every constraint — only create a rule when the text explicitly states conditional logic.
+- **One statement, multiple atoms:** A single sentence can produce several atom types simultaneously. Do NOT force a choice — extract all that apply. Example: "system uptime must be ≥ 99.9%" → constraint (severity: must) + metric (threshold: "≥ 99.9%").
+- **Relation types:** Use standard relation types when possible: CONTAINS (parent→child composition), DEPENDS_ON (runtime dependency), IMPLEMENTS (code/component→spec realization), PRODUCES (process→output), TRIGGERS (event/process triggering), REFERENCES (weak cross-reference). Only invent a new type when none of these fit.
+- **Decisions:** Extract as "decisions" when the text records a deliberate choice between alternatives with rationale (e.g., "we chose X because Y", "after evaluating A/B/C, selected B"). Do not extract routine descriptions as decisions.
 - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions, English input → English descriptions).
 - JSON structure keys (tag, atom type names, field names) must always be in English.
 - Be thorough: extract ALL relevant atoms from each paragraph.
+- **Tables: basic extraction only.** For paragraphs containing markdown tables, extract the table heading as an entity and a brief summary attribute. Detailed table modeling (row-level data, comparisons, metrics) is handled by a dedicated table extraction pass — do NOT attempt exhaustive table column extraction here.
+- **Diagrams: basic extraction only.** For paragraphs containing text-based diagrams (e.g. \`\`\`mermaid, \`\`\`plantuml, \`\`\`dot, etc.), extract the diagram title as an entity and a brief summary attribute describing what the diagram shows. Detailed diagram modeling (nodes, edges, states, transitions) is handled by a dedicated diagram extraction pass — do NOT attempt exhaustive diagram parsing here.
+- **Metrics checklist:** If the text mentions performance targets, SLA, response times, throughput, error rates, port numbers, timeouts, capacity limits, or any numeric thresholds, extract them as "metrics" atoms.
 - Do NOT include "claims" — they are system-generated and not part of document extraction.`;
 function buildDocAtomAnnotationPrompt(chunkText) {
   return `Extract all semantic atoms from the following document text.
@@ -222254,6 +222291,13 @@ ${chunkText}
 Return ONLY a valid JSON object. No markdown fences, no explanation.`;
 }
+function toFlatFormat(result) {
+  const flat = {};
+  for (const p of result.paragraphs) {
+    flat[p.tag] = p.atoms;
+  }
+  return flat;
+}
 function buildDocGleaningPrompt(chunkText, previousResult) {
   return `Review the following document text and the previously extracted atoms.
 Check for any MISSING atoms that were not captured in the first pass.
@@ -222262,66 +222306,483 @@ Check for any MISSING atoms that were not captured in the first pass.
 ${chunkText}
 ## Previously Extracted Atoms
-${JSON.stringify(previousResult, null, 2)}
+${JSON.stringify(toFlatFormat(previousResult), null, 2)}
 ## Instructions
-- If you find missing atoms, output them in the same JSON format (with paragraph tags).
+- If you find missing atoms, output them in the same flat JSON format keyed by paragraph tags (e.g. {"P0": {"entities": [...]}, "P3": {"rules": [...]}}).
 - Only include NEW atoms not already in the previous extraction.
 - Every atom MUST include a "confidence" field (0.0-1.0).
-- If nothing is missing, return: {"paragraphs": []}
+- **Entity reference consistency:** If you add a new relation whose from/to references an entity not yet declared in the previous extraction or your current output, you MUST also add that entity to the "entities" array.
+- If nothing is missing, return: {}
 - Respond in the same language as the input text.
 Return ONLY a valid JSON object. No markdown fences, no explanation.`;
 }
 var DOC_ANNOTATION_SYSTEM_PROMPT = SYSTEM_PROMPT;
+// ../llm/src/prompts/entityResolution.ts
+var ENTITY_RESOLUTION_SYSTEM_PROMPT = `You are an entity resolution assistant. You review a list of entity names extracted from a technical document and perform two tasks:
+## Task 1: Merge Duplicates
+- Only merge names that clearly refer to the same entity (same system, service, tool, etc.)
+- Prefer the LONGER, more descriptive name as the canonical name
+- Do NOT merge names that share a substring but refer to different things
+- When uncertain, do NOT merge — add to "ambiguous" instead
+- Chinese and English names for the same entity SHOULD be merged (e.g. "Vmok" → "Vmok 微模块框架")
+- Abbreviations should be merged with their full forms (e.g. "AGW" → "API Gateway")
+## Task 2: Remove Noise
+- Apply the **identity test**: a real entity is something you can discuss independently ("What is X?", "How does X work?", "Who owns X?"). Names that fail this test — values, addresses, actions, generic descriptions — are noise.
+- Remove names that are NOT meaningful named entities: generic words, action descriptions, or things that are attributes/values rather than independent subjects
+- Examples of REAL entities to KEEP: product names (TTAstra, Gulux), tools (nvm, Rush), services (Op Main 服务), platforms (AGW 平台) — these all pass the identity test
+- When uncertain, KEEP the name — only remove if it clearly fails the identity test
+## Output
+Valid JSON only. No markdown fences, no explanation.`;
+function buildEntityResolutionPrompt(input) {
+  const parts = [];
+  parts.push(`## All Entity Names (${input.allNames.length} total)`);
+  parts.push(input.allNames.map((n, i) => `${i + 1}. ${n}`).join(`
+`));
+  if (input.candidates.length > 0) {
+    parts.push("");
+    parts.push(`## Suspected Duplicates (${input.candidates.length} pairs)`);
+    parts.push("Review each pair and decide whether to merge:");
+    for (const c of input.candidates) {
+      parts.push(`- "${c.short}" ↔ "${c.long}" — ${c.reason}`);
+    }
+  }
+  if (input.noiseCandidates && input.noiseCandidates.length > 0) {
+    parts.push("");
+    parts.push(`## Suspected Noise (${input.noiseCandidates.length} names)`);
+    parts.push("Review each name — remove if NOT a meaningful named entity, keep if it IS:");
+    for (const n of input.noiseCandidates) {
+      parts.push(`- "${n}"`);
+    }
+  }
+  if (input.contextSnippets && input.contextSnippets.length > 0) {
+    parts.push("");
+    parts.push("## Context Snippets");
+    for (const s of input.contextSnippets) {
+      parts.push(`- **${s.name}**: ${s.snippet}`);
+    }
+  }
+  parts.push("");
+  parts.push(`## Output Format
+Return a JSON object:
+{
+  "merges": [
+    { "from": "alias name", "to": "canonical name" }
+  ],
+  "remove": ["noise_name_1", "noise_name_2"],
+  "ambiguous": ["name1", "name2"]
+}
+- "merges": confirmed duplicate pairs. "from" will be replaced by "to" everywhere.
+- "remove": names confirmed as noise. They will be deleted from entity list.
+- "ambiguous": names you're unsure about (optional, for logging).
+Return ONLY valid JSON. No markdown fences, no explanation.`);
+  return parts.join(`
+`);
+}
+// ../llm/src/prompts/docTableAnnotation.ts
+init_src();
+var entityFields = zodObjectToPromptFields(entityAtomSchema);
+var attributeFields = zodObjectToPromptFields(attributeAtomSchema);
+var relationFields = zodObjectToPromptFields(relationAtomSchema);
+var comparisonFields = zodObjectToPromptFields(comparisonAtomSchema);
+var metricFields = zodObjectToPromptFields(metricAtomSchema);
+var behaviorFields = zodObjectToPromptFields(behaviorAtomSchema);
+var eventFields = zodObjectToPromptFields(eventAtomSchema);
+var transitionFields = zodObjectToPromptFields(transitionAtomSchema);
+var constraintFields = zodObjectToPromptFields(constraintAtomSchema);
+var stateFields = zodObjectToPromptFields(stateAtomSchema);
+var ruleFields = zodObjectToPromptFields(ruleAtomSchema);
+var TABLE_SYSTEM_PROMPT = `You are a table data modeling assistant. Your task is to extract structured semantic atoms from markdown tables in documents.
+Each table paragraph is tagged with [P0], [P1], etc. You must classify the table type FIRST, then apply the corresponding extraction rules.
+## Step 1: Classify the Table
+Determine the table type by examining the relationship between rows:
+### Type A: Collection / Record Table
+**Rows are peer instances of the same concept.** Each row is an independent record; columns describe different facets of the same instance.
+- Examples: code→name mappings, enum definitions, config parameter lists, reference data tables
+- Key signal: removing one row does not affect the meaning of other rows
+### Type B: Single-Object Property Table
+**Rows describe properties/fields of ONE entity.** First column is property name, other columns are its type/value/description.
+- Examples: API field definitions, configuration schema, entity attribute lists
+- Key signal: all rows refer to the same parent entity
+### Type C: Comparison / Evaluation Table
+**Rows or columns represent different subjects being compared** across the same dimensions.
+- Examples: technology selection, vendor evaluation, feature comparison
+- Key signal: multiple named subjects evaluated on shared criteria
+### Type D: Matrix / Cross-Reference Table
+**Both row headers and column headers are dimensions.** Cells represent the relationship at the intersection.
+- Examples: permission matrices (role × operation), compatibility matrices, dependency tables
+- Key signal: both axes are meaningful dimensions, cells are binary/rating/relationship values
+### Type E: Metrics / KPI Table
+**Rows are measurable indicators** with numeric targets, thresholds, or SLA values.
+- Examples: SLA tables, performance baselines, capacity planning tables
+- Key signal: columns include target/threshold/unit/SLA-style values
+### Type F: Timeline / Process Table
+**Rows represent ordered steps or phases** in a sequence.
+- Examples: deployment steps, approval workflows, version changelog, migration plans
+- Key signal: rows have implicit ordering, may have phase/step/date columns
+## Step 2: Extract Atoms by Table Type
+### Type A → Single attribute with row-object array
+1. Create ONE entity for the abstract concept (table heading or the concept rows represent).
+   Entity schema: ${entityFields}
+2. Create ONE attribute with \`type: "table"\` and \`value\` as an array of row objects. Each row object uses column headers as keys.
+   Attribute schema: ${attributeFields}
+   Example: \`{ "name": "Region Code Mapping", "type": "table", "value": [{"Code": "1001", "Name": "CN_North", "Region": "CN-NORTH"}, ...] }\`
+3. **Extract ALL rows — do not sample.** If a table has 30 rows, the value array must contain all 30 objects.
+4. Extract structural patterns: status indicators (DEPRECATED, enabled/disabled) → "states" + "rules" atoms.
+   State schema: ${stateFields}
+   Rule schema: ${ruleFields}
+### Type B → Multiple attribute atoms
+1. Create ONE entity for the parent structure.
+   Entity schema: ${entityFields}
+2. Create one attribute per row: \`name\` = property name, \`type\` = property type, \`value\` = default/example.
+   Attribute schema: ${attributeFields}
+3. Extract constraints from "required" or "validation" columns.
+   Constraint schema: ${constraintFields}
+### Type C → Comparison atom
+1. Use "comparisons" atom. Subjects = compared items, dimensions = evaluation criteria.
+   Comparison schema: ${comparisonFields}
+2. Extract "decisions" atoms if the table leads to a conclusion.
+### Type D → Relations or table attribute
+1. If cells are simple (yes/no, allowed/denied): extract as "relations" atoms.
+   Relation schema: ${relationFields}
+   Map each cell to a relation: row header → \`from\`, column header → \`to\`, cell value → \`type\` or \`description\`.
+2. If cells are complex: use Type A approach (single attribute with \`type: "table"\`).
+3. Create entities for both row headers and column headers if they are named concepts.
+### Type E → Metrics atoms
+1. Create one "metrics" atom per row.
+   Metric schema: ${metricFields}
+2. Also create the parent entity if named (e.g., "SLA Requirements").
+### Type F → Behaviors/Events/Transitions
+1. Create one "behaviors" atom per step/phase.
+   Behavior schema: ${behaviorFields}
+2. If there are triggers: extract "events" atoms.
+   Event schema: ${eventFields}
+3. If there are state changes: extract "transitions" atoms.
+   Transition schema: ${transitionFields}
+4. Create the parent entity for the process/workflow.
+## Output Format
+Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
+{
+  "P0": {
+    "tableType": "A",
+    "entities": [...],
+    "attributes": [...]
+  },
+  "P3": {
+    "tableType": "C",
+    "comparisons": [...]
+  }
+}
+## Rules
+- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
+- Skip paragraphs with no atoms — do NOT emit empty objects.
+- Every atom MUST include a "confidence" field (0.0-1.0).
+- The "tableType" field is required for each paragraph (one of "A", "B", "C", "D", "E", "F").
+- Only include atom types that are actually extracted.
+- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
+- JSON structure keys must always be in English.
+- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
+- Do NOT include "claims" — they are system-generated.`;
+function buildDocTableAnnotationPrompt(tableText) {
+  return `Classify and extract atoms from the following table paragraphs.
+Each paragraph is tagged with [P0], [P1], etc. First classify each table, then extract atoms accordingly.
+---
+${tableText}
+---
+Return ONLY a valid JSON object. No markdown fences, no explanation.`;
+}
+var DOC_TABLE_ANNOTATION_SYSTEM_PROMPT = TABLE_SYSTEM_PROMPT;
+// ../llm/src/prompts/docDiagramAnnotation.ts
+init_src();
+var entityFields2 = zodObjectToPromptFields(entityAtomSchema);
+var attributeFields2 = zodObjectToPromptFields(attributeAtomSchema);
+var relationFields2 = zodObjectToPromptFields(relationAtomSchema);
+var behaviorFields2 = zodObjectToPromptFields(behaviorAtomSchema);
+var transitionFields2 = zodObjectToPromptFields(transitionAtomSchema);
+var stateFields2 = zodObjectToPromptFields(stateAtomSchema);
+var roleFields = zodObjectToPromptFields(roleAtomSchema);
+var eventFields2 = zodObjectToPromptFields(eventAtomSchema);
+var decisionFields = zodObjectToPromptFields(decisionAtomSchema);
+var constraintFields2 = zodObjectToPromptFields(constraintAtomSchema);
+var DIAGRAM_FENCE_TAGS = [
+  "mermaid",
+  "plantuml",
+  "puml",
+  "dot",
+  "graphviz",
+  "viz",
+  "d2",
+  "c4plantuml",
+  "ditaa",
+  "nomnoml",
+  "wavedrom",
+  "vega",
+  "vega-lite"
+];
+var DIAGRAM_FENCE_REGEX = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "i");
+var DIAGRAM_SYSTEM_PROMPT = `You are a diagram analysis assistant. Your task is to extract structured semantic atoms from text-based diagrams (Mermaid, PlantUML, Graphviz, D2, etc.) embedded in documents.
+Each diagram paragraph is tagged with [P0], [P1], etc. You must classify the diagram type FIRST, then extract atoms accordingly.
+## Step 1: Identify the Diagram Format and Type
+### Formats
+- **Mermaid**: flowchart/graph, sequenceDiagram, stateDiagram, classDiagram, erDiagram, gantt, pie, gitgraph
+- **PlantUML / C4-PlantUML**: @startuml/@enduml blocks, all UML types, C4 architecture (System_Context, Container, Component)
+- **Graphviz (DOT)**: digraph/graph, general directed/undirected graphs
+- **D2**: modern declarative diagrams with shape/connection syntax
+- **Others**: ditaa (ASCII art), nomnoml (UML), wavedrom (timing), vega/vega-lite (data viz)
+### Diagram Types (by semantic content)
+- **Flowchart / Process**: decision trees, algorithms, business process flows
+- **Sequence**: interaction between participants over time (API calls, protocols)
+- **State Machine**: states and transitions triggered by events/guards
+- **Class / ER**: data models, entity relationships, inheritance hierarchies
+- **Architecture**: system components, containers, deployment topology
+- **Gantt / Timeline**: project schedules, milestones, phases
+- **Pie / Data Viz**: statistical distributions, metrics visualization
+## Step 2: Extract Atoms by Diagram Type
+### Flowchart / Process → entities + relations + behaviors + decisions
+1. Extract each node as an entity.
+   Entity schema: ${entityFields2}
+2. Extract each arrow/edge as a relation. Use edge labels as \`type\` or \`description\`.
+   Relation schema: ${relationFields2}
+3. Extract action nodes as behaviors (what the process does at each step).
+   Behavior schema: ${behaviorFields2}
+4. Extract diamond/condition nodes: if it represents a deliberate choice with rationale → "decisions"; if it represents conditional branching logic (IF-THEN) → "rules".
+   Decision schema: ${decisionFields}
+### Sequence → entities + relations + behaviors + events
+1. Extract each participant/actor as an entity (or role if it's a person/team).
+   Entity schema: ${entityFields2}
+   Role schema: ${roleFields}
+2. Extract each message/call as a relation (\`from\` = caller, \`to\` = callee, \`type\` = message label).
+   Relation schema: ${relationFields2}
+3. Extract significant interactions as behaviors.
+   Behavior schema: ${behaviorFields2}
+4. Extract triggers, responses, and async messages as events.
+   Event schema: ${eventFields2}
+### State Machine → entities + states + transitions + events
+1. Extract the state machine subject as an entity.
+   Entity schema: ${entityFields2}
+2. Extract each state as a state atom.
+   State schema: ${stateFields2}
+3. Extract each arrow as a transition (\`from\` = source state, \`to\` = target state, \`trigger\` = event/guard).
+   Transition schema: ${transitionFields2}
+4. Extract triggers as events.
+   Event schema: ${eventFields2}
+### Class / ER → entities + attributes + relations
+1. Extract each class/entity as an entity.
+   Entity schema: ${entityFields2}
+2. Extract fields/properties as attributes.
+   Attribute schema: ${attributeFields2}
+3. Extract associations, inheritance, composition as relations (\`type\` = "INHERITS", "CONTAINS", "REFERENCES", etc.).
+   Relation schema: ${relationFields2}
+### Architecture → entities + relations + constraints
+1. Extract each system/service/container/component as an entity. Use kind to indicate origin: "implementation" for internal systems/services, "external" for third-party dependencies (databases, cloud services, external APIs).
+   Entity schema: ${entityFields2}
+2. Extract connections between components as relations. Use standard types: CONTAINS (parent→child), DEPENDS_ON (runtime dependency), TRIGGERS (event/process triggering).
+   Relation schema: ${relationFields2}
+3. Extract deployment constraints, technology choices.
+   Constraint schema: ${constraintFields2}
+### Gantt / Timeline → behaviors + events + constraints
+1. Extract each task/phase as a behavior.
+   Behavior schema: ${behaviorFields2}
+2. Extract milestones and deadlines as events.
+   Event schema: ${eventFields2}
+3. Extract dependencies and critical path constraints.
+   Constraint schema: ${constraintFields2}
+### Pie / Data Viz → attributes (summary only)
+1. Extract the chart title as an entity.
+   Entity schema: ${entityFields2}
+2. Extract each slice/data point as an attribute (\`name\` = label, \`value\` = amount/percentage, \`type\` = "metric").
+   Attribute schema: ${attributeFields2}
+## Additional Extraction: Diagram Description
+For EVERY diagram, also extract a "description" attribute on the diagram's primary entity:
+- \`name\`: "diagram_description"
+- \`type\`: "description"
+- \`value\`: A 1-3 sentence natural language summary of what the diagram communicates.
+This description is critical for downstream AI consumers who cannot render the diagram.
+## Output Format
+Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
+{
+  "P0": {
+    "diagramFormat": "mermaid",
+    "diagramType": "sequence",
+    "entities": [...],
+    "relations": [...]
+  }
+}
+## Rules
+- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
+- Skip paragraphs with no atoms — do NOT emit empty objects.
+- Every atom MUST include a "confidence" field (0.0-1.0).
+- The "diagramFormat" and "diagramType" fields are required for each paragraph.
+- Only include atom types that are actually extracted.
+- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
+- JSON structure keys must always be in English.
+- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
+- **Extract ALL nodes and edges** — do not sample or skip.
+- Do NOT include "claims" — they are system-generated.`;
+function buildDocDiagramAnnotationPrompt(diagramText) {
+  return `Analyze and extract atoms from the following diagram paragraphs.
+Each paragraph is tagged with [P0], [P1], etc. First identify the diagram format and type, then extract atoms accordingly.
+---
+${diagramText}
+---
+Return ONLY a valid JSON object. No markdown fences, no explanation.`;
+}
+var DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT = DIAGRAM_SYSTEM_PROMPT;
 // ../llm/src/chunking/markdownChunker.ts
 var DEFAULT_MAX_TOKENS2 = 4000;
+var DEFAULT_PARAGRAPH_MAX_TOKENS = 500;
 function estimateTokens(text2) {
   return Math.ceil(text2.length / 4);
 }
+function findCodeBlockRanges(content) {
+  const ranges = [];
+  const fenceRe = /^(`{3,}|~{3,})/gm;
+  let openStart = -1;
+  let openFence = "";
+  let match;
+  while ((match = fenceRe.exec(content)) !== null) {
+    const fence = match[1];
+    if (openStart === -1) {
+      openStart = match.index;
+      openFence = fence[0].repeat(fence.length);
+    } else if (fence[0] === openFence[0] && fence.length >= openFence.length) {
+      ranges.push({ start: openStart, end: match.index + match[0].length });
+      openStart = -1;
+      openFence = "";
+    }
+  }
+  if (openStart !== -1) {
+    ranges.push({ start: openStart, end: content.length });
+  }
+  return ranges;
+}
+function isInsideCodeBlock(pos, ranges) {
+  for (const r of ranges) {
+    if (pos >= r.start && pos < r.end)
+      return true;
+    if (r.start > pos)
+      break;
+  }
+  return false;
+}
 function parseSections(content) {
-  const headingRe = /^(#{1,6})\s+(.*)$/gm;
-  const sections = [];
+  const codeRanges = findCodeBlockRanges(content);
   const matches = [];
-  let match;
-  while ((match = headingRe.exec(content)) !== null) {
-    matches.push({
-      index: match.index,
-      level: match[1].length,
-      heading: match[2].trim()
-    });
+  const atxRe = /^(#{1,6})\s+(.*)$/gm;
+  let m;
+  while ((m = atxRe.exec(content)) !== null) {
+    if (!isInsideCodeBlock(m.index, codeRanges)) {
+      matches.push({
+        index: m.index,
+        endIndex: m.index + m[0].length,
+        level: m[1].length,
+        heading: m[2].trim()
+      });
+    }
   }
+  const lines = content.split(`
+`);
+  let offset = 0;
+  for (let i = 0;i < lines.length; i++) {
+    const line = lines[i];
+    if (i > 0) {
+      const prevLine = lines[i - 1].trim();
+      const prevLineStart = offset - lines[i - 1].length - 1;
+      if (prevLine && !isInsideCodeBlock(prevLineStart, codeRanges)) {
+        if (/^={2,}\s*$/.test(line)) {
+          matches.push({
+            index: prevLineStart < 0 ? 0 : prevLineStart,
+            endIndex: offset + line.length,
+            level: 1,
+            heading: prevLine
+          });
+        } else if (/^-{2,}\s*$/.test(line) && !/^-{3,}\s*$/.test(prevLine)) {
+          matches.push({
+            index: prevLineStart < 0 ? 0 : prevLineStart,
+            endIndex: offset + line.length,
+            level: 2,
+            heading: prevLine
+          });
+        }
+      }
+    }
+    offset += line.length + 1;
+  }
+  matches.sort((a, b) => a.index - b.index);
+  const deduped = [];
+  for (const match of matches) {
+    const last = deduped[deduped.length - 1];
+    if (last && match.index < last.endIndex)
+      continue;
+    deduped.push(match);
+  }
+  return buildSectionsFromMatches(content, deduped);
+}
+function buildSectionsFromMatches(content, matches) {
+  const sections = [];
   if (matches.length === 0) {
     const body = content.trim();
     if (body) {
-      sections.push({
-        heading: "",
-        level: 0,
-        body,
-        paragraphs: splitParagraphs(body)
-      });
+      sections.push({ heading: "", level: 0, body, paragraphs: splitParagraphs(body) });
     }
     return sections;
   }
   if (matches[0].index > 0) {
     const preBody = content.slice(0, matches[0].index).trim();
     if (preBody) {
-      sections.push({
-        heading: "",
-        level: 0,
-        body: preBody,
-        paragraphs: splitParagraphs(preBody)
-      });
+      sections.push({ heading: "", level: 0, body: preBody, paragraphs: splitParagraphs(preBody) });
     }
   }
   for (let i = 0;i < matches.length; i++) {
     const m = matches[i];
-    const start = m.index;
-    const end = i + 1 < matches.length ? matches[i + 1].index : content.length;
-    const fullText = content.slice(start, end).trim();
-    const headingLineEnd = fullText.indexOf(`
-`);
-    const body = headingLineEnd === -1 ? "" : fullText.slice(headingLineEnd + 1).trim();
+    const bodyStart = m.endIndex;
+    const bodyEnd = i + 1 < matches.length ? matches[i + 1].index : content.length;
+    const body = content.slice(bodyStart, bodyEnd).trim();
     sections.push({
       heading: m.heading,
       level: m.level,
@@ -222336,6 +222797,128 @@ function splitParagraphs(text2) {
     return [];
   return text2.split(/\n\n+/).map((p) => p.trim()).filter(Boolean);
 }
+function splitOversizedText(text2, maxTokens) {
+  const doubleNewlineParts = text2.split(/\n\n+/).map((p) => p.trim()).filter(Boolean);
+  if (doubleNewlineParts.length > 1) {
+    const results = [];
+    let acc = "";
+    let accTokens = 0;
+    for (const part of doubleNewlineParts) {
+      const partTokens = estimateTokens(part);
+      if (partTokens > maxTokens) {
+        if (acc) {
+          results.push(acc);
+          acc = "";
+          accTokens = 0;
+        }
+        results.push(...splitOversizedText(part, maxTokens));
+        continue;
+      }
+      if (acc && accTokens + partTokens > maxTokens) {
+        results.push(acc);
+        acc = "";
+        accTokens = 0;
+      }
+      acc = acc ? acc + `
+` + part : part;
+      accTokens += partTokens;
+    }
+    if (acc)
+      results.push(acc);
+    return results;
+  }
+  const lines = text2.split(`
+`);
+  if (lines.length > 1) {
+    const blocks = mergeAtomicBlocks(lines);
+    const results = [];
+    let acc = "";
+    let accTokens = 0;
+    for (const block of blocks) {
+      const blockTokens = estimateTokens(block);
+      if (blockTokens > maxTokens) {
+        if (acc) {
+          results.push(acc);
+          acc = "";
+          accTokens = 0;
+        }
+        results.push(block);
+        continue;
+      }
+      if (acc && accTokens + blockTokens > maxTokens) {
+        results.push(acc);
+        acc = "";
+        accTokens = 0;
+      }
+      acc = acc ? acc + `
+` + block : block;
+      accTokens += blockTokens;
+    }
+    if (acc)
+      results.push(acc);
+    return results;
+  }
+  return forceBreakText(text2, maxTokens);
+}
+function mergeAtomicBlocks(lines) {
+  const result = [];
+  let i = 0;
+  while (i < lines.length) {
+    const line = lines[i];
+    const trimmed = line.trimStart();
+    if (/^(`{3,}|~{3,})/.test(trimmed)) {
+      const fence = trimmed.match(/^(`{3,}|~{3,})/)[1];
+      const fenceChar = fence[0];
+      const fenceLen = fence.length;
+      const blockLines = [line];
+      i++;
+      while (i < lines.length) {
+        blockLines.push(lines[i]);
+        const inner = lines[i].trimStart();
+        if (inner.startsWith(fenceChar) && inner.match(new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`))) {
+          i++;
+          break;
+        }
+        i++;
+      }
+      result.push(blockLines.join(`
+`));
+      continue;
+    }
+    if (trimmed.startsWith("|")) {
+      const tableLines = [line];
+      i++;
+      while (i < lines.length && lines[i].trimStart().startsWith("|")) {
+        tableLines.push(lines[i]);
+        i++;
+      }
+      result.push(tableLines.join(`
+`));
+      continue;
+    }
+    result.push(line);
+    i++;
+  }
+  return result;
+}
+function forceBreakText(text2, maxTokens) {
+  const maxChars = maxTokens * 4;
+  const results = [];
+  let remaining = text2;
+  while (remaining.length > maxChars) {
+    let breakAt = maxChars;
+    const spaceIdx = remaining.lastIndexOf(" ", maxChars);
+    if (spaceIdx > maxChars * 0.7) {
+      breakAt = spaceIdx;
+    }
+    results.push(remaining.slice(0, breakAt).trim());
+    remaining = remaining.slice(breakAt).trim();
+  }
+  if (remaining)
+    results.push(remaining);
+  return results;
+}
 function buildBreadcrumb(sections, sectionIndex) {
   const current = sections[sectionIndex];
   if (current.level <= 0)
@@ -222364,11 +222947,53 @@ function sectionHeadingLine(section) {
     return "";
   return `${"#".repeat(section.level)} ${section.heading}`;
 }
+function buildCoarseParagraphs(sections, paragraphMaxTokens) {
+  const result = [];
+  const rawEntries = [];
+  for (let sIdx = 0;sIdx < sections.length; sIdx++) {
+    const section = sections[sIdx];
+    if (!section.body.trim())
+      continue;
+    const bodyTokens = estimateTokens(section.body);
+    if (bodyTokens > paragraphMaxTokens) {
+      const parts = splitOversizedText(section.body, paragraphMaxTokens);
+      for (const part of parts) {
+        rawEntries.push({ sectionIndex: sIdx, text: part, tokens: estimateTokens(part) });
+      }
+    } else {
+      rawEntries.push({ sectionIndex: sIdx, text: section.body, tokens: bodyTokens });
+    }
+  }
+  const MERGE_THRESHOLD = 150;
+  const merged = [];
+  for (const entry of rawEntries) {
+    const last = merged[merged.length - 1];
+    if (last && last.tokens < MERGE_THRESHOLD && entry.tokens < MERGE_THRESHOLD && last.tokens + entry.tokens <= paragraphMaxTokens) {
+      last.text = last.text + `
+` + entry.text;
+      last.tokens += entry.tokens;
+    } else {
+      merged.push({ ...entry });
+    }
+  }
+  let pIdx = 0;
+  for (const entry of merged) {
+    result.push({
+      sectionIndex: entry.sectionIndex,
+      paragraphIndex: pIdx++,
+      text: entry.text
+    });
+  }
+  return result;
+}
 function chunkMarkdown(content, options = {}) {
   const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS2;
+  const paragraphMaxTokens = options.paragraphMaxTokens ?? DEFAULT_PARAGRAPH_MAX_TOKENS;
   const sections = parseSections(content);
   if (sections.length === 0)
     return [];
+  const coarseParagraphs = buildCoarseParagraphs(sections, paragraphMaxTokens);
   const chunks = [];
   let pendingSections = [];
   let pendingTokens = 0;
@@ -222386,14 +223011,16 @@ function chunkMarkdown(content, options = {}) {
       const heading = sectionHeadingLine(entry.section);
       if (heading)
         textParts.push(heading);
-      for (let pIdx = 0;pIdx < entry.section.paragraphs.length; pIdx++) {
-        const pText = entry.section.paragraphs[pIdx];
-        textParts.push(pText);
-        paragraphs.push({
-          sectionIndex: entry.sectionIndex,
-          paragraphIndex: pIdx,
-          text: pText
-        });
+      const sectionParas = coarseParagraphs.filter((p) => p.sectionIndex === entry.sectionIndex);
+      for (const p of sectionParas) {
+        if (!paragraphs.some((existing) => existing.paragraphIndex === p.paragraphIndex && existing.text === p.text)) {
+          textParts.push(p.text);
+          paragraphs.push({
+            sectionIndex: p.sectionIndex,
+            paragraphIndex: p.paragraphIndex,
+            text: p.text
+          });
+        }
       }
     }
     chunks.push({
@@ -222416,7 +223043,7 @@ function chunkMarkdown(content, options = {}) {
 ` : "") + section.body);
     if (sectionTokens > maxTokens && section.paragraphs.length > 1) {
       flushPending();
-      splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks);
+      splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks, coarseParagraphs);
       continue;
     }
     const crumbTokens = pendingSections.length === 0 ? estimateTokens(breadcrumbPrefix(breadcrumb)) : 0;
@@ -222429,9 +223056,10 @@ function chunkMarkdown(content, options = {}) {
   flushPending();
   return chunks;
 }
-function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks) {
+function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks, coarseParagraphs) {
   const headingLine = sectionHeadingLine(section);
   const prefix = breadcrumbPrefix(breadcrumb);
+  const sectionParas = coarseParagraphs.filter((p) => p.sectionIndex === sectionIndex);
   let accParagraphs = [];
   let accTextParts = [];
   let accTokens = 0;
@@ -222458,18 +223086,265 @@ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens,
     accTokens = baseOverhead;
   }
   accTokens = baseOverhead;
-  for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
-    const pText = section.paragraphs[pIdx];
-    const pTokens = estimateTokens(pText);
+  for (const p of sectionParas) {
+    const pTokens = estimateTokens(p.text);
     if (accParagraphs.length > 0 && accTokens + pTokens > maxTokens) {
       flushAcc();
     }
-    accParagraphs.push({ sectionIndex, paragraphIndex: pIdx, text: pText });
-    accTextParts.push(pText);
+    accParagraphs.push({ sectionIndex, paragraphIndex: p.paragraphIndex, text: p.text });
+    accTextParts.push(p.text);
     accTokens += pTokens;
   }
   flushAcc();
 }
+// ../llm/src/chunking/normalizeMarkdown.ts
+function normalizeMarkdown(content) {
+  const stats = { repairs: {} };
+  function count(category) {
+    stats.repairs[category] = (stats.repairs[category] ?? 0) + 1;
+  }
+  let result = stripBomAndInvisible(content, count);
+  result = normalizeLineEndings(result, count);
+  const lines = result.split(`
+`);
+  const output = processBlocks(lines, count);
+  return { content: output.join(`
+`), stats };
+}
+function stripBomAndInvisible(text2, count) {
+  const cleaned = text2.replace(/[\uFEFF\u200B\u200C\u200D]/g, "");
+  if (cleaned.length !== text2.length) {
+    count("invisible_chars");
+  }
+  return cleaned;
+}
+function normalizeLineEndings(text2, count) {
+  if (text2.includes("\r")) {
+    count("line_endings");
+    return text2.replace(/\r\n?/g, `
+`);
+  }
+  return text2;
+}
+function processBlocks(inputLines, count) {
+  const lines = splitInlineFences(inputLines, count);
+  const output = [];
+  let i = 0;
+  while (i < lines.length) {
+    const line = lines[i];
+    const trimmed = line.trimStart();
+    const fenceMatch = trimmed.match(/^(`{3,}|~{3,})/);
+    if (fenceMatch) {
+      const result = handleCodeFence(lines, i, fenceMatch[1], count);
+      output.push(...result.lines);
+      i = result.nextIndex;
+      continue;
+    }
+    if (looksLikeTableRow(trimmed)) {
+      const result = handleTableBlock(lines, i, count);
+      output.push(...result.lines);
+      i = result.nextIndex;
+      continue;
+    }
+    if (trimmed === "") {
+      const result = handleBlankLines(lines, i, count);
+      output.push(...result.lines);
+      i = result.nextIndex;
+      continue;
+    }
+    if (trimmed.startsWith("<!--")) {
+      const result = handleHtmlComment(lines, i, count);
+      output.push(...result.lines);
+      i = result.nextIndex;
+      continue;
+    }
+    if (looksLikeJsonBlockStart(trimmed)) {
+      const result = handleUnfencedJson(lines, i, count);
+      if (result) {
+        output.push(...result.lines);
+        i = result.nextIndex;
+        continue;
+      }
+    }
+    output.push(line);
+    i++;
+  }
+  return output;
+}
+function handleCodeFence(lines, startIdx, fence, count) {
+  const fenceChar = fence[0];
+  const fenceLen = fence.length;
+  const result = [lines[startIdx]];
+  let i = startIdx + 1;
+  while (i < lines.length) {
+    const trimmed = lines[i].trimStart();
+    result.push(lines[i]);
+    const closingRe = new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`);
+    if (closingRe.test(trimmed)) {
+      return { lines: result, nextIndex: i + 1 };
+    }
+    i++;
+  }
+  count("unclosed_code_fence");
+  result.push(fence);
+  return { lines: result, nextIndex: i };
+}
+function handleTableBlock(lines, startIdx, count) {
+  const tableLines = [];
+  let i = startIdx;
+  while (i < lines.length && looksLikeTableRow(lines[i].trimStart())) {
+    tableLines.push(lines[i]);
+    i++;
+  }
+  if (tableLines.length < 2) {
+    return { lines: tableLines, nextIndex: i };
+  }
+  const normalized = tableLines.map((line) => {
+    const trimmed = line.trimStart();
+    if (!trimmed.startsWith("|") && trimmed.includes("|")) {
+      count("table_leading_pipe");
+      return "| " + trimmed + (trimmed.endsWith("|") ? "" : " |");
+    }
+    return line;
+  });
+  const hasSeparator = normalized.some((line) => /^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/.test(line.trim()));
+  if (!hasSeparator && normalized.length >= 2) {
+    const firstRow = normalized[0].trim();
+    const colCount = countPipes(firstRow) - 1;
+    if (colCount >= 2) {
+      const separator = "| " + Array(colCount).fill("---").join(" | ") + " |";
+      count("table_missing_separator");
+      const result = [normalized[0], separator, ...normalized.slice(1)];
+      return { lines: result, nextIndex: i };
+    }
+  }
+  return { lines: normalized, nextIndex: i };
+}
+function handleBlankLines(lines, startIdx, count) {
+  let i = startIdx;
+  while (i < lines.length && lines[i].trim() === "") {
+    i++;
+  }
+  const blankCount = i - startIdx;
+  if (blankCount > 2) {
+    count("excessive_blank_lines");
+    return { lines: [""], nextIndex: i };
+  }
+  return { lines: lines.slice(startIdx, i), nextIndex: i };
+}
+function handleHtmlComment(lines, startIdx, count) {
+  const firstLine = lines[startIdx];
+  if (firstLine.includes("-->")) {
+    count("html_comment");
+    return { lines: [], nextIndex: startIdx + 1 };
+  }
+  let i = startIdx + 1;
+  while (i < lines.length) {
+    if (lines[i].includes("-->")) {
+      count("html_comment");
+      return { lines: [], nextIndex: i + 1 };
+    }
+    i++;
+  }
+  return { lines: [firstLine], nextIndex: startIdx + 1 };
+}
+function looksLikeJsonBlockStart(trimmed) {
+  return trimmed === "{" || trimmed === "[";
+}
+var MIN_JSON_BLOCK_LINES = 5;
+function handleUnfencedJson(lines, startIdx, count) {
+  const opener = lines[startIdx].trimStart();
+  const openChar = opener[0];
+  const closeChar = openChar === "{" ? "}" : "]";
+  let depth = 0;
+  let i = startIdx;
+  let inString = false;
+  while (i < lines.length) {
+    const line = lines[i];
+    for (let c = 0;c < line.length; c++) {
+      const ch = line[c];
+      if (ch === "\\" && inString) {
+        c++;
+        continue;
+      }
+      if (ch === '"') {
+        inString = !inString;
+        continue;
+      }
+      if (inString)
+        continue;
+      if (ch === "/" && c + 1 < line.length && line[c + 1] === "/") {
+        break;
+      }
+      if (ch === "{" || ch === "[")
+        depth++;
+      else if (ch === "}" || ch === "]")
+        depth--;
+    }
+    i++;
+    if (depth === 0) {
+      const blockLen = i - startIdx;
+      if (blockLen < MIN_JSON_BLOCK_LINES) {
+        return null;
+      }
+      const lastTrimmed = lines[i - 1].trimEnd();
+      if (!lastTrimmed.endsWith(closeChar)) {
+        return null;
+      }
+      count("unfenced_json_block");
+      const fenced = ["```json"];
+      for (let j = startIdx;j < i; j++) {
+        fenced.push(lines[j]);
+      }
+      fenced.push("```");
+      return { lines: fenced, nextIndex: i };
+    }
+    if (depth < 0) {
+      return null;
+    }
+  }
+  return null;
+}
+function splitInlineFences(lines, count) {
+  const result = [];
+  for (const line of lines) {
+    const trimmed = line.trimStart();
+    if (/^(`{3,}|~{3,})/.test(trimmed)) {
+      result.push(line);
+      continue;
+    }
+    const inlineMatch = trimmed.match(/(`{3,}|~{3,})(\S*)\s*$/);
+    if (inlineMatch) {
+      const fenceStr = inlineMatch[1];
+      const fenceIdx = trimmed.lastIndexOf(fenceStr);
+      const beforeFence = trimmed.substring(0, fenceIdx);
+      if (beforeFence.trim().length > 0) {
+        const leadingWhitespace = line.substring(0, line.length - trimmed.length);
+        count("inline_code_fence");
+        result.push(leadingWhitespace + beforeFence.trimEnd());
+        result.push(trimmed.substring(fenceIdx));
+        continue;
+      }
+    }
+    result.push(line);
+  }
+  return result;
+}
+function looksLikeTableRow(trimmed) {
+  if (trimmed.startsWith("#") || trimmed.startsWith("```") || trimmed.startsWith("~~~")) {
+    return false;
+  }
+  return countPipes(trimmed) >= 1;
+}
+function countPipes(text2) {
+  let count = 0;
+  for (let i = 0;i < text2.length; i++) {
+    if (text2[i] === "|" && (i === 0 || text2[i - 1] !== "\\")) {
+      count++;
+    }
+  }
+  return count;
+}
 // ../llm/src/utils/mapConcurrent.ts
 async function mapConcurrent(items, concurrency, fn) {
   const results = [];