@c4a/server-cli 0.4.15-alpha.3 → 0.4.15-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +10 -5
  2. package/index.js +954 -85
  3. package/package.json +1 -1
  4. package/serve.js +8948 -5401
  5. package/wasm/tree-sitter-tsx.wasm +0 -0
  6. package/wasm/tree-sitter-typescript.wasm +0 -0
  7. package/wasm/tree-sitter.wasm +0 -0
  8. package/web/assets/ContentDetail--oZBzWh0.js +1 -0
  9. package/web/assets/ContentDetail-B5s8bbFo.js +1 -0
  10. package/web/assets/ContentDetail-C3kXsx-i.js +1 -0
  11. package/web/assets/ContentDetail-CcLGF_Yi.js +1 -0
  12. package/web/assets/ContentDetail-D-2xyerw.js +1 -0
  13. package/web/assets/ContentDetail-DlQ8URkx.js +1 -0
  14. package/web/assets/ContentDetail-TPc0m0eM.js +1 -0
  15. package/web/assets/ContentDetail-y0yi2qln.js +1 -0
  16. package/web/assets/EntityDetail-3CFtMmgQ.js +1 -0
  17. package/web/assets/EntityDetail-BI3etmj4.js +1 -0
  18. package/web/assets/EntityDetail-CoFb-qZW.js +1 -0
  19. package/web/assets/EntityDetail-D_WP7tD4.js +1 -0
  20. package/web/assets/EntityDetail-DiJPemDY.js +1 -0
  21. package/web/assets/EntityDetail-DihnDvhA.js +1 -0
  22. package/web/assets/EntityDetail-DyDH4GAw.js +1 -0
  23. package/web/assets/EntityDetail-dIZiNN2t.js +1 -0
  24. package/web/assets/RelationDetail-B2gHrceI.js +1 -0
  25. package/web/assets/RelationDetail-CEq9vopD.js +1 -0
  26. package/web/assets/RelationDetail-CaYrspaS.js +1 -0
  27. package/web/assets/RelationDetail-CpoGdy25.js +1 -0
  28. package/web/assets/RelationDetail-DU9ECyHi.js +1 -0
  29. package/web/assets/RelationDetail-Dz7HAlU5.js +1 -0
  30. package/web/assets/RelationDetail-Wh3IgNaF.js +1 -0
  31. package/web/assets/RelationDetail-zZ_ZfkYX.js +1 -0
  32. package/web/assets/index-BPMqeFze.js +111 -0
  33. package/web/assets/index-BgRuvBL5.js +111 -0
  34. package/web/assets/index-CcrkBEZl.js +111 -0
  35. package/web/assets/index-DGDx8sCs.js +111 -0
  36. package/web/assets/index-DIyAwnqE.js +111 -0
  37. package/web/assets/index-DW1cCA8v.js +111 -0
  38. package/web/assets/index-DiAYi5t8.css +1 -0
  39. package/web/assets/index-FOCWvgW_.css +1 -0
  40. package/web/assets/index-daOjyLzy.css +1 -0
  41. package/web/assets/index-moF8uSEi.js +111 -0
  42. package/web/assets/index-sPNyENFN.js +111 -0
  43. package/web/assets/index-uGqDxUnx.css +1 -0
  44. package/web/index.html +2 -2
package/index.js CHANGED
@@ -40342,6 +40342,10 @@ var init_serverConfig = __esm(() => {
40342
40342
  default_model: "gemini-3-pro-preview"
40343
40343
  }
40344
40344
  },
40345
+ indexing: {
40346
+ task_timeout_ms: 150 * 60 * 1000,
40347
+ file_timeout_ms: 15 * 60 * 1000
40348
+ },
40345
40349
  embedding: {
40346
40350
  provider: "huggingface",
40347
40351
  huggingface: {
@@ -44401,7 +44405,7 @@ var init_atomsSchema = __esm(() => {
44401
44405
  init_zod();
44402
44406
  init_base();
44403
44407
  init_baseSchema();
44404
- confidenceAtomSchema = exports_external.number().min(0).max(1).optional();
44408
+ confidenceAtomSchema = exports_external.number().min(0).max(1).optional().catch(undefined);
44405
44409
  entityAtomSchema = exports_external.object({
44406
44410
  name: exports_external.string(),
44407
44411
  kind: kindSchema.optional().catch(undefined),
@@ -220955,14 +220959,21 @@ function isRetryableStatus(status) {
220955
220959
  function isAuthStatus(status) {
220956
220960
  return status === 401 || status === 403;
220957
220961
  }
220958
- function isBadRequest(status) {
220959
- return status === 400;
220962
+ function throwLlmError(error40, status) {
220963
+ const detail = toErrorMessage(error40);
220964
+ const statusTag = status ? ` [HTTP ${status}]` : "";
220965
+ if (isAuthStatus(status)) {
220966
+ throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, `LLM 认证失败${statusTag}: ${detail}`, detail);
220967
+ }
220968
+ throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, `LLM 调用失败${statusTag}: ${detail}`, detail);
220960
220969
  }
220961
220970
 
220962
220971
  class LlmServiceImpl {
220963
220972
  options;
220973
+ supportsTemperature;
220964
220974
  constructor(options) {
220965
220975
  this.options = options;
220976
+ this.supportsTemperature = options.provider !== "openai";
220966
220977
  }
220967
220978
  async generateText(prompt, options) {
220968
220979
  if (this.options.forceStream) {
@@ -220974,7 +220985,7 @@ class LlmServiceImpl {
220974
220985
  model: this.options.languageModel,
220975
220986
  prompt,
220976
220987
  maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
220977
- temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
220988
+ ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
220978
220989
  maxRetries: 0
220979
220990
  };
220980
220991
  if (options?.systemPrompt) {
@@ -221011,13 +221022,7 @@ class LlmServiceImpl {
221011
221022
  durationMs,
221012
221023
  error: toErrorMessage(error40)
221013
221024
  });
221014
- if (isAuthStatus(status)) {
221015
- throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
221016
- }
221017
- if (isBadRequest(status)) {
221018
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221019
- }
221020
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221025
+ throwLlmError(error40, status);
221021
221026
  }
221022
221027
  }
221023
221028
  async generateTextViaStream(prompt, options) {
@@ -221027,7 +221032,7 @@ class LlmServiceImpl {
221027
221032
  model: this.options.languageModel,
221028
221033
  prompt,
221029
221034
  maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
221030
- temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
221035
+ ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
221031
221036
  maxRetries: 0
221032
221037
  };
221033
221038
  if (options?.systemPrompt) {
@@ -221065,13 +221070,7 @@ class LlmServiceImpl {
221065
221070
  durationMs: Date.now() - startedAt,
221066
221071
  error: toErrorMessage(error40)
221067
221072
  });
221068
- if (isAuthStatus(status)) {
221069
- throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
221070
- }
221071
- if (isBadRequest(status)) {
221072
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221073
- }
221074
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221073
+ throwLlmError(error40, status);
221075
221074
  }
221076
221075
  }
221077
221076
  streamText(prompt, options) {
@@ -221094,7 +221093,7 @@ class LlmServiceImpl {
221094
221093
  model: this.options.languageModel,
221095
221094
  prompt,
221096
221095
  maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
221097
- temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
221096
+ ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
221098
221097
  maxRetries: 0,
221099
221098
  onFinish: (event) => {
221100
221099
  const finishEvent = event;
@@ -221140,13 +221139,7 @@ class LlmServiceImpl {
221140
221139
  durationMs: Date.now() - startedAt,
221141
221140
  error: toErrorMessage(error40)
221142
221141
  });
221143
- if (isAuthStatus(status)) {
221144
- throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
221145
- }
221146
- if (isBadRequest(status)) {
221147
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221148
- }
221149
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221142
+ throwLlmError(error40, status);
221150
221143
  }
221151
221144
  }
221152
221145
  }
@@ -221857,7 +221850,12 @@ function parseExtractionOutput(raw, schema) {
221857
221850
  return { success: false, error: new Error("Empty output") };
221858
221851
  }
221859
221852
  const protocolParsed = tryParseProtocol(trimmed);
221860
- const parsed = protocolParsed ?? tryParseJson(trimmed);
221853
+ let parsed = protocolParsed ?? tryParseJson(trimmed);
221854
+ if (Array.isArray(parsed)) {
221855
+ parsed = { paragraphs: parsed };
221856
+ }
221857
+ parsed = normalizeFlatOutput(parsed);
221858
+ parsed = stripNulls(parsed);
221861
221859
  const result = schema.safeParse(parsed);
221862
221860
  if (!result.success) {
221863
221861
  return { success: false, error: result.error };
@@ -221927,6 +221925,37 @@ function tryParseJson(raw) {
221927
221925
  function repairAndParse(raw) {
221928
221926
  return JSON.parse(jsonrepair(raw));
221929
221927
  }
221928
+ var PARAGRAPH_TAG_RE = /^P\d+$/;
221929
+ function normalizeFlatOutput(parsed) {
221930
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
221931
+ return parsed;
221932
+ const obj = parsed;
221933
+ if ("paragraphs" in obj)
221934
+ return parsed;
221935
+ const keys = Object.keys(obj);
221936
+ if (keys.length === 0)
221937
+ return { paragraphs: [] };
221938
+ const allTags = keys.every((k) => PARAGRAPH_TAG_RE.test(k));
221939
+ if (!allTags)
221940
+ return parsed;
221941
+ const paragraphs = keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag) => ({ tag, atoms: obj[tag] }));
221942
+ return { paragraphs };
221943
+ }
221944
+ function stripNulls(value) {
221945
+ if (value === null)
221946
+ return;
221947
+ if (Array.isArray(value))
221948
+ return value.map(stripNulls);
221949
+ if (typeof value === "object" && value !== null) {
221950
+ const out = {};
221951
+ for (const [k, v] of Object.entries(value)) {
221952
+ if (v !== null)
221953
+ out[k] = stripNulls(v);
221954
+ }
221955
+ return out;
221956
+ }
221957
+ return value;
221958
+ }
221930
221959
  function isRecord(value) {
221931
221960
  return !!value && typeof value === "object" && "key" in value && "value" in value && typeof value.key === "string";
221932
221961
  }
@@ -222219,30 +222248,32 @@ Each atom type has specific required fields. Fields with "?" suffix are optional
222219
222248
  ${ATOM_TYPES_BLOCK}
222220
222249
 
222221
222250
  ## Output Format
222222
- Return a single JSON object with this structure:
222251
+ Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
222223
222252
  {
222224
- "paragraphs": [
222225
- {
222226
- "tag": "P0",
222227
- "atoms": {
222228
- "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
222229
- "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }],
222230
- "rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
222231
- }
222232
- }
222233
- ]
222253
+ "P0": {
222254
+ "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
222255
+ "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }]
222256
+ },
222257
+ "P3": {
222258
+ "rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
222259
+ }
222234
222260
  }
222235
222261
 
222236
222262
  ## Rules
222237
- - Each paragraph tag (P0, P1, ...) corresponds to the tagged paragraph in the input.
222263
+ - Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
222264
+ - Skip paragraphs with no atoms — do NOT emit empty objects.
222238
222265
  - Only include atom types that are actually found in a paragraph (all types are optional).
222239
222266
  - Every atom MUST include all required fields for its type (see schemas above). Fields with "?" suffix are optional.
222240
222267
  - **Enum fields MUST use ONLY the listed values.** For example, entity.kind must be one of "implementation"|"external"|"concept" — do NOT use values from other atom types (e.g., do NOT put "team" or "human" in entity.kind; those belong to roles.kind).
222241
222268
  - Every atom MUST include a "confidence" field (0.0-1.0) indicating how confident you are in the extraction. Use higher values (0.85-1.0) for explicitly stated facts and lower values (0.5-0.7) for inferred or ambiguous information.
222242
222269
  - **Classify correctly:** People, teams, and personas → "roles" (not "entities"). Technical systems, services, modules → "entities".
222270
+ - **Entity reference consistency (CRITICAL):** Every entity name referenced in relation.from, relation.to, behavior.subject, or any other cross-reference field MUST also appear in the "entities" array of the SAME paragraph (or a preceding paragraph in the same chunk). If an entity is mentioned for the first time in a relation, you MUST also extract it as an entity. This ensures no "dangling references" — every name used in relations has a corresponding entity declaration.
222243
222271
  - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions, English input → English descriptions).
222244
222272
  - JSON structure keys (tag, atom type names, field names) must always be in English.
222245
222273
  - Be thorough: extract ALL relevant atoms from each paragraph.
222274
+ - **Tables: basic extraction only.** For paragraphs containing markdown tables, extract the table heading as an entity and a brief summary attribute. Detailed table modeling (row-level data, comparisons, metrics) is handled by a dedicated table extraction pass — do NOT attempt exhaustive table column extraction here.
222275
+ - **Diagrams: basic extraction only.** For paragraphs containing text-based diagrams (e.g. \`\`\`mermaid, \`\`\`plantuml, \`\`\`dot, etc.), extract the diagram title as an entity and a brief summary attribute describing what the diagram shows. Detailed diagram modeling (nodes, edges, states, transitions) is handled by a dedicated diagram extraction pass — do NOT attempt exhaustive diagram parsing here.
222276
+ - **Metrics checklist:** If the text mentions performance targets, SLA, response times, throughput, error rates, port numbers, timeouts, capacity limits, or any numeric thresholds, extract them as "metrics" atoms.
222246
222277
  - Do NOT include "claims" — they are system-generated and not part of document extraction.`;
222247
222278
  function buildDocAtomAnnotationPrompt(chunkText) {
222248
222279
  return `Extract all semantic atoms from the following document text.
@@ -222254,6 +222285,13 @@ ${chunkText}
222254
222285
 
222255
222286
  Return ONLY a valid JSON object. No markdown fences, no explanation.`;
222256
222287
  }
222288
+ function toFlatFormat(result) {
222289
+ const flat = {};
222290
+ for (const p of result.paragraphs) {
222291
+ flat[p.tag] = p.atoms;
222292
+ }
222293
+ return flat;
222294
+ }
222257
222295
  function buildDocGleaningPrompt(chunkText, previousResult) {
222258
222296
  return `Review the following document text and the previously extracted atoms.
222259
222297
  Check for any MISSING atoms that were not captured in the first pass.
@@ -222262,66 +222300,483 @@ Check for any MISSING atoms that were not captured in the first pass.
222262
222300
  ${chunkText}
222263
222301
 
222264
222302
  ## Previously Extracted Atoms
222265
- ${JSON.stringify(previousResult, null, 2)}
222303
+ ${JSON.stringify(toFlatFormat(previousResult), null, 2)}
222266
222304
 
222267
222305
  ## Instructions
222268
- - If you find missing atoms, output them in the same JSON format (with paragraph tags).
222306
+ - If you find missing atoms, output them in the same flat JSON format keyed by paragraph tags (e.g. {"P0": {"entities": [...]}, "P3": {"rules": [...]}}).
222269
222307
  - Only include NEW atoms not already in the previous extraction.
222270
222308
  - Every atom MUST include a "confidence" field (0.0-1.0).
222271
- - If nothing is missing, return: {"paragraphs": []}
222309
+ - **Entity reference consistency:** If you add a new relation whose from/to references an entity not yet declared in the previous extraction or your current output, you MUST also add that entity to the "entities" array.
222310
+ - If nothing is missing, return: {}
222272
222311
  - Respond in the same language as the input text.
222273
222312
 
222274
222313
  Return ONLY a valid JSON object. No markdown fences, no explanation.`;
222275
222314
  }
222276
222315
  var DOC_ANNOTATION_SYSTEM_PROMPT = SYSTEM_PROMPT;
222316
+ // ../llm/src/prompts/entityResolution.ts
222317
+ var ENTITY_RESOLUTION_SYSTEM_PROMPT = `You are an entity resolution assistant. You review a list of entity names extracted from a technical document and perform two tasks:
222318
+
222319
+ ## Task 1: Merge Duplicates
222320
+ - Only merge names that clearly refer to the same entity (same system, service, tool, etc.)
222321
+ - Prefer the LONGER, more descriptive name as the canonical name
222322
+ - Do NOT merge names that share a substring but refer to different things
222323
+ - When uncertain, do NOT merge — add to "ambiguous" instead
222324
+ - Chinese and English names for the same entity SHOULD be merged (e.g. "Vmok" → "Vmok 微模块框架")
222325
+ - Abbreviations should be merged with their full forms (e.g. "AGW" → "API Gateway")
222326
+
222327
+ ## Task 2: Remove Noise
222328
+ - Remove names that are NOT meaningful named entities — they are generic words, actions, or descriptions
222329
+ - Examples of noise: common verbs/nouns (登录, 路由, 直连), generic technical terms (Env, query), action descriptions (Kill 3001 进程)
222330
+ - Examples of REAL entities to KEEP: product names (TTAstra, Gulux), tools (nvm, Rush), services (Op Main 服务), platforms (AGW 平台)
222331
+ - When uncertain, KEEP the name — only remove if clearly not a named entity
222332
+
222333
+ ## Output
222334
+ Valid JSON only. No markdown fences, no explanation.`;
222335
+ function buildEntityResolutionPrompt(input) {
222336
+ const parts = [];
222337
+ parts.push(`## All Entity Names (${input.allNames.length} total)`);
222338
+ parts.push(input.allNames.map((n, i) => `${i + 1}. ${n}`).join(`
222339
+ `));
222340
+ if (input.candidates.length > 0) {
222341
+ parts.push("");
222342
+ parts.push(`## Suspected Duplicates (${input.candidates.length} pairs)`);
222343
+ parts.push("Review each pair and decide whether to merge:");
222344
+ for (const c of input.candidates) {
222345
+ parts.push(`- "${c.short}" ↔ "${c.long}" — ${c.reason}`);
222346
+ }
222347
+ }
222348
+ if (input.noiseCandidates && input.noiseCandidates.length > 0) {
222349
+ parts.push("");
222350
+ parts.push(`## Suspected Noise (${input.noiseCandidates.length} names)`);
222351
+ parts.push("Review each name — remove if NOT a meaningful named entity, keep if it IS:");
222352
+ for (const n of input.noiseCandidates) {
222353
+ parts.push(`- "${n}"`);
222354
+ }
222355
+ }
222356
+ if (input.contextSnippets && input.contextSnippets.length > 0) {
222357
+ parts.push("");
222358
+ parts.push("## Context Snippets");
222359
+ for (const s of input.contextSnippets) {
222360
+ parts.push(`- **${s.name}**: ${s.snippet}`);
222361
+ }
222362
+ }
222363
+ parts.push("");
222364
+ parts.push(`## Output Format
222365
+ Return a JSON object:
222366
+ {
222367
+ "merges": [
222368
+ { "from": "alias name", "to": "canonical name" }
222369
+ ],
222370
+ "remove": ["noise_name_1", "noise_name_2"],
222371
+ "ambiguous": ["name1", "name2"]
222372
+ }
222373
+
222374
+ - "merges": confirmed duplicate pairs. "from" will be replaced by "to" everywhere.
222375
+ - "remove": names confirmed as noise. They will be deleted from entity list.
222376
+ - "ambiguous": names you're unsure about (optional, for logging).
222377
+
222378
+ Return ONLY valid JSON. No markdown fences, no explanation.`);
222379
+ return parts.join(`
222380
+ `);
222381
+ }
222382
+ // ../llm/src/prompts/docTableAnnotation.ts
222383
+ init_src();
222384
+ var entityFields = zodObjectToPromptFields(entityAtomSchema);
222385
+ var attributeFields = zodObjectToPromptFields(attributeAtomSchema);
222386
+ var relationFields = zodObjectToPromptFields(relationAtomSchema);
222387
+ var comparisonFields = zodObjectToPromptFields(comparisonAtomSchema);
222388
+ var metricFields = zodObjectToPromptFields(metricAtomSchema);
222389
+ var behaviorFields = zodObjectToPromptFields(behaviorAtomSchema);
222390
+ var eventFields = zodObjectToPromptFields(eventAtomSchema);
222391
+ var transitionFields = zodObjectToPromptFields(transitionAtomSchema);
222392
+ var constraintFields = zodObjectToPromptFields(constraintAtomSchema);
222393
+ var stateFields = zodObjectToPromptFields(stateAtomSchema);
222394
+ var ruleFields = zodObjectToPromptFields(ruleAtomSchema);
222395
+ var TABLE_SYSTEM_PROMPT = `You are a table data modeling assistant. Your task is to extract structured semantic atoms from markdown tables in documents.
222396
+
222397
+ Each table paragraph is tagged with [P0], [P1], etc. You must classify the table type FIRST, then apply the corresponding extraction rules.
222398
+
222399
+ ## Step 1: Classify the Table
222400
+
222401
+ Determine the table type by examining the relationship between rows:
222402
+
222403
+ ### Type A: Collection / Record Table
222404
+ **Rows are peer instances of the same concept.** Each row is an independent record; columns describe different facets of the same instance.
222405
+ - Examples: code→name mappings, enum definitions, config parameter lists, reference data tables
222406
+ - Key signal: removing one row does not affect the meaning of other rows
222407
+
222408
+ ### Type B: Single-Object Property Table
222409
+ **Rows describe properties/fields of ONE entity.** First column is property name, other columns are its type/value/description.
222410
+ - Examples: API field definitions, configuration schema, entity attribute lists
222411
+ - Key signal: all rows refer to the same parent entity
222412
+
222413
+ ### Type C: Comparison / Evaluation Table
222414
+ **Rows or columns represent different subjects being compared** across the same dimensions.
222415
+ - Examples: technology selection, vendor evaluation, feature comparison
222416
+ - Key signal: multiple named subjects evaluated on shared criteria
222417
+
222418
+ ### Type D: Matrix / Cross-Reference Table
222419
+ **Both row headers and column headers are dimensions.** Cells represent the relationship at the intersection.
222420
+ - Examples: permission matrices (role × operation), compatibility matrices, dependency tables
222421
+ - Key signal: both axes are meaningful dimensions, cells are binary/rating/relationship values
222422
+
222423
+ ### Type E: Metrics / KPI Table
222424
+ **Rows are measurable indicators** with numeric targets, thresholds, or SLA values.
222425
+ - Examples: SLA tables, performance baselines, capacity planning tables
222426
+ - Key signal: columns include target/threshold/unit/SLA-style values
222427
+
222428
+ ### Type F: Timeline / Process Table
222429
+ **Rows represent ordered steps or phases** in a sequence.
222430
+ - Examples: deployment steps, approval workflows, version changelog, migration plans
222431
+ - Key signal: rows have implicit ordering, may have phase/step/date columns
222432
+
222433
+ ## Step 2: Extract Atoms by Table Type
222434
+
222435
+ ### Type A → Single attribute with row-object array
222436
+ 1. Create ONE entity for the abstract concept (table heading or the concept rows represent).
222437
+ Entity schema: ${entityFields}
222438
+ 2. Create ONE attribute with \`type: "table"\` and \`value\` as an array of row objects. Each row object uses column headers as keys.
222439
+ Attribute schema: ${attributeFields}
222440
+ Example: \`{ "name": "Region Code Mapping", "type": "table", "value": [{"Code": "1001", "Name": "CN_North", "Region": "CN-NORTH"}, ...] }\`
222441
+ 3. **Extract ALL rows — do not sample.** If a table has 30 rows, the value array must contain all 30 objects.
222442
+ 4. Extract structural patterns: status indicators (DEPRECATED, enabled/disabled) → "states" + "rules" atoms.
222443
+ State schema: ${stateFields}
222444
+ Rule schema: ${ruleFields}
222445
+
222446
+ ### Type B → Multiple attribute atoms
222447
+ 1. Create ONE entity for the parent structure.
222448
+ Entity schema: ${entityFields}
222449
+ 2. Create one attribute per row: \`name\` = property name, \`type\` = property type, \`value\` = default/example.
222450
+ Attribute schema: ${attributeFields}
222451
+ 3. Extract constraints from "required" or "validation" columns.
222452
+ Constraint schema: ${constraintFields}
222453
+
222454
+ ### Type C → Comparison atom
222455
+ 1. Use "comparisons" atom. Subjects = compared items, dimensions = evaluation criteria.
222456
+ Comparison schema: ${comparisonFields}
222457
+ 2. Extract "decisions" atoms if the table leads to a conclusion.
222458
+
222459
+ ### Type D → Relations or table attribute
222460
+ 1. If cells are simple (yes/no, allowed/denied): extract as "relations" atoms.
222461
+ Relation schema: ${relationFields}
222462
+ Map each cell to a relation: row header → \`from\`, column header → \`to\`, cell value → \`type\` or \`description\`.
222463
+ 2. If cells are complex: use Type A approach (single attribute with \`type: "table"\`).
222464
+ 3. Create entities for both row headers and column headers if they are named concepts.
222465
+
222466
+ ### Type E → Metrics atoms
222467
+ 1. Create one "metrics" atom per row.
222468
+ Metric schema: ${metricFields}
222469
+ 2. Also create the parent entity if named (e.g., "SLA Requirements").
222470
+
222471
+ ### Type F → Behaviors/Events/Transitions
222472
+ 1. Create one "behaviors" atom per step/phase.
222473
+ Behavior schema: ${behaviorFields}
222474
+ 2. If there are triggers: extract "events" atoms.
222475
+ Event schema: ${eventFields}
222476
+ 3. If there are state changes: extract "transitions" atoms.
222477
+ Transition schema: ${transitionFields}
222478
+ 4. Create the parent entity for the process/workflow.
222479
+
222480
+ ## Output Format
222481
+ Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
222482
+ {
222483
+ "P0": {
222484
+ "tableType": "A",
222485
+ "entities": [...],
222486
+ "attributes": [...]
222487
+ },
222488
+ "P3": {
222489
+ "tableType": "C",
222490
+ "comparisons": [...]
222491
+ }
222492
+ }
222493
+
222494
+ ## Rules
222495
+ - Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
222496
+ - Skip paragraphs with no atoms — do NOT emit empty objects.
222497
+ - Every atom MUST include a "confidence" field (0.0-1.0).
222498
+ - The "tableType" field is required for each paragraph (one of "A", "B", "C", "D", "E", "F").
222499
+ - Only include atom types that are actually extracted.
222500
+ - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
222501
+ - JSON structure keys must always be in English.
222502
+ - **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
222503
+ - Do NOT include "claims" — they are system-generated.`;
222504
+ function buildDocTableAnnotationPrompt(tableText) {
222505
+ return `Classify and extract atoms from the following table paragraphs.
222506
+ Each paragraph is tagged with [P0], [P1], etc. First classify each table, then extract atoms accordingly.
222507
+
222508
+ ---
222509
+ ${tableText}
222510
+ ---
222511
+
222512
+ Return ONLY a valid JSON object. No markdown fences, no explanation.`;
222513
+ }
222514
+ var DOC_TABLE_ANNOTATION_SYSTEM_PROMPT = TABLE_SYSTEM_PROMPT;
222515
+ // ../llm/src/prompts/docDiagramAnnotation.ts
222516
+ init_src();
222517
+ var entityFields2 = zodObjectToPromptFields(entityAtomSchema);
222518
+ var attributeFields2 = zodObjectToPromptFields(attributeAtomSchema);
222519
+ var relationFields2 = zodObjectToPromptFields(relationAtomSchema);
222520
+ var behaviorFields2 = zodObjectToPromptFields(behaviorAtomSchema);
222521
+ var transitionFields2 = zodObjectToPromptFields(transitionAtomSchema);
222522
+ var stateFields2 = zodObjectToPromptFields(stateAtomSchema);
222523
+ var roleFields = zodObjectToPromptFields(roleAtomSchema);
222524
+ var eventFields2 = zodObjectToPromptFields(eventAtomSchema);
222525
+ var decisionFields = zodObjectToPromptFields(decisionAtomSchema);
222526
+ var constraintFields2 = zodObjectToPromptFields(constraintAtomSchema);
222527
+ var DIAGRAM_FENCE_TAGS = [
222528
+ "mermaid",
222529
+ "plantuml",
222530
+ "puml",
222531
+ "dot",
222532
+ "graphviz",
222533
+ "viz",
222534
+ "d2",
222535
+ "c4plantuml",
222536
+ "ditaa",
222537
+ "nomnoml",
222538
+ "wavedrom",
222539
+ "vega",
222540
+ "vega-lite"
222541
+ ];
222542
+ var DIAGRAM_FENCE_REGEX = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "i");
222543
+ var DIAGRAM_SYSTEM_PROMPT = `You are a diagram analysis assistant. Your task is to extract structured semantic atoms from text-based diagrams (Mermaid, PlantUML, Graphviz, D2, etc.) embedded in documents.
222544
+
222545
+ Each diagram paragraph is tagged with [P0], [P1], etc. You must classify the diagram type FIRST, then extract atoms accordingly.
222546
+
222547
+ ## Step 1: Identify the Diagram Format and Type
222548
+
222549
+ ### Formats
222550
+ - **Mermaid**: flowchart/graph, sequenceDiagram, stateDiagram, classDiagram, erDiagram, gantt, pie, gitgraph
222551
+ - **PlantUML / C4-PlantUML**: @startuml/@enduml blocks, all UML types, C4 architecture (System_Context, Container, Component)
222552
+ - **Graphviz (DOT)**: digraph/graph, general directed/undirected graphs
222553
+ - **D2**: modern declarative diagrams with shape/connection syntax
222554
+ - **Others**: ditaa (ASCII art), nomnoml (UML), wavedrom (timing), vega/vega-lite (data viz)
222555
+
222556
+ ### Diagram Types (by semantic content)
222557
+ - **Flowchart / Process**: decision trees, algorithms, business process flows
222558
+ - **Sequence**: interaction between participants over time (API calls, protocols)
222559
+ - **State Machine**: states and transitions triggered by events/guards
222560
+ - **Class / ER**: data models, entity relationships, inheritance hierarchies
222561
+ - **Architecture**: system components, containers, deployment topology
222562
+ - **Gantt / Timeline**: project schedules, milestones, phases
222563
+ - **Pie / Data Viz**: statistical distributions, metrics visualization
222564
+
222565
+ ## Step 2: Extract Atoms by Diagram Type
222566
+
222567
+ ### Flowchart / Process → entities + relations + behaviors + decisions
222568
+ 1. Extract each node as an entity.
222569
+ Entity schema: ${entityFields2}
222570
+ 2. Extract each arrow/edge as a relation. Use edge labels as \`type\` or \`description\`.
222571
+ Relation schema: ${relationFields2}
222572
+ 3. Extract action nodes as behaviors (what the process does at each step).
222573
+ Behavior schema: ${behaviorFields2}
222574
+ 4. Extract diamond/decision nodes as decisions.
222575
+ Decision schema: ${decisionFields}
222576
+
222577
+ ### Sequence → entities + relations + behaviors + events
222578
+ 1. Extract each participant/actor as an entity (or role if it's a person/team).
222579
+ Entity schema: ${entityFields2}
222580
+ Role schema: ${roleFields}
222581
+ 2. Extract each message/call as a relation (\`from\` = caller, \`to\` = callee, \`type\` = message label).
222582
+ Relation schema: ${relationFields2}
222583
+ 3. Extract significant interactions as behaviors.
222584
+ Behavior schema: ${behaviorFields2}
222585
+ 4. Extract triggers, responses, and async messages as events.
222586
+ Event schema: ${eventFields2}
222587
+
222588
+ ### State Machine → entities + states + transitions + events
222589
+ 1. Extract the state machine subject as an entity.
222590
+ Entity schema: ${entityFields2}
222591
+ 2. Extract each state as a state atom.
222592
+ State schema: ${stateFields2}
222593
+ 3. Extract each arrow as a transition (\`from\` = source state, \`to\` = target state, \`trigger\` = event/guard).
222594
+ Transition schema: ${transitionFields2}
222595
+ 4. Extract triggers as events.
222596
+ Event schema: ${eventFields2}
222597
+
222598
+ ### Class / ER → entities + attributes + relations
222599
+ 1. Extract each class/entity as an entity.
222600
+ Entity schema: ${entityFields2}
222601
+ 2. Extract fields/properties as attributes.
222602
+ Attribute schema: ${attributeFields2}
222603
+ 3. Extract associations, inheritance, composition as relations (\`type\` = "INHERITS", "CONTAINS", "REFERENCES", etc.).
222604
+ Relation schema: ${relationFields2}
222605
+
222606
+ ### Architecture → entities + relations + constraints
222607
+ 1. Extract each system/service/container/component as an entity.
222608
+ Entity schema: ${entityFields2}
222609
+ 2. Extract connections between components as relations.
222610
+ Relation schema: ${relationFields2}
222611
+ 3. Extract deployment constraints, technology choices.
222612
+ Constraint schema: ${constraintFields2}
222613
+
222614
+ ### Gantt / Timeline → behaviors + events + constraints
222615
+ 1. Extract each task/phase as a behavior.
222616
+ Behavior schema: ${behaviorFields2}
222617
+ 2. Extract milestones and deadlines as events.
222618
+ Event schema: ${eventFields2}
222619
+ 3. Extract dependencies and critical path constraints.
222620
+ Constraint schema: ${constraintFields2}
222621
+
222622
+ ### Pie / Data Viz → attributes (summary only)
222623
+ 1. Extract the chart title as an entity.
222624
+ Entity schema: ${entityFields2}
222625
+ 2. Extract each slice/data point as an attribute (\`name\` = label, \`value\` = amount/percentage, \`type\` = "metric").
222626
+ Attribute schema: ${attributeFields2}
222627
+
222628
+ ## Additional Extraction: Diagram Description
222629
+
222630
+ For EVERY diagram, also extract a "description" attribute on the diagram's primary entity:
222631
+ - \`name\`: "diagram_description"
222632
+ - \`type\`: "description"
222633
+ - \`value\`: A 1-3 sentence natural language summary of what the diagram communicates.
222634
+
222635
+ This description is critical for downstream AI consumers who cannot render the diagram.
222636
+
222637
+ ## Output Format
222638
+ Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
222639
+ {
222640
+ "P0": {
222641
+ "diagramFormat": "mermaid",
222642
+ "diagramType": "sequence",
222643
+ "entities": [...],
222644
+ "relations": [...]
222645
+ }
222646
+ }
222647
+
222648
+ ## Rules
222649
+ - Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
222650
+ - Skip paragraphs with no atoms — do NOT emit empty objects.
222651
+ - Every atom MUST include a "confidence" field (0.0-1.0).
222652
+ - The "diagramFormat" and "diagramType" fields are required for each paragraph.
222653
+ - Only include atom types that are actually extracted.
222654
+ - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
222655
+ - JSON structure keys must always be in English.
222656
+ - **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
222657
+ - **Extract ALL nodes and edges** — do not sample or skip.
222658
+ - Do NOT include "claims" — they are system-generated.`;
222659
+ function buildDocDiagramAnnotationPrompt(diagramText) {
222660
+ return `Analyze and extract atoms from the following diagram paragraphs.
222661
+ Each paragraph is tagged with [P0], [P1], etc. First identify the diagram format and type, then extract atoms accordingly.
222662
+
222663
+ ---
222664
+ ${diagramText}
222665
+ ---
222666
+
222667
+ Return ONLY a valid JSON object. No markdown fences, no explanation.`;
222668
+ }
222669
+ var DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT = DIAGRAM_SYSTEM_PROMPT;
222277
222670
  // ../llm/src/chunking/markdownChunker.ts
222278
222671
  var DEFAULT_MAX_TOKENS2 = 4000;
222672
+ var DEFAULT_PARAGRAPH_MAX_TOKENS = 500;
222279
222673
  function estimateTokens(text2) {
222280
222674
  return Math.ceil(text2.length / 4);
222281
222675
  }
222676
+ function findCodeBlockRanges(content) {
222677
+ const ranges = [];
222678
+ const fenceRe = /^(`{3,}|~{3,})/gm;
222679
+ let openStart = -1;
222680
+ let openFence = "";
222681
+ let match;
222682
+ while ((match = fenceRe.exec(content)) !== null) {
222683
+ const fence = match[1];
222684
+ if (openStart === -1) {
222685
+ openStart = match.index;
222686
+ openFence = fence[0].repeat(fence.length);
222687
+ } else if (fence[0] === openFence[0] && fence.length >= openFence.length) {
222688
+ ranges.push({ start: openStart, end: match.index + match[0].length });
222689
+ openStart = -1;
222690
+ openFence = "";
222691
+ }
222692
+ }
222693
+ if (openStart !== -1) {
222694
+ ranges.push({ start: openStart, end: content.length });
222695
+ }
222696
+ return ranges;
222697
+ }
222698
+ function isInsideCodeBlock(pos, ranges) {
222699
+ for (const r of ranges) {
222700
+ if (pos >= r.start && pos < r.end)
222701
+ return true;
222702
+ if (r.start > pos)
222703
+ break;
222704
+ }
222705
+ return false;
222706
+ }
222282
222707
  function parseSections(content) {
222283
- const headingRe = /^(#{1,6})\s+(.*)$/gm;
222284
- const sections = [];
222708
+ const codeRanges = findCodeBlockRanges(content);
222285
222709
  const matches = [];
222286
- let match;
222287
- while ((match = headingRe.exec(content)) !== null) {
222288
- matches.push({
222289
- index: match.index,
222290
- level: match[1].length,
222291
- heading: match[2].trim()
222292
- });
222710
+ const atxRe = /^(#{1,6})\s+(.*)$/gm;
222711
+ let m;
222712
+ while ((m = atxRe.exec(content)) !== null) {
222713
+ if (!isInsideCodeBlock(m.index, codeRanges)) {
222714
+ matches.push({
222715
+ index: m.index,
222716
+ endIndex: m.index + m[0].length,
222717
+ level: m[1].length,
222718
+ heading: m[2].trim()
222719
+ });
222720
+ }
222293
222721
  }
222722
+ const lines = content.split(`
222723
+ `);
222724
+ let offset = 0;
222725
+ for (let i = 0;i < lines.length; i++) {
222726
+ const line = lines[i];
222727
+ if (i > 0) {
222728
+ const prevLine = lines[i - 1].trim();
222729
+ const prevLineStart = offset - lines[i - 1].length - 1;
222730
+ if (prevLine && !isInsideCodeBlock(prevLineStart, codeRanges)) {
222731
+ if (/^={2,}\s*$/.test(line)) {
222732
+ matches.push({
222733
+ index: prevLineStart < 0 ? 0 : prevLineStart,
222734
+ endIndex: offset + line.length,
222735
+ level: 1,
222736
+ heading: prevLine
222737
+ });
222738
+ } else if (/^-{2,}\s*$/.test(line) && !/^-{3,}\s*$/.test(prevLine)) {
222739
+ matches.push({
222740
+ index: prevLineStart < 0 ? 0 : prevLineStart,
222741
+ endIndex: offset + line.length,
222742
+ level: 2,
222743
+ heading: prevLine
222744
+ });
222745
+ }
222746
+ }
222747
+ }
222748
+ offset += line.length + 1;
222749
+ }
222750
+ matches.sort((a, b) => a.index - b.index);
222751
+ const deduped = [];
222752
+ for (const match of matches) {
222753
+ const last = deduped[deduped.length - 1];
222754
+ if (last && match.index < last.endIndex)
222755
+ continue;
222756
+ deduped.push(match);
222757
+ }
222758
+ return buildSectionsFromMatches(content, deduped);
222759
+ }
222760
+ function buildSectionsFromMatches(content, matches) {
222761
+ const sections = [];
222294
222762
  if (matches.length === 0) {
222295
222763
  const body = content.trim();
222296
222764
  if (body) {
222297
- sections.push({
222298
- heading: "",
222299
- level: 0,
222300
- body,
222301
- paragraphs: splitParagraphs(body)
222302
- });
222765
+ sections.push({ heading: "", level: 0, body, paragraphs: splitParagraphs(body) });
222303
222766
  }
222304
222767
  return sections;
222305
222768
  }
222306
222769
  if (matches[0].index > 0) {
222307
222770
  const preBody = content.slice(0, matches[0].index).trim();
222308
222771
  if (preBody) {
222309
- sections.push({
222310
- heading: "",
222311
- level: 0,
222312
- body: preBody,
222313
- paragraphs: splitParagraphs(preBody)
222314
- });
222772
+ sections.push({ heading: "", level: 0, body: preBody, paragraphs: splitParagraphs(preBody) });
222315
222773
  }
222316
222774
  }
222317
222775
  for (let i = 0;i < matches.length; i++) {
222318
222776
  const m = matches[i];
222319
- const start = m.index;
222320
- const end = i + 1 < matches.length ? matches[i + 1].index : content.length;
222321
- const fullText = content.slice(start, end).trim();
222322
- const headingLineEnd = fullText.indexOf(`
222323
- `);
222324
- const body = headingLineEnd === -1 ? "" : fullText.slice(headingLineEnd + 1).trim();
222777
+ const bodyStart = m.endIndex;
222778
+ const bodyEnd = i + 1 < matches.length ? matches[i + 1].index : content.length;
222779
+ const body = content.slice(bodyStart, bodyEnd).trim();
222325
222780
  sections.push({
222326
222781
  heading: m.heading,
222327
222782
  level: m.level,
@@ -222336,6 +222791,128 @@ function splitParagraphs(text2) {
222336
222791
  return [];
222337
222792
  return text2.split(/\n\n+/).map((p) => p.trim()).filter(Boolean);
222338
222793
  }
222794
+ function splitOversizedText(text2, maxTokens) {
222795
+ const doubleNewlineParts = text2.split(/\n\n+/).map((p) => p.trim()).filter(Boolean);
222796
+ if (doubleNewlineParts.length > 1) {
222797
+ const results = [];
222798
+ let acc = "";
222799
+ let accTokens = 0;
222800
+ for (const part of doubleNewlineParts) {
222801
+ const partTokens = estimateTokens(part);
222802
+ if (partTokens > maxTokens) {
222803
+ if (acc) {
222804
+ results.push(acc);
222805
+ acc = "";
222806
+ accTokens = 0;
222807
+ }
222808
+ results.push(...splitOversizedText(part, maxTokens));
222809
+ continue;
222810
+ }
222811
+ if (acc && accTokens + partTokens > maxTokens) {
222812
+ results.push(acc);
222813
+ acc = "";
222814
+ accTokens = 0;
222815
+ }
222816
+ acc = acc ? acc + `
222817
+
222818
+ ` + part : part;
222819
+ accTokens += partTokens;
222820
+ }
222821
+ if (acc)
222822
+ results.push(acc);
222823
+ return results;
222824
+ }
222825
+ const lines = text2.split(`
222826
+ `);
222827
+ if (lines.length > 1) {
222828
+ const blocks = mergeAtomicBlocks(lines);
222829
+ const results = [];
222830
+ let acc = "";
222831
+ let accTokens = 0;
222832
+ for (const block of blocks) {
222833
+ const blockTokens = estimateTokens(block);
222834
+ if (blockTokens > maxTokens) {
222835
+ if (acc) {
222836
+ results.push(acc);
222837
+ acc = "";
222838
+ accTokens = 0;
222839
+ }
222840
+ results.push(block);
222841
+ continue;
222842
+ }
222843
+ if (acc && accTokens + blockTokens > maxTokens) {
222844
+ results.push(acc);
222845
+ acc = "";
222846
+ accTokens = 0;
222847
+ }
222848
+ acc = acc ? acc + `
222849
+ ` + block : block;
222850
+ accTokens += blockTokens;
222851
+ }
222852
+ if (acc)
222853
+ results.push(acc);
222854
+ return results;
222855
+ }
222856
+ return forceBreakText(text2, maxTokens);
222857
+ }
222858
+ function mergeAtomicBlocks(lines) {
222859
+ const result = [];
222860
+ let i = 0;
222861
+ while (i < lines.length) {
222862
+ const line = lines[i];
222863
+ const trimmed = line.trimStart();
222864
+ if (/^(`{3,}|~{3,})/.test(trimmed)) {
222865
+ const fence = trimmed.match(/^(`{3,}|~{3,})/)[1];
222866
+ const fenceChar = fence[0];
222867
+ const fenceLen = fence.length;
222868
+ const blockLines = [line];
222869
+ i++;
222870
+ while (i < lines.length) {
222871
+ blockLines.push(lines[i]);
222872
+ const inner = lines[i].trimStart();
222873
+ if (inner.startsWith(fenceChar) && inner.match(new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`))) {
222874
+ i++;
222875
+ break;
222876
+ }
222877
+ i++;
222878
+ }
222879
+ result.push(blockLines.join(`
222880
+ `));
222881
+ continue;
222882
+ }
222883
+ if (trimmed.startsWith("|")) {
222884
+ const tableLines = [line];
222885
+ i++;
222886
+ while (i < lines.length && lines[i].trimStart().startsWith("|")) {
222887
+ tableLines.push(lines[i]);
222888
+ i++;
222889
+ }
222890
+ result.push(tableLines.join(`
222891
+ `));
222892
+ continue;
222893
+ }
222894
+ result.push(line);
222895
+ i++;
222896
+ }
222897
+ return result;
222898
+ }
222899
+ function forceBreakText(text2, maxTokens) {
222900
+ const maxChars = maxTokens * 4;
222901
+ const results = [];
222902
+ let remaining = text2;
222903
+ while (remaining.length > maxChars) {
222904
+ let breakAt = maxChars;
222905
+ const spaceIdx = remaining.lastIndexOf(" ", maxChars);
222906
+ if (spaceIdx > maxChars * 0.7) {
222907
+ breakAt = spaceIdx;
222908
+ }
222909
+ results.push(remaining.slice(0, breakAt).trim());
222910
+ remaining = remaining.slice(breakAt).trim();
222911
+ }
222912
+ if (remaining)
222913
+ results.push(remaining);
222914
+ return results;
222915
+ }
222339
222916
  function buildBreadcrumb(sections, sectionIndex) {
222340
222917
  const current = sections[sectionIndex];
222341
222918
  if (current.level <= 0)
@@ -222364,11 +222941,53 @@ function sectionHeadingLine(section) {
222364
222941
  return "";
222365
222942
  return `${"#".repeat(section.level)} ${section.heading}`;
222366
222943
  }
222944
+ function buildCoarseParagraphs(sections, paragraphMaxTokens) {
222945
+ const result = [];
222946
+ const rawEntries = [];
222947
+ for (let sIdx = 0;sIdx < sections.length; sIdx++) {
222948
+ const section = sections[sIdx];
222949
+ if (!section.body.trim())
222950
+ continue;
222951
+ const bodyTokens = estimateTokens(section.body);
222952
+ if (bodyTokens > paragraphMaxTokens) {
222953
+ const parts = splitOversizedText(section.body, paragraphMaxTokens);
222954
+ for (const part of parts) {
222955
+ rawEntries.push({ sectionIndex: sIdx, text: part, tokens: estimateTokens(part) });
222956
+ }
222957
+ } else {
222958
+ rawEntries.push({ sectionIndex: sIdx, text: section.body, tokens: bodyTokens });
222959
+ }
222960
+ }
222961
+ const MERGE_THRESHOLD = 150;
222962
+ const merged = [];
222963
+ for (const entry of rawEntries) {
222964
+ const last = merged[merged.length - 1];
222965
+ if (last && last.tokens < MERGE_THRESHOLD && entry.tokens < MERGE_THRESHOLD && last.tokens + entry.tokens <= paragraphMaxTokens) {
222966
+ last.text = last.text + `
222967
+
222968
+ ` + entry.text;
222969
+ last.tokens += entry.tokens;
222970
+ } else {
222971
+ merged.push({ ...entry });
222972
+ }
222973
+ }
222974
+ let pIdx = 0;
222975
+ for (const entry of merged) {
222976
+ result.push({
222977
+ sectionIndex: entry.sectionIndex,
222978
+ paragraphIndex: pIdx++,
222979
+ text: entry.text
222980
+ });
222981
+ }
222982
+ return result;
222983
+ }
222367
222984
  function chunkMarkdown(content, options = {}) {
222368
222985
  const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS2;
222986
+ const paragraphMaxTokens = options.paragraphMaxTokens ?? DEFAULT_PARAGRAPH_MAX_TOKENS;
222369
222987
  const sections = parseSections(content);
222370
222988
  if (sections.length === 0)
222371
222989
  return [];
222990
+ const coarseParagraphs = buildCoarseParagraphs(sections, paragraphMaxTokens);
222372
222991
  const chunks = [];
222373
222992
  let pendingSections = [];
222374
222993
  let pendingTokens = 0;
@@ -222386,14 +223005,16 @@ function chunkMarkdown(content, options = {}) {
222386
223005
  const heading = sectionHeadingLine(entry.section);
222387
223006
  if (heading)
222388
223007
  textParts.push(heading);
222389
- for (let pIdx = 0;pIdx < entry.section.paragraphs.length; pIdx++) {
222390
- const pText = entry.section.paragraphs[pIdx];
222391
- textParts.push(pText);
222392
- paragraphs.push({
222393
- sectionIndex: entry.sectionIndex,
222394
- paragraphIndex: pIdx,
222395
- text: pText
222396
- });
223008
+ const sectionParas = coarseParagraphs.filter((p) => p.sectionIndex === entry.sectionIndex);
223009
+ for (const p of sectionParas) {
223010
+ if (!paragraphs.some((existing) => existing.paragraphIndex === p.paragraphIndex && existing.text === p.text)) {
223011
+ textParts.push(p.text);
223012
+ paragraphs.push({
223013
+ sectionIndex: p.sectionIndex,
223014
+ paragraphIndex: p.paragraphIndex,
223015
+ text: p.text
223016
+ });
223017
+ }
222397
223018
  }
222398
223019
  }
222399
223020
  chunks.push({
@@ -222416,7 +223037,7 @@ function chunkMarkdown(content, options = {}) {
222416
223037
  ` : "") + section.body);
222417
223038
  if (sectionTokens > maxTokens && section.paragraphs.length > 1) {
222418
223039
  flushPending();
222419
- splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks);
223040
+ splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks, coarseParagraphs);
222420
223041
  continue;
222421
223042
  }
222422
223043
  const crumbTokens = pendingSections.length === 0 ? estimateTokens(breadcrumbPrefix(breadcrumb)) : 0;
@@ -222429,9 +223050,10 @@ function chunkMarkdown(content, options = {}) {
222429
223050
  flushPending();
222430
223051
  return chunks;
222431
223052
  }
222432
- function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks) {
223053
+ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks, coarseParagraphs) {
222433
223054
  const headingLine = sectionHeadingLine(section);
222434
223055
  const prefix = breadcrumbPrefix(breadcrumb);
223056
+ const sectionParas = coarseParagraphs.filter((p) => p.sectionIndex === sectionIndex);
222435
223057
  let accParagraphs = [];
222436
223058
  let accTextParts = [];
222437
223059
  let accTokens = 0;
@@ -222458,18 +223080,265 @@ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens,
222458
223080
  accTokens = baseOverhead;
222459
223081
  }
222460
223082
  accTokens = baseOverhead;
222461
- for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
222462
- const pText = section.paragraphs[pIdx];
222463
- const pTokens = estimateTokens(pText);
223083
+ for (const p of sectionParas) {
223084
+ const pTokens = estimateTokens(p.text);
222464
223085
  if (accParagraphs.length > 0 && accTokens + pTokens > maxTokens) {
222465
223086
  flushAcc();
222466
223087
  }
222467
- accParagraphs.push({ sectionIndex, paragraphIndex: pIdx, text: pText });
222468
- accTextParts.push(pText);
223088
+ accParagraphs.push({ sectionIndex, paragraphIndex: p.paragraphIndex, text: p.text });
223089
+ accTextParts.push(p.text);
222469
223090
  accTokens += pTokens;
222470
223091
  }
222471
223092
  flushAcc();
222472
223093
  }
223094
+ // ../llm/src/chunking/normalizeMarkdown.ts
223095
+ function normalizeMarkdown(content) {
223096
+ const stats = { repairs: {} };
223097
+ function count(category) {
223098
+ stats.repairs[category] = (stats.repairs[category] ?? 0) + 1;
223099
+ }
223100
+ let result = stripBomAndInvisible(content, count);
223101
+ result = normalizeLineEndings(result, count);
223102
+ const lines = result.split(`
223103
+ `);
223104
+ const output = processBlocks(lines, count);
223105
+ return { content: output.join(`
223106
+ `), stats };
223107
+ }
223108
+ function stripBomAndInvisible(text2, count) {
223109
+ const cleaned = text2.replace(/[\uFEFF\u200B\u200C\u200D]/g, "");
223110
+ if (cleaned.length !== text2.length) {
223111
+ count("invisible_chars");
223112
+ }
223113
+ return cleaned;
223114
+ }
223115
+ function normalizeLineEndings(text2, count) {
223116
+ if (text2.includes("\r")) {
223117
+ count("line_endings");
223118
+ return text2.replace(/\r\n?/g, `
223119
+ `);
223120
+ }
223121
+ return text2;
223122
+ }
223123
+ function processBlocks(inputLines, count) {
223124
+ const lines = splitInlineFences(inputLines, count);
223125
+ const output = [];
223126
+ let i = 0;
223127
+ while (i < lines.length) {
223128
+ const line = lines[i];
223129
+ const trimmed = line.trimStart();
223130
+ const fenceMatch = trimmed.match(/^(`{3,}|~{3,})/);
223131
+ if (fenceMatch) {
223132
+ const result = handleCodeFence(lines, i, fenceMatch[1], count);
223133
+ output.push(...result.lines);
223134
+ i = result.nextIndex;
223135
+ continue;
223136
+ }
223137
+ if (looksLikeTableRow(trimmed)) {
223138
+ const result = handleTableBlock(lines, i, count);
223139
+ output.push(...result.lines);
223140
+ i = result.nextIndex;
223141
+ continue;
223142
+ }
223143
+ if (trimmed === "") {
223144
+ const result = handleBlankLines(lines, i, count);
223145
+ output.push(...result.lines);
223146
+ i = result.nextIndex;
223147
+ continue;
223148
+ }
223149
+ if (trimmed.startsWith("<!--")) {
223150
+ const result = handleHtmlComment(lines, i, count);
223151
+ output.push(...result.lines);
223152
+ i = result.nextIndex;
223153
+ continue;
223154
+ }
223155
+ if (looksLikeJsonBlockStart(trimmed)) {
223156
+ const result = handleUnfencedJson(lines, i, count);
223157
+ if (result) {
223158
+ output.push(...result.lines);
223159
+ i = result.nextIndex;
223160
+ continue;
223161
+ }
223162
+ }
223163
+ output.push(line);
223164
+ i++;
223165
+ }
223166
+ return output;
223167
+ }
223168
+ function handleCodeFence(lines, startIdx, fence, count) {
223169
+ const fenceChar = fence[0];
223170
+ const fenceLen = fence.length;
223171
+ const result = [lines[startIdx]];
223172
+ let i = startIdx + 1;
223173
+ while (i < lines.length) {
223174
+ const trimmed = lines[i].trimStart();
223175
+ result.push(lines[i]);
223176
+ const closingRe = new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`);
223177
+ if (closingRe.test(trimmed)) {
223178
+ return { lines: result, nextIndex: i + 1 };
223179
+ }
223180
+ i++;
223181
+ }
223182
+ count("unclosed_code_fence");
223183
+ result.push(fence);
223184
+ return { lines: result, nextIndex: i };
223185
+ }
223186
+ function handleTableBlock(lines, startIdx, count) {
223187
+ const tableLines = [];
223188
+ let i = startIdx;
223189
+ while (i < lines.length && looksLikeTableRow(lines[i].trimStart())) {
223190
+ tableLines.push(lines[i]);
223191
+ i++;
223192
+ }
223193
+ if (tableLines.length < 2) {
223194
+ return { lines: tableLines, nextIndex: i };
223195
+ }
223196
+ const normalized = tableLines.map((line) => {
223197
+ const trimmed = line.trimStart();
223198
+ if (!trimmed.startsWith("|") && trimmed.includes("|")) {
223199
+ count("table_leading_pipe");
223200
+ return "| " + trimmed + (trimmed.endsWith("|") ? "" : " |");
223201
+ }
223202
+ return line;
223203
+ });
223204
+ const hasSeparator = normalized.some((line) => /^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/.test(line.trim()));
223205
+ if (!hasSeparator && normalized.length >= 2) {
223206
+ const firstRow = normalized[0].trim();
223207
+ const colCount = countPipes(firstRow) - 1;
223208
+ if (colCount >= 2) {
223209
+ const separator = "| " + Array(colCount).fill("---").join(" | ") + " |";
223210
+ count("table_missing_separator");
223211
+ const result = [normalized[0], separator, ...normalized.slice(1)];
223212
+ return { lines: result, nextIndex: i };
223213
+ }
223214
+ }
223215
+ return { lines: normalized, nextIndex: i };
223216
+ }
223217
+ function handleBlankLines(lines, startIdx, count) {
223218
+ let i = startIdx;
223219
+ while (i < lines.length && lines[i].trim() === "") {
223220
+ i++;
223221
+ }
223222
+ const blankCount = i - startIdx;
223223
+ if (blankCount > 2) {
223224
+ count("excessive_blank_lines");
223225
+ return { lines: [""], nextIndex: i };
223226
+ }
223227
+ return { lines: lines.slice(startIdx, i), nextIndex: i };
223228
+ }
223229
+ function handleHtmlComment(lines, startIdx, count) {
223230
+ const firstLine = lines[startIdx];
223231
+ if (firstLine.includes("-->")) {
223232
+ count("html_comment");
223233
+ return { lines: [], nextIndex: startIdx + 1 };
223234
+ }
223235
+ let i = startIdx + 1;
223236
+ while (i < lines.length) {
223237
+ if (lines[i].includes("-->")) {
223238
+ count("html_comment");
223239
+ return { lines: [], nextIndex: i + 1 };
223240
+ }
223241
+ i++;
223242
+ }
223243
+ return { lines: [firstLine], nextIndex: startIdx + 1 };
223244
+ }
223245
+ function looksLikeJsonBlockStart(trimmed) {
223246
+ return trimmed === "{" || trimmed === "[";
223247
+ }
223248
+ var MIN_JSON_BLOCK_LINES = 5;
223249
+ function handleUnfencedJson(lines, startIdx, count) {
223250
+ const opener = lines[startIdx].trimStart();
223251
+ const openChar = opener[0];
223252
+ const closeChar = openChar === "{" ? "}" : "]";
223253
+ let depth = 0;
223254
+ let i = startIdx;
223255
+ let inString = false;
223256
+ while (i < lines.length) {
223257
+ const line = lines[i];
223258
+ for (let c = 0;c < line.length; c++) {
223259
+ const ch = line[c];
223260
+ if (ch === "\\" && inString) {
223261
+ c++;
223262
+ continue;
223263
+ }
223264
+ if (ch === '"') {
223265
+ inString = !inString;
223266
+ continue;
223267
+ }
223268
+ if (inString)
223269
+ continue;
223270
+ if (ch === "/" && c + 1 < line.length && line[c + 1] === "/") {
223271
+ break;
223272
+ }
223273
+ if (ch === "{" || ch === "[")
223274
+ depth++;
223275
+ else if (ch === "}" || ch === "]")
223276
+ depth--;
223277
+ }
223278
+ i++;
223279
+ if (depth === 0) {
223280
+ const blockLen = i - startIdx;
223281
+ if (blockLen < MIN_JSON_BLOCK_LINES) {
223282
+ return null;
223283
+ }
223284
+ const lastTrimmed = lines[i - 1].trimEnd();
223285
+ if (!lastTrimmed.endsWith(closeChar)) {
223286
+ return null;
223287
+ }
223288
+ count("unfenced_json_block");
223289
+ const fenced = ["```json"];
223290
+ for (let j = startIdx;j < i; j++) {
223291
+ fenced.push(lines[j]);
223292
+ }
223293
+ fenced.push("```");
223294
+ return { lines: fenced, nextIndex: i };
223295
+ }
223296
+ if (depth < 0) {
223297
+ return null;
223298
+ }
223299
+ }
223300
+ return null;
223301
+ }
223302
+ function splitInlineFences(lines, count) {
223303
+ const result = [];
223304
+ for (const line of lines) {
223305
+ const trimmed = line.trimStart();
223306
+ if (/^(`{3,}|~{3,})/.test(trimmed)) {
223307
+ result.push(line);
223308
+ continue;
223309
+ }
223310
+ const inlineMatch = trimmed.match(/(`{3,}|~{3,})(\S*)\s*$/);
223311
+ if (inlineMatch) {
223312
+ const fenceStr = inlineMatch[1];
223313
+ const fenceIdx = trimmed.lastIndexOf(fenceStr);
223314
+ const beforeFence = trimmed.substring(0, fenceIdx);
223315
+ if (beforeFence.trim().length > 0) {
223316
+ const leadingWhitespace = line.substring(0, line.length - trimmed.length);
223317
+ count("inline_code_fence");
223318
+ result.push(leadingWhitespace + beforeFence.trimEnd());
223319
+ result.push(trimmed.substring(fenceIdx));
223320
+ continue;
223321
+ }
223322
+ }
223323
+ result.push(line);
223324
+ }
223325
+ return result;
223326
+ }
223327
+ function looksLikeTableRow(trimmed) {
223328
+ if (trimmed.startsWith("#") || trimmed.startsWith("```") || trimmed.startsWith("~~~")) {
223329
+ return false;
223330
+ }
223331
+ return countPipes(trimmed) >= 1;
223332
+ }
223333
+ function countPipes(text2) {
223334
+ let count = 0;
223335
+ for (let i = 0;i < text2.length; i++) {
223336
+ if (text2[i] === "|" && (i === 0 || text2[i - 1] !== "\\")) {
223337
+ count++;
223338
+ }
223339
+ }
223340
+ return count;
223341
+ }
222473
223342
  // ../llm/src/utils/mapConcurrent.ts
222474
223343
  async function mapConcurrent(items, concurrency, fn) {
222475
223344
  const results = [];