@c4a/server-cli 0.4.15-alpha.4 → 0.4.15-alpha.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +10 -5
  2. package/index.js +965 -90
  3. package/package.json +1 -1
  4. package/serve.js +2027 -221
  5. package/web/assets/ContentDetail--oZBzWh0.js +1 -0
  6. package/web/assets/ContentDetail-B5s8bbFo.js +1 -0
  7. package/web/assets/ContentDetail-C0zfArPg.js +1 -0
  8. package/web/assets/ContentDetail-C3kXsx-i.js +1 -0
  9. package/web/assets/ContentDetail-CcLGF_Yi.js +1 -0
  10. package/web/assets/ContentDetail-D-2xyerw.js +1 -0
  11. package/web/assets/ContentDetail-DlQ8URkx.js +1 -0
  12. package/web/assets/ContentDetail-TPc0m0eM.js +1 -0
  13. package/web/assets/ContentDetail-y0yi2qln.js +1 -0
  14. package/web/assets/EntityDetail-3CFtMmgQ.js +1 -0
  15. package/web/assets/EntityDetail-BI3etmj4.js +1 -0
  16. package/web/assets/EntityDetail-C9k4cMVL.js +1 -0
  17. package/web/assets/EntityDetail-CoFb-qZW.js +1 -0
  18. package/web/assets/EntityDetail-D_WP7tD4.js +1 -0
  19. package/web/assets/EntityDetail-DiJPemDY.js +1 -0
  20. package/web/assets/EntityDetail-DihnDvhA.js +1 -0
  21. package/web/assets/EntityDetail-DyDH4GAw.js +1 -0
  22. package/web/assets/EntityDetail-dIZiNN2t.js +1 -0
  23. package/web/assets/RelationDetail-B2gHrceI.js +1 -0
  24. package/web/assets/RelationDetail-BK8C5waL.js +1 -0
  25. package/web/assets/RelationDetail-CEq9vopD.js +1 -0
  26. package/web/assets/RelationDetail-CaYrspaS.js +1 -0
  27. package/web/assets/RelationDetail-CpoGdy25.js +1 -0
  28. package/web/assets/RelationDetail-DU9ECyHi.js +1 -0
  29. package/web/assets/RelationDetail-Dz7HAlU5.js +1 -0
  30. package/web/assets/RelationDetail-Wh3IgNaF.js +1 -0
  31. package/web/assets/RelationDetail-zZ_ZfkYX.js +1 -0
  32. package/web/assets/index-BKETuM1m.js +111 -0
  33. package/web/assets/index-BPMqeFze.js +111 -0
  34. package/web/assets/index-BgRuvBL5.js +111 -0
  35. package/web/assets/index-C96WspeJ.css +1 -0
  36. package/web/assets/index-CcrkBEZl.js +111 -0
  37. package/web/assets/index-DGDx8sCs.js +111 -0
  38. package/web/assets/index-DIyAwnqE.js +111 -0
  39. package/web/assets/index-DW1cCA8v.js +111 -0
  40. package/web/assets/index-DiAYi5t8.css +1 -0
  41. package/web/assets/index-FOCWvgW_.css +1 -0
  42. package/web/assets/index-daOjyLzy.css +1 -0
  43. package/web/assets/index-moF8uSEi.js +111 -0
  44. package/web/assets/index-sPNyENFN.js +111 -0
  45. package/web/assets/index-uGqDxUnx.css +1 -0
  46. package/web/index.html +2 -2
package/index.js CHANGED
@@ -40342,6 +40342,10 @@ var init_serverConfig = __esm(() => {
40342
40342
  default_model: "gemini-3-pro-preview"
40343
40343
  }
40344
40344
  },
40345
+ indexing: {
40346
+ task_timeout_ms: 150 * 60 * 1000,
40347
+ file_timeout_ms: 15 * 60 * 1000
40348
+ },
40345
40349
  embedding: {
40346
40350
  provider: "huggingface",
40347
40351
  huggingface: {
@@ -44401,7 +44405,7 @@ var init_atomsSchema = __esm(() => {
44401
44405
  init_zod();
44402
44406
  init_base();
44403
44407
  init_baseSchema();
44404
- confidenceAtomSchema = exports_external.number().min(0).max(1).optional();
44408
+ confidenceAtomSchema = exports_external.number().min(0).max(1).optional().catch(undefined);
44405
44409
  entityAtomSchema = exports_external.object({
44406
44410
  name: exports_external.string(),
44407
44411
  kind: kindSchema.optional().catch(undefined),
@@ -220955,14 +220959,21 @@ function isRetryableStatus(status) {
220955
220959
  function isAuthStatus(status) {
220956
220960
  return status === 401 || status === 403;
220957
220961
  }
220958
- function isBadRequest(status) {
220959
- return status === 400;
220962
+ function throwLlmError(error40, status) {
220963
+ const detail = toErrorMessage(error40);
220964
+ const statusTag = status ? ` [HTTP ${status}]` : "";
220965
+ if (isAuthStatus(status)) {
220966
+ throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, `LLM 认证失败${statusTag}: ${detail}`, detail);
220967
+ }
220968
+ throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, `LLM 调用失败${statusTag}: ${detail}`, detail);
220960
220969
  }
220961
220970
 
220962
220971
  class LlmServiceImpl {
220963
220972
  options;
220973
+ supportsTemperature;
220964
220974
  constructor(options) {
220965
220975
  this.options = options;
220976
+ this.supportsTemperature = options.provider !== "openai";
220966
220977
  }
220967
220978
  async generateText(prompt, options) {
220968
220979
  if (this.options.forceStream) {
@@ -220974,7 +220985,7 @@ class LlmServiceImpl {
220974
220985
  model: this.options.languageModel,
220975
220986
  prompt,
220976
220987
  maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
220977
- temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
220988
+ ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
220978
220989
  maxRetries: 0
220979
220990
  };
220980
220991
  if (options?.systemPrompt) {
@@ -221011,13 +221022,7 @@ class LlmServiceImpl {
221011
221022
  durationMs,
221012
221023
  error: toErrorMessage(error40)
221013
221024
  });
221014
- if (isAuthStatus(status)) {
221015
- throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
221016
- }
221017
- if (isBadRequest(status)) {
221018
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221019
- }
221020
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221025
+ throwLlmError(error40, status);
221021
221026
  }
221022
221027
  }
221023
221028
  async generateTextViaStream(prompt, options) {
@@ -221027,7 +221032,7 @@ class LlmServiceImpl {
221027
221032
  model: this.options.languageModel,
221028
221033
  prompt,
221029
221034
  maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
221030
- temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
221035
+ ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
221031
221036
  maxRetries: 0
221032
221037
  };
221033
221038
  if (options?.systemPrompt) {
@@ -221065,13 +221070,7 @@ class LlmServiceImpl {
221065
221070
  durationMs: Date.now() - startedAt,
221066
221071
  error: toErrorMessage(error40)
221067
221072
  });
221068
- if (isAuthStatus(status)) {
221069
- throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
221070
- }
221071
- if (isBadRequest(status)) {
221072
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221073
- }
221074
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221073
+ throwLlmError(error40, status);
221075
221074
  }
221076
221075
  }
221077
221076
  streamText(prompt, options) {
@@ -221094,7 +221093,7 @@ class LlmServiceImpl {
221094
221093
  model: this.options.languageModel,
221095
221094
  prompt,
221096
221095
  maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
221097
- temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
221096
+ ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
221098
221097
  maxRetries: 0,
221099
221098
  onFinish: (event) => {
221100
221099
  const finishEvent = event;
@@ -221140,13 +221139,7 @@ class LlmServiceImpl {
221140
221139
  durationMs: Date.now() - startedAt,
221141
221140
  error: toErrorMessage(error40)
221142
221141
  });
221143
- if (isAuthStatus(status)) {
221144
- throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
221145
- }
221146
- if (isBadRequest(status)) {
221147
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221148
- }
221149
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
221142
+ throwLlmError(error40, status);
221150
221143
  }
221151
221144
  }
221152
221145
  }
@@ -221857,7 +221850,12 @@ function parseExtractionOutput(raw, schema) {
221857
221850
  return { success: false, error: new Error("Empty output") };
221858
221851
  }
221859
221852
  const protocolParsed = tryParseProtocol(trimmed);
221860
- const parsed = protocolParsed ?? tryParseJson(trimmed);
221853
+ let parsed = protocolParsed ?? tryParseJson(trimmed);
221854
+ if (Array.isArray(parsed)) {
221855
+ parsed = { paragraphs: parsed };
221856
+ }
221857
+ parsed = normalizeFlatOutput(parsed);
221858
+ parsed = stripNulls(parsed);
221861
221859
  const result = schema.safeParse(parsed);
221862
221860
  if (!result.success) {
221863
221861
  return { success: false, error: result.error };
@@ -221927,6 +221925,37 @@ function tryParseJson(raw) {
221927
221925
  function repairAndParse(raw) {
221928
221926
  return JSON.parse(jsonrepair(raw));
221929
221927
  }
221928
+ var PARAGRAPH_TAG_RE = /^P\d+$/;
221929
+ function normalizeFlatOutput(parsed) {
221930
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
221931
+ return parsed;
221932
+ const obj = parsed;
221933
+ if ("paragraphs" in obj)
221934
+ return parsed;
221935
+ const keys = Object.keys(obj);
221936
+ if (keys.length === 0)
221937
+ return { paragraphs: [] };
221938
+ const allTags = keys.every((k) => PARAGRAPH_TAG_RE.test(k));
221939
+ if (!allTags)
221940
+ return parsed;
221941
+ const paragraphs = keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag) => ({ tag, atoms: obj[tag] }));
221942
+ return { paragraphs };
221943
+ }
221944
+ function stripNulls(value) {
221945
+ if (value === null)
221946
+ return;
221947
+ if (Array.isArray(value))
221948
+ return value.map(stripNulls);
221949
+ if (typeof value === "object" && value !== null) {
221950
+ const out = {};
221951
+ for (const [k, v] of Object.entries(value)) {
221952
+ if (v !== null)
221953
+ out[k] = stripNulls(v);
221954
+ }
221955
+ return out;
221956
+ }
221957
+ return value;
221958
+ }
221930
221959
  function isRecord(value) {
221931
221960
  return !!value && typeof value === "object" && "key" in value && "value" in value && typeof value.key === "string";
221932
221961
  }
@@ -222183,20 +222212,20 @@ class GleaningExtractor {
222183
222212
  // ../llm/src/prompts/docAtomAnnotation.ts
222184
222213
  init_src();
222185
222214
  var DOC_ATOM_DEFS = [
222186
- ["entities", "Named things: systems, services, modules, documents, APIs (NOT people/teams — use roles for those)", entityAtomSchema],
222215
+ ["entities", "Named things with independent identity — something you can ask questions about ('What does X do?', 'Who owns X?'). Examples: systems, services, modules, APIs, products. If it is a value, address, path, or configuration detail, it is an attribute of an entity, not an entity itself. (NOT people/teams — use roles for those). kind: implementation=internal systems/services, external=third-party dependencies, concept=abstract/not-yet-implemented", entityAtomSchema],
222187
222216
  ["relations", "Connections between entities", relationAtomSchema],
222188
222217
  ["behaviors", "Actions/operations: functions, API calls, user actions, workflows", behaviorAtomSchema],
222189
222218
  ["attributes", "Properties of entities", attributeAtomSchema],
222190
222219
  ["states", "Possible states of entities", stateAtomSchema],
222191
- ["rules", "Business/domain rules: validations, constraints expressed as logic", ruleAtomSchema],
222220
+ ["rules", "Conditional business/domain logic: IF condition THEN consequence (e.g., 'IF user not authenticated THEN reject request')", ruleAtomSchema],
222192
222221
  ["transitions", "State changes: from→to triggered by events or guards", transitionAtomSchema],
222193
222222
  ["events", "Occurrences that trigger behaviors", eventAtomSchema],
222194
222223
  ["decisions", "Architectural or business decisions", decisionAtomSchema],
222195
222224
  ["metrics", "Measurable targets: SLA, throughput, error_rate, with thresholds", metricAtomSchema],
222196
- ["roles", "Actors: human roles, teams, personas that perform behaviors", roleAtomSchema],
222197
- ["constraints", "Hard/soft requirements", constraintAtomSchema],
222225
+ ["roles", "Actors: human roles, teams, personas that perform behaviors. kind: human=individual role, team=group/department, persona=user archetype. System-triggered actions use entity relations, NOT roles", roleAtomSchema],
222226
+ ["constraints", "Declarative requirements: 'X must/should/must-not Y' (e.g., 'passwords must be >= 8 chars'). Unlike rules, constraints have no IF-THEN condition — they are unconditional mandates or restrictions", constraintAtomSchema],
222198
222227
  ["comparisons", "Side-by-side evaluations", comparisonAtomSchema],
222199
- ["boundaries", "System/domain boundaries: what is included vs excluded", boundaryAtomSchema]
222228
+ ["boundaries", "Explicit scope declarations: what is included vs excluded. Only extract when the text explicitly declares scope (e.g., 'this product covers X but NOT Y'). Implicit containment (A runs inside B) is expressed via entity relations, not boundaries", boundaryAtomSchema]
222200
222229
  ];
222201
222230
  function buildAtomTypesBlock() {
222202
222231
  return DOC_ATOM_DEFS.map(([name21, desc, schema], i) => {
@@ -222219,30 +222248,38 @@ Each atom type has specific required fields. Fields with "?" suffix are optional
222219
222248
  ${ATOM_TYPES_BLOCK}
222220
222249
 
222221
222250
  ## Output Format
222222
- Return a single JSON object with this structure:
222251
+ Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
222223
222252
  {
222224
- "paragraphs": [
222225
- {
222226
- "tag": "P0",
222227
- "atoms": {
222228
- "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
222229
- "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }],
222230
- "rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
222231
- }
222232
- }
222233
- ]
222253
+ "P0": {
222254
+ "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
222255
+ "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }]
222256
+ },
222257
+ "P3": {
222258
+ "constraints": [{ "description": "User must be authenticated before access", "severity": "must", "confidence": 0.9 }],
222259
+ "rules": [{ "description": "Reject request if user is not authenticated", "expression": "IF !user.isAuthenticated THEN reject", "confidence": 0.85 }]
222260
+ }
222234
222261
  }
222235
222262
 
222236
222263
  ## Rules
222237
- - Each paragraph tag (P0, P1, ...) corresponds to the tagged paragraph in the input.
222264
+ - Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
222265
+ - Skip paragraphs with no atoms — do NOT emit empty objects.
222238
222266
  - Only include atom types that are actually found in a paragraph (all types are optional).
222239
222267
  - Every atom MUST include all required fields for its type (see schemas above). Fields with "?" suffix are optional.
222240
222268
  - **Enum fields MUST use ONLY the listed values.** For example, entity.kind must be one of "implementation"|"external"|"concept" — do NOT use values from other atom types (e.g., do NOT put "team" or "human" in entity.kind; those belong to roles.kind).
222241
222269
  - Every atom MUST include a "confidence" field (0.0-1.0) indicating how confident you are in the extraction. Use higher values (0.85-1.0) for explicitly stated facts and lower values (0.5-0.7) for inferred or ambiguous information.
222242
222270
  - **Classify correctly:** People, teams, and personas → "roles" (not "entities"). Technical systems, services, modules → "entities".
222271
+ - **Entity reference consistency (CRITICAL):** Every entity name referenced in relation.from, relation.to, behavior.subject, or any other cross-reference field MUST also appear in the "entities" array of the SAME paragraph (or a preceding paragraph in the same chunk). If an entity is mentioned for the first time in a relation, you MUST also extract it as an entity. This ensures no "dangling references" — every name used in relations has a corresponding entity declaration.
222272
+ - **Cross-atom reference consistency:** transitions[].from and transitions[].to values MUST exist in states[].values of the same entity. roles[].performs values MUST match names declared in behaviors[].name.
222273
+ - **Constraints vs rules distinction:** Use "constraints" for unconditional declarative mandates ('X must Y'). Use "rules" for conditional logic ('IF X THEN Y'). Do not mix them — a requirement with no condition is a constraint, a requirement triggered by a condition is a rule. Do NOT invent a rule for every constraint — only create a rule when the text explicitly states conditional logic.
222274
+ - **One statement, multiple atoms:** A single sentence can produce several atom types simultaneously. Do NOT force a choice — extract all that apply. Example: "system uptime must be ≥ 99.9%" → constraint (severity: must) + metric (threshold: "≥ 99.9%").
222275
+ - **Relation types:** Use standard relation types when possible: CONTAINS (parent→child composition), DEPENDS_ON (runtime dependency), IMPLEMENTS (code/component→spec realization), PRODUCES (process→output), TRIGGERS (event/process triggering), REFERENCES (weak cross-reference). Only invent a new type when none of these fit.
222276
+ - **Decisions:** Extract as "decisions" when the text records a deliberate choice between alternatives with rationale (e.g., "we chose X because Y", "after evaluating A/B/C, selected B"). Do not extract routine descriptions as decisions.
222243
222277
  - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions, English input → English descriptions).
222244
222278
  - JSON structure keys (tag, atom type names, field names) must always be in English.
222245
222279
  - Be thorough: extract ALL relevant atoms from each paragraph.
222280
+ - **Tables: basic extraction only.** For paragraphs containing markdown tables, extract the table heading as an entity and a brief summary attribute. Detailed table modeling (row-level data, comparisons, metrics) is handled by a dedicated table extraction pass — do NOT attempt exhaustive table column extraction here.
222281
+ - **Diagrams: basic extraction only.** For paragraphs containing text-based diagrams (e.g. \`\`\`mermaid, \`\`\`plantuml, \`\`\`dot, etc.), extract the diagram title as an entity and a brief summary attribute describing what the diagram shows. Detailed diagram modeling (nodes, edges, states, transitions) is handled by a dedicated diagram extraction pass — do NOT attempt exhaustive diagram parsing here.
222282
+ - **Metrics checklist:** If the text mentions performance targets, SLA, response times, throughput, error rates, port numbers, timeouts, capacity limits, or any numeric thresholds, extract them as "metrics" atoms.
222246
222283
  - Do NOT include "claims" — they are system-generated and not part of document extraction.`;
222247
222284
  function buildDocAtomAnnotationPrompt(chunkText) {
222248
222285
  return `Extract all semantic atoms from the following document text.
@@ -222254,6 +222291,13 @@ ${chunkText}
222254
222291
 
222255
222292
  Return ONLY a valid JSON object. No markdown fences, no explanation.`;
222256
222293
  }
222294
+ function toFlatFormat(result) {
222295
+ const flat = {};
222296
+ for (const p of result.paragraphs) {
222297
+ flat[p.tag] = p.atoms;
222298
+ }
222299
+ return flat;
222300
+ }
222257
222301
  function buildDocGleaningPrompt(chunkText, previousResult) {
222258
222302
  return `Review the following document text and the previously extracted atoms.
222259
222303
  Check for any MISSING atoms that were not captured in the first pass.
@@ -222262,66 +222306,483 @@ Check for any MISSING atoms that were not captured in the first pass.
222262
222306
  ${chunkText}
222263
222307
 
222264
222308
  ## Previously Extracted Atoms
222265
- ${JSON.stringify(previousResult, null, 2)}
222309
+ ${JSON.stringify(toFlatFormat(previousResult), null, 2)}
222266
222310
 
222267
222311
  ## Instructions
222268
- - If you find missing atoms, output them in the same JSON format (with paragraph tags).
222312
+ - If you find missing atoms, output them in the same flat JSON format keyed by paragraph tags (e.g. {"P0": {"entities": [...]}, "P3": {"rules": [...]}}).
222269
222313
  - Only include NEW atoms not already in the previous extraction.
222270
222314
  - Every atom MUST include a "confidence" field (0.0-1.0).
222271
- - If nothing is missing, return: {"paragraphs": []}
222315
+ - **Entity reference consistency:** If you add a new relation whose from/to references an entity not yet declared in the previous extraction or your current output, you MUST also add that entity to the "entities" array.
222316
+ - If nothing is missing, return: {}
222272
222317
  - Respond in the same language as the input text.
222273
222318
 
222274
222319
  Return ONLY a valid JSON object. No markdown fences, no explanation.`;
222275
222320
  }
222276
222321
  var DOC_ANNOTATION_SYSTEM_PROMPT = SYSTEM_PROMPT;
222322
+ // ../llm/src/prompts/entityResolution.ts
222323
+ var ENTITY_RESOLUTION_SYSTEM_PROMPT = `You are an entity resolution assistant. You review a list of entity names extracted from a technical document and perform two tasks:
222324
+
222325
+ ## Task 1: Merge Duplicates
222326
+ - Only merge names that clearly refer to the same entity (same system, service, tool, etc.)
222327
+ - Prefer the LONGER, more descriptive name as the canonical name
222328
+ - Do NOT merge names that share a substring but refer to different things
222329
+ - When uncertain, do NOT merge — add to "ambiguous" instead
222330
+ - Chinese and English names for the same entity SHOULD be merged (e.g. "Vmok" → "Vmok 微模块框架")
222331
+ - Abbreviations should be merged with their full forms (e.g. "AGW" → "API Gateway")
222332
+
222333
+ ## Task 2: Remove Noise
222334
+ - Apply the **identity test**: a real entity is something you can discuss independently ("What is X?", "How does X work?", "Who owns X?"). Names that fail this test — values, addresses, actions, generic descriptions — are noise.
222335
+ - Remove names that are NOT meaningful named entities: generic words, action descriptions, or things that are attributes/values rather than independent subjects
222336
+ - Examples of REAL entities to KEEP: product names (TTAstra, Gulux), tools (nvm, Rush), services (Op Main 服务), platforms (AGW 平台) — these all pass the identity test
222337
+ - When uncertain, KEEP the name — only remove if it clearly fails the identity test
222338
+
222339
+ ## Output
222340
+ Valid JSON only. No markdown fences, no explanation.`;
222341
+ function buildEntityResolutionPrompt(input) {
222342
+ const parts = [];
222343
+ parts.push(`## All Entity Names (${input.allNames.length} total)`);
222344
+ parts.push(input.allNames.map((n, i) => `${i + 1}. ${n}`).join(`
222345
+ `));
222346
+ if (input.candidates.length > 0) {
222347
+ parts.push("");
222348
+ parts.push(`## Suspected Duplicates (${input.candidates.length} pairs)`);
222349
+ parts.push("Review each pair and decide whether to merge:");
222350
+ for (const c of input.candidates) {
222351
+ parts.push(`- "${c.short}" ↔ "${c.long}" — ${c.reason}`);
222352
+ }
222353
+ }
222354
+ if (input.noiseCandidates && input.noiseCandidates.length > 0) {
222355
+ parts.push("");
222356
+ parts.push(`## Suspected Noise (${input.noiseCandidates.length} names)`);
222357
+ parts.push("Review each name — remove if NOT a meaningful named entity, keep if it IS:");
222358
+ for (const n of input.noiseCandidates) {
222359
+ parts.push(`- "${n}"`);
222360
+ }
222361
+ }
222362
+ if (input.contextSnippets && input.contextSnippets.length > 0) {
222363
+ parts.push("");
222364
+ parts.push("## Context Snippets");
222365
+ for (const s of input.contextSnippets) {
222366
+ parts.push(`- **${s.name}**: ${s.snippet}`);
222367
+ }
222368
+ }
222369
+ parts.push("");
222370
+ parts.push(`## Output Format
222371
+ Return a JSON object:
222372
+ {
222373
+ "merges": [
222374
+ { "from": "alias name", "to": "canonical name" }
222375
+ ],
222376
+ "remove": ["noise_name_1", "noise_name_2"],
222377
+ "ambiguous": ["name1", "name2"]
222378
+ }
222379
+
222380
+ - "merges": confirmed duplicate pairs. "from" will be replaced by "to" everywhere.
222381
+ - "remove": names confirmed as noise. They will be deleted from entity list.
222382
+ - "ambiguous": names you're unsure about (optional, for logging).
222383
+
222384
+ Return ONLY valid JSON. No markdown fences, no explanation.`);
222385
+ return parts.join(`
222386
+ `);
222387
+ }
222388
+ // ../llm/src/prompts/docTableAnnotation.ts
222389
+ init_src();
222390
+ var entityFields = zodObjectToPromptFields(entityAtomSchema);
222391
+ var attributeFields = zodObjectToPromptFields(attributeAtomSchema);
222392
+ var relationFields = zodObjectToPromptFields(relationAtomSchema);
222393
+ var comparisonFields = zodObjectToPromptFields(comparisonAtomSchema);
222394
+ var metricFields = zodObjectToPromptFields(metricAtomSchema);
222395
+ var behaviorFields = zodObjectToPromptFields(behaviorAtomSchema);
222396
+ var eventFields = zodObjectToPromptFields(eventAtomSchema);
222397
+ var transitionFields = zodObjectToPromptFields(transitionAtomSchema);
222398
+ var constraintFields = zodObjectToPromptFields(constraintAtomSchema);
222399
+ var stateFields = zodObjectToPromptFields(stateAtomSchema);
222400
+ var ruleFields = zodObjectToPromptFields(ruleAtomSchema);
222401
+ var TABLE_SYSTEM_PROMPT = `You are a table data modeling assistant. Your task is to extract structured semantic atoms from markdown tables in documents.
222402
+
222403
+ Each table paragraph is tagged with [P0], [P1], etc. You must classify the table type FIRST, then apply the corresponding extraction rules.
222404
+
222405
+ ## Step 1: Classify the Table
222406
+
222407
+ Determine the table type by examining the relationship between rows:
222408
+
222409
+ ### Type A: Collection / Record Table
222410
+ **Rows are peer instances of the same concept.** Each row is an independent record; columns describe different facets of the same instance.
222411
+ - Examples: code→name mappings, enum definitions, config parameter lists, reference data tables
222412
+ - Key signal: removing one row does not affect the meaning of other rows
222413
+
222414
+ ### Type B: Single-Object Property Table
222415
+ **Rows describe properties/fields of ONE entity.** First column is property name, other columns are its type/value/description.
222416
+ - Examples: API field definitions, configuration schema, entity attribute lists
222417
+ - Key signal: all rows refer to the same parent entity
222418
+
222419
+ ### Type C: Comparison / Evaluation Table
222420
+ **Rows or columns represent different subjects being compared** across the same dimensions.
222421
+ - Examples: technology selection, vendor evaluation, feature comparison
222422
+ - Key signal: multiple named subjects evaluated on shared criteria
222423
+
222424
+ ### Type D: Matrix / Cross-Reference Table
222425
+ **Both row headers and column headers are dimensions.** Cells represent the relationship at the intersection.
222426
+ - Examples: permission matrices (role × operation), compatibility matrices, dependency tables
222427
+ - Key signal: both axes are meaningful dimensions, cells are binary/rating/relationship values
222428
+
222429
+ ### Type E: Metrics / KPI Table
222430
+ **Rows are measurable indicators** with numeric targets, thresholds, or SLA values.
222431
+ - Examples: SLA tables, performance baselines, capacity planning tables
222432
+ - Key signal: columns include target/threshold/unit/SLA-style values
222433
+
222434
+ ### Type F: Timeline / Process Table
222435
+ **Rows represent ordered steps or phases** in a sequence.
222436
+ - Examples: deployment steps, approval workflows, version changelog, migration plans
222437
+ - Key signal: rows have implicit ordering, may have phase/step/date columns
222438
+
222439
+ ## Step 2: Extract Atoms by Table Type
222440
+
222441
+ ### Type A → Single attribute with row-object array
222442
+ 1. Create ONE entity for the abstract concept (table heading or the concept rows represent).
222443
+ Entity schema: ${entityFields}
222444
+ 2. Create ONE attribute with \`type: "table"\` and \`value\` as an array of row objects. Each row object uses column headers as keys.
222445
+ Attribute schema: ${attributeFields}
222446
+ Example: \`{ "name": "Region Code Mapping", "type": "table", "value": [{"Code": "1001", "Name": "CN_North", "Region": "CN-NORTH"}, ...] }\`
222447
+ 3. **Extract ALL rows — do not sample.** If a table has 30 rows, the value array must contain all 30 objects.
222448
+ 4. Extract structural patterns: status indicators (DEPRECATED, enabled/disabled) → "states" + "rules" atoms.
222449
+ State schema: ${stateFields}
222450
+ Rule schema: ${ruleFields}
222451
+
222452
+ ### Type B → Multiple attribute atoms
222453
+ 1. Create ONE entity for the parent structure.
222454
+ Entity schema: ${entityFields}
222455
+ 2. Create one attribute per row: \`name\` = property name, \`type\` = property type, \`value\` = default/example.
222456
+ Attribute schema: ${attributeFields}
222457
+ 3. Extract constraints from "required" or "validation" columns.
222458
+ Constraint schema: ${constraintFields}
222459
+
222460
+ ### Type C → Comparison atom
222461
+ 1. Use "comparisons" atom. Subjects = compared items, dimensions = evaluation criteria.
222462
+ Comparison schema: ${comparisonFields}
222463
+ 2. Extract "decisions" atoms if the table leads to a conclusion.
222464
+
222465
+ ### Type D → Relations or table attribute
222466
+ 1. If cells are simple (yes/no, allowed/denied): extract as "relations" atoms.
222467
+ Relation schema: ${relationFields}
222468
+ Map each cell to a relation: row header → \`from\`, column header → \`to\`, cell value → \`type\` or \`description\`.
222469
+ 2. If cells are complex: use Type A approach (single attribute with \`type: "table"\`).
222470
+ 3. Create entities for both row headers and column headers if they are named concepts.
222471
+
222472
+ ### Type E → Metrics atoms
222473
+ 1. Create one "metrics" atom per row.
222474
+ Metric schema: ${metricFields}
222475
+ 2. Also create the parent entity if named (e.g., "SLA Requirements").
222476
+
222477
+ ### Type F → Behaviors/Events/Transitions
222478
+ 1. Create one "behaviors" atom per step/phase.
222479
+ Behavior schema: ${behaviorFields}
222480
+ 2. If there are triggers: extract "events" atoms.
222481
+ Event schema: ${eventFields}
222482
+ 3. If there are state changes: extract "transitions" atoms.
222483
+ Transition schema: ${transitionFields}
222484
+ 4. Create the parent entity for the process/workflow.
222485
+
222486
+ ## Output Format
222487
+ Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
222488
+ {
222489
+ "P0": {
222490
+ "tableType": "A",
222491
+ "entities": [...],
222492
+ "attributes": [...]
222493
+ },
222494
+ "P3": {
222495
+ "tableType": "C",
222496
+ "comparisons": [...]
222497
+ }
222498
+ }
222499
+
222500
+ ## Rules
222501
+ - Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
222502
+ - Skip paragraphs with no atoms — do NOT emit empty objects.
222503
+ - Every atom MUST include a "confidence" field (0.0-1.0).
222504
+ - The "tableType" field is required for each paragraph (one of "A", "B", "C", "D", "E", "F").
222505
+ - Only include atom types that are actually extracted.
222506
+ - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
222507
+ - JSON structure keys must always be in English.
222508
+ - **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
222509
+ - Do NOT include "claims" — they are system-generated.`;
222510
+ function buildDocTableAnnotationPrompt(tableText) {
222511
+ return `Classify and extract atoms from the following table paragraphs.
222512
+ Each paragraph is tagged with [P0], [P1], etc. First classify each table, then extract atoms accordingly.
222513
+
222514
+ ---
222515
+ ${tableText}
222516
+ ---
222517
+
222518
+ Return ONLY a valid JSON object. No markdown fences, no explanation.`;
222519
+ }
222520
+ var DOC_TABLE_ANNOTATION_SYSTEM_PROMPT = TABLE_SYSTEM_PROMPT;
222521
+ // ../llm/src/prompts/docDiagramAnnotation.ts
222522
+ init_src();
222523
+ var entityFields2 = zodObjectToPromptFields(entityAtomSchema);
222524
+ var attributeFields2 = zodObjectToPromptFields(attributeAtomSchema);
222525
+ var relationFields2 = zodObjectToPromptFields(relationAtomSchema);
222526
+ var behaviorFields2 = zodObjectToPromptFields(behaviorAtomSchema);
222527
+ var transitionFields2 = zodObjectToPromptFields(transitionAtomSchema);
222528
+ var stateFields2 = zodObjectToPromptFields(stateAtomSchema);
222529
+ var roleFields = zodObjectToPromptFields(roleAtomSchema);
222530
+ var eventFields2 = zodObjectToPromptFields(eventAtomSchema);
222531
+ var decisionFields = zodObjectToPromptFields(decisionAtomSchema);
222532
+ var constraintFields2 = zodObjectToPromptFields(constraintAtomSchema);
222533
+ var DIAGRAM_FENCE_TAGS = [
222534
+ "mermaid",
222535
+ "plantuml",
222536
+ "puml",
222537
+ "dot",
222538
+ "graphviz",
222539
+ "viz",
222540
+ "d2",
222541
+ "c4plantuml",
222542
+ "ditaa",
222543
+ "nomnoml",
222544
+ "wavedrom",
222545
+ "vega",
222546
+ "vega-lite"
222547
+ ];
222548
+ var DIAGRAM_FENCE_REGEX = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "i");
222549
+ var DIAGRAM_SYSTEM_PROMPT = `You are a diagram analysis assistant. Your task is to extract structured semantic atoms from text-based diagrams (Mermaid, PlantUML, Graphviz, D2, etc.) embedded in documents.
222550
+
222551
+ Each diagram paragraph is tagged with [P0], [P1], etc. You must classify the diagram type FIRST, then extract atoms accordingly.
222552
+
222553
+ ## Step 1: Identify the Diagram Format and Type
222554
+
222555
+ ### Formats
222556
+ - **Mermaid**: flowchart/graph, sequenceDiagram, stateDiagram, classDiagram, erDiagram, gantt, pie, gitgraph
222557
+ - **PlantUML / C4-PlantUML**: @startuml/@enduml blocks, all UML types, C4 architecture (System_Context, Container, Component)
222558
+ - **Graphviz (DOT)**: digraph/graph, general directed/undirected graphs
222559
+ - **D2**: modern declarative diagrams with shape/connection syntax
222560
+ - **Others**: ditaa (ASCII art), nomnoml (UML), wavedrom (timing), vega/vega-lite (data viz)
222561
+
222562
+ ### Diagram Types (by semantic content)
222563
+ - **Flowchart / Process**: decision trees, algorithms, business process flows
222564
+ - **Sequence**: interaction between participants over time (API calls, protocols)
222565
+ - **State Machine**: states and transitions triggered by events/guards
222566
+ - **Class / ER**: data models, entity relationships, inheritance hierarchies
222567
+ - **Architecture**: system components, containers, deployment topology
222568
+ - **Gantt / Timeline**: project schedules, milestones, phases
222569
+ - **Pie / Data Viz**: statistical distributions, metrics visualization
222570
+
222571
+ ## Step 2: Extract Atoms by Diagram Type
222572
+
222573
+ ### Flowchart / Process → entities + relations + behaviors + decisions
222574
+ 1. Extract each node as an entity.
222575
+ Entity schema: ${entityFields2}
222576
+ 2. Extract each arrow/edge as a relation. Use edge labels as \`type\` or \`description\`.
222577
+ Relation schema: ${relationFields2}
222578
+ 3. Extract action nodes as behaviors (what the process does at each step).
222579
+ Behavior schema: ${behaviorFields2}
222580
+ 4. Extract diamond/condition nodes: if it represents a deliberate choice with rationale → "decisions"; if it represents conditional branching logic (IF-THEN) → "rules".
222581
+ Decision schema: ${decisionFields}
222582
+
222583
+ ### Sequence → entities + relations + behaviors + events
222584
+ 1. Extract each participant/actor as an entity (or role if it's a person/team).
222585
+ Entity schema: ${entityFields2}
222586
+ Role schema: ${roleFields}
222587
+ 2. Extract each message/call as a relation (\`from\` = caller, \`to\` = callee, \`type\` = message label).
222588
+ Relation schema: ${relationFields2}
222589
+ 3. Extract significant interactions as behaviors.
222590
+ Behavior schema: ${behaviorFields2}
222591
+ 4. Extract triggers, responses, and async messages as events.
222592
+ Event schema: ${eventFields2}
222593
+
222594
+ ### State Machine → entities + states + transitions + events
222595
+ 1. Extract the state machine subject as an entity.
222596
+ Entity schema: ${entityFields2}
222597
+ 2. Extract each state as a state atom.
222598
+ State schema: ${stateFields2}
222599
+ 3. Extract each arrow as a transition (\`from\` = source state, \`to\` = target state, \`trigger\` = event/guard).
222600
+ Transition schema: ${transitionFields2}
222601
+ 4. Extract triggers as events.
222602
+ Event schema: ${eventFields2}
222603
+
222604
+ ### Class / ER → entities + attributes + relations
222605
+ 1. Extract each class/entity as an entity.
222606
+ Entity schema: ${entityFields2}
222607
+ 2. Extract fields/properties as attributes.
222608
+ Attribute schema: ${attributeFields2}
222609
+ 3. Extract associations, inheritance, composition as relations (\`type\` = "INHERITS", "CONTAINS", "REFERENCES", etc.).
222610
+ Relation schema: ${relationFields2}
222611
+
222612
+ ### Architecture → entities + relations + constraints
222613
+ 1. Extract each system/service/container/component as an entity. Use kind to indicate origin: "implementation" for internal systems/services, "external" for third-party dependencies (databases, cloud services, external APIs).
222614
+ Entity schema: ${entityFields2}
222615
+ 2. Extract connections between components as relations. Use standard types: CONTAINS (parent→child), DEPENDS_ON (runtime dependency), TRIGGERS (event/process triggering).
222616
+ Relation schema: ${relationFields2}
222617
+ 3. Extract deployment constraints, technology choices.
222618
+ Constraint schema: ${constraintFields2}
222619
+
222620
+ ### Gantt / Timeline → behaviors + events + constraints
222621
+ 1. Extract each task/phase as a behavior.
222622
+ Behavior schema: ${behaviorFields2}
222623
+ 2. Extract milestones and deadlines as events.
222624
+ Event schema: ${eventFields2}
222625
+ 3. Extract dependencies and critical path constraints.
222626
+ Constraint schema: ${constraintFields2}
222627
+
222628
+ ### Pie / Data Viz → attributes (summary only)
222629
+ 1. Extract the chart title as an entity.
222630
+ Entity schema: ${entityFields2}
222631
+ 2. Extract each slice/data point as an attribute (\`name\` = label, \`value\` = amount/percentage, \`type\` = "metric").
222632
+ Attribute schema: ${attributeFields2}
222633
+
222634
+ ## Additional Extraction: Diagram Description
222635
+
222636
+ For EVERY diagram, also extract a "description" attribute on the diagram's primary entity:
222637
+ - \`name\`: "diagram_description"
222638
+ - \`type\`: "description"
222639
+ - \`value\`: A 1-3 sentence natural language summary of what the diagram communicates.
222640
+
222641
+ This description is critical for downstream AI consumers who cannot render the diagram.
222642
+
222643
+ ## Output Format
222644
+ Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
222645
+ {
222646
+ "P0": {
222647
+ "diagramFormat": "mermaid",
222648
+ "diagramType": "sequence",
222649
+ "entities": [...],
222650
+ "relations": [...]
222651
+ }
222652
+ }
222653
+
222654
+ ## Rules
222655
+ - Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
222656
+ - Skip paragraphs with no atoms — do NOT emit empty objects.
222657
+ - Every atom MUST include a "confidence" field (0.0-1.0).
222658
+ - The "diagramFormat" and "diagramType" fields are required for each paragraph.
222659
+ - Only include atom types that are actually extracted.
222660
+ - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
222661
+ - JSON structure keys must always be in English.
222662
+ - **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
222663
+ - **Extract ALL nodes and edges** — do not sample or skip.
222664
+ - Do NOT include "claims" — they are system-generated.`;
222665
+ function buildDocDiagramAnnotationPrompt(diagramText) {
222666
+ return `Analyze and extract atoms from the following diagram paragraphs.
222667
+ Each paragraph is tagged with [P0], [P1], etc. First identify the diagram format and type, then extract atoms accordingly.
222668
+
222669
+ ---
222670
+ ${diagramText}
222671
+ ---
222672
+
222673
+ Return ONLY a valid JSON object. No markdown fences, no explanation.`;
222674
+ }
222675
+ var DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT = DIAGRAM_SYSTEM_PROMPT;
222277
222676
  // ../llm/src/chunking/markdownChunker.ts
222278
222677
  var DEFAULT_MAX_TOKENS2 = 4000;
222678
+ var DEFAULT_PARAGRAPH_MAX_TOKENS = 500;
222279
222679
  function estimateTokens(text2) {
222280
222680
  return Math.ceil(text2.length / 4);
222281
222681
  }
222682
+ function findCodeBlockRanges(content) {
222683
+ const ranges = [];
222684
+ const fenceRe = /^(`{3,}|~{3,})/gm;
222685
+ let openStart = -1;
222686
+ let openFence = "";
222687
+ let match;
222688
+ while ((match = fenceRe.exec(content)) !== null) {
222689
+ const fence = match[1];
222690
+ if (openStart === -1) {
222691
+ openStart = match.index;
222692
+ openFence = fence[0].repeat(fence.length);
222693
+ } else if (fence[0] === openFence[0] && fence.length >= openFence.length) {
222694
+ ranges.push({ start: openStart, end: match.index + match[0].length });
222695
+ openStart = -1;
222696
+ openFence = "";
222697
+ }
222698
+ }
222699
+ if (openStart !== -1) {
222700
+ ranges.push({ start: openStart, end: content.length });
222701
+ }
222702
+ return ranges;
222703
+ }
222704
+ function isInsideCodeBlock(pos, ranges) {
222705
+ for (const r of ranges) {
222706
+ if (pos >= r.start && pos < r.end)
222707
+ return true;
222708
+ if (r.start > pos)
222709
+ break;
222710
+ }
222711
+ return false;
222712
+ }
222282
222713
  function parseSections(content) {
222283
- const headingRe = /^(#{1,6})\s+(.*)$/gm;
222284
- const sections = [];
222714
+ const codeRanges = findCodeBlockRanges(content);
222285
222715
  const matches = [];
222286
- let match;
222287
- while ((match = headingRe.exec(content)) !== null) {
222288
- matches.push({
222289
- index: match.index,
222290
- level: match[1].length,
222291
- heading: match[2].trim()
222292
- });
222716
+ const atxRe = /^(#{1,6})\s+(.*)$/gm;
222717
+ let m;
222718
+ while ((m = atxRe.exec(content)) !== null) {
222719
+ if (!isInsideCodeBlock(m.index, codeRanges)) {
222720
+ matches.push({
222721
+ index: m.index,
222722
+ endIndex: m.index + m[0].length,
222723
+ level: m[1].length,
222724
+ heading: m[2].trim()
222725
+ });
222726
+ }
222293
222727
  }
222728
+ const lines = content.split(`
222729
+ `);
222730
+ let offset = 0;
222731
+ for (let i = 0;i < lines.length; i++) {
222732
+ const line = lines[i];
222733
+ if (i > 0) {
222734
+ const prevLine = lines[i - 1].trim();
222735
+ const prevLineStart = offset - lines[i - 1].length - 1;
222736
+ if (prevLine && !isInsideCodeBlock(prevLineStart, codeRanges)) {
222737
+ if (/^={2,}\s*$/.test(line)) {
222738
+ matches.push({
222739
+ index: prevLineStart < 0 ? 0 : prevLineStart,
222740
+ endIndex: offset + line.length,
222741
+ level: 1,
222742
+ heading: prevLine
222743
+ });
222744
+ } else if (/^-{2,}\s*$/.test(line) && !/^-{3,}\s*$/.test(prevLine)) {
222745
+ matches.push({
222746
+ index: prevLineStart < 0 ? 0 : prevLineStart,
222747
+ endIndex: offset + line.length,
222748
+ level: 2,
222749
+ heading: prevLine
222750
+ });
222751
+ }
222752
+ }
222753
+ }
222754
+ offset += line.length + 1;
222755
+ }
222756
+ matches.sort((a, b) => a.index - b.index);
222757
+ const deduped = [];
222758
+ for (const match of matches) {
222759
+ const last = deduped[deduped.length - 1];
222760
+ if (last && match.index < last.endIndex)
222761
+ continue;
222762
+ deduped.push(match);
222763
+ }
222764
+ return buildSectionsFromMatches(content, deduped);
222765
+ }
222766
+ function buildSectionsFromMatches(content, matches) {
222767
+ const sections = [];
222294
222768
  if (matches.length === 0) {
222295
222769
  const body = content.trim();
222296
222770
  if (body) {
222297
- sections.push({
222298
- heading: "",
222299
- level: 0,
222300
- body,
222301
- paragraphs: splitParagraphs(body)
222302
- });
222771
+ sections.push({ heading: "", level: 0, body, paragraphs: splitParagraphs(body) });
222303
222772
  }
222304
222773
  return sections;
222305
222774
  }
222306
222775
  if (matches[0].index > 0) {
222307
222776
  const preBody = content.slice(0, matches[0].index).trim();
222308
222777
  if (preBody) {
222309
- sections.push({
222310
- heading: "",
222311
- level: 0,
222312
- body: preBody,
222313
- paragraphs: splitParagraphs(preBody)
222314
- });
222778
+ sections.push({ heading: "", level: 0, body: preBody, paragraphs: splitParagraphs(preBody) });
222315
222779
  }
222316
222780
  }
222317
222781
  for (let i = 0;i < matches.length; i++) {
222318
222782
  const m = matches[i];
222319
- const start = m.index;
222320
- const end = i + 1 < matches.length ? matches[i + 1].index : content.length;
222321
- const fullText = content.slice(start, end).trim();
222322
- const headingLineEnd = fullText.indexOf(`
222323
- `);
222324
- const body = headingLineEnd === -1 ? "" : fullText.slice(headingLineEnd + 1).trim();
222783
+ const bodyStart = m.endIndex;
222784
+ const bodyEnd = i + 1 < matches.length ? matches[i + 1].index : content.length;
222785
+ const body = content.slice(bodyStart, bodyEnd).trim();
222325
222786
  sections.push({
222326
222787
  heading: m.heading,
222327
222788
  level: m.level,
@@ -222336,6 +222797,128 @@ function splitParagraphs(text2) {
222336
222797
  return [];
222337
222798
  return text2.split(/\n\n+/).map((p) => p.trim()).filter(Boolean);
222338
222799
  }
222800
+ function splitOversizedText(text2, maxTokens) {
222801
+ const doubleNewlineParts = text2.split(/\n\n+/).map((p) => p.trim()).filter(Boolean);
222802
+ if (doubleNewlineParts.length > 1) {
222803
+ const results = [];
222804
+ let acc = "";
222805
+ let accTokens = 0;
222806
+ for (const part of doubleNewlineParts) {
222807
+ const partTokens = estimateTokens(part);
222808
+ if (partTokens > maxTokens) {
222809
+ if (acc) {
222810
+ results.push(acc);
222811
+ acc = "";
222812
+ accTokens = 0;
222813
+ }
222814
+ results.push(...splitOversizedText(part, maxTokens));
222815
+ continue;
222816
+ }
222817
+ if (acc && accTokens + partTokens > maxTokens) {
222818
+ results.push(acc);
222819
+ acc = "";
222820
+ accTokens = 0;
222821
+ }
222822
+ acc = acc ? acc + `
222823
+
222824
+ ` + part : part;
222825
+ accTokens += partTokens;
222826
+ }
222827
+ if (acc)
222828
+ results.push(acc);
222829
+ return results;
222830
+ }
222831
+ const lines = text2.split(`
222832
+ `);
222833
+ if (lines.length > 1) {
222834
+ const blocks = mergeAtomicBlocks(lines);
222835
+ const results = [];
222836
+ let acc = "";
222837
+ let accTokens = 0;
222838
+ for (const block of blocks) {
222839
+ const blockTokens = estimateTokens(block);
222840
+ if (blockTokens > maxTokens) {
222841
+ if (acc) {
222842
+ results.push(acc);
222843
+ acc = "";
222844
+ accTokens = 0;
222845
+ }
222846
+ results.push(block);
222847
+ continue;
222848
+ }
222849
+ if (acc && accTokens + blockTokens > maxTokens) {
222850
+ results.push(acc);
222851
+ acc = "";
222852
+ accTokens = 0;
222853
+ }
222854
+ acc = acc ? acc + `
222855
+ ` + block : block;
222856
+ accTokens += blockTokens;
222857
+ }
222858
+ if (acc)
222859
+ results.push(acc);
222860
+ return results;
222861
+ }
222862
+ return forceBreakText(text2, maxTokens);
222863
+ }
222864
+ function mergeAtomicBlocks(lines) {
222865
+ const result = [];
222866
+ let i = 0;
222867
+ while (i < lines.length) {
222868
+ const line = lines[i];
222869
+ const trimmed = line.trimStart();
222870
+ if (/^(`{3,}|~{3,})/.test(trimmed)) {
222871
+ const fence = trimmed.match(/^(`{3,}|~{3,})/)[1];
222872
+ const fenceChar = fence[0];
222873
+ const fenceLen = fence.length;
222874
+ const blockLines = [line];
222875
+ i++;
222876
+ while (i < lines.length) {
222877
+ blockLines.push(lines[i]);
222878
+ const inner = lines[i].trimStart();
222879
+ if (inner.startsWith(fenceChar) && inner.match(new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`))) {
222880
+ i++;
222881
+ break;
222882
+ }
222883
+ i++;
222884
+ }
222885
+ result.push(blockLines.join(`
222886
+ `));
222887
+ continue;
222888
+ }
222889
+ if (trimmed.startsWith("|")) {
222890
+ const tableLines = [line];
222891
+ i++;
222892
+ while (i < lines.length && lines[i].trimStart().startsWith("|")) {
222893
+ tableLines.push(lines[i]);
222894
+ i++;
222895
+ }
222896
+ result.push(tableLines.join(`
222897
+ `));
222898
+ continue;
222899
+ }
222900
+ result.push(line);
222901
+ i++;
222902
+ }
222903
+ return result;
222904
+ }
222905
+ function forceBreakText(text2, maxTokens) {
222906
+ const maxChars = maxTokens * 4;
222907
+ const results = [];
222908
+ let remaining = text2;
222909
+ while (remaining.length > maxChars) {
222910
+ let breakAt = maxChars;
222911
+ const spaceIdx = remaining.lastIndexOf(" ", maxChars);
222912
+ if (spaceIdx > maxChars * 0.7) {
222913
+ breakAt = spaceIdx;
222914
+ }
222915
+ results.push(remaining.slice(0, breakAt).trim());
222916
+ remaining = remaining.slice(breakAt).trim();
222917
+ }
222918
+ if (remaining)
222919
+ results.push(remaining);
222920
+ return results;
222921
+ }
222339
222922
  function buildBreadcrumb(sections, sectionIndex) {
222340
222923
  const current = sections[sectionIndex];
222341
222924
  if (current.level <= 0)
@@ -222364,11 +222947,53 @@ function sectionHeadingLine(section) {
222364
222947
  return "";
222365
222948
  return `${"#".repeat(section.level)} ${section.heading}`;
222366
222949
  }
222950
+ function buildCoarseParagraphs(sections, paragraphMaxTokens) {
222951
+ const result = [];
222952
+ const rawEntries = [];
222953
+ for (let sIdx = 0;sIdx < sections.length; sIdx++) {
222954
+ const section = sections[sIdx];
222955
+ if (!section.body.trim())
222956
+ continue;
222957
+ const bodyTokens = estimateTokens(section.body);
222958
+ if (bodyTokens > paragraphMaxTokens) {
222959
+ const parts = splitOversizedText(section.body, paragraphMaxTokens);
222960
+ for (const part of parts) {
222961
+ rawEntries.push({ sectionIndex: sIdx, text: part, tokens: estimateTokens(part) });
222962
+ }
222963
+ } else {
222964
+ rawEntries.push({ sectionIndex: sIdx, text: section.body, tokens: bodyTokens });
222965
+ }
222966
+ }
222967
+ const MERGE_THRESHOLD = 150;
222968
+ const merged = [];
222969
+ for (const entry of rawEntries) {
222970
+ const last = merged[merged.length - 1];
222971
+ if (last && last.tokens < MERGE_THRESHOLD && entry.tokens < MERGE_THRESHOLD && last.tokens + entry.tokens <= paragraphMaxTokens) {
222972
+ last.text = last.text + `
222973
+
222974
+ ` + entry.text;
222975
+ last.tokens += entry.tokens;
222976
+ } else {
222977
+ merged.push({ ...entry });
222978
+ }
222979
+ }
222980
+ let pIdx = 0;
222981
+ for (const entry of merged) {
222982
+ result.push({
222983
+ sectionIndex: entry.sectionIndex,
222984
+ paragraphIndex: pIdx++,
222985
+ text: entry.text
222986
+ });
222987
+ }
222988
+ return result;
222989
+ }
222367
222990
  function chunkMarkdown(content, options = {}) {
222368
222991
  const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS2;
222992
+ const paragraphMaxTokens = options.paragraphMaxTokens ?? DEFAULT_PARAGRAPH_MAX_TOKENS;
222369
222993
  const sections = parseSections(content);
222370
222994
  if (sections.length === 0)
222371
222995
  return [];
222996
+ const coarseParagraphs = buildCoarseParagraphs(sections, paragraphMaxTokens);
222372
222997
  const chunks = [];
222373
222998
  let pendingSections = [];
222374
222999
  let pendingTokens = 0;
@@ -222386,14 +223011,16 @@ function chunkMarkdown(content, options = {}) {
222386
223011
  const heading = sectionHeadingLine(entry.section);
222387
223012
  if (heading)
222388
223013
  textParts.push(heading);
222389
- for (let pIdx = 0;pIdx < entry.section.paragraphs.length; pIdx++) {
222390
- const pText = entry.section.paragraphs[pIdx];
222391
- textParts.push(pText);
222392
- paragraphs.push({
222393
- sectionIndex: entry.sectionIndex,
222394
- paragraphIndex: pIdx,
222395
- text: pText
222396
- });
223014
+ const sectionParas = coarseParagraphs.filter((p) => p.sectionIndex === entry.sectionIndex);
223015
+ for (const p of sectionParas) {
223016
+ if (!paragraphs.some((existing) => existing.paragraphIndex === p.paragraphIndex && existing.text === p.text)) {
223017
+ textParts.push(p.text);
223018
+ paragraphs.push({
223019
+ sectionIndex: p.sectionIndex,
223020
+ paragraphIndex: p.paragraphIndex,
223021
+ text: p.text
223022
+ });
223023
+ }
222397
223024
  }
222398
223025
  }
222399
223026
  chunks.push({
@@ -222416,7 +223043,7 @@ function chunkMarkdown(content, options = {}) {
222416
223043
  ` : "") + section.body);
222417
223044
  if (sectionTokens > maxTokens && section.paragraphs.length > 1) {
222418
223045
  flushPending();
222419
- splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks);
223046
+ splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks, coarseParagraphs);
222420
223047
  continue;
222421
223048
  }
222422
223049
  const crumbTokens = pendingSections.length === 0 ? estimateTokens(breadcrumbPrefix(breadcrumb)) : 0;
@@ -222429,9 +223056,10 @@ function chunkMarkdown(content, options = {}) {
222429
223056
  flushPending();
222430
223057
  return chunks;
222431
223058
  }
222432
- function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks) {
223059
+ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks, coarseParagraphs) {
222433
223060
  const headingLine = sectionHeadingLine(section);
222434
223061
  const prefix = breadcrumbPrefix(breadcrumb);
223062
+ const sectionParas = coarseParagraphs.filter((p) => p.sectionIndex === sectionIndex);
222435
223063
  let accParagraphs = [];
222436
223064
  let accTextParts = [];
222437
223065
  let accTokens = 0;
@@ -222458,18 +223086,265 @@ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens,
222458
223086
  accTokens = baseOverhead;
222459
223087
  }
222460
223088
  accTokens = baseOverhead;
222461
- for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
222462
- const pText = section.paragraphs[pIdx];
222463
- const pTokens = estimateTokens(pText);
223089
+ for (const p of sectionParas) {
223090
+ const pTokens = estimateTokens(p.text);
222464
223091
  if (accParagraphs.length > 0 && accTokens + pTokens > maxTokens) {
222465
223092
  flushAcc();
222466
223093
  }
222467
- accParagraphs.push({ sectionIndex, paragraphIndex: pIdx, text: pText });
222468
- accTextParts.push(pText);
223094
+ accParagraphs.push({ sectionIndex, paragraphIndex: p.paragraphIndex, text: p.text });
223095
+ accTextParts.push(p.text);
222469
223096
  accTokens += pTokens;
222470
223097
  }
222471
223098
  flushAcc();
222472
223099
  }
223100
+ // ../llm/src/chunking/normalizeMarkdown.ts
223101
+ function normalizeMarkdown(content) {
223102
+ const stats = { repairs: {} };
223103
+ function count(category) {
223104
+ stats.repairs[category] = (stats.repairs[category] ?? 0) + 1;
223105
+ }
223106
+ let result = stripBomAndInvisible(content, count);
223107
+ result = normalizeLineEndings(result, count);
223108
+ const lines = result.split(`
223109
+ `);
223110
+ const output = processBlocks(lines, count);
223111
+ return { content: output.join(`
223112
+ `), stats };
223113
+ }
223114
+ function stripBomAndInvisible(text2, count) {
223115
+ const cleaned = text2.replace(/[\uFEFF\u200B\u200C\u200D]/g, "");
223116
+ if (cleaned.length !== text2.length) {
223117
+ count("invisible_chars");
223118
+ }
223119
+ return cleaned;
223120
+ }
223121
+ function normalizeLineEndings(text2, count) {
223122
+ if (text2.includes("\r")) {
223123
+ count("line_endings");
223124
+ return text2.replace(/\r\n?/g, `
223125
+ `);
223126
+ }
223127
+ return text2;
223128
+ }
223129
+ function processBlocks(inputLines, count) {
223130
+ const lines = splitInlineFences(inputLines, count);
223131
+ const output = [];
223132
+ let i = 0;
223133
+ while (i < lines.length) {
223134
+ const line = lines[i];
223135
+ const trimmed = line.trimStart();
223136
+ const fenceMatch = trimmed.match(/^(`{3,}|~{3,})/);
223137
+ if (fenceMatch) {
223138
+ const result = handleCodeFence(lines, i, fenceMatch[1], count);
223139
+ output.push(...result.lines);
223140
+ i = result.nextIndex;
223141
+ continue;
223142
+ }
223143
+ if (looksLikeTableRow(trimmed)) {
223144
+ const result = handleTableBlock(lines, i, count);
223145
+ output.push(...result.lines);
223146
+ i = result.nextIndex;
223147
+ continue;
223148
+ }
223149
+ if (trimmed === "") {
223150
+ const result = handleBlankLines(lines, i, count);
223151
+ output.push(...result.lines);
223152
+ i = result.nextIndex;
223153
+ continue;
223154
+ }
223155
+ if (trimmed.startsWith("<!--")) {
223156
+ const result = handleHtmlComment(lines, i, count);
223157
+ output.push(...result.lines);
223158
+ i = result.nextIndex;
223159
+ continue;
223160
+ }
223161
+ if (looksLikeJsonBlockStart(trimmed)) {
223162
+ const result = handleUnfencedJson(lines, i, count);
223163
+ if (result) {
223164
+ output.push(...result.lines);
223165
+ i = result.nextIndex;
223166
+ continue;
223167
+ }
223168
+ }
223169
+ output.push(line);
223170
+ i++;
223171
+ }
223172
+ return output;
223173
+ }
223174
+ function handleCodeFence(lines, startIdx, fence, count) {
223175
+ const fenceChar = fence[0];
223176
+ const fenceLen = fence.length;
223177
+ const result = [lines[startIdx]];
223178
+ let i = startIdx + 1;
223179
+ while (i < lines.length) {
223180
+ const trimmed = lines[i].trimStart();
223181
+ result.push(lines[i]);
223182
+ const closingRe = new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`);
223183
+ if (closingRe.test(trimmed)) {
223184
+ return { lines: result, nextIndex: i + 1 };
223185
+ }
223186
+ i++;
223187
+ }
223188
+ count("unclosed_code_fence");
223189
+ result.push(fence);
223190
+ return { lines: result, nextIndex: i };
223191
+ }
223192
+ function handleTableBlock(lines, startIdx, count) {
223193
+ const tableLines = [];
223194
+ let i = startIdx;
223195
+ while (i < lines.length && looksLikeTableRow(lines[i].trimStart())) {
223196
+ tableLines.push(lines[i]);
223197
+ i++;
223198
+ }
223199
+ if (tableLines.length < 2) {
223200
+ return { lines: tableLines, nextIndex: i };
223201
+ }
223202
+ const normalized = tableLines.map((line) => {
223203
+ const trimmed = line.trimStart();
223204
+ if (!trimmed.startsWith("|") && trimmed.includes("|")) {
223205
+ count("table_leading_pipe");
223206
+ return "| " + trimmed + (trimmed.endsWith("|") ? "" : " |");
223207
+ }
223208
+ return line;
223209
+ });
223210
+ const hasSeparator = normalized.some((line) => /^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/.test(line.trim()));
223211
+ if (!hasSeparator && normalized.length >= 2) {
223212
+ const firstRow = normalized[0].trim();
223213
+ const colCount = countPipes(firstRow) - 1;
223214
+ if (colCount >= 2) {
223215
+ const separator = "| " + Array(colCount).fill("---").join(" | ") + " |";
223216
+ count("table_missing_separator");
223217
+ const result = [normalized[0], separator, ...normalized.slice(1)];
223218
+ return { lines: result, nextIndex: i };
223219
+ }
223220
+ }
223221
+ return { lines: normalized, nextIndex: i };
223222
+ }
223223
+ function handleBlankLines(lines, startIdx, count) {
223224
+ let i = startIdx;
223225
+ while (i < lines.length && lines[i].trim() === "") {
223226
+ i++;
223227
+ }
223228
+ const blankCount = i - startIdx;
223229
+ if (blankCount > 2) {
223230
+ count("excessive_blank_lines");
223231
+ return { lines: [""], nextIndex: i };
223232
+ }
223233
+ return { lines: lines.slice(startIdx, i), nextIndex: i };
223234
+ }
223235
+ function handleHtmlComment(lines, startIdx, count) {
223236
+ const firstLine = lines[startIdx];
223237
+ if (firstLine.includes("-->")) {
223238
+ count("html_comment");
223239
+ return { lines: [], nextIndex: startIdx + 1 };
223240
+ }
223241
+ let i = startIdx + 1;
223242
+ while (i < lines.length) {
223243
+ if (lines[i].includes("-->")) {
223244
+ count("html_comment");
223245
+ return { lines: [], nextIndex: i + 1 };
223246
+ }
223247
+ i++;
223248
+ }
223249
+ return { lines: [firstLine], nextIndex: startIdx + 1 };
223250
+ }
223251
+ function looksLikeJsonBlockStart(trimmed) {
223252
+ return trimmed === "{" || trimmed === "[";
223253
+ }
223254
+ var MIN_JSON_BLOCK_LINES = 5;
223255
+ function handleUnfencedJson(lines, startIdx, count) {
223256
+ const opener = lines[startIdx].trimStart();
223257
+ const openChar = opener[0];
223258
+ const closeChar = openChar === "{" ? "}" : "]";
223259
+ let depth = 0;
223260
+ let i = startIdx;
223261
+ let inString = false;
223262
+ while (i < lines.length) {
223263
+ const line = lines[i];
223264
+ for (let c = 0;c < line.length; c++) {
223265
+ const ch = line[c];
223266
+ if (ch === "\\" && inString) {
223267
+ c++;
223268
+ continue;
223269
+ }
223270
+ if (ch === '"') {
223271
+ inString = !inString;
223272
+ continue;
223273
+ }
223274
+ if (inString)
223275
+ continue;
223276
+ if (ch === "/" && c + 1 < line.length && line[c + 1] === "/") {
223277
+ break;
223278
+ }
223279
+ if (ch === "{" || ch === "[")
223280
+ depth++;
223281
+ else if (ch === "}" || ch === "]")
223282
+ depth--;
223283
+ }
223284
+ i++;
223285
+ if (depth === 0) {
223286
+ const blockLen = i - startIdx;
223287
+ if (blockLen < MIN_JSON_BLOCK_LINES) {
223288
+ return null;
223289
+ }
223290
+ const lastTrimmed = lines[i - 1].trimEnd();
223291
+ if (!lastTrimmed.endsWith(closeChar)) {
223292
+ return null;
223293
+ }
223294
+ count("unfenced_json_block");
223295
+ const fenced = ["```json"];
223296
+ for (let j = startIdx;j < i; j++) {
223297
+ fenced.push(lines[j]);
223298
+ }
223299
+ fenced.push("```");
223300
+ return { lines: fenced, nextIndex: i };
223301
+ }
223302
+ if (depth < 0) {
223303
+ return null;
223304
+ }
223305
+ }
223306
+ return null;
223307
+ }
223308
+ function splitInlineFences(lines, count) {
223309
+ const result = [];
223310
+ for (const line of lines) {
223311
+ const trimmed = line.trimStart();
223312
+ if (/^(`{3,}|~{3,})/.test(trimmed)) {
223313
+ result.push(line);
223314
+ continue;
223315
+ }
223316
+ const inlineMatch = trimmed.match(/(`{3,}|~{3,})(\S*)\s*$/);
223317
+ if (inlineMatch) {
223318
+ const fenceStr = inlineMatch[1];
223319
+ const fenceIdx = trimmed.lastIndexOf(fenceStr);
223320
+ const beforeFence = trimmed.substring(0, fenceIdx);
223321
+ if (beforeFence.trim().length > 0) {
223322
+ const leadingWhitespace = line.substring(0, line.length - trimmed.length);
223323
+ count("inline_code_fence");
223324
+ result.push(leadingWhitespace + beforeFence.trimEnd());
223325
+ result.push(trimmed.substring(fenceIdx));
223326
+ continue;
223327
+ }
223328
+ }
223329
+ result.push(line);
223330
+ }
223331
+ return result;
223332
+ }
223333
+ function looksLikeTableRow(trimmed) {
223334
+ if (trimmed.startsWith("#") || trimmed.startsWith("```") || trimmed.startsWith("~~~")) {
223335
+ return false;
223336
+ }
223337
+ return countPipes(trimmed) >= 1;
223338
+ }
223339
+ function countPipes(text2) {
223340
+ let count = 0;
223341
+ for (let i = 0;i < text2.length; i++) {
223342
+ if (text2[i] === "|" && (i === 0 || text2[i - 1] !== "\\")) {
223343
+ count++;
223344
+ }
223345
+ }
223346
+ return count;
223347
+ }
222473
223348
  // ../llm/src/utils/mapConcurrent.ts
222474
223349
  async function mapConcurrent(items, concurrency, fn) {
222475
223350
  const results = [];