@c4a/server-cli 0.4.15-alpha.4 → 0.4.15-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +10 -5
  2. package/index.js +954 -85
  3. package/package.json +1 -1
  4. package/serve.js +2016 -216
  5. package/web/assets/ContentDetail--oZBzWh0.js +1 -0
  6. package/web/assets/ContentDetail-B5s8bbFo.js +1 -0
  7. package/web/assets/ContentDetail-C3kXsx-i.js +1 -0
  8. package/web/assets/ContentDetail-CcLGF_Yi.js +1 -0
  9. package/web/assets/ContentDetail-D-2xyerw.js +1 -0
  10. package/web/assets/ContentDetail-DlQ8URkx.js +1 -0
  11. package/web/assets/ContentDetail-TPc0m0eM.js +1 -0
  12. package/web/assets/ContentDetail-y0yi2qln.js +1 -0
  13. package/web/assets/EntityDetail-3CFtMmgQ.js +1 -0
  14. package/web/assets/EntityDetail-BI3etmj4.js +1 -0
  15. package/web/assets/EntityDetail-CoFb-qZW.js +1 -0
  16. package/web/assets/EntityDetail-D_WP7tD4.js +1 -0
  17. package/web/assets/EntityDetail-DiJPemDY.js +1 -0
  18. package/web/assets/EntityDetail-DihnDvhA.js +1 -0
  19. package/web/assets/EntityDetail-DyDH4GAw.js +1 -0
  20. package/web/assets/EntityDetail-dIZiNN2t.js +1 -0
  21. package/web/assets/RelationDetail-B2gHrceI.js +1 -0
  22. package/web/assets/RelationDetail-CEq9vopD.js +1 -0
  23. package/web/assets/RelationDetail-CaYrspaS.js +1 -0
  24. package/web/assets/RelationDetail-CpoGdy25.js +1 -0
  25. package/web/assets/RelationDetail-DU9ECyHi.js +1 -0
  26. package/web/assets/RelationDetail-Dz7HAlU5.js +1 -0
  27. package/web/assets/RelationDetail-Wh3IgNaF.js +1 -0
  28. package/web/assets/RelationDetail-zZ_ZfkYX.js +1 -0
  29. package/web/assets/index-BPMqeFze.js +111 -0
  30. package/web/assets/index-BgRuvBL5.js +111 -0
  31. package/web/assets/index-CcrkBEZl.js +111 -0
  32. package/web/assets/index-DGDx8sCs.js +111 -0
  33. package/web/assets/index-DIyAwnqE.js +111 -0
  34. package/web/assets/index-DW1cCA8v.js +111 -0
  35. package/web/assets/index-DiAYi5t8.css +1 -0
  36. package/web/assets/index-FOCWvgW_.css +1 -0
  37. package/web/assets/index-daOjyLzy.css +1 -0
  38. package/web/assets/index-moF8uSEi.js +111 -0
  39. package/web/assets/index-sPNyENFN.js +111 -0
  40. package/web/assets/index-uGqDxUnx.css +1 -0
  41. package/web/index.html +2 -2
package/serve.js CHANGED
@@ -281,6 +281,10 @@ var init_serverConfig = __esm(() => {
281
281
  default_model: "gemini-3-pro-preview"
282
282
  }
283
283
  },
284
+ indexing: {
285
+ task_timeout_ms: 150 * 60 * 1000,
286
+ file_timeout_ms: 15 * 60 * 1000
287
+ },
284
288
  embedding: {
285
289
  provider: "huggingface",
286
290
  huggingface: {
@@ -4340,7 +4344,7 @@ var init_atomsSchema = __esm(() => {
4340
4344
  init_zod();
4341
4345
  init_base();
4342
4346
  init_baseSchema();
4343
- confidenceAtomSchema = exports_external.number().min(0).max(1).optional();
4347
+ confidenceAtomSchema = exports_external.number().min(0).max(1).optional().catch(undefined);
4344
4348
  entityAtomSchema = exports_external.object({
4345
4349
  name: exports_external.string(),
4346
4350
  kind: kindSchema.optional().catch(undefined),
@@ -186001,6 +186005,10 @@ function mergeServerConfig(parsed) {
186001
186005
  ...isPlainObject3(input.llm?.google) ? input.llm?.google : {}
186002
186006
  }
186003
186007
  },
186008
+ indexing: {
186009
+ ...defaults2.indexing,
186010
+ ...isPlainObject3(input.indexing) ? input.indexing : {}
186011
+ },
186004
186012
  embedding: {
186005
186013
  ...defaults2.embedding,
186006
186014
  ...isPlainObject3(input.embedding) ? input.embedding : {},
@@ -194956,14 +194964,21 @@ function isRetryableStatus(status) {
194956
194964
  function isAuthStatus(status) {
194957
194965
  return status === 401 || status === 403;
194958
194966
  }
194959
- function isBadRequest(status) {
194960
- return status === 400;
194967
+ function throwLlmError(error40, status) {
194968
+ const detail = toErrorMessage(error40);
194969
+ const statusTag = status ? ` [HTTP ${status}]` : "";
194970
+ if (isAuthStatus(status)) {
194971
+ throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, `LLM 认证失败${statusTag}: ${detail}`, detail);
194972
+ }
194973
+ throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, `LLM 调用失败${statusTag}: ${detail}`, detail);
194961
194974
  }
194962
194975
 
194963
194976
  class LlmServiceImpl {
194964
194977
  options;
194978
+ supportsTemperature;
194965
194979
  constructor(options) {
194966
194980
  this.options = options;
194981
+ this.supportsTemperature = options.provider !== "openai";
194967
194982
  }
194968
194983
  async generateText(prompt, options) {
194969
194984
  if (this.options.forceStream) {
@@ -194975,7 +194990,7 @@ class LlmServiceImpl {
194975
194990
  model: this.options.languageModel,
194976
194991
  prompt,
194977
194992
  maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
194978
- temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
194993
+ ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
194979
194994
  maxRetries: 0
194980
194995
  };
194981
194996
  if (options?.systemPrompt) {
@@ -195012,13 +195027,7 @@ class LlmServiceImpl {
195012
195027
  durationMs,
195013
195028
  error: toErrorMessage(error40)
195014
195029
  });
195015
- if (isAuthStatus(status)) {
195016
- throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
195017
- }
195018
- if (isBadRequest(status)) {
195019
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
195020
- }
195021
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
195030
+ throwLlmError(error40, status);
195022
195031
  }
195023
195032
  }
195024
195033
  async generateTextViaStream(prompt, options) {
@@ -195028,7 +195037,7 @@ class LlmServiceImpl {
195028
195037
  model: this.options.languageModel,
195029
195038
  prompt,
195030
195039
  maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
195031
- temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
195040
+ ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
195032
195041
  maxRetries: 0
195033
195042
  };
195034
195043
  if (options?.systemPrompt) {
@@ -195066,13 +195075,7 @@ class LlmServiceImpl {
195066
195075
  durationMs: Date.now() - startedAt,
195067
195076
  error: toErrorMessage(error40)
195068
195077
  });
195069
- if (isAuthStatus(status)) {
195070
- throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
195071
- }
195072
- if (isBadRequest(status)) {
195073
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
195074
- }
195075
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
195078
+ throwLlmError(error40, status);
195076
195079
  }
195077
195080
  }
195078
195081
  streamText(prompt, options) {
@@ -195095,7 +195098,7 @@ class LlmServiceImpl {
195095
195098
  model: this.options.languageModel,
195096
195099
  prompt,
195097
195100
  maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
195098
- temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
195101
+ ...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
195099
195102
  maxRetries: 0,
195100
195103
  onFinish: (event) => {
195101
195104
  const finishEvent = event;
@@ -195141,13 +195144,7 @@ class LlmServiceImpl {
195141
195144
  durationMs: Date.now() - startedAt,
195142
195145
  error: toErrorMessage(error40)
195143
195146
  });
195144
- if (isAuthStatus(status)) {
195145
- throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
195146
- }
195147
- if (isBadRequest(status)) {
195148
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
195149
- }
195150
- throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
195147
+ throwLlmError(error40, status);
195151
195148
  }
195152
195149
  }
195153
195150
  }
@@ -195858,7 +195855,12 @@ function parseExtractionOutput(raw5, schema2) {
195858
195855
  return { success: false, error: new Error("Empty output") };
195859
195856
  }
195860
195857
  const protocolParsed = tryParseProtocol(trimmed);
195861
- const parsed = protocolParsed ?? tryParseJson(trimmed);
195858
+ let parsed = protocolParsed ?? tryParseJson(trimmed);
195859
+ if (Array.isArray(parsed)) {
195860
+ parsed = { paragraphs: parsed };
195861
+ }
195862
+ parsed = normalizeFlatOutput(parsed);
195863
+ parsed = stripNulls(parsed);
195862
195864
  const result = schema2.safeParse(parsed);
195863
195865
  if (!result.success) {
195864
195866
  return { success: false, error: result.error };
@@ -195928,6 +195930,37 @@ function tryParseJson(raw5) {
195928
195930
  function repairAndParse(raw5) {
195929
195931
  return JSON.parse(jsonrepair(raw5));
195930
195932
  }
195933
+ var PARAGRAPH_TAG_RE = /^P\d+$/;
195934
+ function normalizeFlatOutput(parsed) {
195935
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
195936
+ return parsed;
195937
+ const obj = parsed;
195938
+ if ("paragraphs" in obj)
195939
+ return parsed;
195940
+ const keys = Object.keys(obj);
195941
+ if (keys.length === 0)
195942
+ return { paragraphs: [] };
195943
+ const allTags = keys.every((k) => PARAGRAPH_TAG_RE.test(k));
195944
+ if (!allTags)
195945
+ return parsed;
195946
+ const paragraphs = keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag2) => ({ tag: tag2, atoms: obj[tag2] }));
195947
+ return { paragraphs };
195948
+ }
195949
+ function stripNulls(value) {
195950
+ if (value === null)
195951
+ return;
195952
+ if (Array.isArray(value))
195953
+ return value.map(stripNulls);
195954
+ if (typeof value === "object" && value !== null) {
195955
+ const out2 = {};
195956
+ for (const [k, v] of Object.entries(value)) {
195957
+ if (v !== null)
195958
+ out2[k] = stripNulls(v);
195959
+ }
195960
+ return out2;
195961
+ }
195962
+ return value;
195963
+ }
195931
195964
  function isRecord(value) {
195932
195965
  return !!value && typeof value === "object" && "key" in value && "value" in value && typeof value.key === "string";
195933
195966
  }
@@ -196220,30 +196253,32 @@ Each atom type has specific required fields. Fields with "?" suffix are optional
196220
196253
  ${ATOM_TYPES_BLOCK}
196221
196254
 
196222
196255
  ## Output Format
196223
- Return a single JSON object with this structure:
196256
+ Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
196224
196257
  {
196225
- "paragraphs": [
196226
- {
196227
- "tag": "P0",
196228
- "atoms": {
196229
- "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
196230
- "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }],
196231
- "rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
196232
- }
196233
- }
196234
- ]
196258
+ "P0": {
196259
+ "entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
196260
+ "relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }]
196261
+ },
196262
+ "P3": {
196263
+ "rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
196264
+ }
196235
196265
  }
196236
196266
 
196237
196267
  ## Rules
196238
- - Each paragraph tag (P0, P1, ...) corresponds to the tagged paragraph in the input.
196268
+ - Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
196269
+ - Skip paragraphs with no atoms — do NOT emit empty objects.
196239
196270
  - Only include atom types that are actually found in a paragraph (all types are optional).
196240
196271
  - Every atom MUST include all required fields for its type (see schemas above). Fields with "?" suffix are optional.
196241
196272
  - **Enum fields MUST use ONLY the listed values.** For example, entity.kind must be one of "implementation"|"external"|"concept" — do NOT use values from other atom types (e.g., do NOT put "team" or "human" in entity.kind; those belong to roles.kind).
196242
196273
  - Every atom MUST include a "confidence" field (0.0-1.0) indicating how confident you are in the extraction. Use higher values (0.85-1.0) for explicitly stated facts and lower values (0.5-0.7) for inferred or ambiguous information.
196243
196274
  - **Classify correctly:** People, teams, and personas → "roles" (not "entities"). Technical systems, services, modules → "entities".
196275
+ - **Entity reference consistency (CRITICAL):** Every entity name referenced in relation.from, relation.to, behavior.subject, or any other cross-reference field MUST also appear in the "entities" array of the SAME paragraph (or a preceding paragraph in the same chunk). If an entity is mentioned for the first time in a relation, you MUST also extract it as an entity. This ensures no "dangling references" — every name used in relations has a corresponding entity declaration.
196244
196276
  - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions, English input → English descriptions).
196245
196277
  - JSON structure keys (tag, atom type names, field names) must always be in English.
196246
196278
  - Be thorough: extract ALL relevant atoms from each paragraph.
196279
+ - **Tables: basic extraction only.** For paragraphs containing markdown tables, extract the table heading as an entity and a brief summary attribute. Detailed table modeling (row-level data, comparisons, metrics) is handled by a dedicated table extraction pass — do NOT attempt exhaustive table column extraction here.
196280
+ - **Diagrams: basic extraction only.** For paragraphs containing text-based diagrams (e.g. \`\`\`mermaid, \`\`\`plantuml, \`\`\`dot, etc.), extract the diagram title as an entity and a brief summary attribute describing what the diagram shows. Detailed diagram modeling (nodes, edges, states, transitions) is handled by a dedicated diagram extraction pass — do NOT attempt exhaustive diagram parsing here.
196281
+ - **Metrics checklist:** If the text mentions performance targets, SLA, response times, throughput, error rates, port numbers, timeouts, capacity limits, or any numeric thresholds, extract them as "metrics" atoms.
196247
196282
  - Do NOT include "claims" — they are system-generated and not part of document extraction.`;
196248
196283
  function buildDocAtomAnnotationPrompt(chunkText) {
196249
196284
  return `Extract all semantic atoms from the following document text.
@@ -196255,6 +196290,13 @@ ${chunkText}
196255
196290
 
196256
196291
  Return ONLY a valid JSON object. No markdown fences, no explanation.`;
196257
196292
  }
196293
+ function toFlatFormat(result) {
196294
+ const flat = {};
196295
+ for (const p4 of result.paragraphs) {
196296
+ flat[p4.tag] = p4.atoms;
196297
+ }
196298
+ return flat;
196299
+ }
196258
196300
  function buildDocGleaningPrompt(chunkText, previousResult) {
196259
196301
  return `Review the following document text and the previously extracted atoms.
196260
196302
  Check for any MISSING atoms that were not captured in the first pass.
@@ -196263,66 +196305,483 @@ Check for any MISSING atoms that were not captured in the first pass.
196263
196305
  ${chunkText}
196264
196306
 
196265
196307
  ## Previously Extracted Atoms
196266
- ${JSON.stringify(previousResult, null, 2)}
196308
+ ${JSON.stringify(toFlatFormat(previousResult), null, 2)}
196267
196309
 
196268
196310
  ## Instructions
196269
- - If you find missing atoms, output them in the same JSON format (with paragraph tags).
196311
+ - If you find missing atoms, output them in the same flat JSON format keyed by paragraph tags (e.g. {"P0": {"entities": [...]}, "P3": {"rules": [...]}}).
196270
196312
  - Only include NEW atoms not already in the previous extraction.
196271
196313
  - Every atom MUST include a "confidence" field (0.0-1.0).
196272
- - If nothing is missing, return: {"paragraphs": []}
196314
+ - **Entity reference consistency:** If you add a new relation whose from/to references an entity not yet declared in the previous extraction or your current output, you MUST also add that entity to the "entities" array.
196315
+ - If nothing is missing, return: {}
196273
196316
  - Respond in the same language as the input text.
196274
196317
 
196275
196318
  Return ONLY a valid JSON object. No markdown fences, no explanation.`;
196276
196319
  }
196277
196320
  var DOC_ANNOTATION_SYSTEM_PROMPT = SYSTEM_PROMPT;
196321
+ // ../llm/src/prompts/entityResolution.ts
196322
+ var ENTITY_RESOLUTION_SYSTEM_PROMPT = `You are an entity resolution assistant. You review a list of entity names extracted from a technical document and perform two tasks:
196323
+
196324
+ ## Task 1: Merge Duplicates
196325
+ - Only merge names that clearly refer to the same entity (same system, service, tool, etc.)
196326
+ - Prefer the LONGER, more descriptive name as the canonical name
196327
+ - Do NOT merge names that share a substring but refer to different things
196328
+ - When uncertain, do NOT merge — add to "ambiguous" instead
196329
+ - Chinese and English names for the same entity SHOULD be merged (e.g. "Vmok" → "Vmok 微模块框架")
196330
+ - Abbreviations should be merged with their full forms (e.g. "AGW" → "API Gateway")
196331
+
196332
+ ## Task 2: Remove Noise
196333
+ - Remove names that are NOT meaningful named entities — they are generic words, actions, or descriptions
196334
+ - Examples of noise: common verbs/nouns (登录, 路由, 直连), generic technical terms (Env, query), action descriptions (Kill 3001 进程)
196335
+ - Examples of REAL entities to KEEP: product names (TTAstra, Gulux), tools (nvm, Rush), services (Op Main 服务), platforms (AGW 平台)
196336
+ - When uncertain, KEEP the name — only remove if clearly not a named entity
196337
+
196338
+ ## Output
196339
+ Valid JSON only. No markdown fences, no explanation.`;
196340
+ function buildEntityResolutionPrompt(input) {
196341
+ const parts = [];
196342
+ parts.push(`## All Entity Names (${input.allNames.length} total)`);
196343
+ parts.push(input.allNames.map((n, i) => `${i + 1}. ${n}`).join(`
196344
+ `));
196345
+ if (input.candidates.length > 0) {
196346
+ parts.push("");
196347
+ parts.push(`## Suspected Duplicates (${input.candidates.length} pairs)`);
196348
+ parts.push("Review each pair and decide whether to merge:");
196349
+ for (const c of input.candidates) {
196350
+ parts.push(`- "${c.short}" ↔ "${c.long}" — ${c.reason}`);
196351
+ }
196352
+ }
196353
+ if (input.noiseCandidates && input.noiseCandidates.length > 0) {
196354
+ parts.push("");
196355
+ parts.push(`## Suspected Noise (${input.noiseCandidates.length} names)`);
196356
+ parts.push("Review each name — remove if NOT a meaningful named entity, keep if it IS:");
196357
+ for (const n of input.noiseCandidates) {
196358
+ parts.push(`- "${n}"`);
196359
+ }
196360
+ }
196361
+ if (input.contextSnippets && input.contextSnippets.length > 0) {
196362
+ parts.push("");
196363
+ parts.push("## Context Snippets");
196364
+ for (const s of input.contextSnippets) {
196365
+ parts.push(`- **${s.name}**: ${s.snippet}`);
196366
+ }
196367
+ }
196368
+ parts.push("");
196369
+ parts.push(`## Output Format
196370
+ Return a JSON object:
196371
+ {
196372
+ "merges": [
196373
+ { "from": "alias name", "to": "canonical name" }
196374
+ ],
196375
+ "remove": ["noise_name_1", "noise_name_2"],
196376
+ "ambiguous": ["name1", "name2"]
196377
+ }
196378
+
196379
+ - "merges": confirmed duplicate pairs. "from" will be replaced by "to" everywhere.
196380
+ - "remove": names confirmed as noise. They will be deleted from entity list.
196381
+ - "ambiguous": names you're unsure about (optional, for logging).
196382
+
196383
+ Return ONLY valid JSON. No markdown fences, no explanation.`);
196384
+ return parts.join(`
196385
+ `);
196386
+ }
196387
+ // ../llm/src/prompts/docTableAnnotation.ts
196388
+ init_src();
196389
+ var entityFields = zodObjectToPromptFields(entityAtomSchema);
196390
+ var attributeFields = zodObjectToPromptFields(attributeAtomSchema);
196391
+ var relationFields = zodObjectToPromptFields(relationAtomSchema);
196392
+ var comparisonFields = zodObjectToPromptFields(comparisonAtomSchema);
196393
+ var metricFields = zodObjectToPromptFields(metricAtomSchema);
196394
+ var behaviorFields = zodObjectToPromptFields(behaviorAtomSchema);
196395
+ var eventFields = zodObjectToPromptFields(eventAtomSchema);
196396
+ var transitionFields = zodObjectToPromptFields(transitionAtomSchema);
196397
+ var constraintFields = zodObjectToPromptFields(constraintAtomSchema);
196398
+ var stateFields = zodObjectToPromptFields(stateAtomSchema);
196399
+ var ruleFields = zodObjectToPromptFields(ruleAtomSchema);
196400
+ var TABLE_SYSTEM_PROMPT = `You are a table data modeling assistant. Your task is to extract structured semantic atoms from markdown tables in documents.
196401
+
196402
+ Each table paragraph is tagged with [P0], [P1], etc. You must classify the table type FIRST, then apply the corresponding extraction rules.
196403
+
196404
+ ## Step 1: Classify the Table
196405
+
196406
+ Determine the table type by examining the relationship between rows:
196407
+
196408
+ ### Type A: Collection / Record Table
196409
+ **Rows are peer instances of the same concept.** Each row is an independent record; columns describe different facets of the same instance.
196410
+ - Examples: code→name mappings, enum definitions, config parameter lists, reference data tables
196411
+ - Key signal: removing one row does not affect the meaning of other rows
196412
+
196413
+ ### Type B: Single-Object Property Table
196414
+ **Rows describe properties/fields of ONE entity.** First column is property name, other columns are its type/value/description.
196415
+ - Examples: API field definitions, configuration schema, entity attribute lists
196416
+ - Key signal: all rows refer to the same parent entity
196417
+
196418
+ ### Type C: Comparison / Evaluation Table
196419
+ **Rows or columns represent different subjects being compared** across the same dimensions.
196420
+ - Examples: technology selection, vendor evaluation, feature comparison
196421
+ - Key signal: multiple named subjects evaluated on shared criteria
196422
+
196423
+ ### Type D: Matrix / Cross-Reference Table
196424
+ **Both row headers and column headers are dimensions.** Cells represent the relationship at the intersection.
196425
+ - Examples: permission matrices (role × operation), compatibility matrices, dependency tables
196426
+ - Key signal: both axes are meaningful dimensions, cells are binary/rating/relationship values
196427
+
196428
+ ### Type E: Metrics / KPI Table
196429
+ **Rows are measurable indicators** with numeric targets, thresholds, or SLA values.
196430
+ - Examples: SLA tables, performance baselines, capacity planning tables
196431
+ - Key signal: columns include target/threshold/unit/SLA-style values
196432
+
196433
+ ### Type F: Timeline / Process Table
196434
+ **Rows represent ordered steps or phases** in a sequence.
196435
+ - Examples: deployment steps, approval workflows, version changelog, migration plans
196436
+ - Key signal: rows have implicit ordering, may have phase/step/date columns
196437
+
196438
+ ## Step 2: Extract Atoms by Table Type
196439
+
196440
+ ### Type A → Single attribute with row-object array
196441
+ 1. Create ONE entity for the abstract concept (table heading or the concept rows represent).
196442
+ Entity schema: ${entityFields}
196443
+ 2. Create ONE attribute with \`type: "table"\` and \`value\` as an array of row objects. Each row object uses column headers as keys.
196444
+ Attribute schema: ${attributeFields}
196445
+ Example: \`{ "name": "Region Code Mapping", "type": "table", "value": [{"Code": "1001", "Name": "CN_North", "Region": "CN-NORTH"}, ...] }\`
196446
+ 3. **Extract ALL rows — do not sample.** If a table has 30 rows, the value array must contain all 30 objects.
196447
+ 4. Extract structural patterns: status indicators (DEPRECATED, enabled/disabled) → "states" + "rules" atoms.
196448
+ State schema: ${stateFields}
196449
+ Rule schema: ${ruleFields}
196450
+
196451
+ ### Type B → Multiple attribute atoms
196452
+ 1. Create ONE entity for the parent structure.
196453
+ Entity schema: ${entityFields}
196454
+ 2. Create one attribute per row: \`name\` = property name, \`type\` = property type, \`value\` = default/example.
196455
+ Attribute schema: ${attributeFields}
196456
+ 3. Extract constraints from "required" or "validation" columns.
196457
+ Constraint schema: ${constraintFields}
196458
+
196459
+ ### Type C → Comparison atom
196460
+ 1. Use "comparisons" atom. Subjects = compared items, dimensions = evaluation criteria.
196461
+ Comparison schema: ${comparisonFields}
196462
+ 2. Extract "decisions" atoms if the table leads to a conclusion.
196463
+
196464
+ ### Type D → Relations or table attribute
196465
+ 1. If cells are simple (yes/no, allowed/denied): extract as "relations" atoms.
196466
+ Relation schema: ${relationFields}
196467
+ Map each cell to a relation: row header → \`from\`, column header → \`to\`, cell value → \`type\` or \`description\`.
196468
+ 2. If cells are complex: use Type A approach (single attribute with \`type: "table"\`).
196469
+ 3. Create entities for both row headers and column headers if they are named concepts.
196470
+
196471
+ ### Type E → Metrics atoms
196472
+ 1. Create one "metrics" atom per row.
196473
+ Metric schema: ${metricFields}
196474
+ 2. Also create the parent entity if named (e.g., "SLA Requirements").
196475
+
196476
+ ### Type F → Behaviors/Events/Transitions
196477
+ 1. Create one "behaviors" atom per step/phase.
196478
+ Behavior schema: ${behaviorFields}
196479
+ 2. If there are triggers: extract "events" atoms.
196480
+ Event schema: ${eventFields}
196481
+ 3. If there are state changes: extract "transitions" atoms.
196482
+ Transition schema: ${transitionFields}
196483
+ 4. Create the parent entity for the process/workflow.
196484
+
196485
+ ## Output Format
196486
+ Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
196487
+ {
196488
+ "P0": {
196489
+ "tableType": "A",
196490
+ "entities": [...],
196491
+ "attributes": [...]
196492
+ },
196493
+ "P3": {
196494
+ "tableType": "C",
196495
+ "comparisons": [...]
196496
+ }
196497
+ }
196498
+
196499
+ ## Rules
196500
+ - Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
196501
+ - Skip paragraphs with no atoms — do NOT emit empty objects.
196502
+ - Every atom MUST include a "confidence" field (0.0-1.0).
196503
+ - The "tableType" field is required for each paragraph (one of "A", "B", "C", "D", "E", "F").
196504
+ - Only include atom types that are actually extracted.
196505
+ - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
196506
+ - JSON structure keys must always be in English.
196507
+ - **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
196508
+ - Do NOT include "claims" — they are system-generated.`;
196509
+ function buildDocTableAnnotationPrompt(tableText) {
196510
+ return `Classify and extract atoms from the following table paragraphs.
196511
+ Each paragraph is tagged with [P0], [P1], etc. First classify each table, then extract atoms accordingly.
196512
+
196513
+ ---
196514
+ ${tableText}
196515
+ ---
196516
+
196517
+ Return ONLY a valid JSON object. No markdown fences, no explanation.`;
196518
+ }
196519
+ var DOC_TABLE_ANNOTATION_SYSTEM_PROMPT = TABLE_SYSTEM_PROMPT;
196520
+ // ../llm/src/prompts/docDiagramAnnotation.ts
196521
+ init_src();
196522
+ var entityFields2 = zodObjectToPromptFields(entityAtomSchema);
196523
+ var attributeFields2 = zodObjectToPromptFields(attributeAtomSchema);
196524
+ var relationFields2 = zodObjectToPromptFields(relationAtomSchema);
196525
+ var behaviorFields2 = zodObjectToPromptFields(behaviorAtomSchema);
196526
+ var transitionFields2 = zodObjectToPromptFields(transitionAtomSchema);
196527
+ var stateFields2 = zodObjectToPromptFields(stateAtomSchema);
196528
+ var roleFields = zodObjectToPromptFields(roleAtomSchema);
196529
+ var eventFields2 = zodObjectToPromptFields(eventAtomSchema);
196530
+ var decisionFields = zodObjectToPromptFields(decisionAtomSchema);
196531
+ var constraintFields2 = zodObjectToPromptFields(constraintAtomSchema);
196532
+ var DIAGRAM_FENCE_TAGS = [
196533
+ "mermaid",
196534
+ "plantuml",
196535
+ "puml",
196536
+ "dot",
196537
+ "graphviz",
196538
+ "viz",
196539
+ "d2",
196540
+ "c4plantuml",
196541
+ "ditaa",
196542
+ "nomnoml",
196543
+ "wavedrom",
196544
+ "vega",
196545
+ "vega-lite"
196546
+ ];
196547
+ var DIAGRAM_FENCE_REGEX = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "i");
196548
+ var DIAGRAM_SYSTEM_PROMPT = `You are a diagram analysis assistant. Your task is to extract structured semantic atoms from text-based diagrams (Mermaid, PlantUML, Graphviz, D2, etc.) embedded in documents.
196549
+
196550
+ Each diagram paragraph is tagged with [P0], [P1], etc. You must classify the diagram type FIRST, then extract atoms accordingly.
196551
+
196552
+ ## Step 1: Identify the Diagram Format and Type
196553
+
196554
+ ### Formats
196555
+ - **Mermaid**: flowchart/graph, sequenceDiagram, stateDiagram, classDiagram, erDiagram, gantt, pie, gitgraph
196556
+ - **PlantUML / C4-PlantUML**: @startuml/@enduml blocks, all UML types, C4 architecture (System_Context, Container, Component)
196557
+ - **Graphviz (DOT)**: digraph/graph, general directed/undirected graphs
196558
+ - **D2**: modern declarative diagrams with shape/connection syntax
196559
+ - **Others**: ditaa (ASCII art), nomnoml (UML), wavedrom (timing), vega/vega-lite (data viz)
196560
+
196561
+ ### Diagram Types (by semantic content)
196562
+ - **Flowchart / Process**: decision trees, algorithms, business process flows
196563
+ - **Sequence**: interaction between participants over time (API calls, protocols)
196564
+ - **State Machine**: states and transitions triggered by events/guards
196565
+ - **Class / ER**: data models, entity relationships, inheritance hierarchies
196566
+ - **Architecture**: system components, containers, deployment topology
196567
+ - **Gantt / Timeline**: project schedules, milestones, phases
196568
+ - **Pie / Data Viz**: statistical distributions, metrics visualization
196569
+
196570
+ ## Step 2: Extract Atoms by Diagram Type
196571
+
196572
+ ### Flowchart / Process → entities + relations + behaviors + decisions
196573
+ 1. Extract each node as an entity.
196574
+ Entity schema: ${entityFields2}
196575
+ 2. Extract each arrow/edge as a relation. Use edge labels as \`type\` or \`description\`.
196576
+ Relation schema: ${relationFields2}
196577
+ 3. Extract action nodes as behaviors (what the process does at each step).
196578
+ Behavior schema: ${behaviorFields2}
196579
+ 4. Extract diamond/decision nodes as decisions.
196580
+ Decision schema: ${decisionFields}
196581
+
196582
+ ### Sequence → entities + relations + behaviors + events
196583
+ 1. Extract each participant/actor as an entity (or role if it's a person/team).
196584
+ Entity schema: ${entityFields2}
196585
+ Role schema: ${roleFields}
196586
+ 2. Extract each message/call as a relation (\`from\` = caller, \`to\` = callee, \`type\` = message label).
196587
+ Relation schema: ${relationFields2}
196588
+ 3. Extract significant interactions as behaviors.
196589
+ Behavior schema: ${behaviorFields2}
196590
+ 4. Extract triggers, responses, and async messages as events.
196591
+ Event schema: ${eventFields2}
196592
+
196593
+ ### State Machine → entities + states + transitions + events
196594
+ 1. Extract the state machine subject as an entity.
196595
+ Entity schema: ${entityFields2}
196596
+ 2. Extract each state as a state atom.
196597
+ State schema: ${stateFields2}
196598
+ 3. Extract each arrow as a transition (\`from\` = source state, \`to\` = target state, \`trigger\` = event/guard).
196599
+ Transition schema: ${transitionFields2}
196600
+ 4. Extract triggers as events.
196601
+ Event schema: ${eventFields2}
196602
+
196603
+ ### Class / ER → entities + attributes + relations
196604
+ 1. Extract each class/entity as an entity.
196605
+ Entity schema: ${entityFields2}
196606
+ 2. Extract fields/properties as attributes.
196607
+ Attribute schema: ${attributeFields2}
196608
+ 3. Extract associations, inheritance, composition as relations (\`type\` = "INHERITS", "CONTAINS", "REFERENCES", etc.).
196609
+ Relation schema: ${relationFields2}
196610
+
196611
+ ### Architecture → entities + relations + constraints
196612
+ 1. Extract each system/service/container/component as an entity.
196613
+ Entity schema: ${entityFields2}
196614
+ 2. Extract connections between components as relations.
196615
+ Relation schema: ${relationFields2}
196616
+ 3. Extract deployment constraints, technology choices.
196617
+ Constraint schema: ${constraintFields2}
196618
+
196619
+ ### Gantt / Timeline → behaviors + events + constraints
196620
+ 1. Extract each task/phase as a behavior.
196621
+ Behavior schema: ${behaviorFields2}
196622
+ 2. Extract milestones and deadlines as events.
196623
+ Event schema: ${eventFields2}
196624
+ 3. Extract dependencies and critical path constraints.
196625
+ Constraint schema: ${constraintFields2}
196626
+
196627
+ ### Pie / Data Viz → attributes (summary only)
196628
+ 1. Extract the chart title as an entity.
196629
+ Entity schema: ${entityFields2}
196630
+ 2. Extract each slice/data point as an attribute (\`name\` = label, \`value\` = amount/percentage, \`type\` = "metric").
196631
+ Attribute schema: ${attributeFields2}
196632
+
196633
+ ## Additional Extraction: Diagram Description
196634
+
196635
+ For EVERY diagram, also extract a "description" attribute on the diagram's primary entity:
196636
+ - \`name\`: "diagram_description"
196637
+ - \`type\`: "description"
196638
+ - \`value\`: A 1-3 sentence natural language summary of what the diagram communicates.
196639
+
196640
+ This description is critical for downstream AI consumers who cannot render the diagram.
196641
+
196642
+ ## Output Format
196643
+ Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
196644
+ {
196645
+ "P0": {
196646
+ "diagramFormat": "mermaid",
196647
+ "diagramType": "sequence",
196648
+ "entities": [...],
196649
+ "relations": [...]
196650
+ }
196651
+ }
196652
+
196653
+ ## Rules
196654
+ - Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
196655
+ - Skip paragraphs with no atoms — do NOT emit empty objects.
196656
+ - Every atom MUST include a "confidence" field (0.0-1.0).
196657
+ - The "diagramFormat" and "diagramType" fields are required for each paragraph.
196658
+ - Only include atom types that are actually extracted.
196659
+ - Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
196660
+ - JSON structure keys must always be in English.
196661
+ - **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
196662
+ - **Extract ALL nodes and edges** — do not sample or skip.
196663
+ - Do NOT include "claims" — they are system-generated.`;
196664
+ function buildDocDiagramAnnotationPrompt(diagramText) {
196665
+ return `Analyze and extract atoms from the following diagram paragraphs.
196666
+ Each paragraph is tagged with [P0], [P1], etc. First identify the diagram format and type, then extract atoms accordingly.
196667
+
196668
+ ---
196669
+ ${diagramText}
196670
+ ---
196671
+
196672
+ Return ONLY a valid JSON object. No markdown fences, no explanation.`;
196673
+ }
196674
+ var DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT = DIAGRAM_SYSTEM_PROMPT;
196278
196675
  // ../llm/src/chunking/markdownChunker.ts
196279
196676
  var DEFAULT_MAX_TOKENS2 = 4000;
196677
+ var DEFAULT_PARAGRAPH_MAX_TOKENS = 500;
196280
196678
  function estimateTokens(text2) {
196281
196679
  return Math.ceil(text2.length / 4);
196282
196680
  }
196681
+ function findCodeBlockRanges(content) {
196682
+ const ranges = [];
196683
+ const fenceRe = /^(`{3,}|~{3,})/gm;
196684
+ let openStart = -1;
196685
+ let openFence = "";
196686
+ let match2;
196687
+ while ((match2 = fenceRe.exec(content)) !== null) {
196688
+ const fence = match2[1];
196689
+ if (openStart === -1) {
196690
+ openStart = match2.index;
196691
+ openFence = fence[0].repeat(fence.length);
196692
+ } else if (fence[0] === openFence[0] && fence.length >= openFence.length) {
196693
+ ranges.push({ start: openStart, end: match2.index + match2[0].length });
196694
+ openStart = -1;
196695
+ openFence = "";
196696
+ }
196697
+ }
196698
+ if (openStart !== -1) {
196699
+ ranges.push({ start: openStart, end: content.length });
196700
+ }
196701
+ return ranges;
196702
+ }
196703
+ function isInsideCodeBlock(pos, ranges) {
196704
+ for (const r of ranges) {
196705
+ if (pos >= r.start && pos < r.end)
196706
+ return true;
196707
+ if (r.start > pos)
196708
+ break;
196709
+ }
196710
+ return false;
196711
+ }
196283
196712
  function parseSections(content) {
196284
- const headingRe = /^(#{1,6})\s+(.*)$/gm;
196285
- const sections = [];
196713
+ const codeRanges = findCodeBlockRanges(content);
196286
196714
  const matches = [];
196287
- let match2;
196288
- while ((match2 = headingRe.exec(content)) !== null) {
196289
- matches.push({
196290
- index: match2.index,
196291
- level: match2[1].length,
196292
- heading: match2[2].trim()
196293
- });
196715
+ const atxRe = /^(#{1,6})\s+(.*)$/gm;
196716
+ let m;
196717
+ while ((m = atxRe.exec(content)) !== null) {
196718
+ if (!isInsideCodeBlock(m.index, codeRanges)) {
196719
+ matches.push({
196720
+ index: m.index,
196721
+ endIndex: m.index + m[0].length,
196722
+ level: m[1].length,
196723
+ heading: m[2].trim()
196724
+ });
196725
+ }
196726
+ }
196727
+ const lines = content.split(`
196728
+ `);
196729
+ let offset = 0;
196730
+ for (let i = 0;i < lines.length; i++) {
196731
+ const line = lines[i];
196732
+ if (i > 0) {
196733
+ const prevLine = lines[i - 1].trim();
196734
+ const prevLineStart = offset - lines[i - 1].length - 1;
196735
+ if (prevLine && !isInsideCodeBlock(prevLineStart, codeRanges)) {
196736
+ if (/^={2,}\s*$/.test(line)) {
196737
+ matches.push({
196738
+ index: prevLineStart < 0 ? 0 : prevLineStart,
196739
+ endIndex: offset + line.length,
196740
+ level: 1,
196741
+ heading: prevLine
196742
+ });
196743
+ } else if (/^-{2,}\s*$/.test(line) && !/^-{3,}\s*$/.test(prevLine)) {
196744
+ matches.push({
196745
+ index: prevLineStart < 0 ? 0 : prevLineStart,
196746
+ endIndex: offset + line.length,
196747
+ level: 2,
196748
+ heading: prevLine
196749
+ });
196750
+ }
196751
+ }
196752
+ }
196753
+ offset += line.length + 1;
196754
+ }
196755
+ matches.sort((a, b) => a.index - b.index);
196756
+ const deduped = [];
196757
+ for (const match2 of matches) {
196758
+ const last = deduped[deduped.length - 1];
196759
+ if (last && match2.index < last.endIndex)
196760
+ continue;
196761
+ deduped.push(match2);
196294
196762
  }
196763
+ return buildSectionsFromMatches(content, deduped);
196764
+ }
196765
+ function buildSectionsFromMatches(content, matches) {
196766
+ const sections = [];
196295
196767
  if (matches.length === 0) {
196296
196768
  const body2 = content.trim();
196297
196769
  if (body2) {
196298
- sections.push({
196299
- heading: "",
196300
- level: 0,
196301
- body: body2,
196302
- paragraphs: splitParagraphs(body2)
196303
- });
196770
+ sections.push({ heading: "", level: 0, body: body2, paragraphs: splitParagraphs(body2) });
196304
196771
  }
196305
196772
  return sections;
196306
196773
  }
196307
196774
  if (matches[0].index > 0) {
196308
196775
  const preBody = content.slice(0, matches[0].index).trim();
196309
196776
  if (preBody) {
196310
- sections.push({
196311
- heading: "",
196312
- level: 0,
196313
- body: preBody,
196314
- paragraphs: splitParagraphs(preBody)
196315
- });
196777
+ sections.push({ heading: "", level: 0, body: preBody, paragraphs: splitParagraphs(preBody) });
196316
196778
  }
196317
196779
  }
196318
196780
  for (let i = 0;i < matches.length; i++) {
196319
196781
  const m = matches[i];
196320
- const start2 = m.index;
196321
- const end = i + 1 < matches.length ? matches[i + 1].index : content.length;
196322
- const fullText = content.slice(start2, end).trim();
196323
- const headingLineEnd = fullText.indexOf(`
196324
- `);
196325
- const body2 = headingLineEnd === -1 ? "" : fullText.slice(headingLineEnd + 1).trim();
196782
+ const bodyStart = m.endIndex;
196783
+ const bodyEnd = i + 1 < matches.length ? matches[i + 1].index : content.length;
196784
+ const body2 = content.slice(bodyStart, bodyEnd).trim();
196326
196785
  sections.push({
196327
196786
  heading: m.heading,
196328
196787
  level: m.level,
@@ -196337,6 +196796,128 @@ function splitParagraphs(text2) {
196337
196796
  return [];
196338
196797
  return text2.split(/\n\n+/).map((p4) => p4.trim()).filter(Boolean);
196339
196798
  }
196799
+ function splitOversizedText(text2, maxTokens) {
196800
+ const doubleNewlineParts = text2.split(/\n\n+/).map((p4) => p4.trim()).filter(Boolean);
196801
+ if (doubleNewlineParts.length > 1) {
196802
+ const results = [];
196803
+ let acc = "";
196804
+ let accTokens = 0;
196805
+ for (const part of doubleNewlineParts) {
196806
+ const partTokens = estimateTokens(part);
196807
+ if (partTokens > maxTokens) {
196808
+ if (acc) {
196809
+ results.push(acc);
196810
+ acc = "";
196811
+ accTokens = 0;
196812
+ }
196813
+ results.push(...splitOversizedText(part, maxTokens));
196814
+ continue;
196815
+ }
196816
+ if (acc && accTokens + partTokens > maxTokens) {
196817
+ results.push(acc);
196818
+ acc = "";
196819
+ accTokens = 0;
196820
+ }
196821
+ acc = acc ? acc + `
196822
+
196823
+ ` + part : part;
196824
+ accTokens += partTokens;
196825
+ }
196826
+ if (acc)
196827
+ results.push(acc);
196828
+ return results;
196829
+ }
196830
+ const lines = text2.split(`
196831
+ `);
196832
+ if (lines.length > 1) {
196833
+ const blocks = mergeAtomicBlocks(lines);
196834
+ const results = [];
196835
+ let acc = "";
196836
+ let accTokens = 0;
196837
+ for (const block of blocks) {
196838
+ const blockTokens = estimateTokens(block);
196839
+ if (blockTokens > maxTokens) {
196840
+ if (acc) {
196841
+ results.push(acc);
196842
+ acc = "";
196843
+ accTokens = 0;
196844
+ }
196845
+ results.push(block);
196846
+ continue;
196847
+ }
196848
+ if (acc && accTokens + blockTokens > maxTokens) {
196849
+ results.push(acc);
196850
+ acc = "";
196851
+ accTokens = 0;
196852
+ }
196853
+ acc = acc ? acc + `
196854
+ ` + block : block;
196855
+ accTokens += blockTokens;
196856
+ }
196857
+ if (acc)
196858
+ results.push(acc);
196859
+ return results;
196860
+ }
196861
+ return forceBreakText(text2, maxTokens);
196862
+ }
196863
+ function mergeAtomicBlocks(lines) {
196864
+ const result = [];
196865
+ let i = 0;
196866
+ while (i < lines.length) {
196867
+ const line = lines[i];
196868
+ const trimmed = line.trimStart();
196869
+ if (/^(`{3,}|~{3,})/.test(trimmed)) {
196870
+ const fence = trimmed.match(/^(`{3,}|~{3,})/)[1];
196871
+ const fenceChar = fence[0];
196872
+ const fenceLen = fence.length;
196873
+ const blockLines = [line];
196874
+ i++;
196875
+ while (i < lines.length) {
196876
+ blockLines.push(lines[i]);
196877
+ const inner = lines[i].trimStart();
196878
+ if (inner.startsWith(fenceChar) && inner.match(new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`))) {
196879
+ i++;
196880
+ break;
196881
+ }
196882
+ i++;
196883
+ }
196884
+ result.push(blockLines.join(`
196885
+ `));
196886
+ continue;
196887
+ }
196888
+ if (trimmed.startsWith("|")) {
196889
+ const tableLines = [line];
196890
+ i++;
196891
+ while (i < lines.length && lines[i].trimStart().startsWith("|")) {
196892
+ tableLines.push(lines[i]);
196893
+ i++;
196894
+ }
196895
+ result.push(tableLines.join(`
196896
+ `));
196897
+ continue;
196898
+ }
196899
+ result.push(line);
196900
+ i++;
196901
+ }
196902
+ return result;
196903
+ }
196904
+ function forceBreakText(text2, maxTokens) {
196905
+ const maxChars = maxTokens * 4;
196906
+ const results = [];
196907
+ let remaining = text2;
196908
+ while (remaining.length > maxChars) {
196909
+ let breakAt = maxChars;
196910
+ const spaceIdx = remaining.lastIndexOf(" ", maxChars);
196911
+ if (spaceIdx > maxChars * 0.7) {
196912
+ breakAt = spaceIdx;
196913
+ }
196914
+ results.push(remaining.slice(0, breakAt).trim());
196915
+ remaining = remaining.slice(breakAt).trim();
196916
+ }
196917
+ if (remaining)
196918
+ results.push(remaining);
196919
+ return results;
196920
+ }
196340
196921
  function buildBreadcrumb(sections, sectionIndex) {
196341
196922
  const current = sections[sectionIndex];
196342
196923
  if (current.level <= 0)
@@ -196365,11 +196946,53 @@ function sectionHeadingLine(section) {
196365
196946
  return "";
196366
196947
  return `${"#".repeat(section.level)} ${section.heading}`;
196367
196948
  }
196949
+ function buildCoarseParagraphs(sections, paragraphMaxTokens) {
196950
+ const result = [];
196951
+ const rawEntries = [];
196952
+ for (let sIdx = 0;sIdx < sections.length; sIdx++) {
196953
+ const section = sections[sIdx];
196954
+ if (!section.body.trim())
196955
+ continue;
196956
+ const bodyTokens = estimateTokens(section.body);
196957
+ if (bodyTokens > paragraphMaxTokens) {
196958
+ const parts = splitOversizedText(section.body, paragraphMaxTokens);
196959
+ for (const part of parts) {
196960
+ rawEntries.push({ sectionIndex: sIdx, text: part, tokens: estimateTokens(part) });
196961
+ }
196962
+ } else {
196963
+ rawEntries.push({ sectionIndex: sIdx, text: section.body, tokens: bodyTokens });
196964
+ }
196965
+ }
196966
+ const MERGE_THRESHOLD = 150;
196967
+ const merged = [];
196968
+ for (const entry of rawEntries) {
196969
+ const last = merged[merged.length - 1];
196970
+ if (last && last.tokens < MERGE_THRESHOLD && entry.tokens < MERGE_THRESHOLD && last.tokens + entry.tokens <= paragraphMaxTokens) {
196971
+ last.text = last.text + `
196972
+
196973
+ ` + entry.text;
196974
+ last.tokens += entry.tokens;
196975
+ } else {
196976
+ merged.push({ ...entry });
196977
+ }
196978
+ }
196979
+ let pIdx = 0;
196980
+ for (const entry of merged) {
196981
+ result.push({
196982
+ sectionIndex: entry.sectionIndex,
196983
+ paragraphIndex: pIdx++,
196984
+ text: entry.text
196985
+ });
196986
+ }
196987
+ return result;
196988
+ }
196368
196989
  function chunkMarkdown(content, options = {}) {
196369
196990
  const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS2;
196991
+ const paragraphMaxTokens = options.paragraphMaxTokens ?? DEFAULT_PARAGRAPH_MAX_TOKENS;
196370
196992
  const sections = parseSections(content);
196371
196993
  if (sections.length === 0)
196372
196994
  return [];
196995
+ const coarseParagraphs = buildCoarseParagraphs(sections, paragraphMaxTokens);
196373
196996
  const chunks = [];
196374
196997
  let pendingSections = [];
196375
196998
  let pendingTokens = 0;
@@ -196387,14 +197010,16 @@ function chunkMarkdown(content, options = {}) {
196387
197010
  const heading = sectionHeadingLine(entry.section);
196388
197011
  if (heading)
196389
197012
  textParts.push(heading);
196390
- for (let pIdx = 0;pIdx < entry.section.paragraphs.length; pIdx++) {
196391
- const pText = entry.section.paragraphs[pIdx];
196392
- textParts.push(pText);
196393
- paragraphs.push({
196394
- sectionIndex: entry.sectionIndex,
196395
- paragraphIndex: pIdx,
196396
- text: pText
196397
- });
197013
+ const sectionParas = coarseParagraphs.filter((p4) => p4.sectionIndex === entry.sectionIndex);
197014
+ for (const p4 of sectionParas) {
197015
+ if (!paragraphs.some((existing) => existing.paragraphIndex === p4.paragraphIndex && existing.text === p4.text)) {
197016
+ textParts.push(p4.text);
197017
+ paragraphs.push({
197018
+ sectionIndex: p4.sectionIndex,
197019
+ paragraphIndex: p4.paragraphIndex,
197020
+ text: p4.text
197021
+ });
197022
+ }
196398
197023
  }
196399
197024
  }
196400
197025
  chunks.push({
@@ -196417,7 +197042,7 @@ function chunkMarkdown(content, options = {}) {
196417
197042
  ` : "") + section.body);
196418
197043
  if (sectionTokens > maxTokens && section.paragraphs.length > 1) {
196419
197044
  flushPending();
196420
- splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks);
197045
+ splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks, coarseParagraphs);
196421
197046
  continue;
196422
197047
  }
196423
197048
  const crumbTokens = pendingSections.length === 0 ? estimateTokens(breadcrumbPrefix(breadcrumb)) : 0;
@@ -196430,9 +197055,10 @@ function chunkMarkdown(content, options = {}) {
196430
197055
  flushPending();
196431
197056
  return chunks;
196432
197057
  }
196433
- function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks) {
197058
+ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks, coarseParagraphs) {
196434
197059
  const headingLine = sectionHeadingLine(section);
196435
197060
  const prefix = breadcrumbPrefix(breadcrumb);
197061
+ const sectionParas = coarseParagraphs.filter((p4) => p4.sectionIndex === sectionIndex);
196436
197062
  let accParagraphs = [];
196437
197063
  let accTextParts = [];
196438
197064
  let accTokens = 0;
@@ -196459,18 +197085,265 @@ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens,
196459
197085
  accTokens = baseOverhead;
196460
197086
  }
196461
197087
  accTokens = baseOverhead;
196462
- for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
196463
- const pText = section.paragraphs[pIdx];
196464
- const pTokens = estimateTokens(pText);
197088
+ for (const p4 of sectionParas) {
197089
+ const pTokens = estimateTokens(p4.text);
196465
197090
  if (accParagraphs.length > 0 && accTokens + pTokens > maxTokens) {
196466
197091
  flushAcc();
196467
197092
  }
196468
- accParagraphs.push({ sectionIndex, paragraphIndex: pIdx, text: pText });
196469
- accTextParts.push(pText);
197093
+ accParagraphs.push({ sectionIndex, paragraphIndex: p4.paragraphIndex, text: p4.text });
197094
+ accTextParts.push(p4.text);
196470
197095
  accTokens += pTokens;
196471
197096
  }
196472
197097
  flushAcc();
196473
197098
  }
197099
+ // ../llm/src/chunking/normalizeMarkdown.ts
197100
+ function normalizeMarkdown(content) {
197101
+ const stats = { repairs: {} };
197102
+ function count(category) {
197103
+ stats.repairs[category] = (stats.repairs[category] ?? 0) + 1;
197104
+ }
197105
+ let result = stripBomAndInvisible(content, count);
197106
+ result = normalizeLineEndings(result, count);
197107
+ const lines = result.split(`
197108
+ `);
197109
+ const output = processBlocks(lines, count);
197110
+ return { content: output.join(`
197111
+ `), stats };
197112
+ }
197113
+ function stripBomAndInvisible(text2, count) {
197114
+ const cleaned = text2.replace(/[\uFEFF\u200B\u200C\u200D]/g, "");
197115
+ if (cleaned.length !== text2.length) {
197116
+ count("invisible_chars");
197117
+ }
197118
+ return cleaned;
197119
+ }
197120
+ function normalizeLineEndings(text2, count) {
197121
+ if (text2.includes("\r")) {
197122
+ count("line_endings");
197123
+ return text2.replace(/\r\n?/g, `
197124
+ `);
197125
+ }
197126
+ return text2;
197127
+ }
197128
+ function processBlocks(inputLines, count) {
197129
+ const lines = splitInlineFences(inputLines, count);
197130
+ const output = [];
197131
+ let i = 0;
197132
+ while (i < lines.length) {
197133
+ const line = lines[i];
197134
+ const trimmed = line.trimStart();
197135
+ const fenceMatch = trimmed.match(/^(`{3,}|~{3,})/);
197136
+ if (fenceMatch) {
197137
+ const result = handleCodeFence(lines, i, fenceMatch[1], count);
197138
+ output.push(...result.lines);
197139
+ i = result.nextIndex;
197140
+ continue;
197141
+ }
197142
+ if (looksLikeTableRow(trimmed)) {
197143
+ const result = handleTableBlock(lines, i, count);
197144
+ output.push(...result.lines);
197145
+ i = result.nextIndex;
197146
+ continue;
197147
+ }
197148
+ if (trimmed === "") {
197149
+ const result = handleBlankLines(lines, i, count);
197150
+ output.push(...result.lines);
197151
+ i = result.nextIndex;
197152
+ continue;
197153
+ }
197154
+ if (trimmed.startsWith("<!--")) {
197155
+ const result = handleHtmlComment(lines, i, count);
197156
+ output.push(...result.lines);
197157
+ i = result.nextIndex;
197158
+ continue;
197159
+ }
197160
+ if (looksLikeJsonBlockStart(trimmed)) {
197161
+ const result = handleUnfencedJson(lines, i, count);
197162
+ if (result) {
197163
+ output.push(...result.lines);
197164
+ i = result.nextIndex;
197165
+ continue;
197166
+ }
197167
+ }
197168
+ output.push(line);
197169
+ i++;
197170
+ }
197171
+ return output;
197172
+ }
197173
+ function handleCodeFence(lines, startIdx, fence, count) {
197174
+ const fenceChar = fence[0];
197175
+ const fenceLen = fence.length;
197176
+ const result = [lines[startIdx]];
197177
+ let i = startIdx + 1;
197178
+ while (i < lines.length) {
197179
+ const trimmed = lines[i].trimStart();
197180
+ result.push(lines[i]);
197181
+ const closingRe = new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`);
197182
+ if (closingRe.test(trimmed)) {
197183
+ return { lines: result, nextIndex: i + 1 };
197184
+ }
197185
+ i++;
197186
+ }
197187
+ count("unclosed_code_fence");
197188
+ result.push(fence);
197189
+ return { lines: result, nextIndex: i };
197190
+ }
197191
+ function handleTableBlock(lines, startIdx, count) {
197192
+ const tableLines = [];
197193
+ let i = startIdx;
197194
+ while (i < lines.length && looksLikeTableRow(lines[i].trimStart())) {
197195
+ tableLines.push(lines[i]);
197196
+ i++;
197197
+ }
197198
+ if (tableLines.length < 2) {
197199
+ return { lines: tableLines, nextIndex: i };
197200
+ }
197201
+ const normalized = tableLines.map((line) => {
197202
+ const trimmed = line.trimStart();
197203
+ if (!trimmed.startsWith("|") && trimmed.includes("|")) {
197204
+ count("table_leading_pipe");
197205
+ return "| " + trimmed + (trimmed.endsWith("|") ? "" : " |");
197206
+ }
197207
+ return line;
197208
+ });
197209
+ const hasSeparator = normalized.some((line) => /^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/.test(line.trim()));
197210
+ if (!hasSeparator && normalized.length >= 2) {
197211
+ const firstRow = normalized[0].trim();
197212
+ const colCount = countPipes(firstRow) - 1;
197213
+ if (colCount >= 2) {
197214
+ const separator = "| " + Array(colCount).fill("---").join(" | ") + " |";
197215
+ count("table_missing_separator");
197216
+ const result = [normalized[0], separator, ...normalized.slice(1)];
197217
+ return { lines: result, nextIndex: i };
197218
+ }
197219
+ }
197220
+ return { lines: normalized, nextIndex: i };
197221
+ }
197222
+ function handleBlankLines(lines, startIdx, count) {
197223
+ let i = startIdx;
197224
+ while (i < lines.length && lines[i].trim() === "") {
197225
+ i++;
197226
+ }
197227
+ const blankCount = i - startIdx;
197228
+ if (blankCount > 2) {
197229
+ count("excessive_blank_lines");
197230
+ return { lines: [""], nextIndex: i };
197231
+ }
197232
+ return { lines: lines.slice(startIdx, i), nextIndex: i };
197233
+ }
197234
+ function handleHtmlComment(lines, startIdx, count) {
197235
+ const firstLine = lines[startIdx];
197236
+ if (firstLine.includes("-->")) {
197237
+ count("html_comment");
197238
+ return { lines: [], nextIndex: startIdx + 1 };
197239
+ }
197240
+ let i = startIdx + 1;
197241
+ while (i < lines.length) {
197242
+ if (lines[i].includes("-->")) {
197243
+ count("html_comment");
197244
+ return { lines: [], nextIndex: i + 1 };
197245
+ }
197246
+ i++;
197247
+ }
197248
+ return { lines: [firstLine], nextIndex: startIdx + 1 };
197249
+ }
197250
+ function looksLikeJsonBlockStart(trimmed) {
197251
+ return trimmed === "{" || trimmed === "[";
197252
+ }
197253
+ var MIN_JSON_BLOCK_LINES = 5;
197254
+ function handleUnfencedJson(lines, startIdx, count) {
197255
+ const opener = lines[startIdx].trimStart();
197256
+ const openChar = opener[0];
197257
+ const closeChar = openChar === "{" ? "}" : "]";
197258
+ let depth = 0;
197259
+ let i = startIdx;
197260
+ let inString = false;
197261
+ while (i < lines.length) {
197262
+ const line = lines[i];
197263
+ for (let c = 0;c < line.length; c++) {
197264
+ const ch = line[c];
197265
+ if (ch === "\\" && inString) {
197266
+ c++;
197267
+ continue;
197268
+ }
197269
+ if (ch === '"') {
197270
+ inString = !inString;
197271
+ continue;
197272
+ }
197273
+ if (inString)
197274
+ continue;
197275
+ if (ch === "/" && c + 1 < line.length && line[c + 1] === "/") {
197276
+ break;
197277
+ }
197278
+ if (ch === "{" || ch === "[")
197279
+ depth++;
197280
+ else if (ch === "}" || ch === "]")
197281
+ depth--;
197282
+ }
197283
+ i++;
197284
+ if (depth === 0) {
197285
+ const blockLen = i - startIdx;
197286
+ if (blockLen < MIN_JSON_BLOCK_LINES) {
197287
+ return null;
197288
+ }
197289
+ const lastTrimmed = lines[i - 1].trimEnd();
197290
+ if (!lastTrimmed.endsWith(closeChar)) {
197291
+ return null;
197292
+ }
197293
+ count("unfenced_json_block");
197294
+ const fenced = ["```json"];
197295
+ for (let j = startIdx;j < i; j++) {
197296
+ fenced.push(lines[j]);
197297
+ }
197298
+ fenced.push("```");
197299
+ return { lines: fenced, nextIndex: i };
197300
+ }
197301
+ if (depth < 0) {
197302
+ return null;
197303
+ }
197304
+ }
197305
+ return null;
197306
+ }
197307
+ function splitInlineFences(lines, count) {
197308
+ const result = [];
197309
+ for (const line of lines) {
197310
+ const trimmed = line.trimStart();
197311
+ if (/^(`{3,}|~{3,})/.test(trimmed)) {
197312
+ result.push(line);
197313
+ continue;
197314
+ }
197315
+ const inlineMatch = trimmed.match(/(`{3,}|~{3,})(\S*)\s*$/);
197316
+ if (inlineMatch) {
197317
+ const fenceStr = inlineMatch[1];
197318
+ const fenceIdx = trimmed.lastIndexOf(fenceStr);
197319
+ const beforeFence = trimmed.substring(0, fenceIdx);
197320
+ if (beforeFence.trim().length > 0) {
197321
+ const leadingWhitespace = line.substring(0, line.length - trimmed.length);
197322
+ count("inline_code_fence");
197323
+ result.push(leadingWhitespace + beforeFence.trimEnd());
197324
+ result.push(trimmed.substring(fenceIdx));
197325
+ continue;
197326
+ }
197327
+ }
197328
+ result.push(line);
197329
+ }
197330
+ return result;
197331
+ }
197332
+ function looksLikeTableRow(trimmed) {
197333
+ if (trimmed.startsWith("#") || trimmed.startsWith("```") || trimmed.startsWith("~~~")) {
197334
+ return false;
197335
+ }
197336
+ return countPipes(trimmed) >= 1;
197337
+ }
197338
+ function countPipes(text2) {
197339
+ let count = 0;
197340
+ for (let i = 0;i < text2.length; i++) {
197341
+ if (text2[i] === "|" && (i === 0 || text2[i - 1] !== "\\")) {
197342
+ count++;
197343
+ }
197344
+ }
197345
+ return count;
197346
+ }
196474
197347
  // ../llm/src/utils/mapConcurrent.ts
196475
197348
  async function mapConcurrent(items, concurrency, fn) {
196476
197349
  const results = [];
@@ -196499,9 +197372,760 @@ async function mapConcurrent(items, concurrency, fn) {
196499
197372
  }
196500
197373
  // ../api/src/services/docIndexer.ts
196501
197374
  init_src();
197375
+
197376
+ // ../api/src/services/docEmbedding.ts
197377
+ var EMBEDDING_BATCH_SIZE = 20;
197378
+ var EMBEDDING_MAX_TOKENS = 480;
197379
+ function isPureCodeBlock(text2) {
197380
+ const trimmed = text2.trim();
197381
+ if (/^```[\s\S]*```\s*$/.test(trimmed))
197382
+ return true;
197383
+ const lines = trimmed.split(`
197384
+ `).filter(Boolean);
197385
+ if (lines.length < 3)
197386
+ return false;
197387
+ const indentedLines = lines.filter((l) => /^\s{2,}/.test(l)).length;
197388
+ const indentRatio = indentedLines / lines.length;
197389
+ if (indentRatio > 0.8)
197390
+ return true;
197391
+ const codeChars = (trimmed.match(/[{}();=><|&![\]]/g) || []).length;
197392
+ const ratio = codeChars / trimmed.length;
197393
+ if (ratio > 0.15 && indentRatio > 0.6)
197394
+ return true;
197395
+ return false;
197396
+ }
197397
+ var CODE_SKELETON_MAX_LINES = 20;
197398
+ var CODE_SKELETON_MAX_CHARS = 800;
197399
+ function skeletonizeCodeBlock(text2) {
197400
+ const lines = text2.split(`
197401
+ `);
197402
+ let indentUnit = 2;
197403
+ for (const line of lines) {
197404
+ const match2 = line.match(/^(\s+)\S/);
197405
+ if (match2) {
197406
+ const spaces = match2[1].replace(/\t/g, " ").length;
197407
+ if (spaces > 0) {
197408
+ indentUnit = spaces;
197409
+ break;
197410
+ }
197411
+ }
197412
+ }
197413
+ const maxIndent = indentUnit * 2;
197414
+ const kept = [];
197415
+ let lastWasElided = false;
197416
+ for (const line of lines) {
197417
+ const trimmed = line.trimStart();
197418
+ if (trimmed === "")
197419
+ continue;
197420
+ const leadingSpaces = line.replace(/\t/g, " ").length - trimmed.length;
197421
+ if (leadingSpaces <= maxIndent) {
197422
+ if (lastWasElided) {
197423
+ kept.push(" ...");
197424
+ lastWasElided = false;
197425
+ }
197426
+ kept.push(line);
197427
+ } else {
197428
+ lastWasElided = true;
197429
+ }
197430
+ }
197431
+ if (lastWasElided)
197432
+ kept.push(" ...");
197433
+ let result = kept;
197434
+ if (result.length > CODE_SKELETON_MAX_LINES) {
197435
+ result = result.slice(0, CODE_SKELETON_MAX_LINES);
197436
+ result.push("[...]");
197437
+ }
197438
+ let joined = result.join(`
197439
+ `);
197440
+ if (joined.length > CODE_SKELETON_MAX_CHARS) {
197441
+ joined = joined.slice(0, CODE_SKELETON_MAX_CHARS) + `
197442
+ [...]`;
197443
+ }
197444
+ return joined;
197445
+ }
197446
+ function truncateForEmbedding(text2, maxTokens) {
197447
+ const maxChars = maxTokens * 4;
197448
+ if (text2.length <= maxChars)
197449
+ return text2;
197450
+ const spaceIdx = text2.lastIndexOf(" ", maxChars);
197451
+ const breakAt = spaceIdx > maxChars * 0.8 ? spaceIdx : maxChars;
197452
+ return text2.slice(0, breakAt);
197453
+ }
197454
+ async function generateEmbeddings(digest, embeddingService, onProgress) {
197455
+ const paragraphs = [];
197456
+ let skippedCode = 0;
197457
+ for (let sIdx = 0;sIdx < digest.sections.length; sIdx++) {
197458
+ const section = digest.sections[sIdx];
197459
+ for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
197460
+ const text2 = section.paragraphs[pIdx].text;
197461
+ if (isPureCodeBlock(text2)) {
197462
+ skippedCode++;
197463
+ continue;
197464
+ }
197465
+ paragraphs.push({
197466
+ sectionIndex: sIdx,
197467
+ paragraphIndex: pIdx,
197468
+ text: truncateForEmbedding(text2, EMBEDDING_MAX_TOKENS)
197469
+ });
197470
+ }
197471
+ }
197472
+ if (paragraphs.length === 0)
197473
+ return 0;
197474
+ if (skippedCode > 0) {
197475
+ onProgress?.({ phase: "embedding", progress: 85, message: `Skipped ${skippedCode} code-only paragraphs` });
197476
+ }
197477
+ const embeddings = [];
197478
+ const totalParagraphs = paragraphs.length;
197479
+ onProgress?.({ phase: "embedding", progress: 85, message: `Loading model (${totalParagraphs} paragraphs)` });
197480
+ const warmupStart = Date.now();
197481
+ await embeddingService.getDimension();
197482
+ const warmupMs = Date.now() - warmupStart;
197483
+ if (warmupMs > 500) {
197484
+ onProgress?.({ phase: "embedding", progress: 86, message: `Model ready (${(warmupMs / 1000).toFixed(1)}s)` });
197485
+ }
197486
+ const totalBatches = Math.ceil(totalParagraphs / EMBEDDING_BATCH_SIZE);
197487
+ for (let i = 0;i < totalParagraphs; i += EMBEDDING_BATCH_SIZE) {
197488
+ const batchIndex = Math.floor(i / EMBEDDING_BATCH_SIZE) + 1;
197489
+ const batch2 = paragraphs.slice(i, i + EMBEDDING_BATCH_SIZE);
197490
+ const texts = batch2.map((p4) => p4.text);
197491
+ const batchStart = Date.now();
197492
+ try {
197493
+ const vectors = await embeddingService.embedBatch(texts);
197494
+ for (let j = 0;j < batch2.length; j++) {
197495
+ embeddings.push({
197496
+ sectionIndex: batch2[j].sectionIndex,
197497
+ paragraphIndex: batch2[j].paragraphIndex,
197498
+ vector: vectors[j]
197499
+ });
197500
+ }
197501
+ } catch {
197502
+ for (let fi = 0;fi < batch2.length; fi++) {
197503
+ const p4 = batch2[fi];
197504
+ try {
197505
+ const vector = await embeddingService.embed(p4.text);
197506
+ embeddings.push({
197507
+ sectionIndex: p4.sectionIndex,
197508
+ paragraphIndex: p4.paragraphIndex,
197509
+ vector
197510
+ });
197511
+ } catch {
197512
+ console.warn(`[docIndexer] embedding failed for section ${p4.sectionIndex} paragraph ${p4.paragraphIndex}`);
197513
+ }
197514
+ const embedded2 = i + fi + 1;
197515
+ const progress2 = 86 + Math.round(embedded2 / totalParagraphs * 9);
197516
+ onProgress?.({ phase: "embedding", progress: progress2, message: `Fallback ${embedded2}/${totalParagraphs}` });
197517
+ }
197518
+ continue;
197519
+ }
197520
+ const embedded = Math.min(i + EMBEDDING_BATCH_SIZE, totalParagraphs);
197521
+ const batchMs = Date.now() - batchStart;
197522
+ const progress = 86 + Math.round(embedded / totalParagraphs * 9);
197523
+ onProgress?.({ phase: "embedding", progress, message: `Batch ${batchIndex}/${totalBatches} (${embedded}/${totalParagraphs}, ${(batchMs / 1000).toFixed(1)}s)` });
197524
+ }
197525
+ digest.embeddings = embeddings;
197526
+ return embeddings.length;
197527
+ }
197528
+ async function writeToVectorStore(digest, vectorStore, hashId, sourceId, sourcePath) {
197529
+ if (digest.embeddings.length === 0)
197530
+ return;
197531
+ try {
197532
+ await vectorStore.deleteByPrefix(`${hashId}:`);
197533
+ await vectorStore.add(digest.embeddings.map((e) => ({
197534
+ id: `${hashId}:${e.sectionIndex}:${e.paragraphIndex}`,
197535
+ embedding: e.vector,
197536
+ metadata: {
197537
+ layer: "digest",
197538
+ sourceId,
197539
+ hashId,
197540
+ sourcePath,
197541
+ sectionIndex: e.sectionIndex,
197542
+ paragraphIndex: e.paragraphIndex
197543
+ }
197544
+ })));
197545
+ } catch (err2) {
197546
+ console.warn(`[docIndexer] IVectorStore write failed (non-blocking):`, err2);
197547
+ }
197548
+ }
197549
+
197550
+ // ../api/src/services/docTableExtractor.ts
197551
+ init_src();
197552
+ function detectTableColumnCount(text2) {
197553
+ const sepMatch = text2.match(/^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/m);
197554
+ if (!sepMatch)
197555
+ return 0;
197556
+ return (sepMatch[0].match(/\|/g)?.length ?? 1) - 1;
197557
+ }
197558
+ async function extractTableAtoms(chunk, sections, result, llmService) {
197559
+ const tableParagraphs = [];
197560
+ for (let i = 0;i < chunk.paragraphs.length; i++) {
197561
+ const p4 = chunk.paragraphs[i];
197562
+ const colCount = detectTableColumnCount(p4.text);
197563
+ if (colCount < 2)
197564
+ continue;
197565
+ const section = sections[p4.sectionIndex];
197566
+ const sectionHeading = section?.heading ? `${"#".repeat(section.level)} ${section.heading}` : "";
197567
+ tableParagraphs.push({
197568
+ chunkParaIndex: i,
197569
+ colCount,
197570
+ text: p4.text,
197571
+ sectionHeading
197572
+ });
197573
+ }
197574
+ if (tableParagraphs.length === 0) {
197575
+ return { extracted: 0, llmCalls: 0, totalTokens: 0 };
197576
+ }
197577
+ const parts = [];
197578
+ const tagToChunkIndex = new Map;
197579
+ for (let ti = 0;ti < tableParagraphs.length; ti++) {
197580
+ const tp = tableParagraphs[ti];
197581
+ const tag2 = `P${ti}`;
197582
+ tagToChunkIndex.set(tag2, tp.chunkParaIndex);
197583
+ if (tp.sectionHeading) {
197584
+ parts.push(tp.sectionHeading);
197585
+ }
197586
+ if (tp.chunkParaIndex > 0) {
197587
+ const prevPara = chunk.paragraphs[tp.chunkParaIndex - 1];
197588
+ if (prevPara && detectTableColumnCount(prevPara.text) === 0) {
197589
+ parts.push(prevPara.text);
197590
+ }
197591
+ }
197592
+ parts.push(`[${tag2}] ${tp.text}`);
197593
+ parts.push("");
197594
+ }
197595
+ const tableText = parts.join(`
197596
+
197597
+ `);
197598
+ const prompt = buildDocTableAnnotationPrompt(tableText);
197599
+ try {
197600
+ const res = await llmService.generateText(prompt, {
197601
+ systemPrompt: DOC_TABLE_ANNOTATION_SYSTEM_PROMPT
197602
+ });
197603
+ const parsed = parseExtractionOutput(res.text, docChunkResultSchema);
197604
+ if (!parsed.success) {
197605
+ console.warn(`[docIndexer] table extraction: parse failed: ${parsed.error.message.slice(0, 200)}`);
197606
+ return { extracted: 0, llmCalls: 1, totalTokens: res.usage.totalTokens };
197607
+ }
197608
+ let extracted = 0;
197609
+ for (const tableP of parsed.data.paragraphs) {
197610
+ const chunkParaIndex = tagToChunkIndex.get(tableP.tag);
197611
+ if (chunkParaIndex === undefined)
197612
+ continue;
197613
+ const originalTag = `P${chunkParaIndex}`;
197614
+ const existing = result.paragraphs.find((rp) => rp.tag === originalTag);
197615
+ const tableAtomCount = Object.values(tableP.atoms).reduce((sum, arr) => sum + (Array.isArray(arr) ? arr.length : 0), 0);
197616
+ if (tableAtomCount === 0)
197617
+ continue;
197618
+ if (existing) {
197619
+ for (const [atomType, atoms2] of Object.entries(tableP.atoms)) {
197620
+ if (Array.isArray(atoms2) && atoms2.length > 0) {
197621
+ existing.atoms[atomType] = atoms2;
197622
+ }
197623
+ }
197624
+ } else {
197625
+ result.paragraphs.push({ ...tableP, tag: originalTag });
197626
+ }
197627
+ extracted++;
197628
+ const tp = tableParagraphs.find((t4) => t4.chunkParaIndex === chunkParaIndex);
197629
+ console.log(`[docIndexer] table extraction: ${originalTag} → ${tableAtomCount} atoms (table has ${tp?.colCount ?? "?"} cols)`);
197630
+ }
197631
+ return { extracted, llmCalls: 1, totalTokens: res.usage.totalTokens };
197632
+ } catch (err2) {
197633
+ console.warn("[docIndexer] table extraction failed (non-blocking):", err2);
197634
+ return { extracted: 0, llmCalls: 1, totalTokens: 0 };
197635
+ }
197636
+ }
197637
+
197638
+ // ../api/src/services/docDiagramExtractor.ts
197639
+ init_src();
197640
+ var DIAGRAM_OPEN_RE = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "im");
197641
+ function detectDiagramFormat(text2) {
197642
+ const trimmed = text2.trim();
197643
+ const match2 = trimmed.match(new RegExp(`^\`\`\`(${DIAGRAM_FENCE_TAGS.join("|")})\\s*\\n`, "i"));
197644
+ if (!match2)
197645
+ return null;
197646
+ if (!trimmed.endsWith("```"))
197647
+ return null;
197648
+ return match2[1].toLowerCase();
197649
+ }
197650
+ async function extractDiagramAtoms(chunk, sections, result, llmService) {
197651
+ const diagramParagraphs = [];
197652
+ for (let i = 0;i < chunk.paragraphs.length; i++) {
197653
+ const p4 = chunk.paragraphs[i];
197654
+ const format = detectDiagramFormat(p4.text);
197655
+ if (!format)
197656
+ continue;
197657
+ const section = sections[p4.sectionIndex];
197658
+ const sectionHeading = section?.heading ? `${"#".repeat(section.level)} ${section.heading}` : "";
197659
+ diagramParagraphs.push({
197660
+ chunkParaIndex: i,
197661
+ format,
197662
+ text: p4.text,
197663
+ sectionHeading
197664
+ });
197665
+ }
197666
+ if (diagramParagraphs.length === 0) {
197667
+ return { extracted: 0, llmCalls: 0, totalTokens: 0 };
197668
+ }
197669
+ const parts = [];
197670
+ const tagToChunkIndex = new Map;
197671
+ for (let di = 0;di < diagramParagraphs.length; di++) {
197672
+ const dp = diagramParagraphs[di];
197673
+ const tag2 = `P${di}`;
197674
+ tagToChunkIndex.set(tag2, dp.chunkParaIndex);
197675
+ if (dp.sectionHeading) {
197676
+ parts.push(dp.sectionHeading);
197677
+ }
197678
+ if (dp.chunkParaIndex > 0) {
197679
+ const prevPara = chunk.paragraphs[dp.chunkParaIndex - 1];
197680
+ if (prevPara && !detectDiagramFormat(prevPara.text)) {
197681
+ parts.push(prevPara.text);
197682
+ }
197683
+ }
197684
+ parts.push(`[${tag2}] ${dp.text}`);
197685
+ parts.push("");
197686
+ }
197687
+ const diagramText = parts.join(`
197688
+
197689
+ `);
197690
+ const prompt = buildDocDiagramAnnotationPrompt(diagramText);
197691
+ try {
197692
+ const res = await llmService.generateText(prompt, {
197693
+ systemPrompt: DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT
197694
+ });
197695
+ const parsed = parseExtractionOutput(res.text, docChunkResultSchema);
197696
+ if (!parsed.success) {
197697
+ console.warn(`[docIndexer] diagram extraction: parse failed — ${parsed.error.message.slice(0, 200)}`);
197698
+ return { extracted: 0, llmCalls: 1, totalTokens: res.usage.totalTokens };
197699
+ }
197700
+ let extracted = 0;
197701
+ for (const diagramP of parsed.data.paragraphs) {
197702
+ const chunkParaIndex = tagToChunkIndex.get(diagramP.tag);
197703
+ if (chunkParaIndex === undefined)
197704
+ continue;
197705
+ const originalTag = `P${chunkParaIndex}`;
197706
+ const existing = result.paragraphs.find((rp) => rp.tag === originalTag);
197707
+ const diagramAtomCount = Object.values(diagramP.atoms).reduce((sum, arr) => sum + (Array.isArray(arr) ? arr.length : 0), 0);
197708
+ if (diagramAtomCount === 0)
197709
+ continue;
197710
+ if (existing) {
197711
+ for (const [atomType, atoms2] of Object.entries(diagramP.atoms)) {
197712
+ if (Array.isArray(atoms2) && atoms2.length > 0) {
197713
+ existing.atoms[atomType] = atoms2;
197714
+ }
197715
+ }
197716
+ } else {
197717
+ result.paragraphs.push({ ...diagramP, tag: originalTag });
197718
+ }
197719
+ extracted++;
197720
+ const dp = diagramParagraphs.find((d) => d.chunkParaIndex === chunkParaIndex);
197721
+ console.log(`[docIndexer] diagram extraction: ${originalTag} → ${diagramAtomCount} atoms (${dp?.format ?? "unknown"} diagram)`);
197722
+ }
197723
+ return { extracted, llmCalls: 1, totalTokens: res.usage.totalTokens };
197724
+ } catch (err2) {
197725
+ console.warn("[docIndexer] diagram extraction failed (non-blocking):", err2);
197726
+ return { extracted: 0, llmCalls: 1, totalTokens: 0 };
197727
+ }
197728
+ }
197729
+
197730
+ // ../api/src/services/docAtomPostProcess.ts
197731
+ function postProcessDigestAtoms(sections) {
197732
+ autoCompleteEntities(sections);
197733
+ normalizeEntityNames(sections);
197734
+ warnCrossRefIssues(sections);
197735
+ }
197736
+ function isNoiseEntityName(name21) {
197737
+ const trimmed = name21.trim();
197738
+ if (trimmed.length === 0)
197739
+ return true;
197740
+ if (trimmed.startsWith("$"))
197741
+ return true;
197742
+ if (/[+=]/.test(trimmed))
197743
+ return true;
197744
+ if (/^\d/.test(trimmed))
197745
+ return true;
197746
+ return false;
197747
+ }
197748
+ function autoCompleteEntities(sections) {
197749
+ const declaredEntities = new Set;
197750
+ for (const section of sections) {
197751
+ for (const para of section.paragraphs) {
197752
+ const entities2 = para.atoms.entities;
197753
+ if (entities2) {
197754
+ for (const e of entities2)
197755
+ declaredEntities.add(e.name);
197756
+ }
197757
+ }
197758
+ }
197759
+ let autoCreated = 0;
197760
+ let skippedNoise = 0;
197761
+ for (const section of sections) {
197762
+ for (const para of section.paragraphs) {
197763
+ const referencedNames = new Set;
197764
+ const relations = para.atoms.relations;
197765
+ if (relations) {
197766
+ for (const r of relations) {
197767
+ referencedNames.add(r.from);
197768
+ referencedNames.add(r.to);
197769
+ }
197770
+ }
197771
+ const boundaries = para.atoms.boundaries;
197772
+ if (boundaries) {
197773
+ for (const b of boundaries) {
197774
+ for (const name21 of b.contains)
197775
+ referencedNames.add(name21);
197776
+ if (b.excludes)
197777
+ for (const name21 of b.excludes)
197778
+ referencedNames.add(name21);
197779
+ }
197780
+ }
197781
+ for (const name21 of referencedNames) {
197782
+ if (!declaredEntities.has(name21)) {
197783
+ if (isNoiseEntityName(name21)) {
197784
+ skippedNoise++;
197785
+ continue;
197786
+ }
197787
+ if (!para.atoms.entities) {
197788
+ para.atoms.entities = [];
197789
+ }
197790
+ para.atoms.entities.push({
197791
+ name: name21,
197792
+ kind: "concept",
197793
+ confidence: 0.6
197794
+ });
197795
+ declaredEntities.add(name21);
197796
+ autoCreated++;
197797
+ }
197798
+ }
197799
+ }
197800
+ }
197801
+ if (autoCreated > 0 || skippedNoise > 0) {
197802
+ console.log(`[docAtomPostProcess] auto-created ${autoCreated} entities, skipped ${skippedNoise} noise names`);
197803
+ }
197804
+ }
197805
+ function normalizeEntityNames(sections) {
197806
+ const allNames = [];
197807
+ for (const section of sections) {
197808
+ for (const para of section.paragraphs) {
197809
+ const entities2 = para.atoms.entities;
197810
+ if (entities2) {
197811
+ for (const e of entities2)
197812
+ allNames.push(e.name);
197813
+ }
197814
+ }
197815
+ }
197816
+ const uniqueNames = [...new Set(allNames)].sort((a, b) => b.length - a.length);
197817
+ const mergeMap = new Map;
197818
+ for (let i = 0;i < uniqueNames.length; i++) {
197819
+ const short = uniqueNames[i];
197820
+ if (short.length < 3)
197821
+ continue;
197822
+ if (mergeMap.has(short))
197823
+ continue;
197824
+ for (let j = 0;j < i; j++) {
197825
+ const long = uniqueNames[j];
197826
+ if (mergeMap.has(long))
197827
+ continue;
197828
+ if (long.includes(short) && long !== short) {
197829
+ mergeMap.set(short, long);
197830
+ break;
197831
+ }
197832
+ }
197833
+ }
197834
+ if (mergeMap.size === 0)
197835
+ return;
197836
+ let normalized = 0;
197837
+ for (const section of sections) {
197838
+ for (const para of section.paragraphs) {
197839
+ const entities2 = para.atoms.entities;
197840
+ if (entities2) {
197841
+ for (const e of entities2) {
197842
+ const canonical = mergeMap.get(e.name);
197843
+ if (canonical) {
197844
+ e.name = canonical;
197845
+ normalized++;
197846
+ }
197847
+ }
197848
+ const seen = new Set;
197849
+ para.atoms.entities = entities2.filter((e) => {
197850
+ if (seen.has(e.name))
197851
+ return false;
197852
+ seen.add(e.name);
197853
+ return true;
197854
+ });
197855
+ }
197856
+ const relations = para.atoms.relations;
197857
+ if (relations) {
197858
+ for (const r of relations) {
197859
+ const fromCanonical = mergeMap.get(r.from);
197860
+ if (fromCanonical) {
197861
+ r.from = fromCanonical;
197862
+ normalized++;
197863
+ }
197864
+ const toCanonical = mergeMap.get(r.to);
197865
+ if (toCanonical) {
197866
+ r.to = toCanonical;
197867
+ normalized++;
197868
+ }
197869
+ }
197870
+ }
197871
+ const boundaries = para.atoms.boundaries;
197872
+ if (boundaries) {
197873
+ for (const b of boundaries) {
197874
+ b.contains = b.contains.map((n) => mergeMap.get(n) ?? n);
197875
+ if (b.excludes)
197876
+ b.excludes = b.excludes.map((n) => mergeMap.get(n) ?? n);
197877
+ }
197878
+ }
197879
+ }
197880
+ }
197881
+ if (normalized > 0) {
197882
+ console.log(`[docAtomPostProcess] normalized ${normalized} entity name references (${mergeMap.size} merge rules)`);
197883
+ for (const [short, long] of mergeMap) {
197884
+ console.log(` "${short}" → "${long}"`);
197885
+ }
197886
+ }
197887
+ }
197888
+ function warnCrossRefIssues(sections) {
197889
+ const allEntityNames = new Set;
197890
+ for (const section of sections) {
197891
+ for (const para of section.paragraphs) {
197892
+ const entities2 = para.atoms.entities;
197893
+ if (entities2) {
197894
+ for (const e of entities2)
197895
+ allEntityNames.add(e.name);
197896
+ }
197897
+ }
197898
+ }
197899
+ const allStateValues = new Set;
197900
+ for (const section of sections) {
197901
+ for (const para of section.paragraphs) {
197902
+ const states = para.atoms.states;
197903
+ if (states) {
197904
+ for (const s of states)
197905
+ for (const v of s.values)
197906
+ allStateValues.add(v);
197907
+ }
197908
+ }
197909
+ }
197910
+ let warnings = 0;
197911
+ for (const section of sections) {
197912
+ for (const para of section.paragraphs) {
197913
+ const transitions = para.atoms.transitions;
197914
+ if (transitions) {
197915
+ for (const t4 of transitions) {
197916
+ if (allStateValues.size > 0 && !allStateValues.has(t4.from)) {
197917
+ console.warn(`[docAtomPostProcess] transition.from "${t4.from}" not in declared states`);
197918
+ warnings++;
197919
+ }
197920
+ if (allStateValues.size > 0 && !allStateValues.has(t4.to)) {
197921
+ console.warn(`[docAtomPostProcess] transition.to "${t4.to}" not in declared states`);
197922
+ warnings++;
197923
+ }
197924
+ }
197925
+ }
197926
+ const roles = para.atoms.roles;
197927
+ if (roles) {
197928
+ const allBehaviorNames = new Set;
197929
+ for (const s of sections) {
197930
+ for (const p4 of s.paragraphs) {
197931
+ const behaviors = p4.atoms.behaviors;
197932
+ if (behaviors)
197933
+ for (const b of behaviors)
197934
+ allBehaviorNames.add(b.name);
197935
+ }
197936
+ }
197937
+ for (const role of roles) {
197938
+ if (role.performs) {
197939
+ for (const p4 of role.performs) {
197940
+ if (allBehaviorNames.size > 0 && !allBehaviorNames.has(p4)) {
197941
+ console.warn(`[docAtomPostProcess] role.performs "${p4}" not in declared behaviors`);
197942
+ warnings++;
197943
+ }
197944
+ }
197945
+ }
197946
+ }
197947
+ }
197948
+ }
197949
+ }
197950
+ if (warnings > 0) {
197951
+ console.warn(`[docAtomPostProcess] ${warnings} cross-reference warnings (non-blocking)`);
197952
+ }
197953
+ }
197954
+ function detectNoiseCandidates(entityNames) {
197955
+ const candidates = [];
197956
+ for (const name21 of entityNames) {
197957
+ if (/^[\u4e00-\u9fff]{1,4}$/.test(name21)) {
197958
+ candidates.push(name21);
197959
+ continue;
197960
+ }
197961
+ if (/^[a-zA-Z]{1,5}$/.test(name21) && name21[0] === name21[0].toLowerCase()) {
197962
+ candidates.push(name21);
197963
+ continue;
197964
+ }
197965
+ if (/^Kill\s+\d/.test(name21)) {
197966
+ candidates.push(name21);
197967
+ continue;
197968
+ }
197969
+ }
197970
+ return candidates;
197971
+ }
197972
+ function collectExtractionStats(sections) {
197973
+ const atomTypeCounts = {};
197974
+ const entityNames = new Set;
197975
+ let paragraphsWithAtoms = 0;
197976
+ let paragraphsTotal = 0;
197977
+ for (const section of sections) {
197978
+ for (const para of section.paragraphs) {
197979
+ paragraphsTotal++;
197980
+ let hasAtoms = false;
197981
+ for (const [atomType, atoms2] of Object.entries(para.atoms)) {
197982
+ if (!Array.isArray(atoms2) || atoms2.length === 0)
197983
+ continue;
197984
+ hasAtoms = true;
197985
+ atomTypeCounts[atomType] = (atomTypeCounts[atomType] ?? 0) + atoms2.length;
197986
+ if (atomType === "entities") {
197987
+ for (const e of atoms2)
197988
+ entityNames.add(e.name);
197989
+ }
197990
+ }
197991
+ if (hasAtoms)
197992
+ paragraphsWithAtoms++;
197993
+ }
197994
+ }
197995
+ return {
197996
+ entityCount: atomTypeCounts.entities ?? 0,
197997
+ relationCount: atomTypeCounts.relations ?? 0,
197998
+ atomTypeCounts,
197999
+ uniqueEntityNames: [...entityNames],
198000
+ paragraphsWithAtoms,
198001
+ paragraphsTotal
198002
+ };
198003
+ }
198004
+ function detectResolutionCandidates(entityNames) {
198005
+ const candidates = [];
198006
+ const sorted = [...entityNames].sort((a, b) => b.length - a.length);
198007
+ for (let i = 0;i < sorted.length; i++) {
198008
+ const long = sorted[i];
198009
+ for (let j = i + 1;j < sorted.length; j++) {
198010
+ const short = sorted[j];
198011
+ if (short.length < 3)
198012
+ continue;
198013
+ if (short === long)
198014
+ continue;
198015
+ if (long.includes(short)) {
198016
+ candidates.push({ short, long, reason: "substring match" });
198017
+ continue;
198018
+ }
198019
+ if (long.toLowerCase() === short.toLowerCase()) {
198020
+ candidates.push({ short, long, reason: "case-insensitive match" });
198021
+ continue;
198022
+ }
198023
+ if (long.toLowerCase().includes(short.toLowerCase()) && short.length >= 4) {
198024
+ candidates.push({ short, long, reason: "case-insensitive substring" });
198025
+ }
198026
+ }
198027
+ }
198028
+ return candidates;
198029
+ }
198030
+ function applyEntityMerges(sections, merges) {
198031
+ if (merges.length === 0)
198032
+ return 0;
198033
+ const mergeMap = new Map;
198034
+ for (const m of merges)
198035
+ mergeMap.set(m.from, m.to);
198036
+ let normalized = 0;
198037
+ for (const section of sections) {
198038
+ for (const para of section.paragraphs) {
198039
+ const entities2 = para.atoms.entities;
198040
+ if (entities2) {
198041
+ for (const e of entities2) {
198042
+ const canonical = mergeMap.get(e.name);
198043
+ if (canonical) {
198044
+ e.name = canonical;
198045
+ normalized++;
198046
+ }
198047
+ }
198048
+ const seen = new Set;
198049
+ para.atoms.entities = entities2.filter((e) => {
198050
+ if (seen.has(e.name))
198051
+ return false;
198052
+ seen.add(e.name);
198053
+ return true;
198054
+ });
198055
+ }
198056
+ const relations = para.atoms.relations;
198057
+ if (relations) {
198058
+ for (const r of relations) {
198059
+ const fromCanonical = mergeMap.get(r.from);
198060
+ if (fromCanonical) {
198061
+ r.from = fromCanonical;
198062
+ normalized++;
198063
+ }
198064
+ const toCanonical = mergeMap.get(r.to);
198065
+ if (toCanonical) {
198066
+ r.to = toCanonical;
198067
+ normalized++;
198068
+ }
198069
+ }
198070
+ }
198071
+ const boundaries = para.atoms.boundaries;
198072
+ if (boundaries) {
198073
+ for (const b of boundaries) {
198074
+ b.contains = b.contains.map((n) => mergeMap.get(n) ?? n);
198075
+ if (b.excludes)
198076
+ b.excludes = b.excludes.map((n) => mergeMap.get(n) ?? n);
198077
+ }
198078
+ }
198079
+ }
198080
+ }
198081
+ if (normalized > 0) {
198082
+ console.log(`[docAtomPostProcess] LLM entity resolution: normalized ${normalized} references (${merges.length} merge rules)`);
198083
+ for (const m of merges) {
198084
+ console.log(` "${m.from}" → "${m.to}"`);
198085
+ }
198086
+ }
198087
+ return normalized;
198088
+ }
198089
+ function removeNoiseEntities(sections, names) {
198090
+ if (names.length === 0)
198091
+ return 0;
198092
+ const removeSet = new Set(names);
198093
+ let removed = 0;
198094
+ for (const section of sections) {
198095
+ for (const para of section.paragraphs) {
198096
+ const entities2 = para.atoms.entities;
198097
+ if (entities2) {
198098
+ const before = entities2.length;
198099
+ para.atoms.entities = entities2.filter((e) => !removeSet.has(e.name));
198100
+ removed += before - para.atoms.entities.length;
198101
+ }
198102
+ const relations = para.atoms.relations;
198103
+ if (relations) {
198104
+ para.atoms.relations = relations.filter((r) => !removeSet.has(r.from) || !removeSet.has(r.to));
198105
+ }
198106
+ const boundaries = para.atoms.boundaries;
198107
+ if (boundaries) {
198108
+ for (const b of boundaries) {
198109
+ b.contains = b.contains.filter((n) => !removeSet.has(n));
198110
+ if (b.excludes)
198111
+ b.excludes = b.excludes.filter((n) => !removeSet.has(n));
198112
+ }
198113
+ }
198114
+ }
198115
+ }
198116
+ if (removed > 0) {
198117
+ console.log(`[docAtomPostProcess] removed ${removed} noise entity instances (${names.length} names)`);
198118
+ for (const n of names) {
198119
+ console.log(` ✕ "${n}"`);
198120
+ }
198121
+ }
198122
+ return removed;
198123
+ }
198124
+
198125
+ // ../api/src/services/docIndexer.ts
196502
198126
  var CHUNK_CONCURRENCY = 2;
196503
198127
  var GLEANING_MAX_ROUNDS = 2;
196504
- var EMBEDDING_BATCH_SIZE = 20;
198128
+ var CODE_BLOCK_MIN_LENGTH = 500;
196505
198129
  function injectParagraphTags(chunk, sections) {
196506
198130
  const parts = [];
196507
198131
  if (chunk.breadcrumb.length > 0) {
@@ -196519,7 +198143,11 @@ function injectParagraphTags(chunk, sections) {
196519
198143
  parts.push(`${"#".repeat(section.level)} ${section.heading}`);
196520
198144
  }
196521
198145
  }
196522
- parts.push(`[P${i}] ${p4.text}`);
198146
+ if (p4.text.length >= CODE_BLOCK_MIN_LENGTH && isPureCodeBlock(p4.text)) {
198147
+ parts.push(`[P${i}] ${skeletonizeCodeBlock(p4.text)}`);
198148
+ } else {
198149
+ parts.push(`[P${i}] ${p4.text}`);
198150
+ }
196523
198151
  }
196524
198152
  return parts.join(`
196525
198153
 
@@ -196584,12 +198212,14 @@ Continue the JSON output from the exact point of truncation. Output ONLY the rem
196584
198212
  });
196585
198213
  const combined = trimmed + result.text.trim();
196586
198214
  JSON.parse(jsonrepair(combined));
198215
+ console.log(`[docIndexer] continuation: merged T1 (${trimmed.length} chars) + continuation (${result.text.trim().length} chars) = ${combined.length} chars`);
196587
198216
  return {
196588
198217
  text: combined,
196589
198218
  extraCalls: 1,
196590
198219
  extraTokens: result.usage.totalTokens
196591
198220
  };
196592
- } catch {
198221
+ } catch (contErr) {
198222
+ console.warn(`[docIndexer] continuation: merge failed, returning original (${trimmed.length} chars). ` + `Error: ${contErr instanceof Error ? contErr.message : String(contErr)}`);
196593
198223
  return { text: text2, extraCalls: 1, extraTokens: 0 };
196594
198224
  }
196595
198225
  }
@@ -196630,9 +198260,16 @@ async function processChunk(chunk, chunkIndex, llmService, sections, onStep) {
196630
198260
  llmCalls += continued.extraCalls;
196631
198261
  totalTokens += continued.extraTokens;
196632
198262
  onStep?.("T1 done", llmCalls, totalTokens);
196633
- const parseResult = parseExtractionOutput(continued.text, docChunkResultSchema);
198263
+ let parseResult = parseExtractionOutput(continued.text, docChunkResultSchema);
196634
198264
  if (!parseResult.success) {
196635
- throw new Error(`Chunk ${chunkIndex} T1 parse failed: ${parseResult.error.message}`);
198265
+ const preview = continued.text.slice(0, 500).replace(/\n/g, "\\n");
198266
+ console.warn(`[docIndexer] chunk ${chunkIndex} T1 strict parse failed, attempting lenient. ` + `Error: ${parseResult.error.message.slice(0, 200)}. ` + `LLM output preview: ${preview}`);
198267
+ const lenient = tryLenientParse(continued.text, chunkIndex);
198268
+ if (lenient) {
198269
+ parseResult = { success: true, data: lenient };
198270
+ } else {
198271
+ throw new Error(`Chunk ${chunkIndex} T1 parse failed: ${parseResult.error.message}`);
198272
+ }
196636
198273
  }
196637
198274
  try {
196638
198275
  const rawJson = JSON.parse(jsonrepair(continued.text));
@@ -196689,8 +198326,19 @@ async function processChunk(chunk, chunkIndex, llmService, sections, onStep) {
196689
198326
  chunkText,
196690
198327
  previousResult: parseResult.data
196691
198328
  });
196692
- const finalResult = cumulativeResult;
196693
- return { result: finalResult, llmCalls, totalTokens };
198329
+ const tableResult = await extractTableAtoms(chunk, sections, cumulativeResult, llmService);
198330
+ if (tableResult.extracted > 0) {
198331
+ onStep?.(`table extraction (${tableResult.extracted} tables)`, llmCalls + tableResult.llmCalls, totalTokens + tableResult.totalTokens);
198332
+ }
198333
+ llmCalls += tableResult.llmCalls;
198334
+ totalTokens += tableResult.totalTokens;
198335
+ const diagramResult = await extractDiagramAtoms(chunk, sections, cumulativeResult, llmService);
198336
+ if (diagramResult.extracted > 0) {
198337
+ onStep?.(`diagram extraction (${diagramResult.extracted} diagrams)`, llmCalls + diagramResult.llmCalls, totalTokens + diagramResult.totalTokens);
198338
+ }
198339
+ llmCalls += diagramResult.llmCalls;
198340
+ totalTokens += diagramResult.totalTokens;
198341
+ return { result: cumulativeResult, llmCalls, totalTokens };
196694
198342
  }
196695
198343
  function mapChunkResultToSections(chunk, chunkResult, sections) {
196696
198344
  for (const p4 of chunkResult.paragraphs) {
@@ -196735,6 +198383,123 @@ function mapChunkResultToSections(chunk, chunkResult, sections) {
196735
198383
  }
196736
198384
  }
196737
198385
  }
198386
+ var ATOM_TYPE_KEYS = new Set([
198387
+ "entities",
198388
+ "relations",
198389
+ "behaviors",
198390
+ "attributes",
198391
+ "states",
198392
+ "rules",
198393
+ "transitions",
198394
+ "events",
198395
+ "decisions",
198396
+ "metrics",
198397
+ "roles",
198398
+ "constraints",
198399
+ "comparisons",
198400
+ "boundaries"
198401
+ ]);
198402
+ function looksLikeAtoms(obj) {
198403
+ return Object.keys(obj).some((k) => ATOM_TYPE_KEYS.has(k) && Array.isArray(obj[k]));
198404
+ }
198405
+ var ATOM_REQUIRED_FIELDS = {
198406
+ entities: ["name"],
198407
+ relations: ["from", "to", "type"],
198408
+ behaviors: ["name"],
198409
+ attributes: ["name"],
198410
+ states: ["name"],
198411
+ rules: ["description"],
198412
+ transitions: ["from", "to"],
198413
+ events: ["name"],
198414
+ decisions: ["description"],
198415
+ metrics: ["name"],
198416
+ roles: ["name"],
198417
+ constraints: ["description"],
198418
+ comparisons: ["description"],
198419
+ boundaries: ["name"]
198420
+ };
198421
+ var PARAGRAPH_TAG_RE2 = /^P\d+$/;
198422
+ function tryLenientParse(rawText, chunkIndex) {
198423
+ try {
198424
+ let raw5 = JSON.parse(jsonrepair(rawText));
198425
+ if (Array.isArray(raw5)) {
198426
+ raw5 = { paragraphs: raw5 };
198427
+ }
198428
+ if (raw5 && typeof raw5 === "object" && !Array.isArray(raw5) && !raw5.paragraphs) {
198429
+ const keys = Object.keys(raw5);
198430
+ if (keys.length > 0 && keys.every((k) => PARAGRAPH_TAG_RE2.test(k))) {
198431
+ raw5 = {
198432
+ paragraphs: keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag2) => ({ tag: tag2, atoms: raw5[tag2] }))
198433
+ };
198434
+ }
198435
+ }
198436
+ if (!Array.isArray(raw5?.paragraphs) && raw5 && typeof raw5 === "object" && looksLikeAtoms(raw5)) {
198437
+ raw5 = { paragraphs: [{ tag: "P0", atoms: raw5 }] };
198438
+ }
198439
+ if (!raw5 || !Array.isArray(raw5.paragraphs))
198440
+ return null;
198441
+ const salvaged = { paragraphs: [] };
198442
+ let droppedAtoms = 0;
198443
+ let fixedTags = 0;
198444
+ for (let idx = 0;idx < raw5.paragraphs.length; idx++) {
198445
+ const rawPara = raw5.paragraphs[idx];
198446
+ if (!rawPara || typeof rawPara !== "object")
198447
+ continue;
198448
+ let tag2 = rawPara.tag;
198449
+ if (!tag2 || typeof tag2 !== "string" || !/^P\d+$/.test(tag2)) {
198450
+ tag2 = `P${idx}`;
198451
+ fixedTags++;
198452
+ }
198453
+ let atomsObj;
198454
+ if (rawPara.atoms && typeof rawPara.atoms === "object") {
198455
+ atomsObj = rawPara.atoms;
198456
+ } else if (looksLikeAtoms(rawPara)) {
198457
+ atomsObj = rawPara;
198458
+ } else {
198459
+ continue;
198460
+ }
198461
+ const cleanAtoms = {};
198462
+ for (const [atomType, atoms2] of Object.entries(atomsObj)) {
198463
+ if (!ATOM_TYPE_KEYS.has(atomType) || !Array.isArray(atoms2))
198464
+ continue;
198465
+ const requiredFields = ATOM_REQUIRED_FIELDS[atomType] ?? [];
198466
+ const kept = [];
198467
+ for (const atom of atoms2) {
198468
+ if (!atom || typeof atom !== "object") {
198469
+ droppedAtoms++;
198470
+ continue;
198471
+ }
198472
+ const rec = atom;
198473
+ const hasRequired = requiredFields.every((f) => rec[f] != null && rec[f] !== "");
198474
+ if (hasRequired) {
198475
+ kept.push(atom);
198476
+ } else {
198477
+ droppedAtoms++;
198478
+ }
198479
+ }
198480
+ if (kept.length > 0)
198481
+ cleanAtoms[atomType] = kept;
198482
+ }
198483
+ salvaged.paragraphs.push({ tag: tag2, atoms: cleanAtoms });
198484
+ }
198485
+ if (salvaged.paragraphs.length === 0)
198486
+ return null;
198487
+ const result = docChunkResultSchema.safeParse(salvaged);
198488
+ if (!result.success)
198489
+ return null;
198490
+ const fixes = [];
198491
+ if (fixedTags > 0)
198492
+ fixes.push(`${fixedTags} tags auto-assigned`);
198493
+ if (droppedAtoms > 0)
198494
+ fixes.push(`${droppedAtoms} invalid atoms dropped`);
198495
+ if (fixes.length > 0) {
198496
+ console.warn(`[docIndexer] chunk ${chunkIndex}: lenient parse salvaged — ${fixes.join(", ")}`);
198497
+ }
198498
+ return result.data;
198499
+ } catch {
198500
+ return null;
198501
+ }
198502
+ }
196738
198503
  function ensureAtomConfidence(atoms2) {
196739
198504
  const DEFAULT_DOC_CONFIDENCE = 0.7;
196740
198505
  for (const atomList of Object.values(atoms2)) {
@@ -196758,90 +198523,62 @@ function countAtoms(sections) {
196758
198523
  }
196759
198524
  return counts;
196760
198525
  }
196761
- async function generateEmbeddings(digest, embeddingService, onProgress) {
196762
- const paragraphs = [];
196763
- for (let sIdx = 0;sIdx < digest.sections.length; sIdx++) {
196764
- const section = digest.sections[sIdx];
196765
- for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
196766
- paragraphs.push({
196767
- sectionIndex: sIdx,
196768
- paragraphIndex: pIdx,
196769
- text: section.paragraphs[pIdx].text
196770
- });
196771
- }
196772
- }
196773
- if (paragraphs.length === 0)
196774
- return 0;
196775
- const embeddings = [];
196776
- const totalParagraphs = paragraphs.length;
196777
- onProgress?.({ phase: "embedding", progress: 85, message: `Loading model (${totalParagraphs} paragraphs)` });
196778
- const warmupStart = Date.now();
196779
- await embeddingService.getDimension();
196780
- const warmupMs = Date.now() - warmupStart;
196781
- if (warmupMs > 500) {
196782
- onProgress?.({ phase: "embedding", progress: 86, message: `Model ready (${(warmupMs / 1000).toFixed(1)}s)` });
198526
+ function formatExtractionStats(stats) {
198527
+ const typeSummary = Object.entries(stats.atomTypeCounts).sort(([, a], [, b]) => b - a).map(([t4, c]) => `${t4}:${c}`).join(" ");
198528
+ return `${stats.uniqueEntityNames.length} entities, ${stats.relationCount} relations, ` + `${stats.paragraphsWithAtoms}/${stats.paragraphsTotal} paragraphs with atoms | ${typeSummary}`;
198529
+ }
198530
+ async function runEntityResolution(sections, entityNames, llmService, onProgress) {
198531
+ const candidates = detectResolutionCandidates(entityNames);
198532
+ const noiseCandidates = detectNoiseCandidates(entityNames);
198533
+ if (candidates.length === 0 && noiseCandidates.length === 0) {
198534
+ console.log("[docIndexer] entity resolution: no duplicates or noise candidates, skipping");
198535
+ onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.no_resolution" } });
198536
+ return { llmCalls: 0, totalTokens: 0 };
196783
198537
  }
196784
- const totalBatches = Math.ceil(totalParagraphs / EMBEDDING_BATCH_SIZE);
196785
- for (let i = 0;i < totalParagraphs; i += EMBEDDING_BATCH_SIZE) {
196786
- const batchIndex = Math.floor(i / EMBEDDING_BATCH_SIZE) + 1;
196787
- const batch2 = paragraphs.slice(i, i + EMBEDDING_BATCH_SIZE);
196788
- const texts = batch2.map((p4) => p4.text);
196789
- const batchStart = Date.now();
198538
+ console.log(`[docIndexer] entity resolution: ${candidates.length} duplicate pairs, ${noiseCandidates.length} noise candidates`);
198539
+ onProgress?.({
198540
+ phase: "post-processing",
198541
+ progress: 83,
198542
+ message: { key: "index.doc.msg.resolving", params: { duplicates: candidates.length, noise: noiseCandidates.length } }
198543
+ });
198544
+ try {
198545
+ const prompt = buildEntityResolutionPrompt({
198546
+ allNames: entityNames,
198547
+ candidates,
198548
+ ...noiseCandidates.length > 0 ? { noiseCandidates } : {}
198549
+ });
198550
+ const result = await llmService.generateText(prompt, {
198551
+ systemPrompt: ENTITY_RESOLUTION_SYSTEM_PROMPT
198552
+ });
198553
+ let resolution;
196790
198554
  try {
196791
- const vectors = await embeddingService.embedBatch(texts);
196792
- for (let j = 0;j < batch2.length; j++) {
196793
- embeddings.push({
196794
- sectionIndex: batch2[j].sectionIndex,
196795
- paragraphIndex: batch2[j].paragraphIndex,
196796
- vector: vectors[j]
196797
- });
196798
- }
198555
+ resolution = JSON.parse(jsonrepair(result.text));
196799
198556
  } catch {
196800
- for (let fi = 0;fi < batch2.length; fi++) {
196801
- const p4 = batch2[fi];
196802
- try {
196803
- const vector = await embeddingService.embed(p4.text);
196804
- embeddings.push({
196805
- sectionIndex: p4.sectionIndex,
196806
- paragraphIndex: p4.paragraphIndex,
196807
- vector
196808
- });
196809
- } catch {
196810
- console.warn(`[docIndexer] embedding failed for section ${p4.sectionIndex} paragraph ${p4.paragraphIndex}`);
196811
- }
196812
- const embedded2 = i + fi + 1;
196813
- const progress2 = 86 + Math.round(embedded2 / totalParagraphs * 9);
196814
- onProgress?.({ phase: "embedding", progress: progress2, message: `Fallback ${embedded2}/${totalParagraphs}` });
196815
- }
196816
- continue;
198557
+ console.warn("[docIndexer] entity resolution: failed to parse LLM response, skipping");
198558
+ onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.resolution_parse_failed" } });
198559
+ return { llmCalls: 1, totalTokens: result.usage.totalTokens };
196817
198560
  }
196818
- const embedded = Math.min(i + EMBEDDING_BATCH_SIZE, totalParagraphs);
196819
- const batchMs = Date.now() - batchStart;
196820
- const progress = 86 + Math.round(embedded / totalParagraphs * 9);
196821
- onProgress?.({ phase: "embedding", progress, message: `Batch ${batchIndex}/${totalBatches} (${embedded}/${totalParagraphs}, ${(batchMs / 1000).toFixed(1)}s)` });
196822
- }
196823
- digest.embeddings = embeddings;
196824
- return embeddings.length;
196825
- }
196826
- async function writeToVectorStore(digest, vectorStore, hashId, sourceId, sourcePath) {
196827
- if (digest.embeddings.length === 0)
196828
- return;
196829
- try {
196830
- await vectorStore.deleteByPrefix(`${hashId}:`);
196831
- await vectorStore.add(digest.embeddings.map((e) => ({
196832
- id: `${hashId}:${e.sectionIndex}:${e.paragraphIndex}`,
196833
- embedding: e.vector,
196834
- metadata: {
196835
- layer: "digest",
196836
- sourceId,
196837
- hashId,
196838
- sourcePath,
196839
- sectionIndex: e.sectionIndex,
196840
- paragraphIndex: e.paragraphIndex
198561
+ const mergeCount = applyEntityMerges(sections, resolution.merges ?? []);
198562
+ const removeCount = removeNoiseEntities(sections, resolution.remove ?? []);
198563
+ onProgress?.({
198564
+ phase: "post-processing",
198565
+ progress: 84,
198566
+ message: {
198567
+ key: "index.doc.msg.resolution_result",
198568
+ params: {
198569
+ merges: resolution.merges?.length ?? 0,
198570
+ mergeRefs: mergeCount,
198571
+ removed: resolution.remove?.length ?? 0,
198572
+ removeRefs: removeCount,
198573
+ ambiguous: resolution.ambiguous?.length ?? 0
198574
+ }
196841
198575
  }
196842
- })));
198576
+ });
198577
+ return { llmCalls: 1, totalTokens: result.usage.totalTokens };
196843
198578
  } catch (err2) {
196844
- console.warn(`[docIndexer] IVectorStore write failed (non-blocking):`, err2);
198579
+ console.warn("[docIndexer] entity resolution LLM call failed (non-blocking):", err2);
198580
+ onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.resolution_failed" } });
198581
+ return { llmCalls: 0, totalTokens: 0 };
196845
198582
  }
196846
198583
  }
196847
198584
  async function indexDocument(input) {
@@ -196854,16 +198591,22 @@ async function indexDocument(input) {
196854
198591
  digestStore: digestStore2,
196855
198592
  onProgress
196856
198593
  } = input;
198594
+ const { content: normalizedContent, stats: normalizeStats } = normalizeMarkdown(content);
198595
+ const repairCount = Object.values(normalizeStats.repairs).reduce((a, b) => a + b, 0);
198596
+ if (repairCount > 0) {
198597
+ const repairSummary = Object.entries(normalizeStats.repairs).map(([k, v]) => `${k}:${v}`).join(" ");
198598
+ console.log(`[docIndexer] markdown normalized: ${repairCount} repairs (${repairSummary})`);
198599
+ }
196857
198600
  onProgress?.({ phase: "chunking", progress: 3 });
196858
- const chunks = chunkMarkdown(content);
196859
- const parsedSections = parseSections(content);
198601
+ const chunks = chunkMarkdown(normalizedContent);
198602
+ const parsedSections = parseSections(normalizedContent);
196860
198603
  if (chunks.length === 0) {
196861
198604
  throw new Error("Document produced no chunks — content may be empty");
196862
198605
  }
196863
- onProgress?.({ phase: "chunking", progress: 8, message: `${chunks.length} chunks, ${parsedSections.length} sections` });
198606
+ onProgress?.({ phase: "chunking", progress: 8, message: { key: "index.doc.msg.chunking_result", params: { chunks: chunks.length, sections: parsedSections.length } } });
196864
198607
  const totalChunks = chunks.length;
196865
- const annotateMsg = input.llmModel ? `${totalChunks} chunks, LLM ${input.llmModel}` : `${totalChunks} chunks`;
196866
- onProgress?.({ phase: "annotating", progress: 10, message: annotateMsg });
198608
+ const annotateStartMsg = input.llmModel ? { key: "index.doc.msg.annotating_start_model", params: { n: totalChunks, model: input.llmModel } } : { key: "index.doc.msg.annotating_start", params: { n: totalChunks } };
198609
+ onProgress?.({ phase: "annotating", progress: 10, message: annotateStartMsg });
196867
198610
  let completedChunks = 0;
196868
198611
  let totalLlmCalls = 0;
196869
198612
  let totalTokens = 0;
@@ -196873,7 +198616,7 @@ async function indexDocument(input) {
196873
198616
  onProgress?.({
196874
198617
  phase: "annotating",
196875
198618
  progress: baseProgress,
196876
- message: `Chunk ${completedChunks + 1}/${totalChunks} ${step} (${calls} calls, ${tokens} tokens)`
198619
+ message: { key: "index.doc.msg.annotating_chunk", params: { current: completedChunks + 1, total: totalChunks, step, calls, tokens } }
196877
198620
  });
196878
198621
  });
196879
198622
  completedChunks++;
@@ -196883,7 +198626,7 @@ async function indexDocument(input) {
196883
198626
  onProgress?.({
196884
198627
  phase: "annotating",
196885
198628
  progress,
196886
- message: `Chunk ${completedChunks}/${totalChunks} done, total ${totalLlmCalls} calls ${totalTokens} tokens`
198629
+ message: { key: "index.doc.msg.annotating_chunk_done", params: { current: completedChunks, total: totalChunks, calls: totalLlmCalls, tokens: totalTokens } }
196887
198630
  });
196888
198631
  return result;
196889
198632
  });
@@ -196894,17 +198637,25 @@ async function indexDocument(input) {
196894
198637
  const sectionsMap = new Map;
196895
198638
  for (let i = 0;i < parsedSections.length; i++) {
196896
198639
  const s = parsedSections[i];
196897
- const sectionKey = `${i}`;
196898
- sectionsMap.set(sectionKey, {
198640
+ sectionsMap.set(`${i}`, {
196899
198641
  heading: s.heading,
196900
198642
  level: s.level,
196901
198643
  paragraphs: new Map
196902
198644
  });
196903
- for (let pIdx = 0;pIdx < s.paragraphs.length; pIdx++) {
196904
- sectionsMap.get(sectionKey).paragraphs.set(`${i}:${pIdx}`, {
196905
- text: s.paragraphs[pIdx],
196906
- atoms: {}
196907
- });
198645
+ }
198646
+ for (const chunk of chunks) {
198647
+ for (const cp of chunk.paragraphs) {
198648
+ const sectionKey = `${cp.sectionIndex}`;
198649
+ if (!sectionsMap.has(sectionKey)) {
198650
+ sectionsMap.set(sectionKey, { heading: "", level: 0, paragraphs: new Map });
198651
+ }
198652
+ const paragraphKey = `${cp.sectionIndex}:${cp.paragraphIndex}`;
198653
+ if (!sectionsMap.get(sectionKey).paragraphs.has(paragraphKey)) {
198654
+ sectionsMap.get(sectionKey).paragraphs.set(paragraphKey, {
198655
+ text: cp.text,
198656
+ atoms: {}
198657
+ });
198658
+ }
196908
198659
  }
196909
198660
  }
196910
198661
  for (const success2 of chunkProcessResult.successes) {
@@ -196934,6 +198685,25 @@ async function indexDocument(input) {
196934
198685
  ensureAtomConfidence(para.atoms);
196935
198686
  }
196936
198687
  }
198688
+ onProgress?.({ phase: "post-processing", progress: 81, message: { key: "index.doc.msg.post_process_start" } });
198689
+ postProcessDigestAtoms(digestSections);
198690
+ const preStats = collectExtractionStats(digestSections);
198691
+ const statsMsg = formatExtractionStats(preStats);
198692
+ console.log(`[docIndexer] extraction stats: ${statsMsg}`);
198693
+ onProgress?.({ phase: "post-processing", progress: 82, message: {
198694
+ key: "index.doc.msg.extraction_stats",
198695
+ params: {
198696
+ entities: preStats.uniqueEntityNames.length,
198697
+ relations: preStats.relationCount,
198698
+ withAtoms: preStats.paragraphsWithAtoms,
198699
+ totalParas: preStats.paragraphsTotal
198700
+ }
198701
+ } });
198702
+ if ((input.enableEntityResolution ?? true) && preStats.uniqueEntityNames.length > 1) {
198703
+ const resolutionResult = await runEntityResolution(digestSections, preStats.uniqueEntityNames, llmService, onProgress);
198704
+ totalLlmCalls += resolutionResult.llmCalls;
198705
+ totalTokens += resolutionResult.totalTokens;
198706
+ }
196937
198707
  const atomCounts = countAtoms(sectionsMap);
196938
198708
  const paragraphCount = digestSections.reduce((sum, s) => sum + s.paragraphs.length, 0);
196939
198709
  if (paragraphCount === 0) {
@@ -196955,7 +198725,7 @@ async function indexDocument(input) {
196955
198725
  processedAt: new Date().toISOString()
196956
198726
  }
196957
198727
  };
196958
- const embedMsg = input.embeddingModel ? `Embedding ${input.embeddingModel}` : undefined;
198728
+ const embedMsg = input.embeddingModel ? { key: "index.doc.msg.embedding_model", params: { model: input.embeddingModel } } : undefined;
196959
198729
  onProgress?.({ phase: "embedding", progress: 85, ...embedMsg ? { message: embedMsg } : {} });
196960
198730
  let embeddingCount = 0;
196961
198731
  if (input.embeddingService) {
@@ -197250,45 +199020,55 @@ async function runDocIndexPipeline(opts) {
197250
199020
  const llmModelId = serverConfig2.llm[llmProvider]?.default_model ?? llmProvider;
197251
199021
  const embProvider = serverConfig2.embedding?.provider;
197252
199022
  const embModelId = embProvider ? serverConfig2.embedding[embProvider]?.model_id ?? embProvider : undefined;
199023
+ const fileTimeoutMs = serverConfig2.indexing?.file_timeout_ms ?? 15 * 60 * 1000;
199024
+ const abortSignal = indexTaskManager.getAbortSignal?.(sourceId) ?? null;
197253
199025
  for (let fileIdx = 0;fileIdx < filesToIndex.length; fileIdx++) {
199026
+ if (abortSignal?.aborted) {
199027
+ const reason = typeof abortSignal.reason === "string" ? abortSignal.reason : "Task aborted";
199028
+ console.warn(`[runDocIndexPipeline] aborted before file ${fileIdx + 1}/${filesToIndex.length}: ${reason}`);
199029
+ break;
199030
+ }
197254
199031
  const file2 = filesToIndex[fileIdx];
197255
- const fileLabel = `[${fileIdx + 1}/${filesToIndex.length}] ${file2.sourcePath}`;
197256
199032
  if (indexTaskManager.hasTask(sourceId)) {
197257
199033
  indexTaskManager.updateProgress(sourceId, {
197258
- stage: "annotating",
199034
+ stage: "chunking",
197259
199035
  percent: 0,
197260
- message: `${fileLabel} 开始索引`
199036
+ message: { key: "index.doc.msg.file_start", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath } }
197261
199037
  });
197262
199038
  }
197263
199039
  try {
197264
- await indexDocument({
197265
- sourceId,
197266
- hashId: file2.hashId,
197267
- sourcePath: file2.sourcePath,
197268
- content: file2.content,
197269
- contentType: "markdown",
197270
- llmService,
197271
- embeddingService,
197272
- vectorStore,
197273
- digestStore: digestStore2,
197274
- llmModel: `${llmProvider}/${llmModelId}`,
197275
- ...embModelId ? { embeddingModel: `${embProvider}/${embModelId}` } : {},
197276
- onProgress: (p4) => {
197277
- if (indexTaskManager.hasTask(sourceId)) {
197278
- indexTaskManager.updateProgress(sourceId, {
197279
- stage: p4.phase,
197280
- percent: p4.progress,
197281
- ...p4.message != null ? { message: `[${fileIdx + 1}/${filesToIndex.length}] ${p4.message}` } : {}
197282
- });
199040
+ const fileTimeout = new Promise((_, reject) => setTimeout(() => reject(new Error(`File timeout after ${Math.round(fileTimeoutMs / 60000)}min: ${file2.sourcePath}`)), fileTimeoutMs));
199041
+ await Promise.race([
199042
+ indexDocument({
199043
+ sourceId,
199044
+ hashId: file2.hashId,
199045
+ sourcePath: file2.sourcePath,
199046
+ content: file2.content,
199047
+ contentType: "markdown",
199048
+ llmService,
199049
+ embeddingService,
199050
+ vectorStore,
199051
+ digestStore: digestStore2,
199052
+ llmModel: `${llmProvider}/${llmModelId}`,
199053
+ ...embModelId ? { embeddingModel: `${embProvider}/${embModelId}` } : {},
199054
+ onProgress: (p4) => {
199055
+ if (indexTaskManager.hasTask(sourceId)) {
199056
+ indexTaskManager.updateProgress(sourceId, {
199057
+ stage: p4.phase,
199058
+ percent: p4.progress,
199059
+ ...p4.message != null ? { message: p4.message } : {}
199060
+ });
199061
+ }
197283
199062
  }
197284
- }
197285
- });
199063
+ }),
199064
+ fileTimeout
199065
+ ]);
197286
199066
  stored.push({ hash_id: file2.hashId, status: "created" });
197287
199067
  if (indexTaskManager.hasTask(sourceId)) {
197288
199068
  indexTaskManager.updateProgress(sourceId, {
197289
199069
  stage: "storing",
197290
199070
  percent: 100,
197291
- message: `${fileLabel} 索引完成`
199071
+ message: { key: "index.doc.msg.file_done", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath } }
197292
199072
  });
197293
199073
  }
197294
199074
  } catch (err2) {
@@ -197299,11 +199079,15 @@ async function runDocIndexPipeline(opts) {
197299
199079
  indexTaskManager.updateProgress(sourceId, {
197300
199080
  stage: "annotating",
197301
199081
  percent: 0,
197302
- message: `${fileLabel} 索引失败: ${msg}`
199082
+ message: { key: "index.doc.msg.file_error", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath, error: msg } }
197303
199083
  });
197304
199084
  }
197305
199085
  }
197306
199086
  }
199087
+ if (abortSignal?.aborted) {
199088
+ console.warn(`[runDocIndexPipeline] pipeline aborted for ${sourceId}, skipping completion`);
199089
+ return;
199090
+ }
197307
199091
  if (stored.length === 0 && errors5.length > 0) {
197308
199092
  const errorCode = errors5[0].code ?? "DOC_INDEX_LLM_EXHAUSTED" /* DOC_INDEX_LLM_EXHAUSTED */;
197309
199093
  indexTaskManager.failTask(sourceId, errors5[0].error, errorCode);
@@ -197417,7 +199201,7 @@ async function handleDocIndex(c, storageProvider, source2) {
197417
199201
  throw new C4AError("DOC_INDEX_EMBEDDING_UNAVAILABLE" /* DOC_INDEX_EMBEDDING_UNAVAILABLE */, "Embedding service not configured", null);
197418
199202
  }
197419
199203
  const modulePaths = modules?.map((m) => m.path);
197420
- indexTaskManager.createTask(source2.id, "server", source2.id, modulePaths);
199204
+ indexTaskManager.createTask(source2.id, "server", source2.id, modulePaths, serverConfig2.indexing?.task_timeout_ms);
197421
199205
  const hashToPath = new Map;
197422
199206
  for (const sf of latestByPath.values()) {
197423
199207
  hashToPath.set(sf.hash_id, sf.source_path ?? "");
@@ -199261,6 +201045,10 @@ function mergeServerConfig2(parsed) {
199261
201045
  ...isPlainObject5(input.llm?.google) ? input.llm?.google : {}
199262
201046
  }
199263
201047
  },
201048
+ indexing: {
201049
+ ...defaults2.indexing,
201050
+ ...isPlainObject5(input.indexing) ? input.indexing : {}
201051
+ },
199264
201052
  embedding: {
199265
201053
  ...defaults2.embedding,
199266
201054
  ...isPlainObject5(input.embedding) ? input.embedding : {},
@@ -199935,7 +201723,8 @@ import path9 from "node:path";
199935
201723
  import { fileURLToPath } from "node:url";
199936
201724
 
199937
201725
  // ../server/src/indexTaskManager.ts
199938
- var DEFAULT_INDEX_TASK_TIMEOUT_MS = 20 * 60 * 1000;
201726
+ var DEFAULT_INDEX_TASK_TIMEOUT_MS = 150 * 60 * 1000;
201727
+ var DEFAULT_FILE_TIMEOUT_MS = 15 * 60 * 1000;
199939
201728
 
199940
201729
  class IndexTaskManager {
199941
201730
  broadcaster;
@@ -199955,12 +201744,18 @@ class IndexTaskManager {
199955
201744
  getTask(sourceId) {
199956
201745
  return this.indexTasks.get(sourceId) ?? null;
199957
201746
  }
199958
- createTask(sourceId, machineId, targetCommit, modules) {
201747
+ getAbortSignal(sourceId) {
201748
+ return this.indexTasks.get(sourceId)?.abortController.signal ?? null;
201749
+ }
201750
+ createTask(sourceId, machineId, targetCommit, modules, timeoutMs) {
199959
201751
  const existing = this.indexTasks.get(sourceId);
199960
201752
  if (existing) {
199961
201753
  clearTimeout(existing.timer);
201754
+ existing.abortController.abort("Task replaced by new task");
199962
201755
  this.indexTasks.delete(sourceId);
199963
201756
  }
201757
+ const abortController = new AbortController;
201758
+ const effectiveTimeout = timeoutMs ?? this.timeoutMs;
199964
201759
  const task = {
199965
201760
  sourceId,
199966
201761
  machineId,
@@ -199968,8 +201763,10 @@ class IndexTaskManager {
199968
201763
  startedAt: new Date,
199969
201764
  timer: setTimeout(() => {
199970
201765
  this.timeoutTask(sourceId);
199971
- }, this.timeoutMs),
201766
+ }, effectiveTimeout),
201767
+ timeoutMs: effectiveTimeout,
199972
201768
  progress: null,
201769
+ abortController,
199973
201770
  ...modules && modules.length > 0 ? { modules } : {}
199974
201771
  };
199975
201772
  this.indexTasks.set(sourceId, task);
@@ -200007,7 +201804,7 @@ class IndexTaskManager {
200007
201804
  clearTimeout(task.timer);
200008
201805
  task.timer = setTimeout(() => {
200009
201806
  this.timeoutTask(sourceId);
200010
- }, this.timeoutMs);
201807
+ }, task.timeoutMs);
200011
201808
  nextPhase();
200012
201809
  return;
200013
201810
  }
@@ -200026,6 +201823,7 @@ class IndexTaskManager {
200026
201823
  return;
200027
201824
  this.pendingPhases.delete(sourceId);
200028
201825
  clearTimeout(task.timer);
201826
+ task.abortController.abort(error40);
200029
201827
  this.indexTasks.delete(sourceId);
200030
201828
  this.broadcaster.error({
200031
201829
  source_id: sourceId,
@@ -200040,6 +201838,7 @@ class IndexTaskManager {
200040
201838
  return;
200041
201839
  this.pendingPhases.delete(sourceId);
200042
201840
  clearTimeout(task.timer);
201841
+ task.abortController.abort("Task timed out");
200043
201842
  this.indexTasks.delete(sourceId);
200044
201843
  this.broadcaster.timeout({
200045
201844
  source_id: sourceId,
@@ -200055,6 +201854,7 @@ class IndexTaskManager {
200055
201854
  destroy() {
200056
201855
  for (const task of this.indexTasks.values()) {
200057
201856
  clearTimeout(task.timer);
201857
+ task.abortController.abort("Manager destroyed");
200058
201858
  }
200059
201859
  this.indexTasks.clear();
200060
201860
  this.pendingPhases.clear();