@c4a/server-cli 0.4.15-alpha.4 → 0.4.15-alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -5
- package/index.js +954 -85
- package/package.json +1 -1
- package/serve.js +2016 -216
- package/web/assets/ContentDetail--oZBzWh0.js +1 -0
- package/web/assets/ContentDetail-B5s8bbFo.js +1 -0
- package/web/assets/ContentDetail-C3kXsx-i.js +1 -0
- package/web/assets/ContentDetail-CcLGF_Yi.js +1 -0
- package/web/assets/ContentDetail-D-2xyerw.js +1 -0
- package/web/assets/ContentDetail-DlQ8URkx.js +1 -0
- package/web/assets/ContentDetail-TPc0m0eM.js +1 -0
- package/web/assets/ContentDetail-y0yi2qln.js +1 -0
- package/web/assets/EntityDetail-3CFtMmgQ.js +1 -0
- package/web/assets/EntityDetail-BI3etmj4.js +1 -0
- package/web/assets/EntityDetail-CoFb-qZW.js +1 -0
- package/web/assets/EntityDetail-D_WP7tD4.js +1 -0
- package/web/assets/EntityDetail-DiJPemDY.js +1 -0
- package/web/assets/EntityDetail-DihnDvhA.js +1 -0
- package/web/assets/EntityDetail-DyDH4GAw.js +1 -0
- package/web/assets/EntityDetail-dIZiNN2t.js +1 -0
- package/web/assets/RelationDetail-B2gHrceI.js +1 -0
- package/web/assets/RelationDetail-CEq9vopD.js +1 -0
- package/web/assets/RelationDetail-CaYrspaS.js +1 -0
- package/web/assets/RelationDetail-CpoGdy25.js +1 -0
- package/web/assets/RelationDetail-DU9ECyHi.js +1 -0
- package/web/assets/RelationDetail-Dz7HAlU5.js +1 -0
- package/web/assets/RelationDetail-Wh3IgNaF.js +1 -0
- package/web/assets/RelationDetail-zZ_ZfkYX.js +1 -0
- package/web/assets/index-BPMqeFze.js +111 -0
- package/web/assets/index-BgRuvBL5.js +111 -0
- package/web/assets/index-CcrkBEZl.js +111 -0
- package/web/assets/index-DGDx8sCs.js +111 -0
- package/web/assets/index-DIyAwnqE.js +111 -0
- package/web/assets/index-DW1cCA8v.js +111 -0
- package/web/assets/index-DiAYi5t8.css +1 -0
- package/web/assets/index-FOCWvgW_.css +1 -0
- package/web/assets/index-daOjyLzy.css +1 -0
- package/web/assets/index-moF8uSEi.js +111 -0
- package/web/assets/index-sPNyENFN.js +111 -0
- package/web/assets/index-uGqDxUnx.css +1 -0
- package/web/index.html +2 -2
package/serve.js
CHANGED
|
@@ -281,6 +281,10 @@ var init_serverConfig = __esm(() => {
|
|
|
281
281
|
default_model: "gemini-3-pro-preview"
|
|
282
282
|
}
|
|
283
283
|
},
|
|
284
|
+
indexing: {
|
|
285
|
+
task_timeout_ms: 150 * 60 * 1000,
|
|
286
|
+
file_timeout_ms: 15 * 60 * 1000
|
|
287
|
+
},
|
|
284
288
|
embedding: {
|
|
285
289
|
provider: "huggingface",
|
|
286
290
|
huggingface: {
|
|
@@ -4340,7 +4344,7 @@ var init_atomsSchema = __esm(() => {
|
|
|
4340
4344
|
init_zod();
|
|
4341
4345
|
init_base();
|
|
4342
4346
|
init_baseSchema();
|
|
4343
|
-
confidenceAtomSchema = exports_external.number().min(0).max(1).optional();
|
|
4347
|
+
confidenceAtomSchema = exports_external.number().min(0).max(1).optional().catch(undefined);
|
|
4344
4348
|
entityAtomSchema = exports_external.object({
|
|
4345
4349
|
name: exports_external.string(),
|
|
4346
4350
|
kind: kindSchema.optional().catch(undefined),
|
|
@@ -186001,6 +186005,10 @@ function mergeServerConfig(parsed) {
|
|
|
186001
186005
|
...isPlainObject3(input.llm?.google) ? input.llm?.google : {}
|
|
186002
186006
|
}
|
|
186003
186007
|
},
|
|
186008
|
+
indexing: {
|
|
186009
|
+
...defaults2.indexing,
|
|
186010
|
+
...isPlainObject3(input.indexing) ? input.indexing : {}
|
|
186011
|
+
},
|
|
186004
186012
|
embedding: {
|
|
186005
186013
|
...defaults2.embedding,
|
|
186006
186014
|
...isPlainObject3(input.embedding) ? input.embedding : {},
|
|
@@ -194956,14 +194964,21 @@ function isRetryableStatus(status) {
|
|
|
194956
194964
|
function isAuthStatus(status) {
|
|
194957
194965
|
return status === 401 || status === 403;
|
|
194958
194966
|
}
|
|
194959
|
-
function
|
|
194960
|
-
|
|
194967
|
+
function throwLlmError(error40, status) {
|
|
194968
|
+
const detail = toErrorMessage(error40);
|
|
194969
|
+
const statusTag = status ? ` [HTTP ${status}]` : "";
|
|
194970
|
+
if (isAuthStatus(status)) {
|
|
194971
|
+
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, `LLM 认证失败${statusTag}: ${detail}`, detail);
|
|
194972
|
+
}
|
|
194973
|
+
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, `LLM 调用失败${statusTag}: ${detail}`, detail);
|
|
194961
194974
|
}
|
|
194962
194975
|
|
|
194963
194976
|
class LlmServiceImpl {
|
|
194964
194977
|
options;
|
|
194978
|
+
supportsTemperature;
|
|
194965
194979
|
constructor(options) {
|
|
194966
194980
|
this.options = options;
|
|
194981
|
+
this.supportsTemperature = options.provider !== "openai";
|
|
194967
194982
|
}
|
|
194968
194983
|
async generateText(prompt, options) {
|
|
194969
194984
|
if (this.options.forceStream) {
|
|
@@ -194975,7 +194990,7 @@ class LlmServiceImpl {
|
|
|
194975
194990
|
model: this.options.languageModel,
|
|
194976
194991
|
prompt,
|
|
194977
194992
|
maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
194978
|
-
temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
|
|
194993
|
+
...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
|
|
194979
194994
|
maxRetries: 0
|
|
194980
194995
|
};
|
|
194981
194996
|
if (options?.systemPrompt) {
|
|
@@ -195012,13 +195027,7 @@ class LlmServiceImpl {
|
|
|
195012
195027
|
durationMs,
|
|
195013
195028
|
error: toErrorMessage(error40)
|
|
195014
195029
|
});
|
|
195015
|
-
|
|
195016
|
-
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
|
|
195017
|
-
}
|
|
195018
|
-
if (isBadRequest(status)) {
|
|
195019
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195020
|
-
}
|
|
195021
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195030
|
+
throwLlmError(error40, status);
|
|
195022
195031
|
}
|
|
195023
195032
|
}
|
|
195024
195033
|
async generateTextViaStream(prompt, options) {
|
|
@@ -195028,7 +195037,7 @@ class LlmServiceImpl {
|
|
|
195028
195037
|
model: this.options.languageModel,
|
|
195029
195038
|
prompt,
|
|
195030
195039
|
maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
195031
|
-
temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
|
|
195040
|
+
...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
|
|
195032
195041
|
maxRetries: 0
|
|
195033
195042
|
};
|
|
195034
195043
|
if (options?.systemPrompt) {
|
|
@@ -195066,13 +195075,7 @@ class LlmServiceImpl {
|
|
|
195066
195075
|
durationMs: Date.now() - startedAt,
|
|
195067
195076
|
error: toErrorMessage(error40)
|
|
195068
195077
|
});
|
|
195069
|
-
|
|
195070
|
-
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
|
|
195071
|
-
}
|
|
195072
|
-
if (isBadRequest(status)) {
|
|
195073
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195074
|
-
}
|
|
195075
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195078
|
+
throwLlmError(error40, status);
|
|
195076
195079
|
}
|
|
195077
195080
|
}
|
|
195078
195081
|
streamText(prompt, options) {
|
|
@@ -195095,7 +195098,7 @@ class LlmServiceImpl {
|
|
|
195095
195098
|
model: this.options.languageModel,
|
|
195096
195099
|
prompt,
|
|
195097
195100
|
maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
195098
|
-
temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
|
|
195101
|
+
...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
|
|
195099
195102
|
maxRetries: 0,
|
|
195100
195103
|
onFinish: (event) => {
|
|
195101
195104
|
const finishEvent = event;
|
|
@@ -195141,13 +195144,7 @@ class LlmServiceImpl {
|
|
|
195141
195144
|
durationMs: Date.now() - startedAt,
|
|
195142
195145
|
error: toErrorMessage(error40)
|
|
195143
195146
|
});
|
|
195144
|
-
|
|
195145
|
-
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
|
|
195146
|
-
}
|
|
195147
|
-
if (isBadRequest(status)) {
|
|
195148
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195149
|
-
}
|
|
195150
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195147
|
+
throwLlmError(error40, status);
|
|
195151
195148
|
}
|
|
195152
195149
|
}
|
|
195153
195150
|
}
|
|
@@ -195858,7 +195855,12 @@ function parseExtractionOutput(raw5, schema2) {
|
|
|
195858
195855
|
return { success: false, error: new Error("Empty output") };
|
|
195859
195856
|
}
|
|
195860
195857
|
const protocolParsed = tryParseProtocol(trimmed);
|
|
195861
|
-
|
|
195858
|
+
let parsed = protocolParsed ?? tryParseJson(trimmed);
|
|
195859
|
+
if (Array.isArray(parsed)) {
|
|
195860
|
+
parsed = { paragraphs: parsed };
|
|
195861
|
+
}
|
|
195862
|
+
parsed = normalizeFlatOutput(parsed);
|
|
195863
|
+
parsed = stripNulls(parsed);
|
|
195862
195864
|
const result = schema2.safeParse(parsed);
|
|
195863
195865
|
if (!result.success) {
|
|
195864
195866
|
return { success: false, error: result.error };
|
|
@@ -195928,6 +195930,37 @@ function tryParseJson(raw5) {
|
|
|
195928
195930
|
function repairAndParse(raw5) {
|
|
195929
195931
|
return JSON.parse(jsonrepair(raw5));
|
|
195930
195932
|
}
|
|
195933
|
+
var PARAGRAPH_TAG_RE = /^P\d+$/;
|
|
195934
|
+
function normalizeFlatOutput(parsed) {
|
|
195935
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
|
|
195936
|
+
return parsed;
|
|
195937
|
+
const obj = parsed;
|
|
195938
|
+
if ("paragraphs" in obj)
|
|
195939
|
+
return parsed;
|
|
195940
|
+
const keys = Object.keys(obj);
|
|
195941
|
+
if (keys.length === 0)
|
|
195942
|
+
return { paragraphs: [] };
|
|
195943
|
+
const allTags = keys.every((k) => PARAGRAPH_TAG_RE.test(k));
|
|
195944
|
+
if (!allTags)
|
|
195945
|
+
return parsed;
|
|
195946
|
+
const paragraphs = keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag2) => ({ tag: tag2, atoms: obj[tag2] }));
|
|
195947
|
+
return { paragraphs };
|
|
195948
|
+
}
|
|
195949
|
+
function stripNulls(value) {
|
|
195950
|
+
if (value === null)
|
|
195951
|
+
return;
|
|
195952
|
+
if (Array.isArray(value))
|
|
195953
|
+
return value.map(stripNulls);
|
|
195954
|
+
if (typeof value === "object" && value !== null) {
|
|
195955
|
+
const out2 = {};
|
|
195956
|
+
for (const [k, v] of Object.entries(value)) {
|
|
195957
|
+
if (v !== null)
|
|
195958
|
+
out2[k] = stripNulls(v);
|
|
195959
|
+
}
|
|
195960
|
+
return out2;
|
|
195961
|
+
}
|
|
195962
|
+
return value;
|
|
195963
|
+
}
|
|
195931
195964
|
function isRecord(value) {
|
|
195932
195965
|
return !!value && typeof value === "object" && "key" in value && "value" in value && typeof value.key === "string";
|
|
195933
195966
|
}
|
|
@@ -196220,30 +196253,32 @@ Each atom type has specific required fields. Fields with "?" suffix are optional
|
|
|
196220
196253
|
${ATOM_TYPES_BLOCK}
|
|
196221
196254
|
|
|
196222
196255
|
## Output Format
|
|
196223
|
-
Return a single JSON object
|
|
196256
|
+
Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
|
|
196224
196257
|
{
|
|
196225
|
-
"
|
|
196226
|
-
{
|
|
196227
|
-
|
|
196228
|
-
|
|
196229
|
-
|
|
196230
|
-
|
|
196231
|
-
|
|
196232
|
-
}
|
|
196233
|
-
}
|
|
196234
|
-
]
|
|
196258
|
+
"P0": {
|
|
196259
|
+
"entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
|
|
196260
|
+
"relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }]
|
|
196261
|
+
},
|
|
196262
|
+
"P3": {
|
|
196263
|
+
"rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
|
|
196264
|
+
}
|
|
196235
196265
|
}
|
|
196236
196266
|
|
|
196237
196267
|
## Rules
|
|
196238
|
-
-
|
|
196268
|
+
- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
|
|
196269
|
+
- Skip paragraphs with no atoms — do NOT emit empty objects.
|
|
196239
196270
|
- Only include atom types that are actually found in a paragraph (all types are optional).
|
|
196240
196271
|
- Every atom MUST include all required fields for its type (see schemas above). Fields with "?" suffix are optional.
|
|
196241
196272
|
- **Enum fields MUST use ONLY the listed values.** For example, entity.kind must be one of "implementation"|"external"|"concept" — do NOT use values from other atom types (e.g., do NOT put "team" or "human" in entity.kind; those belong to roles.kind).
|
|
196242
196273
|
- Every atom MUST include a "confidence" field (0.0-1.0) indicating how confident you are in the extraction. Use higher values (0.85-1.0) for explicitly stated facts and lower values (0.5-0.7) for inferred or ambiguous information.
|
|
196243
196274
|
- **Classify correctly:** People, teams, and personas → "roles" (not "entities"). Technical systems, services, modules → "entities".
|
|
196275
|
+
- **Entity reference consistency (CRITICAL):** Every entity name referenced in relation.from, relation.to, behavior.subject, or any other cross-reference field MUST also appear in the "entities" array of the SAME paragraph (or a preceding paragraph in the same chunk). If an entity is mentioned for the first time in a relation, you MUST also extract it as an entity. This ensures no "dangling references" — every name used in relations has a corresponding entity declaration.
|
|
196244
196276
|
- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions, English input → English descriptions).
|
|
196245
196277
|
- JSON structure keys (tag, atom type names, field names) must always be in English.
|
|
196246
196278
|
- Be thorough: extract ALL relevant atoms from each paragraph.
|
|
196279
|
+
- **Tables: basic extraction only.** For paragraphs containing markdown tables, extract the table heading as an entity and a brief summary attribute. Detailed table modeling (row-level data, comparisons, metrics) is handled by a dedicated table extraction pass — do NOT attempt exhaustive table column extraction here.
|
|
196280
|
+
- **Diagrams: basic extraction only.** For paragraphs containing text-based diagrams (e.g. \`\`\`mermaid, \`\`\`plantuml, \`\`\`dot, etc.), extract the diagram title as an entity and a brief summary attribute describing what the diagram shows. Detailed diagram modeling (nodes, edges, states, transitions) is handled by a dedicated diagram extraction pass — do NOT attempt exhaustive diagram parsing here.
|
|
196281
|
+
- **Metrics checklist:** If the text mentions performance targets, SLA, response times, throughput, error rates, port numbers, timeouts, capacity limits, or any numeric thresholds, extract them as "metrics" atoms.
|
|
196247
196282
|
- Do NOT include "claims" — they are system-generated and not part of document extraction.`;
|
|
196248
196283
|
function buildDocAtomAnnotationPrompt(chunkText) {
|
|
196249
196284
|
return `Extract all semantic atoms from the following document text.
|
|
@@ -196255,6 +196290,13 @@ ${chunkText}
|
|
|
196255
196290
|
|
|
196256
196291
|
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
196257
196292
|
}
|
|
196293
|
+
function toFlatFormat(result) {
|
|
196294
|
+
const flat = {};
|
|
196295
|
+
for (const p4 of result.paragraphs) {
|
|
196296
|
+
flat[p4.tag] = p4.atoms;
|
|
196297
|
+
}
|
|
196298
|
+
return flat;
|
|
196299
|
+
}
|
|
196258
196300
|
function buildDocGleaningPrompt(chunkText, previousResult) {
|
|
196259
196301
|
return `Review the following document text and the previously extracted atoms.
|
|
196260
196302
|
Check for any MISSING atoms that were not captured in the first pass.
|
|
@@ -196263,66 +196305,483 @@ Check for any MISSING atoms that were not captured in the first pass.
|
|
|
196263
196305
|
${chunkText}
|
|
196264
196306
|
|
|
196265
196307
|
## Previously Extracted Atoms
|
|
196266
|
-
${JSON.stringify(previousResult, null, 2)}
|
|
196308
|
+
${JSON.stringify(toFlatFormat(previousResult), null, 2)}
|
|
196267
196309
|
|
|
196268
196310
|
## Instructions
|
|
196269
|
-
- If you find missing atoms, output them in the same JSON format
|
|
196311
|
+
- If you find missing atoms, output them in the same flat JSON format keyed by paragraph tags (e.g. {"P0": {"entities": [...]}, "P3": {"rules": [...]}}).
|
|
196270
196312
|
- Only include NEW atoms not already in the previous extraction.
|
|
196271
196313
|
- Every atom MUST include a "confidence" field (0.0-1.0).
|
|
196272
|
-
- If
|
|
196314
|
+
- **Entity reference consistency:** If you add a new relation whose from/to references an entity not yet declared in the previous extraction or your current output, you MUST also add that entity to the "entities" array.
|
|
196315
|
+
- If nothing is missing, return: {}
|
|
196273
196316
|
- Respond in the same language as the input text.
|
|
196274
196317
|
|
|
196275
196318
|
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
196276
196319
|
}
|
|
196277
196320
|
var DOC_ANNOTATION_SYSTEM_PROMPT = SYSTEM_PROMPT;
|
|
196321
|
+
// ../llm/src/prompts/entityResolution.ts
|
|
196322
|
+
var ENTITY_RESOLUTION_SYSTEM_PROMPT = `You are an entity resolution assistant. You review a list of entity names extracted from a technical document and perform two tasks:
|
|
196323
|
+
|
|
196324
|
+
## Task 1: Merge Duplicates
|
|
196325
|
+
- Only merge names that clearly refer to the same entity (same system, service, tool, etc.)
|
|
196326
|
+
- Prefer the LONGER, more descriptive name as the canonical name
|
|
196327
|
+
- Do NOT merge names that share a substring but refer to different things
|
|
196328
|
+
- When uncertain, do NOT merge — add to "ambiguous" instead
|
|
196329
|
+
- Chinese and English names for the same entity SHOULD be merged (e.g. "Vmok" → "Vmok 微模块框架")
|
|
196330
|
+
- Abbreviations should be merged with their full forms (e.g. "AGW" → "API Gateway")
|
|
196331
|
+
|
|
196332
|
+
## Task 2: Remove Noise
|
|
196333
|
+
- Remove names that are NOT meaningful named entities — they are generic words, actions, or descriptions
|
|
196334
|
+
- Examples of noise: common verbs/nouns (登录, 路由, 直连), generic technical terms (Env, query), action descriptions (Kill 3001 进程)
|
|
196335
|
+
- Examples of REAL entities to KEEP: product names (TTAstra, Gulux), tools (nvm, Rush), services (Op Main 服务), platforms (AGW 平台)
|
|
196336
|
+
- When uncertain, KEEP the name — only remove if clearly not a named entity
|
|
196337
|
+
|
|
196338
|
+
## Output
|
|
196339
|
+
Valid JSON only. No markdown fences, no explanation.`;
|
|
196340
|
+
function buildEntityResolutionPrompt(input) {
|
|
196341
|
+
const parts = [];
|
|
196342
|
+
parts.push(`## All Entity Names (${input.allNames.length} total)`);
|
|
196343
|
+
parts.push(input.allNames.map((n, i) => `${i + 1}. ${n}`).join(`
|
|
196344
|
+
`));
|
|
196345
|
+
if (input.candidates.length > 0) {
|
|
196346
|
+
parts.push("");
|
|
196347
|
+
parts.push(`## Suspected Duplicates (${input.candidates.length} pairs)`);
|
|
196348
|
+
parts.push("Review each pair and decide whether to merge:");
|
|
196349
|
+
for (const c of input.candidates) {
|
|
196350
|
+
parts.push(`- "${c.short}" ↔ "${c.long}" — ${c.reason}`);
|
|
196351
|
+
}
|
|
196352
|
+
}
|
|
196353
|
+
if (input.noiseCandidates && input.noiseCandidates.length > 0) {
|
|
196354
|
+
parts.push("");
|
|
196355
|
+
parts.push(`## Suspected Noise (${input.noiseCandidates.length} names)`);
|
|
196356
|
+
parts.push("Review each name — remove if NOT a meaningful named entity, keep if it IS:");
|
|
196357
|
+
for (const n of input.noiseCandidates) {
|
|
196358
|
+
parts.push(`- "${n}"`);
|
|
196359
|
+
}
|
|
196360
|
+
}
|
|
196361
|
+
if (input.contextSnippets && input.contextSnippets.length > 0) {
|
|
196362
|
+
parts.push("");
|
|
196363
|
+
parts.push("## Context Snippets");
|
|
196364
|
+
for (const s of input.contextSnippets) {
|
|
196365
|
+
parts.push(`- **${s.name}**: ${s.snippet}`);
|
|
196366
|
+
}
|
|
196367
|
+
}
|
|
196368
|
+
parts.push("");
|
|
196369
|
+
parts.push(`## Output Format
|
|
196370
|
+
Return a JSON object:
|
|
196371
|
+
{
|
|
196372
|
+
"merges": [
|
|
196373
|
+
{ "from": "alias name", "to": "canonical name" }
|
|
196374
|
+
],
|
|
196375
|
+
"remove": ["noise_name_1", "noise_name_2"],
|
|
196376
|
+
"ambiguous": ["name1", "name2"]
|
|
196377
|
+
}
|
|
196378
|
+
|
|
196379
|
+
- "merges": confirmed duplicate pairs. "from" will be replaced by "to" everywhere.
|
|
196380
|
+
- "remove": names confirmed as noise. They will be deleted from entity list.
|
|
196381
|
+
- "ambiguous": names you're unsure about (optional, for logging).
|
|
196382
|
+
|
|
196383
|
+
Return ONLY valid JSON. No markdown fences, no explanation.`);
|
|
196384
|
+
return parts.join(`
|
|
196385
|
+
`);
|
|
196386
|
+
}
|
|
196387
|
+
// ../llm/src/prompts/docTableAnnotation.ts
|
|
196388
|
+
init_src();
|
|
196389
|
+
var entityFields = zodObjectToPromptFields(entityAtomSchema);
|
|
196390
|
+
var attributeFields = zodObjectToPromptFields(attributeAtomSchema);
|
|
196391
|
+
var relationFields = zodObjectToPromptFields(relationAtomSchema);
|
|
196392
|
+
var comparisonFields = zodObjectToPromptFields(comparisonAtomSchema);
|
|
196393
|
+
var metricFields = zodObjectToPromptFields(metricAtomSchema);
|
|
196394
|
+
var behaviorFields = zodObjectToPromptFields(behaviorAtomSchema);
|
|
196395
|
+
var eventFields = zodObjectToPromptFields(eventAtomSchema);
|
|
196396
|
+
var transitionFields = zodObjectToPromptFields(transitionAtomSchema);
|
|
196397
|
+
var constraintFields = zodObjectToPromptFields(constraintAtomSchema);
|
|
196398
|
+
var stateFields = zodObjectToPromptFields(stateAtomSchema);
|
|
196399
|
+
var ruleFields = zodObjectToPromptFields(ruleAtomSchema);
|
|
196400
|
+
var TABLE_SYSTEM_PROMPT = `You are a table data modeling assistant. Your task is to extract structured semantic atoms from markdown tables in documents.
|
|
196401
|
+
|
|
196402
|
+
Each table paragraph is tagged with [P0], [P1], etc. You must classify the table type FIRST, then apply the corresponding extraction rules.
|
|
196403
|
+
|
|
196404
|
+
## Step 1: Classify the Table
|
|
196405
|
+
|
|
196406
|
+
Determine the table type by examining the relationship between rows:
|
|
196407
|
+
|
|
196408
|
+
### Type A: Collection / Record Table
|
|
196409
|
+
**Rows are peer instances of the same concept.** Each row is an independent record; columns describe different facets of the same instance.
|
|
196410
|
+
- Examples: code→name mappings, enum definitions, config parameter lists, reference data tables
|
|
196411
|
+
- Key signal: removing one row does not affect the meaning of other rows
|
|
196412
|
+
|
|
196413
|
+
### Type B: Single-Object Property Table
|
|
196414
|
+
**Rows describe properties/fields of ONE entity.** First column is property name, other columns are its type/value/description.
|
|
196415
|
+
- Examples: API field definitions, configuration schema, entity attribute lists
|
|
196416
|
+
- Key signal: all rows refer to the same parent entity
|
|
196417
|
+
|
|
196418
|
+
### Type C: Comparison / Evaluation Table
|
|
196419
|
+
**Rows or columns represent different subjects being compared** across the same dimensions.
|
|
196420
|
+
- Examples: technology selection, vendor evaluation, feature comparison
|
|
196421
|
+
- Key signal: multiple named subjects evaluated on shared criteria
|
|
196422
|
+
|
|
196423
|
+
### Type D: Matrix / Cross-Reference Table
|
|
196424
|
+
**Both row headers and column headers are dimensions.** Cells represent the relationship at the intersection.
|
|
196425
|
+
- Examples: permission matrices (role × operation), compatibility matrices, dependency tables
|
|
196426
|
+
- Key signal: both axes are meaningful dimensions, cells are binary/rating/relationship values
|
|
196427
|
+
|
|
196428
|
+
### Type E: Metrics / KPI Table
|
|
196429
|
+
**Rows are measurable indicators** with numeric targets, thresholds, or SLA values.
|
|
196430
|
+
- Examples: SLA tables, performance baselines, capacity planning tables
|
|
196431
|
+
- Key signal: columns include target/threshold/unit/SLA-style values
|
|
196432
|
+
|
|
196433
|
+
### Type F: Timeline / Process Table
|
|
196434
|
+
**Rows represent ordered steps or phases** in a sequence.
|
|
196435
|
+
- Examples: deployment steps, approval workflows, version changelog, migration plans
|
|
196436
|
+
- Key signal: rows have implicit ordering, may have phase/step/date columns
|
|
196437
|
+
|
|
196438
|
+
## Step 2: Extract Atoms by Table Type
|
|
196439
|
+
|
|
196440
|
+
### Type A → Single attribute with row-object array
|
|
196441
|
+
1. Create ONE entity for the abstract concept (table heading or the concept rows represent).
|
|
196442
|
+
Entity schema: ${entityFields}
|
|
196443
|
+
2. Create ONE attribute with \`type: "table"\` and \`value\` as an array of row objects. Each row object uses column headers as keys.
|
|
196444
|
+
Attribute schema: ${attributeFields}
|
|
196445
|
+
Example: \`{ "name": "Region Code Mapping", "type": "table", "value": [{"Code": "1001", "Name": "CN_North", "Region": "CN-NORTH"}, ...] }\`
|
|
196446
|
+
3. **Extract ALL rows — do not sample.** If a table has 30 rows, the value array must contain all 30 objects.
|
|
196447
|
+
4. Extract structural patterns: status indicators (DEPRECATED, enabled/disabled) → "states" + "rules" atoms.
|
|
196448
|
+
State schema: ${stateFields}
|
|
196449
|
+
Rule schema: ${ruleFields}
|
|
196450
|
+
|
|
196451
|
+
### Type B → Multiple attribute atoms
|
|
196452
|
+
1. Create ONE entity for the parent structure.
|
|
196453
|
+
Entity schema: ${entityFields}
|
|
196454
|
+
2. Create one attribute per row: \`name\` = property name, \`type\` = property type, \`value\` = default/example.
|
|
196455
|
+
Attribute schema: ${attributeFields}
|
|
196456
|
+
3. Extract constraints from "required" or "validation" columns.
|
|
196457
|
+
Constraint schema: ${constraintFields}
|
|
196458
|
+
|
|
196459
|
+
### Type C → Comparison atom
|
|
196460
|
+
1. Use "comparisons" atom. Subjects = compared items, dimensions = evaluation criteria.
|
|
196461
|
+
Comparison schema: ${comparisonFields}
|
|
196462
|
+
2. Extract "decisions" atoms if the table leads to a conclusion.
|
|
196463
|
+
|
|
196464
|
+
### Type D → Relations or table attribute
|
|
196465
|
+
1. If cells are simple (yes/no, allowed/denied): extract as "relations" atoms.
|
|
196466
|
+
Relation schema: ${relationFields}
|
|
196467
|
+
Map each cell to a relation: row header → \`from\`, column header → \`to\`, cell value → \`type\` or \`description\`.
|
|
196468
|
+
2. If cells are complex: use Type A approach (single attribute with \`type: "table"\`).
|
|
196469
|
+
3. Create entities for both row headers and column headers if they are named concepts.
|
|
196470
|
+
|
|
196471
|
+
### Type E → Metrics atoms
|
|
196472
|
+
1. Create one "metrics" atom per row.
|
|
196473
|
+
Metric schema: ${metricFields}
|
|
196474
|
+
2. Also create the parent entity if named (e.g., "SLA Requirements").
|
|
196475
|
+
|
|
196476
|
+
### Type F → Behaviors/Events/Transitions
|
|
196477
|
+
1. Create one "behaviors" atom per step/phase.
|
|
196478
|
+
Behavior schema: ${behaviorFields}
|
|
196479
|
+
2. If there are triggers: extract "events" atoms.
|
|
196480
|
+
Event schema: ${eventFields}
|
|
196481
|
+
3. If there are state changes: extract "transitions" atoms.
|
|
196482
|
+
Transition schema: ${transitionFields}
|
|
196483
|
+
4. Create the parent entity for the process/workflow.
|
|
196484
|
+
|
|
196485
|
+
## Output Format
|
|
196486
|
+
Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
|
|
196487
|
+
{
|
|
196488
|
+
"P0": {
|
|
196489
|
+
"tableType": "A",
|
|
196490
|
+
"entities": [...],
|
|
196491
|
+
"attributes": [...]
|
|
196492
|
+
},
|
|
196493
|
+
"P3": {
|
|
196494
|
+
"tableType": "C",
|
|
196495
|
+
"comparisons": [...]
|
|
196496
|
+
}
|
|
196497
|
+
}
|
|
196498
|
+
|
|
196499
|
+
## Rules
|
|
196500
|
+
- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
|
|
196501
|
+
- Skip paragraphs with no atoms — do NOT emit empty objects.
|
|
196502
|
+
- Every atom MUST include a "confidence" field (0.0-1.0).
|
|
196503
|
+
- The "tableType" field is required for each paragraph (one of "A", "B", "C", "D", "E", "F").
|
|
196504
|
+
- Only include atom types that are actually extracted.
|
|
196505
|
+
- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
|
|
196506
|
+
- JSON structure keys must always be in English.
|
|
196507
|
+
- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
|
|
196508
|
+
- Do NOT include "claims" — they are system-generated.`;
|
|
196509
|
+
function buildDocTableAnnotationPrompt(tableText) {
|
|
196510
|
+
return `Classify and extract atoms from the following table paragraphs.
|
|
196511
|
+
Each paragraph is tagged with [P0], [P1], etc. First classify each table, then extract atoms accordingly.
|
|
196512
|
+
|
|
196513
|
+
---
|
|
196514
|
+
${tableText}
|
|
196515
|
+
---
|
|
196516
|
+
|
|
196517
|
+
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
196518
|
+
}
|
|
196519
|
+
var DOC_TABLE_ANNOTATION_SYSTEM_PROMPT = TABLE_SYSTEM_PROMPT;
|
|
196520
|
+
// ../llm/src/prompts/docDiagramAnnotation.ts
|
|
196521
|
+
init_src();
|
|
196522
|
+
var entityFields2 = zodObjectToPromptFields(entityAtomSchema);
|
|
196523
|
+
var attributeFields2 = zodObjectToPromptFields(attributeAtomSchema);
|
|
196524
|
+
var relationFields2 = zodObjectToPromptFields(relationAtomSchema);
|
|
196525
|
+
var behaviorFields2 = zodObjectToPromptFields(behaviorAtomSchema);
|
|
196526
|
+
var transitionFields2 = zodObjectToPromptFields(transitionAtomSchema);
|
|
196527
|
+
var stateFields2 = zodObjectToPromptFields(stateAtomSchema);
|
|
196528
|
+
var roleFields = zodObjectToPromptFields(roleAtomSchema);
|
|
196529
|
+
var eventFields2 = zodObjectToPromptFields(eventAtomSchema);
|
|
196530
|
+
var decisionFields = zodObjectToPromptFields(decisionAtomSchema);
|
|
196531
|
+
var constraintFields2 = zodObjectToPromptFields(constraintAtomSchema);
|
|
196532
|
+
var DIAGRAM_FENCE_TAGS = [
|
|
196533
|
+
"mermaid",
|
|
196534
|
+
"plantuml",
|
|
196535
|
+
"puml",
|
|
196536
|
+
"dot",
|
|
196537
|
+
"graphviz",
|
|
196538
|
+
"viz",
|
|
196539
|
+
"d2",
|
|
196540
|
+
"c4plantuml",
|
|
196541
|
+
"ditaa",
|
|
196542
|
+
"nomnoml",
|
|
196543
|
+
"wavedrom",
|
|
196544
|
+
"vega",
|
|
196545
|
+
"vega-lite"
|
|
196546
|
+
];
|
|
196547
|
+
var DIAGRAM_FENCE_REGEX = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "i");
|
|
196548
|
+
var DIAGRAM_SYSTEM_PROMPT = `You are a diagram analysis assistant. Your task is to extract structured semantic atoms from text-based diagrams (Mermaid, PlantUML, Graphviz, D2, etc.) embedded in documents.
|
|
196549
|
+
|
|
196550
|
+
Each diagram paragraph is tagged with [P0], [P1], etc. You must classify the diagram type FIRST, then extract atoms accordingly.
|
|
196551
|
+
|
|
196552
|
+
## Step 1: Identify the Diagram Format and Type
|
|
196553
|
+
|
|
196554
|
+
### Formats
|
|
196555
|
+
- **Mermaid**: flowchart/graph, sequenceDiagram, stateDiagram, classDiagram, erDiagram, gantt, pie, gitgraph
|
|
196556
|
+
- **PlantUML / C4-PlantUML**: @startuml/@enduml blocks, all UML types, C4 architecture (System_Context, Container, Component)
|
|
196557
|
+
- **Graphviz (DOT)**: digraph/graph, general directed/undirected graphs
|
|
196558
|
+
- **D2**: modern declarative diagrams with shape/connection syntax
|
|
196559
|
+
- **Others**: ditaa (ASCII art), nomnoml (UML), wavedrom (timing), vega/vega-lite (data viz)
|
|
196560
|
+
|
|
196561
|
+
### Diagram Types (by semantic content)
|
|
196562
|
+
- **Flowchart / Process**: decision trees, algorithms, business process flows
|
|
196563
|
+
- **Sequence**: interaction between participants over time (API calls, protocols)
|
|
196564
|
+
- **State Machine**: states and transitions triggered by events/guards
|
|
196565
|
+
- **Class / ER**: data models, entity relationships, inheritance hierarchies
|
|
196566
|
+
- **Architecture**: system components, containers, deployment topology
|
|
196567
|
+
- **Gantt / Timeline**: project schedules, milestones, phases
|
|
196568
|
+
- **Pie / Data Viz**: statistical distributions, metrics visualization
|
|
196569
|
+
|
|
196570
|
+
## Step 2: Extract Atoms by Diagram Type
|
|
196571
|
+
|
|
196572
|
+
### Flowchart / Process → entities + relations + behaviors + decisions
|
|
196573
|
+
1. Extract each node as an entity.
|
|
196574
|
+
Entity schema: ${entityFields2}
|
|
196575
|
+
2. Extract each arrow/edge as a relation. Use edge labels as \`type\` or \`description\`.
|
|
196576
|
+
Relation schema: ${relationFields2}
|
|
196577
|
+
3. Extract action nodes as behaviors (what the process does at each step).
|
|
196578
|
+
Behavior schema: ${behaviorFields2}
|
|
196579
|
+
4. Extract diamond/decision nodes as decisions.
|
|
196580
|
+
Decision schema: ${decisionFields}
|
|
196581
|
+
|
|
196582
|
+
### Sequence → entities + relations + behaviors + events
|
|
196583
|
+
1. Extract each participant/actor as an entity (or role if it's a person/team).
|
|
196584
|
+
Entity schema: ${entityFields2}
|
|
196585
|
+
Role schema: ${roleFields}
|
|
196586
|
+
2. Extract each message/call as a relation (\`from\` = caller, \`to\` = callee, \`type\` = message label).
|
|
196587
|
+
Relation schema: ${relationFields2}
|
|
196588
|
+
3. Extract significant interactions as behaviors.
|
|
196589
|
+
Behavior schema: ${behaviorFields2}
|
|
196590
|
+
4. Extract triggers, responses, and async messages as events.
|
|
196591
|
+
Event schema: ${eventFields2}
|
|
196592
|
+
|
|
196593
|
+
### State Machine → entities + states + transitions + events
|
|
196594
|
+
1. Extract the state machine subject as an entity.
|
|
196595
|
+
Entity schema: ${entityFields2}
|
|
196596
|
+
2. Extract each state as a state atom.
|
|
196597
|
+
State schema: ${stateFields2}
|
|
196598
|
+
3. Extract each arrow as a transition (\`from\` = source state, \`to\` = target state, \`trigger\` = event/guard).
|
|
196599
|
+
Transition schema: ${transitionFields2}
|
|
196600
|
+
4. Extract triggers as events.
|
|
196601
|
+
Event schema: ${eventFields2}
|
|
196602
|
+
|
|
196603
|
+
### Class / ER → entities + attributes + relations
|
|
196604
|
+
1. Extract each class/entity as an entity.
|
|
196605
|
+
Entity schema: ${entityFields2}
|
|
196606
|
+
2. Extract fields/properties as attributes.
|
|
196607
|
+
Attribute schema: ${attributeFields2}
|
|
196608
|
+
3. Extract associations, inheritance, composition as relations (\`type\` = "INHERITS", "CONTAINS", "REFERENCES", etc.).
|
|
196609
|
+
Relation schema: ${relationFields2}
|
|
196610
|
+
|
|
196611
|
+
### Architecture → entities + relations + constraints
|
|
196612
|
+
1. Extract each system/service/container/component as an entity.
|
|
196613
|
+
Entity schema: ${entityFields2}
|
|
196614
|
+
2. Extract connections between components as relations.
|
|
196615
|
+
Relation schema: ${relationFields2}
|
|
196616
|
+
3. Extract deployment constraints, technology choices.
|
|
196617
|
+
Constraint schema: ${constraintFields2}
|
|
196618
|
+
|
|
196619
|
+
### Gantt / Timeline → behaviors + events + constraints
|
|
196620
|
+
1. Extract each task/phase as a behavior.
|
|
196621
|
+
Behavior schema: ${behaviorFields2}
|
|
196622
|
+
2. Extract milestones and deadlines as events.
|
|
196623
|
+
Event schema: ${eventFields2}
|
|
196624
|
+
3. Extract dependencies and critical path constraints.
|
|
196625
|
+
Constraint schema: ${constraintFields2}
|
|
196626
|
+
|
|
196627
|
+
### Pie / Data Viz → attributes (summary only)
|
|
196628
|
+
1. Extract the chart title as an entity.
|
|
196629
|
+
Entity schema: ${entityFields2}
|
|
196630
|
+
2. Extract each slice/data point as an attribute (\`name\` = label, \`value\` = amount/percentage, \`type\` = "metric").
|
|
196631
|
+
Attribute schema: ${attributeFields2}
|
|
196632
|
+
|
|
196633
|
+
## Additional Extraction: Diagram Description
|
|
196634
|
+
|
|
196635
|
+
For EVERY diagram, also extract a "description" attribute on the diagram's primary entity:
|
|
196636
|
+
- \`name\`: "diagram_description"
|
|
196637
|
+
- \`type\`: "description"
|
|
196638
|
+
- \`value\`: A 1-3 sentence natural language summary of what the diagram communicates.
|
|
196639
|
+
|
|
196640
|
+
This description is critical for downstream AI consumers who cannot render the diagram.
|
|
196641
|
+
|
|
196642
|
+
## Output Format
|
|
196643
|
+
Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
|
|
196644
|
+
{
|
|
196645
|
+
"P0": {
|
|
196646
|
+
"diagramFormat": "mermaid",
|
|
196647
|
+
"diagramType": "sequence",
|
|
196648
|
+
"entities": [...],
|
|
196649
|
+
"relations": [...]
|
|
196650
|
+
}
|
|
196651
|
+
}
|
|
196652
|
+
|
|
196653
|
+
## Rules
|
|
196654
|
+
- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
|
|
196655
|
+
- Skip paragraphs with no atoms — do NOT emit empty objects.
|
|
196656
|
+
- Every atom MUST include a "confidence" field (0.0-1.0).
|
|
196657
|
+
- The "diagramFormat" and "diagramType" fields are required for each paragraph.
|
|
196658
|
+
- Only include atom types that are actually extracted.
|
|
196659
|
+
- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
|
|
196660
|
+
- JSON structure keys must always be in English.
|
|
196661
|
+
- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
|
|
196662
|
+
- **Extract ALL nodes and edges** — do not sample or skip.
|
|
196663
|
+
- Do NOT include "claims" — they are system-generated.`;
|
|
196664
|
+
function buildDocDiagramAnnotationPrompt(diagramText) {
|
|
196665
|
+
return `Analyze and extract atoms from the following diagram paragraphs.
|
|
196666
|
+
Each paragraph is tagged with [P0], [P1], etc. First identify the diagram format and type, then extract atoms accordingly.
|
|
196667
|
+
|
|
196668
|
+
---
|
|
196669
|
+
${diagramText}
|
|
196670
|
+
---
|
|
196671
|
+
|
|
196672
|
+
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
196673
|
+
}
|
|
196674
|
+
var DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT = DIAGRAM_SYSTEM_PROMPT;
|
|
196278
196675
|
// ../llm/src/chunking/markdownChunker.ts
|
|
196279
196676
|
var DEFAULT_MAX_TOKENS2 = 4000;
|
|
196677
|
+
var DEFAULT_PARAGRAPH_MAX_TOKENS = 500;
|
|
196280
196678
|
function estimateTokens(text2) {
|
|
196281
196679
|
return Math.ceil(text2.length / 4);
|
|
196282
196680
|
}
|
|
196681
|
+
function findCodeBlockRanges(content) {
|
|
196682
|
+
const ranges = [];
|
|
196683
|
+
const fenceRe = /^(`{3,}|~{3,})/gm;
|
|
196684
|
+
let openStart = -1;
|
|
196685
|
+
let openFence = "";
|
|
196686
|
+
let match2;
|
|
196687
|
+
while ((match2 = fenceRe.exec(content)) !== null) {
|
|
196688
|
+
const fence = match2[1];
|
|
196689
|
+
if (openStart === -1) {
|
|
196690
|
+
openStart = match2.index;
|
|
196691
|
+
openFence = fence[0].repeat(fence.length);
|
|
196692
|
+
} else if (fence[0] === openFence[0] && fence.length >= openFence.length) {
|
|
196693
|
+
ranges.push({ start: openStart, end: match2.index + match2[0].length });
|
|
196694
|
+
openStart = -1;
|
|
196695
|
+
openFence = "";
|
|
196696
|
+
}
|
|
196697
|
+
}
|
|
196698
|
+
if (openStart !== -1) {
|
|
196699
|
+
ranges.push({ start: openStart, end: content.length });
|
|
196700
|
+
}
|
|
196701
|
+
return ranges;
|
|
196702
|
+
}
|
|
196703
|
+
function isInsideCodeBlock(pos, ranges) {
|
|
196704
|
+
for (const r of ranges) {
|
|
196705
|
+
if (pos >= r.start && pos < r.end)
|
|
196706
|
+
return true;
|
|
196707
|
+
if (r.start > pos)
|
|
196708
|
+
break;
|
|
196709
|
+
}
|
|
196710
|
+
return false;
|
|
196711
|
+
}
|
|
196283
196712
|
function parseSections(content) {
|
|
196284
|
-
const
|
|
196285
|
-
const sections = [];
|
|
196713
|
+
const codeRanges = findCodeBlockRanges(content);
|
|
196286
196714
|
const matches = [];
|
|
196287
|
-
|
|
196288
|
-
|
|
196289
|
-
|
|
196290
|
-
|
|
196291
|
-
|
|
196292
|
-
|
|
196293
|
-
|
|
196715
|
+
const atxRe = /^(#{1,6})\s+(.*)$/gm;
|
|
196716
|
+
let m;
|
|
196717
|
+
while ((m = atxRe.exec(content)) !== null) {
|
|
196718
|
+
if (!isInsideCodeBlock(m.index, codeRanges)) {
|
|
196719
|
+
matches.push({
|
|
196720
|
+
index: m.index,
|
|
196721
|
+
endIndex: m.index + m[0].length,
|
|
196722
|
+
level: m[1].length,
|
|
196723
|
+
heading: m[2].trim()
|
|
196724
|
+
});
|
|
196725
|
+
}
|
|
196726
|
+
}
|
|
196727
|
+
const lines = content.split(`
|
|
196728
|
+
`);
|
|
196729
|
+
let offset = 0;
|
|
196730
|
+
for (let i = 0;i < lines.length; i++) {
|
|
196731
|
+
const line = lines[i];
|
|
196732
|
+
if (i > 0) {
|
|
196733
|
+
const prevLine = lines[i - 1].trim();
|
|
196734
|
+
const prevLineStart = offset - lines[i - 1].length - 1;
|
|
196735
|
+
if (prevLine && !isInsideCodeBlock(prevLineStart, codeRanges)) {
|
|
196736
|
+
if (/^={2,}\s*$/.test(line)) {
|
|
196737
|
+
matches.push({
|
|
196738
|
+
index: prevLineStart < 0 ? 0 : prevLineStart,
|
|
196739
|
+
endIndex: offset + line.length,
|
|
196740
|
+
level: 1,
|
|
196741
|
+
heading: prevLine
|
|
196742
|
+
});
|
|
196743
|
+
} else if (/^-{2,}\s*$/.test(line) && !/^-{3,}\s*$/.test(prevLine)) {
|
|
196744
|
+
matches.push({
|
|
196745
|
+
index: prevLineStart < 0 ? 0 : prevLineStart,
|
|
196746
|
+
endIndex: offset + line.length,
|
|
196747
|
+
level: 2,
|
|
196748
|
+
heading: prevLine
|
|
196749
|
+
});
|
|
196750
|
+
}
|
|
196751
|
+
}
|
|
196752
|
+
}
|
|
196753
|
+
offset += line.length + 1;
|
|
196754
|
+
}
|
|
196755
|
+
matches.sort((a, b) => a.index - b.index);
|
|
196756
|
+
const deduped = [];
|
|
196757
|
+
for (const match2 of matches) {
|
|
196758
|
+
const last = deduped[deduped.length - 1];
|
|
196759
|
+
if (last && match2.index < last.endIndex)
|
|
196760
|
+
continue;
|
|
196761
|
+
deduped.push(match2);
|
|
196294
196762
|
}
|
|
196763
|
+
return buildSectionsFromMatches(content, deduped);
|
|
196764
|
+
}
|
|
196765
|
+
function buildSectionsFromMatches(content, matches) {
|
|
196766
|
+
const sections = [];
|
|
196295
196767
|
if (matches.length === 0) {
|
|
196296
196768
|
const body2 = content.trim();
|
|
196297
196769
|
if (body2) {
|
|
196298
|
-
sections.push({
|
|
196299
|
-
heading: "",
|
|
196300
|
-
level: 0,
|
|
196301
|
-
body: body2,
|
|
196302
|
-
paragraphs: splitParagraphs(body2)
|
|
196303
|
-
});
|
|
196770
|
+
sections.push({ heading: "", level: 0, body: body2, paragraphs: splitParagraphs(body2) });
|
|
196304
196771
|
}
|
|
196305
196772
|
return sections;
|
|
196306
196773
|
}
|
|
196307
196774
|
if (matches[0].index > 0) {
|
|
196308
196775
|
const preBody = content.slice(0, matches[0].index).trim();
|
|
196309
196776
|
if (preBody) {
|
|
196310
|
-
sections.push({
|
|
196311
|
-
heading: "",
|
|
196312
|
-
level: 0,
|
|
196313
|
-
body: preBody,
|
|
196314
|
-
paragraphs: splitParagraphs(preBody)
|
|
196315
|
-
});
|
|
196777
|
+
sections.push({ heading: "", level: 0, body: preBody, paragraphs: splitParagraphs(preBody) });
|
|
196316
196778
|
}
|
|
196317
196779
|
}
|
|
196318
196780
|
for (let i = 0;i < matches.length; i++) {
|
|
196319
196781
|
const m = matches[i];
|
|
196320
|
-
const
|
|
196321
|
-
const
|
|
196322
|
-
const
|
|
196323
|
-
const headingLineEnd = fullText.indexOf(`
|
|
196324
|
-
`);
|
|
196325
|
-
const body2 = headingLineEnd === -1 ? "" : fullText.slice(headingLineEnd + 1).trim();
|
|
196782
|
+
const bodyStart = m.endIndex;
|
|
196783
|
+
const bodyEnd = i + 1 < matches.length ? matches[i + 1].index : content.length;
|
|
196784
|
+
const body2 = content.slice(bodyStart, bodyEnd).trim();
|
|
196326
196785
|
sections.push({
|
|
196327
196786
|
heading: m.heading,
|
|
196328
196787
|
level: m.level,
|
|
@@ -196337,6 +196796,128 @@ function splitParagraphs(text2) {
|
|
|
196337
196796
|
return [];
|
|
196338
196797
|
return text2.split(/\n\n+/).map((p4) => p4.trim()).filter(Boolean);
|
|
196339
196798
|
}
|
|
196799
|
+
function splitOversizedText(text2, maxTokens) {
|
|
196800
|
+
const doubleNewlineParts = text2.split(/\n\n+/).map((p4) => p4.trim()).filter(Boolean);
|
|
196801
|
+
if (doubleNewlineParts.length > 1) {
|
|
196802
|
+
const results = [];
|
|
196803
|
+
let acc = "";
|
|
196804
|
+
let accTokens = 0;
|
|
196805
|
+
for (const part of doubleNewlineParts) {
|
|
196806
|
+
const partTokens = estimateTokens(part);
|
|
196807
|
+
if (partTokens > maxTokens) {
|
|
196808
|
+
if (acc) {
|
|
196809
|
+
results.push(acc);
|
|
196810
|
+
acc = "";
|
|
196811
|
+
accTokens = 0;
|
|
196812
|
+
}
|
|
196813
|
+
results.push(...splitOversizedText(part, maxTokens));
|
|
196814
|
+
continue;
|
|
196815
|
+
}
|
|
196816
|
+
if (acc && accTokens + partTokens > maxTokens) {
|
|
196817
|
+
results.push(acc);
|
|
196818
|
+
acc = "";
|
|
196819
|
+
accTokens = 0;
|
|
196820
|
+
}
|
|
196821
|
+
acc = acc ? acc + `
|
|
196822
|
+
|
|
196823
|
+
` + part : part;
|
|
196824
|
+
accTokens += partTokens;
|
|
196825
|
+
}
|
|
196826
|
+
if (acc)
|
|
196827
|
+
results.push(acc);
|
|
196828
|
+
return results;
|
|
196829
|
+
}
|
|
196830
|
+
const lines = text2.split(`
|
|
196831
|
+
`);
|
|
196832
|
+
if (lines.length > 1) {
|
|
196833
|
+
const blocks = mergeAtomicBlocks(lines);
|
|
196834
|
+
const results = [];
|
|
196835
|
+
let acc = "";
|
|
196836
|
+
let accTokens = 0;
|
|
196837
|
+
for (const block of blocks) {
|
|
196838
|
+
const blockTokens = estimateTokens(block);
|
|
196839
|
+
if (blockTokens > maxTokens) {
|
|
196840
|
+
if (acc) {
|
|
196841
|
+
results.push(acc);
|
|
196842
|
+
acc = "";
|
|
196843
|
+
accTokens = 0;
|
|
196844
|
+
}
|
|
196845
|
+
results.push(block);
|
|
196846
|
+
continue;
|
|
196847
|
+
}
|
|
196848
|
+
if (acc && accTokens + blockTokens > maxTokens) {
|
|
196849
|
+
results.push(acc);
|
|
196850
|
+
acc = "";
|
|
196851
|
+
accTokens = 0;
|
|
196852
|
+
}
|
|
196853
|
+
acc = acc ? acc + `
|
|
196854
|
+
` + block : block;
|
|
196855
|
+
accTokens += blockTokens;
|
|
196856
|
+
}
|
|
196857
|
+
if (acc)
|
|
196858
|
+
results.push(acc);
|
|
196859
|
+
return results;
|
|
196860
|
+
}
|
|
196861
|
+
return forceBreakText(text2, maxTokens);
|
|
196862
|
+
}
|
|
196863
|
+
function mergeAtomicBlocks(lines) {
|
|
196864
|
+
const result = [];
|
|
196865
|
+
let i = 0;
|
|
196866
|
+
while (i < lines.length) {
|
|
196867
|
+
const line = lines[i];
|
|
196868
|
+
const trimmed = line.trimStart();
|
|
196869
|
+
if (/^(`{3,}|~{3,})/.test(trimmed)) {
|
|
196870
|
+
const fence = trimmed.match(/^(`{3,}|~{3,})/)[1];
|
|
196871
|
+
const fenceChar = fence[0];
|
|
196872
|
+
const fenceLen = fence.length;
|
|
196873
|
+
const blockLines = [line];
|
|
196874
|
+
i++;
|
|
196875
|
+
while (i < lines.length) {
|
|
196876
|
+
blockLines.push(lines[i]);
|
|
196877
|
+
const inner = lines[i].trimStart();
|
|
196878
|
+
if (inner.startsWith(fenceChar) && inner.match(new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`))) {
|
|
196879
|
+
i++;
|
|
196880
|
+
break;
|
|
196881
|
+
}
|
|
196882
|
+
i++;
|
|
196883
|
+
}
|
|
196884
|
+
result.push(blockLines.join(`
|
|
196885
|
+
`));
|
|
196886
|
+
continue;
|
|
196887
|
+
}
|
|
196888
|
+
if (trimmed.startsWith("|")) {
|
|
196889
|
+
const tableLines = [line];
|
|
196890
|
+
i++;
|
|
196891
|
+
while (i < lines.length && lines[i].trimStart().startsWith("|")) {
|
|
196892
|
+
tableLines.push(lines[i]);
|
|
196893
|
+
i++;
|
|
196894
|
+
}
|
|
196895
|
+
result.push(tableLines.join(`
|
|
196896
|
+
`));
|
|
196897
|
+
continue;
|
|
196898
|
+
}
|
|
196899
|
+
result.push(line);
|
|
196900
|
+
i++;
|
|
196901
|
+
}
|
|
196902
|
+
return result;
|
|
196903
|
+
}
|
|
196904
|
+
function forceBreakText(text2, maxTokens) {
|
|
196905
|
+
const maxChars = maxTokens * 4;
|
|
196906
|
+
const results = [];
|
|
196907
|
+
let remaining = text2;
|
|
196908
|
+
while (remaining.length > maxChars) {
|
|
196909
|
+
let breakAt = maxChars;
|
|
196910
|
+
const spaceIdx = remaining.lastIndexOf(" ", maxChars);
|
|
196911
|
+
if (spaceIdx > maxChars * 0.7) {
|
|
196912
|
+
breakAt = spaceIdx;
|
|
196913
|
+
}
|
|
196914
|
+
results.push(remaining.slice(0, breakAt).trim());
|
|
196915
|
+
remaining = remaining.slice(breakAt).trim();
|
|
196916
|
+
}
|
|
196917
|
+
if (remaining)
|
|
196918
|
+
results.push(remaining);
|
|
196919
|
+
return results;
|
|
196920
|
+
}
|
|
196340
196921
|
function buildBreadcrumb(sections, sectionIndex) {
|
|
196341
196922
|
const current = sections[sectionIndex];
|
|
196342
196923
|
if (current.level <= 0)
|
|
@@ -196365,11 +196946,53 @@ function sectionHeadingLine(section) {
|
|
|
196365
196946
|
return "";
|
|
196366
196947
|
return `${"#".repeat(section.level)} ${section.heading}`;
|
|
196367
196948
|
}
|
|
196949
|
+
function buildCoarseParagraphs(sections, paragraphMaxTokens) {
|
|
196950
|
+
const result = [];
|
|
196951
|
+
const rawEntries = [];
|
|
196952
|
+
for (let sIdx = 0;sIdx < sections.length; sIdx++) {
|
|
196953
|
+
const section = sections[sIdx];
|
|
196954
|
+
if (!section.body.trim())
|
|
196955
|
+
continue;
|
|
196956
|
+
const bodyTokens = estimateTokens(section.body);
|
|
196957
|
+
if (bodyTokens > paragraphMaxTokens) {
|
|
196958
|
+
const parts = splitOversizedText(section.body, paragraphMaxTokens);
|
|
196959
|
+
for (const part of parts) {
|
|
196960
|
+
rawEntries.push({ sectionIndex: sIdx, text: part, tokens: estimateTokens(part) });
|
|
196961
|
+
}
|
|
196962
|
+
} else {
|
|
196963
|
+
rawEntries.push({ sectionIndex: sIdx, text: section.body, tokens: bodyTokens });
|
|
196964
|
+
}
|
|
196965
|
+
}
|
|
196966
|
+
const MERGE_THRESHOLD = 150;
|
|
196967
|
+
const merged = [];
|
|
196968
|
+
for (const entry of rawEntries) {
|
|
196969
|
+
const last = merged[merged.length - 1];
|
|
196970
|
+
if (last && last.tokens < MERGE_THRESHOLD && entry.tokens < MERGE_THRESHOLD && last.tokens + entry.tokens <= paragraphMaxTokens) {
|
|
196971
|
+
last.text = last.text + `
|
|
196972
|
+
|
|
196973
|
+
` + entry.text;
|
|
196974
|
+
last.tokens += entry.tokens;
|
|
196975
|
+
} else {
|
|
196976
|
+
merged.push({ ...entry });
|
|
196977
|
+
}
|
|
196978
|
+
}
|
|
196979
|
+
let pIdx = 0;
|
|
196980
|
+
for (const entry of merged) {
|
|
196981
|
+
result.push({
|
|
196982
|
+
sectionIndex: entry.sectionIndex,
|
|
196983
|
+
paragraphIndex: pIdx++,
|
|
196984
|
+
text: entry.text
|
|
196985
|
+
});
|
|
196986
|
+
}
|
|
196987
|
+
return result;
|
|
196988
|
+
}
|
|
196368
196989
|
function chunkMarkdown(content, options = {}) {
|
|
196369
196990
|
const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS2;
|
|
196991
|
+
const paragraphMaxTokens = options.paragraphMaxTokens ?? DEFAULT_PARAGRAPH_MAX_TOKENS;
|
|
196370
196992
|
const sections = parseSections(content);
|
|
196371
196993
|
if (sections.length === 0)
|
|
196372
196994
|
return [];
|
|
196995
|
+
const coarseParagraphs = buildCoarseParagraphs(sections, paragraphMaxTokens);
|
|
196373
196996
|
const chunks = [];
|
|
196374
196997
|
let pendingSections = [];
|
|
196375
196998
|
let pendingTokens = 0;
|
|
@@ -196387,14 +197010,16 @@ function chunkMarkdown(content, options = {}) {
|
|
|
196387
197010
|
const heading = sectionHeadingLine(entry.section);
|
|
196388
197011
|
if (heading)
|
|
196389
197012
|
textParts.push(heading);
|
|
196390
|
-
|
|
196391
|
-
|
|
196392
|
-
|
|
196393
|
-
|
|
196394
|
-
|
|
196395
|
-
|
|
196396
|
-
|
|
196397
|
-
|
|
197013
|
+
const sectionParas = coarseParagraphs.filter((p4) => p4.sectionIndex === entry.sectionIndex);
|
|
197014
|
+
for (const p4 of sectionParas) {
|
|
197015
|
+
if (!paragraphs.some((existing) => existing.paragraphIndex === p4.paragraphIndex && existing.text === p4.text)) {
|
|
197016
|
+
textParts.push(p4.text);
|
|
197017
|
+
paragraphs.push({
|
|
197018
|
+
sectionIndex: p4.sectionIndex,
|
|
197019
|
+
paragraphIndex: p4.paragraphIndex,
|
|
197020
|
+
text: p4.text
|
|
197021
|
+
});
|
|
197022
|
+
}
|
|
196398
197023
|
}
|
|
196399
197024
|
}
|
|
196400
197025
|
chunks.push({
|
|
@@ -196417,7 +197042,7 @@ function chunkMarkdown(content, options = {}) {
|
|
|
196417
197042
|
` : "") + section.body);
|
|
196418
197043
|
if (sectionTokens > maxTokens && section.paragraphs.length > 1) {
|
|
196419
197044
|
flushPending();
|
|
196420
|
-
splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks);
|
|
197045
|
+
splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks, coarseParagraphs);
|
|
196421
197046
|
continue;
|
|
196422
197047
|
}
|
|
196423
197048
|
const crumbTokens = pendingSections.length === 0 ? estimateTokens(breadcrumbPrefix(breadcrumb)) : 0;
|
|
@@ -196430,9 +197055,10 @@ function chunkMarkdown(content, options = {}) {
|
|
|
196430
197055
|
flushPending();
|
|
196431
197056
|
return chunks;
|
|
196432
197057
|
}
|
|
196433
|
-
function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks) {
|
|
197058
|
+
function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks, coarseParagraphs) {
|
|
196434
197059
|
const headingLine = sectionHeadingLine(section);
|
|
196435
197060
|
const prefix = breadcrumbPrefix(breadcrumb);
|
|
197061
|
+
const sectionParas = coarseParagraphs.filter((p4) => p4.sectionIndex === sectionIndex);
|
|
196436
197062
|
let accParagraphs = [];
|
|
196437
197063
|
let accTextParts = [];
|
|
196438
197064
|
let accTokens = 0;
|
|
@@ -196459,18 +197085,265 @@ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens,
|
|
|
196459
197085
|
accTokens = baseOverhead;
|
|
196460
197086
|
}
|
|
196461
197087
|
accTokens = baseOverhead;
|
|
196462
|
-
for (
|
|
196463
|
-
const
|
|
196464
|
-
const pTokens = estimateTokens(pText);
|
|
197088
|
+
for (const p4 of sectionParas) {
|
|
197089
|
+
const pTokens = estimateTokens(p4.text);
|
|
196465
197090
|
if (accParagraphs.length > 0 && accTokens + pTokens > maxTokens) {
|
|
196466
197091
|
flushAcc();
|
|
196467
197092
|
}
|
|
196468
|
-
accParagraphs.push({ sectionIndex, paragraphIndex:
|
|
196469
|
-
accTextParts.push(
|
|
197093
|
+
accParagraphs.push({ sectionIndex, paragraphIndex: p4.paragraphIndex, text: p4.text });
|
|
197094
|
+
accTextParts.push(p4.text);
|
|
196470
197095
|
accTokens += pTokens;
|
|
196471
197096
|
}
|
|
196472
197097
|
flushAcc();
|
|
196473
197098
|
}
|
|
197099
|
+
// ../llm/src/chunking/normalizeMarkdown.ts
|
|
197100
|
+
function normalizeMarkdown(content) {
|
|
197101
|
+
const stats = { repairs: {} };
|
|
197102
|
+
function count(category) {
|
|
197103
|
+
stats.repairs[category] = (stats.repairs[category] ?? 0) + 1;
|
|
197104
|
+
}
|
|
197105
|
+
let result = stripBomAndInvisible(content, count);
|
|
197106
|
+
result = normalizeLineEndings(result, count);
|
|
197107
|
+
const lines = result.split(`
|
|
197108
|
+
`);
|
|
197109
|
+
const output = processBlocks(lines, count);
|
|
197110
|
+
return { content: output.join(`
|
|
197111
|
+
`), stats };
|
|
197112
|
+
}
|
|
197113
|
+
function stripBomAndInvisible(text2, count) {
|
|
197114
|
+
const cleaned = text2.replace(/[\uFEFF\u200B\u200C\u200D]/g, "");
|
|
197115
|
+
if (cleaned.length !== text2.length) {
|
|
197116
|
+
count("invisible_chars");
|
|
197117
|
+
}
|
|
197118
|
+
return cleaned;
|
|
197119
|
+
}
|
|
197120
|
+
function normalizeLineEndings(text2, count) {
|
|
197121
|
+
if (text2.includes("\r")) {
|
|
197122
|
+
count("line_endings");
|
|
197123
|
+
return text2.replace(/\r\n?/g, `
|
|
197124
|
+
`);
|
|
197125
|
+
}
|
|
197126
|
+
return text2;
|
|
197127
|
+
}
|
|
197128
|
+
function processBlocks(inputLines, count) {
|
|
197129
|
+
const lines = splitInlineFences(inputLines, count);
|
|
197130
|
+
const output = [];
|
|
197131
|
+
let i = 0;
|
|
197132
|
+
while (i < lines.length) {
|
|
197133
|
+
const line = lines[i];
|
|
197134
|
+
const trimmed = line.trimStart();
|
|
197135
|
+
const fenceMatch = trimmed.match(/^(`{3,}|~{3,})/);
|
|
197136
|
+
if (fenceMatch) {
|
|
197137
|
+
const result = handleCodeFence(lines, i, fenceMatch[1], count);
|
|
197138
|
+
output.push(...result.lines);
|
|
197139
|
+
i = result.nextIndex;
|
|
197140
|
+
continue;
|
|
197141
|
+
}
|
|
197142
|
+
if (looksLikeTableRow(trimmed)) {
|
|
197143
|
+
const result = handleTableBlock(lines, i, count);
|
|
197144
|
+
output.push(...result.lines);
|
|
197145
|
+
i = result.nextIndex;
|
|
197146
|
+
continue;
|
|
197147
|
+
}
|
|
197148
|
+
if (trimmed === "") {
|
|
197149
|
+
const result = handleBlankLines(lines, i, count);
|
|
197150
|
+
output.push(...result.lines);
|
|
197151
|
+
i = result.nextIndex;
|
|
197152
|
+
continue;
|
|
197153
|
+
}
|
|
197154
|
+
if (trimmed.startsWith("<!--")) {
|
|
197155
|
+
const result = handleHtmlComment(lines, i, count);
|
|
197156
|
+
output.push(...result.lines);
|
|
197157
|
+
i = result.nextIndex;
|
|
197158
|
+
continue;
|
|
197159
|
+
}
|
|
197160
|
+
if (looksLikeJsonBlockStart(trimmed)) {
|
|
197161
|
+
const result = handleUnfencedJson(lines, i, count);
|
|
197162
|
+
if (result) {
|
|
197163
|
+
output.push(...result.lines);
|
|
197164
|
+
i = result.nextIndex;
|
|
197165
|
+
continue;
|
|
197166
|
+
}
|
|
197167
|
+
}
|
|
197168
|
+
output.push(line);
|
|
197169
|
+
i++;
|
|
197170
|
+
}
|
|
197171
|
+
return output;
|
|
197172
|
+
}
|
|
197173
|
+
function handleCodeFence(lines, startIdx, fence, count) {
|
|
197174
|
+
const fenceChar = fence[0];
|
|
197175
|
+
const fenceLen = fence.length;
|
|
197176
|
+
const result = [lines[startIdx]];
|
|
197177
|
+
let i = startIdx + 1;
|
|
197178
|
+
while (i < lines.length) {
|
|
197179
|
+
const trimmed = lines[i].trimStart();
|
|
197180
|
+
result.push(lines[i]);
|
|
197181
|
+
const closingRe = new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`);
|
|
197182
|
+
if (closingRe.test(trimmed)) {
|
|
197183
|
+
return { lines: result, nextIndex: i + 1 };
|
|
197184
|
+
}
|
|
197185
|
+
i++;
|
|
197186
|
+
}
|
|
197187
|
+
count("unclosed_code_fence");
|
|
197188
|
+
result.push(fence);
|
|
197189
|
+
return { lines: result, nextIndex: i };
|
|
197190
|
+
}
|
|
197191
|
+
function handleTableBlock(lines, startIdx, count) {
|
|
197192
|
+
const tableLines = [];
|
|
197193
|
+
let i = startIdx;
|
|
197194
|
+
while (i < lines.length && looksLikeTableRow(lines[i].trimStart())) {
|
|
197195
|
+
tableLines.push(lines[i]);
|
|
197196
|
+
i++;
|
|
197197
|
+
}
|
|
197198
|
+
if (tableLines.length < 2) {
|
|
197199
|
+
return { lines: tableLines, nextIndex: i };
|
|
197200
|
+
}
|
|
197201
|
+
const normalized = tableLines.map((line) => {
|
|
197202
|
+
const trimmed = line.trimStart();
|
|
197203
|
+
if (!trimmed.startsWith("|") && trimmed.includes("|")) {
|
|
197204
|
+
count("table_leading_pipe");
|
|
197205
|
+
return "| " + trimmed + (trimmed.endsWith("|") ? "" : " |");
|
|
197206
|
+
}
|
|
197207
|
+
return line;
|
|
197208
|
+
});
|
|
197209
|
+
const hasSeparator = normalized.some((line) => /^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/.test(line.trim()));
|
|
197210
|
+
if (!hasSeparator && normalized.length >= 2) {
|
|
197211
|
+
const firstRow = normalized[0].trim();
|
|
197212
|
+
const colCount = countPipes(firstRow) - 1;
|
|
197213
|
+
if (colCount >= 2) {
|
|
197214
|
+
const separator = "| " + Array(colCount).fill("---").join(" | ") + " |";
|
|
197215
|
+
count("table_missing_separator");
|
|
197216
|
+
const result = [normalized[0], separator, ...normalized.slice(1)];
|
|
197217
|
+
return { lines: result, nextIndex: i };
|
|
197218
|
+
}
|
|
197219
|
+
}
|
|
197220
|
+
return { lines: normalized, nextIndex: i };
|
|
197221
|
+
}
|
|
197222
|
+
function handleBlankLines(lines, startIdx, count) {
|
|
197223
|
+
let i = startIdx;
|
|
197224
|
+
while (i < lines.length && lines[i].trim() === "") {
|
|
197225
|
+
i++;
|
|
197226
|
+
}
|
|
197227
|
+
const blankCount = i - startIdx;
|
|
197228
|
+
if (blankCount > 2) {
|
|
197229
|
+
count("excessive_blank_lines");
|
|
197230
|
+
return { lines: [""], nextIndex: i };
|
|
197231
|
+
}
|
|
197232
|
+
return { lines: lines.slice(startIdx, i), nextIndex: i };
|
|
197233
|
+
}
|
|
197234
|
+
function handleHtmlComment(lines, startIdx, count) {
|
|
197235
|
+
const firstLine = lines[startIdx];
|
|
197236
|
+
if (firstLine.includes("-->")) {
|
|
197237
|
+
count("html_comment");
|
|
197238
|
+
return { lines: [], nextIndex: startIdx + 1 };
|
|
197239
|
+
}
|
|
197240
|
+
let i = startIdx + 1;
|
|
197241
|
+
while (i < lines.length) {
|
|
197242
|
+
if (lines[i].includes("-->")) {
|
|
197243
|
+
count("html_comment");
|
|
197244
|
+
return { lines: [], nextIndex: i + 1 };
|
|
197245
|
+
}
|
|
197246
|
+
i++;
|
|
197247
|
+
}
|
|
197248
|
+
return { lines: [firstLine], nextIndex: startIdx + 1 };
|
|
197249
|
+
}
|
|
197250
|
+
function looksLikeJsonBlockStart(trimmed) {
|
|
197251
|
+
return trimmed === "{" || trimmed === "[";
|
|
197252
|
+
}
|
|
197253
|
+
var MIN_JSON_BLOCK_LINES = 5;
|
|
197254
|
+
function handleUnfencedJson(lines, startIdx, count) {
|
|
197255
|
+
const opener = lines[startIdx].trimStart();
|
|
197256
|
+
const openChar = opener[0];
|
|
197257
|
+
const closeChar = openChar === "{" ? "}" : "]";
|
|
197258
|
+
let depth = 0;
|
|
197259
|
+
let i = startIdx;
|
|
197260
|
+
let inString = false;
|
|
197261
|
+
while (i < lines.length) {
|
|
197262
|
+
const line = lines[i];
|
|
197263
|
+
for (let c = 0;c < line.length; c++) {
|
|
197264
|
+
const ch = line[c];
|
|
197265
|
+
if (ch === "\\" && inString) {
|
|
197266
|
+
c++;
|
|
197267
|
+
continue;
|
|
197268
|
+
}
|
|
197269
|
+
if (ch === '"') {
|
|
197270
|
+
inString = !inString;
|
|
197271
|
+
continue;
|
|
197272
|
+
}
|
|
197273
|
+
if (inString)
|
|
197274
|
+
continue;
|
|
197275
|
+
if (ch === "/" && c + 1 < line.length && line[c + 1] === "/") {
|
|
197276
|
+
break;
|
|
197277
|
+
}
|
|
197278
|
+
if (ch === "{" || ch === "[")
|
|
197279
|
+
depth++;
|
|
197280
|
+
else if (ch === "}" || ch === "]")
|
|
197281
|
+
depth--;
|
|
197282
|
+
}
|
|
197283
|
+
i++;
|
|
197284
|
+
if (depth === 0) {
|
|
197285
|
+
const blockLen = i - startIdx;
|
|
197286
|
+
if (blockLen < MIN_JSON_BLOCK_LINES) {
|
|
197287
|
+
return null;
|
|
197288
|
+
}
|
|
197289
|
+
const lastTrimmed = lines[i - 1].trimEnd();
|
|
197290
|
+
if (!lastTrimmed.endsWith(closeChar)) {
|
|
197291
|
+
return null;
|
|
197292
|
+
}
|
|
197293
|
+
count("unfenced_json_block");
|
|
197294
|
+
const fenced = ["```json"];
|
|
197295
|
+
for (let j = startIdx;j < i; j++) {
|
|
197296
|
+
fenced.push(lines[j]);
|
|
197297
|
+
}
|
|
197298
|
+
fenced.push("```");
|
|
197299
|
+
return { lines: fenced, nextIndex: i };
|
|
197300
|
+
}
|
|
197301
|
+
if (depth < 0) {
|
|
197302
|
+
return null;
|
|
197303
|
+
}
|
|
197304
|
+
}
|
|
197305
|
+
return null;
|
|
197306
|
+
}
|
|
197307
|
+
function splitInlineFences(lines, count) {
|
|
197308
|
+
const result = [];
|
|
197309
|
+
for (const line of lines) {
|
|
197310
|
+
const trimmed = line.trimStart();
|
|
197311
|
+
if (/^(`{3,}|~{3,})/.test(trimmed)) {
|
|
197312
|
+
result.push(line);
|
|
197313
|
+
continue;
|
|
197314
|
+
}
|
|
197315
|
+
const inlineMatch = trimmed.match(/(`{3,}|~{3,})(\S*)\s*$/);
|
|
197316
|
+
if (inlineMatch) {
|
|
197317
|
+
const fenceStr = inlineMatch[1];
|
|
197318
|
+
const fenceIdx = trimmed.lastIndexOf(fenceStr);
|
|
197319
|
+
const beforeFence = trimmed.substring(0, fenceIdx);
|
|
197320
|
+
if (beforeFence.trim().length > 0) {
|
|
197321
|
+
const leadingWhitespace = line.substring(0, line.length - trimmed.length);
|
|
197322
|
+
count("inline_code_fence");
|
|
197323
|
+
result.push(leadingWhitespace + beforeFence.trimEnd());
|
|
197324
|
+
result.push(trimmed.substring(fenceIdx));
|
|
197325
|
+
continue;
|
|
197326
|
+
}
|
|
197327
|
+
}
|
|
197328
|
+
result.push(line);
|
|
197329
|
+
}
|
|
197330
|
+
return result;
|
|
197331
|
+
}
|
|
197332
|
+
function looksLikeTableRow(trimmed) {
|
|
197333
|
+
if (trimmed.startsWith("#") || trimmed.startsWith("```") || trimmed.startsWith("~~~")) {
|
|
197334
|
+
return false;
|
|
197335
|
+
}
|
|
197336
|
+
return countPipes(trimmed) >= 1;
|
|
197337
|
+
}
|
|
197338
|
+
function countPipes(text2) {
|
|
197339
|
+
let count = 0;
|
|
197340
|
+
for (let i = 0;i < text2.length; i++) {
|
|
197341
|
+
if (text2[i] === "|" && (i === 0 || text2[i - 1] !== "\\")) {
|
|
197342
|
+
count++;
|
|
197343
|
+
}
|
|
197344
|
+
}
|
|
197345
|
+
return count;
|
|
197346
|
+
}
|
|
196474
197347
|
// ../llm/src/utils/mapConcurrent.ts
|
|
196475
197348
|
async function mapConcurrent(items, concurrency, fn) {
|
|
196476
197349
|
const results = [];
|
|
@@ -196499,9 +197372,760 @@ async function mapConcurrent(items, concurrency, fn) {
|
|
|
196499
197372
|
}
|
|
196500
197373
|
// ../api/src/services/docIndexer.ts
|
|
196501
197374
|
init_src();
|
|
197375
|
+
|
|
197376
|
+
// ../api/src/services/docEmbedding.ts
|
|
197377
|
+
var EMBEDDING_BATCH_SIZE = 20;
|
|
197378
|
+
var EMBEDDING_MAX_TOKENS = 480;
|
|
197379
|
+
function isPureCodeBlock(text2) {
|
|
197380
|
+
const trimmed = text2.trim();
|
|
197381
|
+
if (/^```[\s\S]*```\s*$/.test(trimmed))
|
|
197382
|
+
return true;
|
|
197383
|
+
const lines = trimmed.split(`
|
|
197384
|
+
`).filter(Boolean);
|
|
197385
|
+
if (lines.length < 3)
|
|
197386
|
+
return false;
|
|
197387
|
+
const indentedLines = lines.filter((l) => /^\s{2,}/.test(l)).length;
|
|
197388
|
+
const indentRatio = indentedLines / lines.length;
|
|
197389
|
+
if (indentRatio > 0.8)
|
|
197390
|
+
return true;
|
|
197391
|
+
const codeChars = (trimmed.match(/[{}();=><|&![\]]/g) || []).length;
|
|
197392
|
+
const ratio = codeChars / trimmed.length;
|
|
197393
|
+
if (ratio > 0.15 && indentRatio > 0.6)
|
|
197394
|
+
return true;
|
|
197395
|
+
return false;
|
|
197396
|
+
}
|
|
197397
|
+
var CODE_SKELETON_MAX_LINES = 20;
|
|
197398
|
+
var CODE_SKELETON_MAX_CHARS = 800;
|
|
197399
|
+
function skeletonizeCodeBlock(text2) {
|
|
197400
|
+
const lines = text2.split(`
|
|
197401
|
+
`);
|
|
197402
|
+
let indentUnit = 2;
|
|
197403
|
+
for (const line of lines) {
|
|
197404
|
+
const match2 = line.match(/^(\s+)\S/);
|
|
197405
|
+
if (match2) {
|
|
197406
|
+
const spaces = match2[1].replace(/\t/g, " ").length;
|
|
197407
|
+
if (spaces > 0) {
|
|
197408
|
+
indentUnit = spaces;
|
|
197409
|
+
break;
|
|
197410
|
+
}
|
|
197411
|
+
}
|
|
197412
|
+
}
|
|
197413
|
+
const maxIndent = indentUnit * 2;
|
|
197414
|
+
const kept = [];
|
|
197415
|
+
let lastWasElided = false;
|
|
197416
|
+
for (const line of lines) {
|
|
197417
|
+
const trimmed = line.trimStart();
|
|
197418
|
+
if (trimmed === "")
|
|
197419
|
+
continue;
|
|
197420
|
+
const leadingSpaces = line.replace(/\t/g, " ").length - trimmed.length;
|
|
197421
|
+
if (leadingSpaces <= maxIndent) {
|
|
197422
|
+
if (lastWasElided) {
|
|
197423
|
+
kept.push(" ...");
|
|
197424
|
+
lastWasElided = false;
|
|
197425
|
+
}
|
|
197426
|
+
kept.push(line);
|
|
197427
|
+
} else {
|
|
197428
|
+
lastWasElided = true;
|
|
197429
|
+
}
|
|
197430
|
+
}
|
|
197431
|
+
if (lastWasElided)
|
|
197432
|
+
kept.push(" ...");
|
|
197433
|
+
let result = kept;
|
|
197434
|
+
if (result.length > CODE_SKELETON_MAX_LINES) {
|
|
197435
|
+
result = result.slice(0, CODE_SKELETON_MAX_LINES);
|
|
197436
|
+
result.push("[...]");
|
|
197437
|
+
}
|
|
197438
|
+
let joined = result.join(`
|
|
197439
|
+
`);
|
|
197440
|
+
if (joined.length > CODE_SKELETON_MAX_CHARS) {
|
|
197441
|
+
joined = joined.slice(0, CODE_SKELETON_MAX_CHARS) + `
|
|
197442
|
+
[...]`;
|
|
197443
|
+
}
|
|
197444
|
+
return joined;
|
|
197445
|
+
}
|
|
197446
|
+
function truncateForEmbedding(text2, maxTokens) {
|
|
197447
|
+
const maxChars = maxTokens * 4;
|
|
197448
|
+
if (text2.length <= maxChars)
|
|
197449
|
+
return text2;
|
|
197450
|
+
const spaceIdx = text2.lastIndexOf(" ", maxChars);
|
|
197451
|
+
const breakAt = spaceIdx > maxChars * 0.8 ? spaceIdx : maxChars;
|
|
197452
|
+
return text2.slice(0, breakAt);
|
|
197453
|
+
}
|
|
197454
|
+
async function generateEmbeddings(digest, embeddingService, onProgress) {
|
|
197455
|
+
const paragraphs = [];
|
|
197456
|
+
let skippedCode = 0;
|
|
197457
|
+
for (let sIdx = 0;sIdx < digest.sections.length; sIdx++) {
|
|
197458
|
+
const section = digest.sections[sIdx];
|
|
197459
|
+
for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
|
|
197460
|
+
const text2 = section.paragraphs[pIdx].text;
|
|
197461
|
+
if (isPureCodeBlock(text2)) {
|
|
197462
|
+
skippedCode++;
|
|
197463
|
+
continue;
|
|
197464
|
+
}
|
|
197465
|
+
paragraphs.push({
|
|
197466
|
+
sectionIndex: sIdx,
|
|
197467
|
+
paragraphIndex: pIdx,
|
|
197468
|
+
text: truncateForEmbedding(text2, EMBEDDING_MAX_TOKENS)
|
|
197469
|
+
});
|
|
197470
|
+
}
|
|
197471
|
+
}
|
|
197472
|
+
if (paragraphs.length === 0)
|
|
197473
|
+
return 0;
|
|
197474
|
+
if (skippedCode > 0) {
|
|
197475
|
+
onProgress?.({ phase: "embedding", progress: 85, message: `Skipped ${skippedCode} code-only paragraphs` });
|
|
197476
|
+
}
|
|
197477
|
+
const embeddings = [];
|
|
197478
|
+
const totalParagraphs = paragraphs.length;
|
|
197479
|
+
onProgress?.({ phase: "embedding", progress: 85, message: `Loading model (${totalParagraphs} paragraphs)` });
|
|
197480
|
+
const warmupStart = Date.now();
|
|
197481
|
+
await embeddingService.getDimension();
|
|
197482
|
+
const warmupMs = Date.now() - warmupStart;
|
|
197483
|
+
if (warmupMs > 500) {
|
|
197484
|
+
onProgress?.({ phase: "embedding", progress: 86, message: `Model ready (${(warmupMs / 1000).toFixed(1)}s)` });
|
|
197485
|
+
}
|
|
197486
|
+
const totalBatches = Math.ceil(totalParagraphs / EMBEDDING_BATCH_SIZE);
|
|
197487
|
+
for (let i = 0;i < totalParagraphs; i += EMBEDDING_BATCH_SIZE) {
|
|
197488
|
+
const batchIndex = Math.floor(i / EMBEDDING_BATCH_SIZE) + 1;
|
|
197489
|
+
const batch2 = paragraphs.slice(i, i + EMBEDDING_BATCH_SIZE);
|
|
197490
|
+
const texts = batch2.map((p4) => p4.text);
|
|
197491
|
+
const batchStart = Date.now();
|
|
197492
|
+
try {
|
|
197493
|
+
const vectors = await embeddingService.embedBatch(texts);
|
|
197494
|
+
for (let j = 0;j < batch2.length; j++) {
|
|
197495
|
+
embeddings.push({
|
|
197496
|
+
sectionIndex: batch2[j].sectionIndex,
|
|
197497
|
+
paragraphIndex: batch2[j].paragraphIndex,
|
|
197498
|
+
vector: vectors[j]
|
|
197499
|
+
});
|
|
197500
|
+
}
|
|
197501
|
+
} catch {
|
|
197502
|
+
for (let fi = 0;fi < batch2.length; fi++) {
|
|
197503
|
+
const p4 = batch2[fi];
|
|
197504
|
+
try {
|
|
197505
|
+
const vector = await embeddingService.embed(p4.text);
|
|
197506
|
+
embeddings.push({
|
|
197507
|
+
sectionIndex: p4.sectionIndex,
|
|
197508
|
+
paragraphIndex: p4.paragraphIndex,
|
|
197509
|
+
vector
|
|
197510
|
+
});
|
|
197511
|
+
} catch {
|
|
197512
|
+
console.warn(`[docIndexer] embedding failed for section ${p4.sectionIndex} paragraph ${p4.paragraphIndex}`);
|
|
197513
|
+
}
|
|
197514
|
+
const embedded2 = i + fi + 1;
|
|
197515
|
+
const progress2 = 86 + Math.round(embedded2 / totalParagraphs * 9);
|
|
197516
|
+
onProgress?.({ phase: "embedding", progress: progress2, message: `Fallback ${embedded2}/${totalParagraphs}` });
|
|
197517
|
+
}
|
|
197518
|
+
continue;
|
|
197519
|
+
}
|
|
197520
|
+
const embedded = Math.min(i + EMBEDDING_BATCH_SIZE, totalParagraphs);
|
|
197521
|
+
const batchMs = Date.now() - batchStart;
|
|
197522
|
+
const progress = 86 + Math.round(embedded / totalParagraphs * 9);
|
|
197523
|
+
onProgress?.({ phase: "embedding", progress, message: `Batch ${batchIndex}/${totalBatches} (${embedded}/${totalParagraphs}, ${(batchMs / 1000).toFixed(1)}s)` });
|
|
197524
|
+
}
|
|
197525
|
+
digest.embeddings = embeddings;
|
|
197526
|
+
return embeddings.length;
|
|
197527
|
+
}
|
|
197528
|
+
async function writeToVectorStore(digest, vectorStore, hashId, sourceId, sourcePath) {
|
|
197529
|
+
if (digest.embeddings.length === 0)
|
|
197530
|
+
return;
|
|
197531
|
+
try {
|
|
197532
|
+
await vectorStore.deleteByPrefix(`${hashId}:`);
|
|
197533
|
+
await vectorStore.add(digest.embeddings.map((e) => ({
|
|
197534
|
+
id: `${hashId}:${e.sectionIndex}:${e.paragraphIndex}`,
|
|
197535
|
+
embedding: e.vector,
|
|
197536
|
+
metadata: {
|
|
197537
|
+
layer: "digest",
|
|
197538
|
+
sourceId,
|
|
197539
|
+
hashId,
|
|
197540
|
+
sourcePath,
|
|
197541
|
+
sectionIndex: e.sectionIndex,
|
|
197542
|
+
paragraphIndex: e.paragraphIndex
|
|
197543
|
+
}
|
|
197544
|
+
})));
|
|
197545
|
+
} catch (err2) {
|
|
197546
|
+
console.warn(`[docIndexer] IVectorStore write failed (non-blocking):`, err2);
|
|
197547
|
+
}
|
|
197548
|
+
}
|
|
197549
|
+
|
|
197550
|
+
// ../api/src/services/docTableExtractor.ts
|
|
197551
|
+
init_src();
|
|
197552
|
+
function detectTableColumnCount(text2) {
|
|
197553
|
+
const sepMatch = text2.match(/^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/m);
|
|
197554
|
+
if (!sepMatch)
|
|
197555
|
+
return 0;
|
|
197556
|
+
return (sepMatch[0].match(/\|/g)?.length ?? 1) - 1;
|
|
197557
|
+
}
|
|
197558
|
+
async function extractTableAtoms(chunk, sections, result, llmService) {
|
|
197559
|
+
const tableParagraphs = [];
|
|
197560
|
+
for (let i = 0;i < chunk.paragraphs.length; i++) {
|
|
197561
|
+
const p4 = chunk.paragraphs[i];
|
|
197562
|
+
const colCount = detectTableColumnCount(p4.text);
|
|
197563
|
+
if (colCount < 2)
|
|
197564
|
+
continue;
|
|
197565
|
+
const section = sections[p4.sectionIndex];
|
|
197566
|
+
const sectionHeading = section?.heading ? `${"#".repeat(section.level)} ${section.heading}` : "";
|
|
197567
|
+
tableParagraphs.push({
|
|
197568
|
+
chunkParaIndex: i,
|
|
197569
|
+
colCount,
|
|
197570
|
+
text: p4.text,
|
|
197571
|
+
sectionHeading
|
|
197572
|
+
});
|
|
197573
|
+
}
|
|
197574
|
+
if (tableParagraphs.length === 0) {
|
|
197575
|
+
return { extracted: 0, llmCalls: 0, totalTokens: 0 };
|
|
197576
|
+
}
|
|
197577
|
+
const parts = [];
|
|
197578
|
+
const tagToChunkIndex = new Map;
|
|
197579
|
+
for (let ti = 0;ti < tableParagraphs.length; ti++) {
|
|
197580
|
+
const tp = tableParagraphs[ti];
|
|
197581
|
+
const tag2 = `P${ti}`;
|
|
197582
|
+
tagToChunkIndex.set(tag2, tp.chunkParaIndex);
|
|
197583
|
+
if (tp.sectionHeading) {
|
|
197584
|
+
parts.push(tp.sectionHeading);
|
|
197585
|
+
}
|
|
197586
|
+
if (tp.chunkParaIndex > 0) {
|
|
197587
|
+
const prevPara = chunk.paragraphs[tp.chunkParaIndex - 1];
|
|
197588
|
+
if (prevPara && detectTableColumnCount(prevPara.text) === 0) {
|
|
197589
|
+
parts.push(prevPara.text);
|
|
197590
|
+
}
|
|
197591
|
+
}
|
|
197592
|
+
parts.push(`[${tag2}] ${tp.text}`);
|
|
197593
|
+
parts.push("");
|
|
197594
|
+
}
|
|
197595
|
+
const tableText = parts.join(`
|
|
197596
|
+
|
|
197597
|
+
`);
|
|
197598
|
+
const prompt = buildDocTableAnnotationPrompt(tableText);
|
|
197599
|
+
try {
|
|
197600
|
+
const res = await llmService.generateText(prompt, {
|
|
197601
|
+
systemPrompt: DOC_TABLE_ANNOTATION_SYSTEM_PROMPT
|
|
197602
|
+
});
|
|
197603
|
+
const parsed = parseExtractionOutput(res.text, docChunkResultSchema);
|
|
197604
|
+
if (!parsed.success) {
|
|
197605
|
+
console.warn(`[docIndexer] table extraction: parse failed: ${parsed.error.message.slice(0, 200)}`);
|
|
197606
|
+
return { extracted: 0, llmCalls: 1, totalTokens: res.usage.totalTokens };
|
|
197607
|
+
}
|
|
197608
|
+
let extracted = 0;
|
|
197609
|
+
for (const tableP of parsed.data.paragraphs) {
|
|
197610
|
+
const chunkParaIndex = tagToChunkIndex.get(tableP.tag);
|
|
197611
|
+
if (chunkParaIndex === undefined)
|
|
197612
|
+
continue;
|
|
197613
|
+
const originalTag = `P${chunkParaIndex}`;
|
|
197614
|
+
const existing = result.paragraphs.find((rp) => rp.tag === originalTag);
|
|
197615
|
+
const tableAtomCount = Object.values(tableP.atoms).reduce((sum, arr) => sum + (Array.isArray(arr) ? arr.length : 0), 0);
|
|
197616
|
+
if (tableAtomCount === 0)
|
|
197617
|
+
continue;
|
|
197618
|
+
if (existing) {
|
|
197619
|
+
for (const [atomType, atoms2] of Object.entries(tableP.atoms)) {
|
|
197620
|
+
if (Array.isArray(atoms2) && atoms2.length > 0) {
|
|
197621
|
+
existing.atoms[atomType] = atoms2;
|
|
197622
|
+
}
|
|
197623
|
+
}
|
|
197624
|
+
} else {
|
|
197625
|
+
result.paragraphs.push({ ...tableP, tag: originalTag });
|
|
197626
|
+
}
|
|
197627
|
+
extracted++;
|
|
197628
|
+
const tp = tableParagraphs.find((t4) => t4.chunkParaIndex === chunkParaIndex);
|
|
197629
|
+
console.log(`[docIndexer] table extraction: ${originalTag} → ${tableAtomCount} atoms (table has ${tp?.colCount ?? "?"} cols)`);
|
|
197630
|
+
}
|
|
197631
|
+
return { extracted, llmCalls: 1, totalTokens: res.usage.totalTokens };
|
|
197632
|
+
} catch (err2) {
|
|
197633
|
+
console.warn("[docIndexer] table extraction failed (non-blocking):", err2);
|
|
197634
|
+
return { extracted: 0, llmCalls: 1, totalTokens: 0 };
|
|
197635
|
+
}
|
|
197636
|
+
}
|
|
197637
|
+
|
|
197638
|
+
// ../api/src/services/docDiagramExtractor.ts
|
|
197639
|
+
init_src();
|
|
197640
|
+
var DIAGRAM_OPEN_RE = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "im");
|
|
197641
|
+
function detectDiagramFormat(text2) {
|
|
197642
|
+
const trimmed = text2.trim();
|
|
197643
|
+
const match2 = trimmed.match(new RegExp(`^\`\`\`(${DIAGRAM_FENCE_TAGS.join("|")})\\s*\\n`, "i"));
|
|
197644
|
+
if (!match2)
|
|
197645
|
+
return null;
|
|
197646
|
+
if (!trimmed.endsWith("```"))
|
|
197647
|
+
return null;
|
|
197648
|
+
return match2[1].toLowerCase();
|
|
197649
|
+
}
|
|
197650
|
+
async function extractDiagramAtoms(chunk, sections, result, llmService) {
|
|
197651
|
+
const diagramParagraphs = [];
|
|
197652
|
+
for (let i = 0;i < chunk.paragraphs.length; i++) {
|
|
197653
|
+
const p4 = chunk.paragraphs[i];
|
|
197654
|
+
const format = detectDiagramFormat(p4.text);
|
|
197655
|
+
if (!format)
|
|
197656
|
+
continue;
|
|
197657
|
+
const section = sections[p4.sectionIndex];
|
|
197658
|
+
const sectionHeading = section?.heading ? `${"#".repeat(section.level)} ${section.heading}` : "";
|
|
197659
|
+
diagramParagraphs.push({
|
|
197660
|
+
chunkParaIndex: i,
|
|
197661
|
+
format,
|
|
197662
|
+
text: p4.text,
|
|
197663
|
+
sectionHeading
|
|
197664
|
+
});
|
|
197665
|
+
}
|
|
197666
|
+
if (diagramParagraphs.length === 0) {
|
|
197667
|
+
return { extracted: 0, llmCalls: 0, totalTokens: 0 };
|
|
197668
|
+
}
|
|
197669
|
+
const parts = [];
|
|
197670
|
+
const tagToChunkIndex = new Map;
|
|
197671
|
+
for (let di = 0;di < diagramParagraphs.length; di++) {
|
|
197672
|
+
const dp = diagramParagraphs[di];
|
|
197673
|
+
const tag2 = `P${di}`;
|
|
197674
|
+
tagToChunkIndex.set(tag2, dp.chunkParaIndex);
|
|
197675
|
+
if (dp.sectionHeading) {
|
|
197676
|
+
parts.push(dp.sectionHeading);
|
|
197677
|
+
}
|
|
197678
|
+
if (dp.chunkParaIndex > 0) {
|
|
197679
|
+
const prevPara = chunk.paragraphs[dp.chunkParaIndex - 1];
|
|
197680
|
+
if (prevPara && !detectDiagramFormat(prevPara.text)) {
|
|
197681
|
+
parts.push(prevPara.text);
|
|
197682
|
+
}
|
|
197683
|
+
}
|
|
197684
|
+
parts.push(`[${tag2}] ${dp.text}`);
|
|
197685
|
+
parts.push("");
|
|
197686
|
+
}
|
|
197687
|
+
const diagramText = parts.join(`
|
|
197688
|
+
|
|
197689
|
+
`);
|
|
197690
|
+
const prompt = buildDocDiagramAnnotationPrompt(diagramText);
|
|
197691
|
+
try {
|
|
197692
|
+
const res = await llmService.generateText(prompt, {
|
|
197693
|
+
systemPrompt: DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT
|
|
197694
|
+
});
|
|
197695
|
+
const parsed = parseExtractionOutput(res.text, docChunkResultSchema);
|
|
197696
|
+
if (!parsed.success) {
|
|
197697
|
+
console.warn(`[docIndexer] diagram extraction: parse failed — ${parsed.error.message.slice(0, 200)}`);
|
|
197698
|
+
return { extracted: 0, llmCalls: 1, totalTokens: res.usage.totalTokens };
|
|
197699
|
+
}
|
|
197700
|
+
let extracted = 0;
|
|
197701
|
+
for (const diagramP of parsed.data.paragraphs) {
|
|
197702
|
+
const chunkParaIndex = tagToChunkIndex.get(diagramP.tag);
|
|
197703
|
+
if (chunkParaIndex === undefined)
|
|
197704
|
+
continue;
|
|
197705
|
+
const originalTag = `P${chunkParaIndex}`;
|
|
197706
|
+
const existing = result.paragraphs.find((rp) => rp.tag === originalTag);
|
|
197707
|
+
const diagramAtomCount = Object.values(diagramP.atoms).reduce((sum, arr) => sum + (Array.isArray(arr) ? arr.length : 0), 0);
|
|
197708
|
+
if (diagramAtomCount === 0)
|
|
197709
|
+
continue;
|
|
197710
|
+
if (existing) {
|
|
197711
|
+
for (const [atomType, atoms2] of Object.entries(diagramP.atoms)) {
|
|
197712
|
+
if (Array.isArray(atoms2) && atoms2.length > 0) {
|
|
197713
|
+
existing.atoms[atomType] = atoms2;
|
|
197714
|
+
}
|
|
197715
|
+
}
|
|
197716
|
+
} else {
|
|
197717
|
+
result.paragraphs.push({ ...diagramP, tag: originalTag });
|
|
197718
|
+
}
|
|
197719
|
+
extracted++;
|
|
197720
|
+
const dp = diagramParagraphs.find((d) => d.chunkParaIndex === chunkParaIndex);
|
|
197721
|
+
console.log(`[docIndexer] diagram extraction: ${originalTag} → ${diagramAtomCount} atoms (${dp?.format ?? "unknown"} diagram)`);
|
|
197722
|
+
}
|
|
197723
|
+
return { extracted, llmCalls: 1, totalTokens: res.usage.totalTokens };
|
|
197724
|
+
} catch (err2) {
|
|
197725
|
+
console.warn("[docIndexer] diagram extraction failed (non-blocking):", err2);
|
|
197726
|
+
return { extracted: 0, llmCalls: 1, totalTokens: 0 };
|
|
197727
|
+
}
|
|
197728
|
+
}
|
|
197729
|
+
|
|
197730
|
+
// ../api/src/services/docAtomPostProcess.ts
|
|
197731
|
+
function postProcessDigestAtoms(sections) {
|
|
197732
|
+
autoCompleteEntities(sections);
|
|
197733
|
+
normalizeEntityNames(sections);
|
|
197734
|
+
warnCrossRefIssues(sections);
|
|
197735
|
+
}
|
|
197736
|
+
function isNoiseEntityName(name21) {
|
|
197737
|
+
const trimmed = name21.trim();
|
|
197738
|
+
if (trimmed.length === 0)
|
|
197739
|
+
return true;
|
|
197740
|
+
if (trimmed.startsWith("$"))
|
|
197741
|
+
return true;
|
|
197742
|
+
if (/[+=]/.test(trimmed))
|
|
197743
|
+
return true;
|
|
197744
|
+
if (/^\d/.test(trimmed))
|
|
197745
|
+
return true;
|
|
197746
|
+
return false;
|
|
197747
|
+
}
|
|
197748
|
+
function autoCompleteEntities(sections) {
|
|
197749
|
+
const declaredEntities = new Set;
|
|
197750
|
+
for (const section of sections) {
|
|
197751
|
+
for (const para of section.paragraphs) {
|
|
197752
|
+
const entities2 = para.atoms.entities;
|
|
197753
|
+
if (entities2) {
|
|
197754
|
+
for (const e of entities2)
|
|
197755
|
+
declaredEntities.add(e.name);
|
|
197756
|
+
}
|
|
197757
|
+
}
|
|
197758
|
+
}
|
|
197759
|
+
let autoCreated = 0;
|
|
197760
|
+
let skippedNoise = 0;
|
|
197761
|
+
for (const section of sections) {
|
|
197762
|
+
for (const para of section.paragraphs) {
|
|
197763
|
+
const referencedNames = new Set;
|
|
197764
|
+
const relations = para.atoms.relations;
|
|
197765
|
+
if (relations) {
|
|
197766
|
+
for (const r of relations) {
|
|
197767
|
+
referencedNames.add(r.from);
|
|
197768
|
+
referencedNames.add(r.to);
|
|
197769
|
+
}
|
|
197770
|
+
}
|
|
197771
|
+
const boundaries = para.atoms.boundaries;
|
|
197772
|
+
if (boundaries) {
|
|
197773
|
+
for (const b of boundaries) {
|
|
197774
|
+
for (const name21 of b.contains)
|
|
197775
|
+
referencedNames.add(name21);
|
|
197776
|
+
if (b.excludes)
|
|
197777
|
+
for (const name21 of b.excludes)
|
|
197778
|
+
referencedNames.add(name21);
|
|
197779
|
+
}
|
|
197780
|
+
}
|
|
197781
|
+
for (const name21 of referencedNames) {
|
|
197782
|
+
if (!declaredEntities.has(name21)) {
|
|
197783
|
+
if (isNoiseEntityName(name21)) {
|
|
197784
|
+
skippedNoise++;
|
|
197785
|
+
continue;
|
|
197786
|
+
}
|
|
197787
|
+
if (!para.atoms.entities) {
|
|
197788
|
+
para.atoms.entities = [];
|
|
197789
|
+
}
|
|
197790
|
+
para.atoms.entities.push({
|
|
197791
|
+
name: name21,
|
|
197792
|
+
kind: "concept",
|
|
197793
|
+
confidence: 0.6
|
|
197794
|
+
});
|
|
197795
|
+
declaredEntities.add(name21);
|
|
197796
|
+
autoCreated++;
|
|
197797
|
+
}
|
|
197798
|
+
}
|
|
197799
|
+
}
|
|
197800
|
+
}
|
|
197801
|
+
if (autoCreated > 0 || skippedNoise > 0) {
|
|
197802
|
+
console.log(`[docAtomPostProcess] auto-created ${autoCreated} entities, skipped ${skippedNoise} noise names`);
|
|
197803
|
+
}
|
|
197804
|
+
}
|
|
197805
|
+
function normalizeEntityNames(sections) {
|
|
197806
|
+
const allNames = [];
|
|
197807
|
+
for (const section of sections) {
|
|
197808
|
+
for (const para of section.paragraphs) {
|
|
197809
|
+
const entities2 = para.atoms.entities;
|
|
197810
|
+
if (entities2) {
|
|
197811
|
+
for (const e of entities2)
|
|
197812
|
+
allNames.push(e.name);
|
|
197813
|
+
}
|
|
197814
|
+
}
|
|
197815
|
+
}
|
|
197816
|
+
const uniqueNames = [...new Set(allNames)].sort((a, b) => b.length - a.length);
|
|
197817
|
+
const mergeMap = new Map;
|
|
197818
|
+
for (let i = 0;i < uniqueNames.length; i++) {
|
|
197819
|
+
const short = uniqueNames[i];
|
|
197820
|
+
if (short.length < 3)
|
|
197821
|
+
continue;
|
|
197822
|
+
if (mergeMap.has(short))
|
|
197823
|
+
continue;
|
|
197824
|
+
for (let j = 0;j < i; j++) {
|
|
197825
|
+
const long = uniqueNames[j];
|
|
197826
|
+
if (mergeMap.has(long))
|
|
197827
|
+
continue;
|
|
197828
|
+
if (long.includes(short) && long !== short) {
|
|
197829
|
+
mergeMap.set(short, long);
|
|
197830
|
+
break;
|
|
197831
|
+
}
|
|
197832
|
+
}
|
|
197833
|
+
}
|
|
197834
|
+
if (mergeMap.size === 0)
|
|
197835
|
+
return;
|
|
197836
|
+
let normalized = 0;
|
|
197837
|
+
for (const section of sections) {
|
|
197838
|
+
for (const para of section.paragraphs) {
|
|
197839
|
+
const entities2 = para.atoms.entities;
|
|
197840
|
+
if (entities2) {
|
|
197841
|
+
for (const e of entities2) {
|
|
197842
|
+
const canonical = mergeMap.get(e.name);
|
|
197843
|
+
if (canonical) {
|
|
197844
|
+
e.name = canonical;
|
|
197845
|
+
normalized++;
|
|
197846
|
+
}
|
|
197847
|
+
}
|
|
197848
|
+
const seen = new Set;
|
|
197849
|
+
para.atoms.entities = entities2.filter((e) => {
|
|
197850
|
+
if (seen.has(e.name))
|
|
197851
|
+
return false;
|
|
197852
|
+
seen.add(e.name);
|
|
197853
|
+
return true;
|
|
197854
|
+
});
|
|
197855
|
+
}
|
|
197856
|
+
const relations = para.atoms.relations;
|
|
197857
|
+
if (relations) {
|
|
197858
|
+
for (const r of relations) {
|
|
197859
|
+
const fromCanonical = mergeMap.get(r.from);
|
|
197860
|
+
if (fromCanonical) {
|
|
197861
|
+
r.from = fromCanonical;
|
|
197862
|
+
normalized++;
|
|
197863
|
+
}
|
|
197864
|
+
const toCanonical = mergeMap.get(r.to);
|
|
197865
|
+
if (toCanonical) {
|
|
197866
|
+
r.to = toCanonical;
|
|
197867
|
+
normalized++;
|
|
197868
|
+
}
|
|
197869
|
+
}
|
|
197870
|
+
}
|
|
197871
|
+
const boundaries = para.atoms.boundaries;
|
|
197872
|
+
if (boundaries) {
|
|
197873
|
+
for (const b of boundaries) {
|
|
197874
|
+
b.contains = b.contains.map((n) => mergeMap.get(n) ?? n);
|
|
197875
|
+
if (b.excludes)
|
|
197876
|
+
b.excludes = b.excludes.map((n) => mergeMap.get(n) ?? n);
|
|
197877
|
+
}
|
|
197878
|
+
}
|
|
197879
|
+
}
|
|
197880
|
+
}
|
|
197881
|
+
if (normalized > 0) {
|
|
197882
|
+
console.log(`[docAtomPostProcess] normalized ${normalized} entity name references (${mergeMap.size} merge rules)`);
|
|
197883
|
+
for (const [short, long] of mergeMap) {
|
|
197884
|
+
console.log(` "${short}" → "${long}"`);
|
|
197885
|
+
}
|
|
197886
|
+
}
|
|
197887
|
+
}
|
|
197888
|
+
function warnCrossRefIssues(sections) {
|
|
197889
|
+
const allEntityNames = new Set;
|
|
197890
|
+
for (const section of sections) {
|
|
197891
|
+
for (const para of section.paragraphs) {
|
|
197892
|
+
const entities2 = para.atoms.entities;
|
|
197893
|
+
if (entities2) {
|
|
197894
|
+
for (const e of entities2)
|
|
197895
|
+
allEntityNames.add(e.name);
|
|
197896
|
+
}
|
|
197897
|
+
}
|
|
197898
|
+
}
|
|
197899
|
+
const allStateValues = new Set;
|
|
197900
|
+
for (const section of sections) {
|
|
197901
|
+
for (const para of section.paragraphs) {
|
|
197902
|
+
const states = para.atoms.states;
|
|
197903
|
+
if (states) {
|
|
197904
|
+
for (const s of states)
|
|
197905
|
+
for (const v of s.values)
|
|
197906
|
+
allStateValues.add(v);
|
|
197907
|
+
}
|
|
197908
|
+
}
|
|
197909
|
+
}
|
|
197910
|
+
let warnings = 0;
|
|
197911
|
+
for (const section of sections) {
|
|
197912
|
+
for (const para of section.paragraphs) {
|
|
197913
|
+
const transitions = para.atoms.transitions;
|
|
197914
|
+
if (transitions) {
|
|
197915
|
+
for (const t4 of transitions) {
|
|
197916
|
+
if (allStateValues.size > 0 && !allStateValues.has(t4.from)) {
|
|
197917
|
+
console.warn(`[docAtomPostProcess] transition.from "${t4.from}" not in declared states`);
|
|
197918
|
+
warnings++;
|
|
197919
|
+
}
|
|
197920
|
+
if (allStateValues.size > 0 && !allStateValues.has(t4.to)) {
|
|
197921
|
+
console.warn(`[docAtomPostProcess] transition.to "${t4.to}" not in declared states`);
|
|
197922
|
+
warnings++;
|
|
197923
|
+
}
|
|
197924
|
+
}
|
|
197925
|
+
}
|
|
197926
|
+
const roles = para.atoms.roles;
|
|
197927
|
+
if (roles) {
|
|
197928
|
+
const allBehaviorNames = new Set;
|
|
197929
|
+
for (const s of sections) {
|
|
197930
|
+
for (const p4 of s.paragraphs) {
|
|
197931
|
+
const behaviors = p4.atoms.behaviors;
|
|
197932
|
+
if (behaviors)
|
|
197933
|
+
for (const b of behaviors)
|
|
197934
|
+
allBehaviorNames.add(b.name);
|
|
197935
|
+
}
|
|
197936
|
+
}
|
|
197937
|
+
for (const role of roles) {
|
|
197938
|
+
if (role.performs) {
|
|
197939
|
+
for (const p4 of role.performs) {
|
|
197940
|
+
if (allBehaviorNames.size > 0 && !allBehaviorNames.has(p4)) {
|
|
197941
|
+
console.warn(`[docAtomPostProcess] role.performs "${p4}" not in declared behaviors`);
|
|
197942
|
+
warnings++;
|
|
197943
|
+
}
|
|
197944
|
+
}
|
|
197945
|
+
}
|
|
197946
|
+
}
|
|
197947
|
+
}
|
|
197948
|
+
}
|
|
197949
|
+
}
|
|
197950
|
+
if (warnings > 0) {
|
|
197951
|
+
console.warn(`[docAtomPostProcess] ${warnings} cross-reference warnings (non-blocking)`);
|
|
197952
|
+
}
|
|
197953
|
+
}
|
|
197954
|
+
function detectNoiseCandidates(entityNames) {
|
|
197955
|
+
const candidates = [];
|
|
197956
|
+
for (const name21 of entityNames) {
|
|
197957
|
+
if (/^[\u4e00-\u9fff]{1,4}$/.test(name21)) {
|
|
197958
|
+
candidates.push(name21);
|
|
197959
|
+
continue;
|
|
197960
|
+
}
|
|
197961
|
+
if (/^[a-zA-Z]{1,5}$/.test(name21) && name21[0] === name21[0].toLowerCase()) {
|
|
197962
|
+
candidates.push(name21);
|
|
197963
|
+
continue;
|
|
197964
|
+
}
|
|
197965
|
+
if (/^Kill\s+\d/.test(name21)) {
|
|
197966
|
+
candidates.push(name21);
|
|
197967
|
+
continue;
|
|
197968
|
+
}
|
|
197969
|
+
}
|
|
197970
|
+
return candidates;
|
|
197971
|
+
}
|
|
197972
|
+
function collectExtractionStats(sections) {
|
|
197973
|
+
const atomTypeCounts = {};
|
|
197974
|
+
const entityNames = new Set;
|
|
197975
|
+
let paragraphsWithAtoms = 0;
|
|
197976
|
+
let paragraphsTotal = 0;
|
|
197977
|
+
for (const section of sections) {
|
|
197978
|
+
for (const para of section.paragraphs) {
|
|
197979
|
+
paragraphsTotal++;
|
|
197980
|
+
let hasAtoms = false;
|
|
197981
|
+
for (const [atomType, atoms2] of Object.entries(para.atoms)) {
|
|
197982
|
+
if (!Array.isArray(atoms2) || atoms2.length === 0)
|
|
197983
|
+
continue;
|
|
197984
|
+
hasAtoms = true;
|
|
197985
|
+
atomTypeCounts[atomType] = (atomTypeCounts[atomType] ?? 0) + atoms2.length;
|
|
197986
|
+
if (atomType === "entities") {
|
|
197987
|
+
for (const e of atoms2)
|
|
197988
|
+
entityNames.add(e.name);
|
|
197989
|
+
}
|
|
197990
|
+
}
|
|
197991
|
+
if (hasAtoms)
|
|
197992
|
+
paragraphsWithAtoms++;
|
|
197993
|
+
}
|
|
197994
|
+
}
|
|
197995
|
+
return {
|
|
197996
|
+
entityCount: atomTypeCounts.entities ?? 0,
|
|
197997
|
+
relationCount: atomTypeCounts.relations ?? 0,
|
|
197998
|
+
atomTypeCounts,
|
|
197999
|
+
uniqueEntityNames: [...entityNames],
|
|
198000
|
+
paragraphsWithAtoms,
|
|
198001
|
+
paragraphsTotal
|
|
198002
|
+
};
|
|
198003
|
+
}
|
|
198004
|
+
function detectResolutionCandidates(entityNames) {
|
|
198005
|
+
const candidates = [];
|
|
198006
|
+
const sorted = [...entityNames].sort((a, b) => b.length - a.length);
|
|
198007
|
+
for (let i = 0;i < sorted.length; i++) {
|
|
198008
|
+
const long = sorted[i];
|
|
198009
|
+
for (let j = i + 1;j < sorted.length; j++) {
|
|
198010
|
+
const short = sorted[j];
|
|
198011
|
+
if (short.length < 3)
|
|
198012
|
+
continue;
|
|
198013
|
+
if (short === long)
|
|
198014
|
+
continue;
|
|
198015
|
+
if (long.includes(short)) {
|
|
198016
|
+
candidates.push({ short, long, reason: "substring match" });
|
|
198017
|
+
continue;
|
|
198018
|
+
}
|
|
198019
|
+
if (long.toLowerCase() === short.toLowerCase()) {
|
|
198020
|
+
candidates.push({ short, long, reason: "case-insensitive match" });
|
|
198021
|
+
continue;
|
|
198022
|
+
}
|
|
198023
|
+
if (long.toLowerCase().includes(short.toLowerCase()) && short.length >= 4) {
|
|
198024
|
+
candidates.push({ short, long, reason: "case-insensitive substring" });
|
|
198025
|
+
}
|
|
198026
|
+
}
|
|
198027
|
+
}
|
|
198028
|
+
return candidates;
|
|
198029
|
+
}
|
|
198030
|
+
function applyEntityMerges(sections, merges) {
|
|
198031
|
+
if (merges.length === 0)
|
|
198032
|
+
return 0;
|
|
198033
|
+
const mergeMap = new Map;
|
|
198034
|
+
for (const m of merges)
|
|
198035
|
+
mergeMap.set(m.from, m.to);
|
|
198036
|
+
let normalized = 0;
|
|
198037
|
+
for (const section of sections) {
|
|
198038
|
+
for (const para of section.paragraphs) {
|
|
198039
|
+
const entities2 = para.atoms.entities;
|
|
198040
|
+
if (entities2) {
|
|
198041
|
+
for (const e of entities2) {
|
|
198042
|
+
const canonical = mergeMap.get(e.name);
|
|
198043
|
+
if (canonical) {
|
|
198044
|
+
e.name = canonical;
|
|
198045
|
+
normalized++;
|
|
198046
|
+
}
|
|
198047
|
+
}
|
|
198048
|
+
const seen = new Set;
|
|
198049
|
+
para.atoms.entities = entities2.filter((e) => {
|
|
198050
|
+
if (seen.has(e.name))
|
|
198051
|
+
return false;
|
|
198052
|
+
seen.add(e.name);
|
|
198053
|
+
return true;
|
|
198054
|
+
});
|
|
198055
|
+
}
|
|
198056
|
+
const relations = para.atoms.relations;
|
|
198057
|
+
if (relations) {
|
|
198058
|
+
for (const r of relations) {
|
|
198059
|
+
const fromCanonical = mergeMap.get(r.from);
|
|
198060
|
+
if (fromCanonical) {
|
|
198061
|
+
r.from = fromCanonical;
|
|
198062
|
+
normalized++;
|
|
198063
|
+
}
|
|
198064
|
+
const toCanonical = mergeMap.get(r.to);
|
|
198065
|
+
if (toCanonical) {
|
|
198066
|
+
r.to = toCanonical;
|
|
198067
|
+
normalized++;
|
|
198068
|
+
}
|
|
198069
|
+
}
|
|
198070
|
+
}
|
|
198071
|
+
const boundaries = para.atoms.boundaries;
|
|
198072
|
+
if (boundaries) {
|
|
198073
|
+
for (const b of boundaries) {
|
|
198074
|
+
b.contains = b.contains.map((n) => mergeMap.get(n) ?? n);
|
|
198075
|
+
if (b.excludes)
|
|
198076
|
+
b.excludes = b.excludes.map((n) => mergeMap.get(n) ?? n);
|
|
198077
|
+
}
|
|
198078
|
+
}
|
|
198079
|
+
}
|
|
198080
|
+
}
|
|
198081
|
+
if (normalized > 0) {
|
|
198082
|
+
console.log(`[docAtomPostProcess] LLM entity resolution: normalized ${normalized} references (${merges.length} merge rules)`);
|
|
198083
|
+
for (const m of merges) {
|
|
198084
|
+
console.log(` "${m.from}" → "${m.to}"`);
|
|
198085
|
+
}
|
|
198086
|
+
}
|
|
198087
|
+
return normalized;
|
|
198088
|
+
}
|
|
198089
|
+
function removeNoiseEntities(sections, names) {
|
|
198090
|
+
if (names.length === 0)
|
|
198091
|
+
return 0;
|
|
198092
|
+
const removeSet = new Set(names);
|
|
198093
|
+
let removed = 0;
|
|
198094
|
+
for (const section of sections) {
|
|
198095
|
+
for (const para of section.paragraphs) {
|
|
198096
|
+
const entities2 = para.atoms.entities;
|
|
198097
|
+
if (entities2) {
|
|
198098
|
+
const before = entities2.length;
|
|
198099
|
+
para.atoms.entities = entities2.filter((e) => !removeSet.has(e.name));
|
|
198100
|
+
removed += before - para.atoms.entities.length;
|
|
198101
|
+
}
|
|
198102
|
+
const relations = para.atoms.relations;
|
|
198103
|
+
if (relations) {
|
|
198104
|
+
para.atoms.relations = relations.filter((r) => !removeSet.has(r.from) || !removeSet.has(r.to));
|
|
198105
|
+
}
|
|
198106
|
+
const boundaries = para.atoms.boundaries;
|
|
198107
|
+
if (boundaries) {
|
|
198108
|
+
for (const b of boundaries) {
|
|
198109
|
+
b.contains = b.contains.filter((n) => !removeSet.has(n));
|
|
198110
|
+
if (b.excludes)
|
|
198111
|
+
b.excludes = b.excludes.filter((n) => !removeSet.has(n));
|
|
198112
|
+
}
|
|
198113
|
+
}
|
|
198114
|
+
}
|
|
198115
|
+
}
|
|
198116
|
+
if (removed > 0) {
|
|
198117
|
+
console.log(`[docAtomPostProcess] removed ${removed} noise entity instances (${names.length} names)`);
|
|
198118
|
+
for (const n of names) {
|
|
198119
|
+
console.log(` ✕ "${n}"`);
|
|
198120
|
+
}
|
|
198121
|
+
}
|
|
198122
|
+
return removed;
|
|
198123
|
+
}
|
|
198124
|
+
|
|
198125
|
+
// ../api/src/services/docIndexer.ts
|
|
196502
198126
|
var CHUNK_CONCURRENCY = 2;
|
|
196503
198127
|
var GLEANING_MAX_ROUNDS = 2;
|
|
196504
|
-
var
|
|
198128
|
+
var CODE_BLOCK_MIN_LENGTH = 500;
|
|
196505
198129
|
function injectParagraphTags(chunk, sections) {
|
|
196506
198130
|
const parts = [];
|
|
196507
198131
|
if (chunk.breadcrumb.length > 0) {
|
|
@@ -196519,7 +198143,11 @@ function injectParagraphTags(chunk, sections) {
|
|
|
196519
198143
|
parts.push(`${"#".repeat(section.level)} ${section.heading}`);
|
|
196520
198144
|
}
|
|
196521
198145
|
}
|
|
196522
|
-
|
|
198146
|
+
if (p4.text.length >= CODE_BLOCK_MIN_LENGTH && isPureCodeBlock(p4.text)) {
|
|
198147
|
+
parts.push(`[P${i}] ${skeletonizeCodeBlock(p4.text)}`);
|
|
198148
|
+
} else {
|
|
198149
|
+
parts.push(`[P${i}] ${p4.text}`);
|
|
198150
|
+
}
|
|
196523
198151
|
}
|
|
196524
198152
|
return parts.join(`
|
|
196525
198153
|
|
|
@@ -196584,12 +198212,14 @@ Continue the JSON output from the exact point of truncation. Output ONLY the rem
|
|
|
196584
198212
|
});
|
|
196585
198213
|
const combined = trimmed + result.text.trim();
|
|
196586
198214
|
JSON.parse(jsonrepair(combined));
|
|
198215
|
+
console.log(`[docIndexer] continuation: merged T1 (${trimmed.length} chars) + continuation (${result.text.trim().length} chars) = ${combined.length} chars`);
|
|
196587
198216
|
return {
|
|
196588
198217
|
text: combined,
|
|
196589
198218
|
extraCalls: 1,
|
|
196590
198219
|
extraTokens: result.usage.totalTokens
|
|
196591
198220
|
};
|
|
196592
|
-
} catch {
|
|
198221
|
+
} catch (contErr) {
|
|
198222
|
+
console.warn(`[docIndexer] continuation: merge failed, returning original (${trimmed.length} chars). ` + `Error: ${contErr instanceof Error ? contErr.message : String(contErr)}`);
|
|
196593
198223
|
return { text: text2, extraCalls: 1, extraTokens: 0 };
|
|
196594
198224
|
}
|
|
196595
198225
|
}
|
|
@@ -196630,9 +198260,16 @@ async function processChunk(chunk, chunkIndex, llmService, sections, onStep) {
|
|
|
196630
198260
|
llmCalls += continued.extraCalls;
|
|
196631
198261
|
totalTokens += continued.extraTokens;
|
|
196632
198262
|
onStep?.("T1 done", llmCalls, totalTokens);
|
|
196633
|
-
|
|
198263
|
+
let parseResult = parseExtractionOutput(continued.text, docChunkResultSchema);
|
|
196634
198264
|
if (!parseResult.success) {
|
|
196635
|
-
|
|
198265
|
+
const preview = continued.text.slice(0, 500).replace(/\n/g, "\\n");
|
|
198266
|
+
console.warn(`[docIndexer] chunk ${chunkIndex} T1 strict parse failed, attempting lenient. ` + `Error: ${parseResult.error.message.slice(0, 200)}. ` + `LLM output preview: ${preview}`);
|
|
198267
|
+
const lenient = tryLenientParse(continued.text, chunkIndex);
|
|
198268
|
+
if (lenient) {
|
|
198269
|
+
parseResult = { success: true, data: lenient };
|
|
198270
|
+
} else {
|
|
198271
|
+
throw new Error(`Chunk ${chunkIndex} T1 parse failed: ${parseResult.error.message}`);
|
|
198272
|
+
}
|
|
196636
198273
|
}
|
|
196637
198274
|
try {
|
|
196638
198275
|
const rawJson = JSON.parse(jsonrepair(continued.text));
|
|
@@ -196689,8 +198326,19 @@ async function processChunk(chunk, chunkIndex, llmService, sections, onStep) {
|
|
|
196689
198326
|
chunkText,
|
|
196690
198327
|
previousResult: parseResult.data
|
|
196691
198328
|
});
|
|
196692
|
-
const
|
|
196693
|
-
|
|
198329
|
+
const tableResult = await extractTableAtoms(chunk, sections, cumulativeResult, llmService);
|
|
198330
|
+
if (tableResult.extracted > 0) {
|
|
198331
|
+
onStep?.(`table extraction (${tableResult.extracted} tables)`, llmCalls + tableResult.llmCalls, totalTokens + tableResult.totalTokens);
|
|
198332
|
+
}
|
|
198333
|
+
llmCalls += tableResult.llmCalls;
|
|
198334
|
+
totalTokens += tableResult.totalTokens;
|
|
198335
|
+
const diagramResult = await extractDiagramAtoms(chunk, sections, cumulativeResult, llmService);
|
|
198336
|
+
if (diagramResult.extracted > 0) {
|
|
198337
|
+
onStep?.(`diagram extraction (${diagramResult.extracted} diagrams)`, llmCalls + diagramResult.llmCalls, totalTokens + diagramResult.totalTokens);
|
|
198338
|
+
}
|
|
198339
|
+
llmCalls += diagramResult.llmCalls;
|
|
198340
|
+
totalTokens += diagramResult.totalTokens;
|
|
198341
|
+
return { result: cumulativeResult, llmCalls, totalTokens };
|
|
196694
198342
|
}
|
|
196695
198343
|
function mapChunkResultToSections(chunk, chunkResult, sections) {
|
|
196696
198344
|
for (const p4 of chunkResult.paragraphs) {
|
|
@@ -196735,6 +198383,123 @@ function mapChunkResultToSections(chunk, chunkResult, sections) {
|
|
|
196735
198383
|
}
|
|
196736
198384
|
}
|
|
196737
198385
|
}
|
|
198386
|
+
var ATOM_TYPE_KEYS = new Set([
|
|
198387
|
+
"entities",
|
|
198388
|
+
"relations",
|
|
198389
|
+
"behaviors",
|
|
198390
|
+
"attributes",
|
|
198391
|
+
"states",
|
|
198392
|
+
"rules",
|
|
198393
|
+
"transitions",
|
|
198394
|
+
"events",
|
|
198395
|
+
"decisions",
|
|
198396
|
+
"metrics",
|
|
198397
|
+
"roles",
|
|
198398
|
+
"constraints",
|
|
198399
|
+
"comparisons",
|
|
198400
|
+
"boundaries"
|
|
198401
|
+
]);
|
|
198402
|
+
function looksLikeAtoms(obj) {
|
|
198403
|
+
return Object.keys(obj).some((k) => ATOM_TYPE_KEYS.has(k) && Array.isArray(obj[k]));
|
|
198404
|
+
}
|
|
198405
|
+
var ATOM_REQUIRED_FIELDS = {
|
|
198406
|
+
entities: ["name"],
|
|
198407
|
+
relations: ["from", "to", "type"],
|
|
198408
|
+
behaviors: ["name"],
|
|
198409
|
+
attributes: ["name"],
|
|
198410
|
+
states: ["name"],
|
|
198411
|
+
rules: ["description"],
|
|
198412
|
+
transitions: ["from", "to"],
|
|
198413
|
+
events: ["name"],
|
|
198414
|
+
decisions: ["description"],
|
|
198415
|
+
metrics: ["name"],
|
|
198416
|
+
roles: ["name"],
|
|
198417
|
+
constraints: ["description"],
|
|
198418
|
+
comparisons: ["description"],
|
|
198419
|
+
boundaries: ["name"]
|
|
198420
|
+
};
|
|
198421
|
+
var PARAGRAPH_TAG_RE2 = /^P\d+$/;
|
|
198422
|
+
function tryLenientParse(rawText, chunkIndex) {
|
|
198423
|
+
try {
|
|
198424
|
+
let raw5 = JSON.parse(jsonrepair(rawText));
|
|
198425
|
+
if (Array.isArray(raw5)) {
|
|
198426
|
+
raw5 = { paragraphs: raw5 };
|
|
198427
|
+
}
|
|
198428
|
+
if (raw5 && typeof raw5 === "object" && !Array.isArray(raw5) && !raw5.paragraphs) {
|
|
198429
|
+
const keys = Object.keys(raw5);
|
|
198430
|
+
if (keys.length > 0 && keys.every((k) => PARAGRAPH_TAG_RE2.test(k))) {
|
|
198431
|
+
raw5 = {
|
|
198432
|
+
paragraphs: keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag2) => ({ tag: tag2, atoms: raw5[tag2] }))
|
|
198433
|
+
};
|
|
198434
|
+
}
|
|
198435
|
+
}
|
|
198436
|
+
if (!Array.isArray(raw5?.paragraphs) && raw5 && typeof raw5 === "object" && looksLikeAtoms(raw5)) {
|
|
198437
|
+
raw5 = { paragraphs: [{ tag: "P0", atoms: raw5 }] };
|
|
198438
|
+
}
|
|
198439
|
+
if (!raw5 || !Array.isArray(raw5.paragraphs))
|
|
198440
|
+
return null;
|
|
198441
|
+
const salvaged = { paragraphs: [] };
|
|
198442
|
+
let droppedAtoms = 0;
|
|
198443
|
+
let fixedTags = 0;
|
|
198444
|
+
for (let idx = 0;idx < raw5.paragraphs.length; idx++) {
|
|
198445
|
+
const rawPara = raw5.paragraphs[idx];
|
|
198446
|
+
if (!rawPara || typeof rawPara !== "object")
|
|
198447
|
+
continue;
|
|
198448
|
+
let tag2 = rawPara.tag;
|
|
198449
|
+
if (!tag2 || typeof tag2 !== "string" || !/^P\d+$/.test(tag2)) {
|
|
198450
|
+
tag2 = `P${idx}`;
|
|
198451
|
+
fixedTags++;
|
|
198452
|
+
}
|
|
198453
|
+
let atomsObj;
|
|
198454
|
+
if (rawPara.atoms && typeof rawPara.atoms === "object") {
|
|
198455
|
+
atomsObj = rawPara.atoms;
|
|
198456
|
+
} else if (looksLikeAtoms(rawPara)) {
|
|
198457
|
+
atomsObj = rawPara;
|
|
198458
|
+
} else {
|
|
198459
|
+
continue;
|
|
198460
|
+
}
|
|
198461
|
+
const cleanAtoms = {};
|
|
198462
|
+
for (const [atomType, atoms2] of Object.entries(atomsObj)) {
|
|
198463
|
+
if (!ATOM_TYPE_KEYS.has(atomType) || !Array.isArray(atoms2))
|
|
198464
|
+
continue;
|
|
198465
|
+
const requiredFields = ATOM_REQUIRED_FIELDS[atomType] ?? [];
|
|
198466
|
+
const kept = [];
|
|
198467
|
+
for (const atom of atoms2) {
|
|
198468
|
+
if (!atom || typeof atom !== "object") {
|
|
198469
|
+
droppedAtoms++;
|
|
198470
|
+
continue;
|
|
198471
|
+
}
|
|
198472
|
+
const rec = atom;
|
|
198473
|
+
const hasRequired = requiredFields.every((f) => rec[f] != null && rec[f] !== "");
|
|
198474
|
+
if (hasRequired) {
|
|
198475
|
+
kept.push(atom);
|
|
198476
|
+
} else {
|
|
198477
|
+
droppedAtoms++;
|
|
198478
|
+
}
|
|
198479
|
+
}
|
|
198480
|
+
if (kept.length > 0)
|
|
198481
|
+
cleanAtoms[atomType] = kept;
|
|
198482
|
+
}
|
|
198483
|
+
salvaged.paragraphs.push({ tag: tag2, atoms: cleanAtoms });
|
|
198484
|
+
}
|
|
198485
|
+
if (salvaged.paragraphs.length === 0)
|
|
198486
|
+
return null;
|
|
198487
|
+
const result = docChunkResultSchema.safeParse(salvaged);
|
|
198488
|
+
if (!result.success)
|
|
198489
|
+
return null;
|
|
198490
|
+
const fixes = [];
|
|
198491
|
+
if (fixedTags > 0)
|
|
198492
|
+
fixes.push(`${fixedTags} tags auto-assigned`);
|
|
198493
|
+
if (droppedAtoms > 0)
|
|
198494
|
+
fixes.push(`${droppedAtoms} invalid atoms dropped`);
|
|
198495
|
+
if (fixes.length > 0) {
|
|
198496
|
+
console.warn(`[docIndexer] chunk ${chunkIndex}: lenient parse salvaged — ${fixes.join(", ")}`);
|
|
198497
|
+
}
|
|
198498
|
+
return result.data;
|
|
198499
|
+
} catch {
|
|
198500
|
+
return null;
|
|
198501
|
+
}
|
|
198502
|
+
}
|
|
196738
198503
|
function ensureAtomConfidence(atoms2) {
|
|
196739
198504
|
const DEFAULT_DOC_CONFIDENCE = 0.7;
|
|
196740
198505
|
for (const atomList of Object.values(atoms2)) {
|
|
@@ -196758,90 +198523,62 @@ function countAtoms(sections) {
|
|
|
196758
198523
|
}
|
|
196759
198524
|
return counts;
|
|
196760
198525
|
}
|
|
196761
|
-
|
|
196762
|
-
const
|
|
196763
|
-
|
|
196764
|
-
|
|
196765
|
-
|
|
196766
|
-
|
|
196767
|
-
|
|
196768
|
-
|
|
196769
|
-
|
|
196770
|
-
|
|
196771
|
-
}
|
|
196772
|
-
}
|
|
196773
|
-
if (paragraphs.length === 0)
|
|
196774
|
-
return 0;
|
|
196775
|
-
const embeddings = [];
|
|
196776
|
-
const totalParagraphs = paragraphs.length;
|
|
196777
|
-
onProgress?.({ phase: "embedding", progress: 85, message: `Loading model (${totalParagraphs} paragraphs)` });
|
|
196778
|
-
const warmupStart = Date.now();
|
|
196779
|
-
await embeddingService.getDimension();
|
|
196780
|
-
const warmupMs = Date.now() - warmupStart;
|
|
196781
|
-
if (warmupMs > 500) {
|
|
196782
|
-
onProgress?.({ phase: "embedding", progress: 86, message: `Model ready (${(warmupMs / 1000).toFixed(1)}s)` });
|
|
198526
|
+
function formatExtractionStats(stats) {
|
|
198527
|
+
const typeSummary = Object.entries(stats.atomTypeCounts).sort(([, a], [, b]) => b - a).map(([t4, c]) => `${t4}:${c}`).join(" ");
|
|
198528
|
+
return `${stats.uniqueEntityNames.length} entities, ${stats.relationCount} relations, ` + `${stats.paragraphsWithAtoms}/${stats.paragraphsTotal} paragraphs with atoms | ${typeSummary}`;
|
|
198529
|
+
}
|
|
198530
|
+
async function runEntityResolution(sections, entityNames, llmService, onProgress) {
|
|
198531
|
+
const candidates = detectResolutionCandidates(entityNames);
|
|
198532
|
+
const noiseCandidates = detectNoiseCandidates(entityNames);
|
|
198533
|
+
if (candidates.length === 0 && noiseCandidates.length === 0) {
|
|
198534
|
+
console.log("[docIndexer] entity resolution: no duplicates or noise candidates, skipping");
|
|
198535
|
+
onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.no_resolution" } });
|
|
198536
|
+
return { llmCalls: 0, totalTokens: 0 };
|
|
196783
198537
|
}
|
|
196784
|
-
|
|
196785
|
-
|
|
196786
|
-
|
|
196787
|
-
|
|
196788
|
-
|
|
196789
|
-
|
|
198538
|
+
console.log(`[docIndexer] entity resolution: ${candidates.length} duplicate pairs, ${noiseCandidates.length} noise candidates`);
|
|
198539
|
+
onProgress?.({
|
|
198540
|
+
phase: "post-processing",
|
|
198541
|
+
progress: 83,
|
|
198542
|
+
message: { key: "index.doc.msg.resolving", params: { duplicates: candidates.length, noise: noiseCandidates.length } }
|
|
198543
|
+
});
|
|
198544
|
+
try {
|
|
198545
|
+
const prompt = buildEntityResolutionPrompt({
|
|
198546
|
+
allNames: entityNames,
|
|
198547
|
+
candidates,
|
|
198548
|
+
...noiseCandidates.length > 0 ? { noiseCandidates } : {}
|
|
198549
|
+
});
|
|
198550
|
+
const result = await llmService.generateText(prompt, {
|
|
198551
|
+
systemPrompt: ENTITY_RESOLUTION_SYSTEM_PROMPT
|
|
198552
|
+
});
|
|
198553
|
+
let resolution;
|
|
196790
198554
|
try {
|
|
196791
|
-
|
|
196792
|
-
for (let j = 0;j < batch2.length; j++) {
|
|
196793
|
-
embeddings.push({
|
|
196794
|
-
sectionIndex: batch2[j].sectionIndex,
|
|
196795
|
-
paragraphIndex: batch2[j].paragraphIndex,
|
|
196796
|
-
vector: vectors[j]
|
|
196797
|
-
});
|
|
196798
|
-
}
|
|
198555
|
+
resolution = JSON.parse(jsonrepair(result.text));
|
|
196799
198556
|
} catch {
|
|
196800
|
-
|
|
196801
|
-
|
|
196802
|
-
|
|
196803
|
-
const vector = await embeddingService.embed(p4.text);
|
|
196804
|
-
embeddings.push({
|
|
196805
|
-
sectionIndex: p4.sectionIndex,
|
|
196806
|
-
paragraphIndex: p4.paragraphIndex,
|
|
196807
|
-
vector
|
|
196808
|
-
});
|
|
196809
|
-
} catch {
|
|
196810
|
-
console.warn(`[docIndexer] embedding failed for section ${p4.sectionIndex} paragraph ${p4.paragraphIndex}`);
|
|
196811
|
-
}
|
|
196812
|
-
const embedded2 = i + fi + 1;
|
|
196813
|
-
const progress2 = 86 + Math.round(embedded2 / totalParagraphs * 9);
|
|
196814
|
-
onProgress?.({ phase: "embedding", progress: progress2, message: `Fallback ${embedded2}/${totalParagraphs}` });
|
|
196815
|
-
}
|
|
196816
|
-
continue;
|
|
198557
|
+
console.warn("[docIndexer] entity resolution: failed to parse LLM response, skipping");
|
|
198558
|
+
onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.resolution_parse_failed" } });
|
|
198559
|
+
return { llmCalls: 1, totalTokens: result.usage.totalTokens };
|
|
196817
198560
|
}
|
|
196818
|
-
const
|
|
196819
|
-
const
|
|
196820
|
-
|
|
196821
|
-
|
|
196822
|
-
|
|
196823
|
-
|
|
196824
|
-
|
|
196825
|
-
|
|
196826
|
-
|
|
196827
|
-
|
|
196828
|
-
|
|
196829
|
-
|
|
196830
|
-
|
|
196831
|
-
|
|
196832
|
-
id: `${hashId}:${e.sectionIndex}:${e.paragraphIndex}`,
|
|
196833
|
-
embedding: e.vector,
|
|
196834
|
-
metadata: {
|
|
196835
|
-
layer: "digest",
|
|
196836
|
-
sourceId,
|
|
196837
|
-
hashId,
|
|
196838
|
-
sourcePath,
|
|
196839
|
-
sectionIndex: e.sectionIndex,
|
|
196840
|
-
paragraphIndex: e.paragraphIndex
|
|
198561
|
+
const mergeCount = applyEntityMerges(sections, resolution.merges ?? []);
|
|
198562
|
+
const removeCount = removeNoiseEntities(sections, resolution.remove ?? []);
|
|
198563
|
+
onProgress?.({
|
|
198564
|
+
phase: "post-processing",
|
|
198565
|
+
progress: 84,
|
|
198566
|
+
message: {
|
|
198567
|
+
key: "index.doc.msg.resolution_result",
|
|
198568
|
+
params: {
|
|
198569
|
+
merges: resolution.merges?.length ?? 0,
|
|
198570
|
+
mergeRefs: mergeCount,
|
|
198571
|
+
removed: resolution.remove?.length ?? 0,
|
|
198572
|
+
removeRefs: removeCount,
|
|
198573
|
+
ambiguous: resolution.ambiguous?.length ?? 0
|
|
198574
|
+
}
|
|
196841
198575
|
}
|
|
196842
|
-
})
|
|
198576
|
+
});
|
|
198577
|
+
return { llmCalls: 1, totalTokens: result.usage.totalTokens };
|
|
196843
198578
|
} catch (err2) {
|
|
196844
|
-
console.warn(
|
|
198579
|
+
console.warn("[docIndexer] entity resolution LLM call failed (non-blocking):", err2);
|
|
198580
|
+
onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.resolution_failed" } });
|
|
198581
|
+
return { llmCalls: 0, totalTokens: 0 };
|
|
196845
198582
|
}
|
|
196846
198583
|
}
|
|
196847
198584
|
async function indexDocument(input) {
|
|
@@ -196854,16 +198591,22 @@ async function indexDocument(input) {
|
|
|
196854
198591
|
digestStore: digestStore2,
|
|
196855
198592
|
onProgress
|
|
196856
198593
|
} = input;
|
|
198594
|
+
const { content: normalizedContent, stats: normalizeStats } = normalizeMarkdown(content);
|
|
198595
|
+
const repairCount = Object.values(normalizeStats.repairs).reduce((a, b) => a + b, 0);
|
|
198596
|
+
if (repairCount > 0) {
|
|
198597
|
+
const repairSummary = Object.entries(normalizeStats.repairs).map(([k, v]) => `${k}:${v}`).join(" ");
|
|
198598
|
+
console.log(`[docIndexer] markdown normalized: ${repairCount} repairs (${repairSummary})`);
|
|
198599
|
+
}
|
|
196857
198600
|
onProgress?.({ phase: "chunking", progress: 3 });
|
|
196858
|
-
const chunks = chunkMarkdown(
|
|
196859
|
-
const parsedSections = parseSections(
|
|
198601
|
+
const chunks = chunkMarkdown(normalizedContent);
|
|
198602
|
+
const parsedSections = parseSections(normalizedContent);
|
|
196860
198603
|
if (chunks.length === 0) {
|
|
196861
198604
|
throw new Error("Document produced no chunks — content may be empty");
|
|
196862
198605
|
}
|
|
196863
|
-
onProgress?.({ phase: "chunking", progress: 8, message:
|
|
198606
|
+
onProgress?.({ phase: "chunking", progress: 8, message: { key: "index.doc.msg.chunking_result", params: { chunks: chunks.length, sections: parsedSections.length } } });
|
|
196864
198607
|
const totalChunks = chunks.length;
|
|
196865
|
-
const
|
|
196866
|
-
onProgress?.({ phase: "annotating", progress: 10, message:
|
|
198608
|
+
const annotateStartMsg = input.llmModel ? { key: "index.doc.msg.annotating_start_model", params: { n: totalChunks, model: input.llmModel } } : { key: "index.doc.msg.annotating_start", params: { n: totalChunks } };
|
|
198609
|
+
onProgress?.({ phase: "annotating", progress: 10, message: annotateStartMsg });
|
|
196867
198610
|
let completedChunks = 0;
|
|
196868
198611
|
let totalLlmCalls = 0;
|
|
196869
198612
|
let totalTokens = 0;
|
|
@@ -196873,7 +198616,7 @@ async function indexDocument(input) {
|
|
|
196873
198616
|
onProgress?.({
|
|
196874
198617
|
phase: "annotating",
|
|
196875
198618
|
progress: baseProgress,
|
|
196876
|
-
message:
|
|
198619
|
+
message: { key: "index.doc.msg.annotating_chunk", params: { current: completedChunks + 1, total: totalChunks, step, calls, tokens } }
|
|
196877
198620
|
});
|
|
196878
198621
|
});
|
|
196879
198622
|
completedChunks++;
|
|
@@ -196883,7 +198626,7 @@ async function indexDocument(input) {
|
|
|
196883
198626
|
onProgress?.({
|
|
196884
198627
|
phase: "annotating",
|
|
196885
198628
|
progress,
|
|
196886
|
-
message:
|
|
198629
|
+
message: { key: "index.doc.msg.annotating_chunk_done", params: { current: completedChunks, total: totalChunks, calls: totalLlmCalls, tokens: totalTokens } }
|
|
196887
198630
|
});
|
|
196888
198631
|
return result;
|
|
196889
198632
|
});
|
|
@@ -196894,17 +198637,25 @@ async function indexDocument(input) {
|
|
|
196894
198637
|
const sectionsMap = new Map;
|
|
196895
198638
|
for (let i = 0;i < parsedSections.length; i++) {
|
|
196896
198639
|
const s = parsedSections[i];
|
|
196897
|
-
|
|
196898
|
-
sectionsMap.set(sectionKey, {
|
|
198640
|
+
sectionsMap.set(`${i}`, {
|
|
196899
198641
|
heading: s.heading,
|
|
196900
198642
|
level: s.level,
|
|
196901
198643
|
paragraphs: new Map
|
|
196902
198644
|
});
|
|
196903
|
-
|
|
196904
|
-
|
|
196905
|
-
|
|
196906
|
-
|
|
196907
|
-
|
|
198645
|
+
}
|
|
198646
|
+
for (const chunk of chunks) {
|
|
198647
|
+
for (const cp of chunk.paragraphs) {
|
|
198648
|
+
const sectionKey = `${cp.sectionIndex}`;
|
|
198649
|
+
if (!sectionsMap.has(sectionKey)) {
|
|
198650
|
+
sectionsMap.set(sectionKey, { heading: "", level: 0, paragraphs: new Map });
|
|
198651
|
+
}
|
|
198652
|
+
const paragraphKey = `${cp.sectionIndex}:${cp.paragraphIndex}`;
|
|
198653
|
+
if (!sectionsMap.get(sectionKey).paragraphs.has(paragraphKey)) {
|
|
198654
|
+
sectionsMap.get(sectionKey).paragraphs.set(paragraphKey, {
|
|
198655
|
+
text: cp.text,
|
|
198656
|
+
atoms: {}
|
|
198657
|
+
});
|
|
198658
|
+
}
|
|
196908
198659
|
}
|
|
196909
198660
|
}
|
|
196910
198661
|
for (const success2 of chunkProcessResult.successes) {
|
|
@@ -196934,6 +198685,25 @@ async function indexDocument(input) {
|
|
|
196934
198685
|
ensureAtomConfidence(para.atoms);
|
|
196935
198686
|
}
|
|
196936
198687
|
}
|
|
198688
|
+
onProgress?.({ phase: "post-processing", progress: 81, message: { key: "index.doc.msg.post_process_start" } });
|
|
198689
|
+
postProcessDigestAtoms(digestSections);
|
|
198690
|
+
const preStats = collectExtractionStats(digestSections);
|
|
198691
|
+
const statsMsg = formatExtractionStats(preStats);
|
|
198692
|
+
console.log(`[docIndexer] extraction stats: ${statsMsg}`);
|
|
198693
|
+
onProgress?.({ phase: "post-processing", progress: 82, message: {
|
|
198694
|
+
key: "index.doc.msg.extraction_stats",
|
|
198695
|
+
params: {
|
|
198696
|
+
entities: preStats.uniqueEntityNames.length,
|
|
198697
|
+
relations: preStats.relationCount,
|
|
198698
|
+
withAtoms: preStats.paragraphsWithAtoms,
|
|
198699
|
+
totalParas: preStats.paragraphsTotal
|
|
198700
|
+
}
|
|
198701
|
+
} });
|
|
198702
|
+
if ((input.enableEntityResolution ?? true) && preStats.uniqueEntityNames.length > 1) {
|
|
198703
|
+
const resolutionResult = await runEntityResolution(digestSections, preStats.uniqueEntityNames, llmService, onProgress);
|
|
198704
|
+
totalLlmCalls += resolutionResult.llmCalls;
|
|
198705
|
+
totalTokens += resolutionResult.totalTokens;
|
|
198706
|
+
}
|
|
196937
198707
|
const atomCounts = countAtoms(sectionsMap);
|
|
196938
198708
|
const paragraphCount = digestSections.reduce((sum, s) => sum + s.paragraphs.length, 0);
|
|
196939
198709
|
if (paragraphCount === 0) {
|
|
@@ -196955,7 +198725,7 @@ async function indexDocument(input) {
|
|
|
196955
198725
|
processedAt: new Date().toISOString()
|
|
196956
198726
|
}
|
|
196957
198727
|
};
|
|
196958
|
-
const embedMsg = input.embeddingModel ?
|
|
198728
|
+
const embedMsg = input.embeddingModel ? { key: "index.doc.msg.embedding_model", params: { model: input.embeddingModel } } : undefined;
|
|
196959
198729
|
onProgress?.({ phase: "embedding", progress: 85, ...embedMsg ? { message: embedMsg } : {} });
|
|
196960
198730
|
let embeddingCount = 0;
|
|
196961
198731
|
if (input.embeddingService) {
|
|
@@ -197250,45 +199020,55 @@ async function runDocIndexPipeline(opts) {
|
|
|
197250
199020
|
const llmModelId = serverConfig2.llm[llmProvider]?.default_model ?? llmProvider;
|
|
197251
199021
|
const embProvider = serverConfig2.embedding?.provider;
|
|
197252
199022
|
const embModelId = embProvider ? serverConfig2.embedding[embProvider]?.model_id ?? embProvider : undefined;
|
|
199023
|
+
const fileTimeoutMs = serverConfig2.indexing?.file_timeout_ms ?? 15 * 60 * 1000;
|
|
199024
|
+
const abortSignal = indexTaskManager.getAbortSignal?.(sourceId) ?? null;
|
|
197253
199025
|
for (let fileIdx = 0;fileIdx < filesToIndex.length; fileIdx++) {
|
|
199026
|
+
if (abortSignal?.aborted) {
|
|
199027
|
+
const reason = typeof abortSignal.reason === "string" ? abortSignal.reason : "Task aborted";
|
|
199028
|
+
console.warn(`[runDocIndexPipeline] aborted before file ${fileIdx + 1}/${filesToIndex.length}: ${reason}`);
|
|
199029
|
+
break;
|
|
199030
|
+
}
|
|
197254
199031
|
const file2 = filesToIndex[fileIdx];
|
|
197255
|
-
const fileLabel = `[${fileIdx + 1}/${filesToIndex.length}] ${file2.sourcePath}`;
|
|
197256
199032
|
if (indexTaskManager.hasTask(sourceId)) {
|
|
197257
199033
|
indexTaskManager.updateProgress(sourceId, {
|
|
197258
|
-
stage: "
|
|
199034
|
+
stage: "chunking",
|
|
197259
199035
|
percent: 0,
|
|
197260
|
-
message:
|
|
199036
|
+
message: { key: "index.doc.msg.file_start", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath } }
|
|
197261
199037
|
});
|
|
197262
199038
|
}
|
|
197263
199039
|
try {
|
|
197264
|
-
|
|
197265
|
-
|
|
197266
|
-
|
|
197267
|
-
|
|
197268
|
-
|
|
197269
|
-
|
|
197270
|
-
|
|
197271
|
-
|
|
197272
|
-
|
|
197273
|
-
|
|
197274
|
-
|
|
197275
|
-
|
|
197276
|
-
|
|
197277
|
-
|
|
197278
|
-
|
|
197279
|
-
|
|
197280
|
-
|
|
197281
|
-
|
|
197282
|
-
|
|
199040
|
+
const fileTimeout = new Promise((_, reject) => setTimeout(() => reject(new Error(`File timeout after ${Math.round(fileTimeoutMs / 60000)}min: ${file2.sourcePath}`)), fileTimeoutMs));
|
|
199041
|
+
await Promise.race([
|
|
199042
|
+
indexDocument({
|
|
199043
|
+
sourceId,
|
|
199044
|
+
hashId: file2.hashId,
|
|
199045
|
+
sourcePath: file2.sourcePath,
|
|
199046
|
+
content: file2.content,
|
|
199047
|
+
contentType: "markdown",
|
|
199048
|
+
llmService,
|
|
199049
|
+
embeddingService,
|
|
199050
|
+
vectorStore,
|
|
199051
|
+
digestStore: digestStore2,
|
|
199052
|
+
llmModel: `${llmProvider}/${llmModelId}`,
|
|
199053
|
+
...embModelId ? { embeddingModel: `${embProvider}/${embModelId}` } : {},
|
|
199054
|
+
onProgress: (p4) => {
|
|
199055
|
+
if (indexTaskManager.hasTask(sourceId)) {
|
|
199056
|
+
indexTaskManager.updateProgress(sourceId, {
|
|
199057
|
+
stage: p4.phase,
|
|
199058
|
+
percent: p4.progress,
|
|
199059
|
+
...p4.message != null ? { message: p4.message } : {}
|
|
199060
|
+
});
|
|
199061
|
+
}
|
|
197283
199062
|
}
|
|
197284
|
-
}
|
|
197285
|
-
|
|
199063
|
+
}),
|
|
199064
|
+
fileTimeout
|
|
199065
|
+
]);
|
|
197286
199066
|
stored.push({ hash_id: file2.hashId, status: "created" });
|
|
197287
199067
|
if (indexTaskManager.hasTask(sourceId)) {
|
|
197288
199068
|
indexTaskManager.updateProgress(sourceId, {
|
|
197289
199069
|
stage: "storing",
|
|
197290
199070
|
percent: 100,
|
|
197291
|
-
message:
|
|
199071
|
+
message: { key: "index.doc.msg.file_done", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath } }
|
|
197292
199072
|
});
|
|
197293
199073
|
}
|
|
197294
199074
|
} catch (err2) {
|
|
@@ -197299,11 +199079,15 @@ async function runDocIndexPipeline(opts) {
|
|
|
197299
199079
|
indexTaskManager.updateProgress(sourceId, {
|
|
197300
199080
|
stage: "annotating",
|
|
197301
199081
|
percent: 0,
|
|
197302
|
-
message:
|
|
199082
|
+
message: { key: "index.doc.msg.file_error", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath, error: msg } }
|
|
197303
199083
|
});
|
|
197304
199084
|
}
|
|
197305
199085
|
}
|
|
197306
199086
|
}
|
|
199087
|
+
if (abortSignal?.aborted) {
|
|
199088
|
+
console.warn(`[runDocIndexPipeline] pipeline aborted for ${sourceId}, skipping completion`);
|
|
199089
|
+
return;
|
|
199090
|
+
}
|
|
197307
199091
|
if (stored.length === 0 && errors5.length > 0) {
|
|
197308
199092
|
const errorCode = errors5[0].code ?? "DOC_INDEX_LLM_EXHAUSTED" /* DOC_INDEX_LLM_EXHAUSTED */;
|
|
197309
199093
|
indexTaskManager.failTask(sourceId, errors5[0].error, errorCode);
|
|
@@ -197417,7 +199201,7 @@ async function handleDocIndex(c, storageProvider, source2) {
|
|
|
197417
199201
|
throw new C4AError("DOC_INDEX_EMBEDDING_UNAVAILABLE" /* DOC_INDEX_EMBEDDING_UNAVAILABLE */, "Embedding service not configured", null);
|
|
197418
199202
|
}
|
|
197419
199203
|
const modulePaths = modules?.map((m) => m.path);
|
|
197420
|
-
indexTaskManager.createTask(source2.id, "server", source2.id, modulePaths);
|
|
199204
|
+
indexTaskManager.createTask(source2.id, "server", source2.id, modulePaths, serverConfig2.indexing?.task_timeout_ms);
|
|
197421
199205
|
const hashToPath = new Map;
|
|
197422
199206
|
for (const sf of latestByPath.values()) {
|
|
197423
199207
|
hashToPath.set(sf.hash_id, sf.source_path ?? "");
|
|
@@ -199261,6 +201045,10 @@ function mergeServerConfig2(parsed) {
|
|
|
199261
201045
|
...isPlainObject5(input.llm?.google) ? input.llm?.google : {}
|
|
199262
201046
|
}
|
|
199263
201047
|
},
|
|
201048
|
+
indexing: {
|
|
201049
|
+
...defaults2.indexing,
|
|
201050
|
+
...isPlainObject5(input.indexing) ? input.indexing : {}
|
|
201051
|
+
},
|
|
199264
201052
|
embedding: {
|
|
199265
201053
|
...defaults2.embedding,
|
|
199266
201054
|
...isPlainObject5(input.embedding) ? input.embedding : {},
|
|
@@ -199935,7 +201723,8 @@ import path9 from "node:path";
|
|
|
199935
201723
|
import { fileURLToPath } from "node:url";
|
|
199936
201724
|
|
|
199937
201725
|
// ../server/src/indexTaskManager.ts
|
|
199938
|
-
var DEFAULT_INDEX_TASK_TIMEOUT_MS =
|
|
201726
|
+
var DEFAULT_INDEX_TASK_TIMEOUT_MS = 150 * 60 * 1000;
|
|
201727
|
+
var DEFAULT_FILE_TIMEOUT_MS = 15 * 60 * 1000;
|
|
199939
201728
|
|
|
199940
201729
|
class IndexTaskManager {
|
|
199941
201730
|
broadcaster;
|
|
@@ -199955,12 +201744,18 @@ class IndexTaskManager {
|
|
|
199955
201744
|
getTask(sourceId) {
|
|
199956
201745
|
return this.indexTasks.get(sourceId) ?? null;
|
|
199957
201746
|
}
|
|
199958
|
-
|
|
201747
|
+
getAbortSignal(sourceId) {
|
|
201748
|
+
return this.indexTasks.get(sourceId)?.abortController.signal ?? null;
|
|
201749
|
+
}
|
|
201750
|
+
createTask(sourceId, machineId, targetCommit, modules, timeoutMs) {
|
|
199959
201751
|
const existing = this.indexTasks.get(sourceId);
|
|
199960
201752
|
if (existing) {
|
|
199961
201753
|
clearTimeout(existing.timer);
|
|
201754
|
+
existing.abortController.abort("Task replaced by new task");
|
|
199962
201755
|
this.indexTasks.delete(sourceId);
|
|
199963
201756
|
}
|
|
201757
|
+
const abortController = new AbortController;
|
|
201758
|
+
const effectiveTimeout = timeoutMs ?? this.timeoutMs;
|
|
199964
201759
|
const task = {
|
|
199965
201760
|
sourceId,
|
|
199966
201761
|
machineId,
|
|
@@ -199968,8 +201763,10 @@ class IndexTaskManager {
|
|
|
199968
201763
|
startedAt: new Date,
|
|
199969
201764
|
timer: setTimeout(() => {
|
|
199970
201765
|
this.timeoutTask(sourceId);
|
|
199971
|
-
},
|
|
201766
|
+
}, effectiveTimeout),
|
|
201767
|
+
timeoutMs: effectiveTimeout,
|
|
199972
201768
|
progress: null,
|
|
201769
|
+
abortController,
|
|
199973
201770
|
...modules && modules.length > 0 ? { modules } : {}
|
|
199974
201771
|
};
|
|
199975
201772
|
this.indexTasks.set(sourceId, task);
|
|
@@ -200007,7 +201804,7 @@ class IndexTaskManager {
|
|
|
200007
201804
|
clearTimeout(task.timer);
|
|
200008
201805
|
task.timer = setTimeout(() => {
|
|
200009
201806
|
this.timeoutTask(sourceId);
|
|
200010
|
-
},
|
|
201807
|
+
}, task.timeoutMs);
|
|
200011
201808
|
nextPhase();
|
|
200012
201809
|
return;
|
|
200013
201810
|
}
|
|
@@ -200026,6 +201823,7 @@ class IndexTaskManager {
|
|
|
200026
201823
|
return;
|
|
200027
201824
|
this.pendingPhases.delete(sourceId);
|
|
200028
201825
|
clearTimeout(task.timer);
|
|
201826
|
+
task.abortController.abort(error40);
|
|
200029
201827
|
this.indexTasks.delete(sourceId);
|
|
200030
201828
|
this.broadcaster.error({
|
|
200031
201829
|
source_id: sourceId,
|
|
@@ -200040,6 +201838,7 @@ class IndexTaskManager {
|
|
|
200040
201838
|
return;
|
|
200041
201839
|
this.pendingPhases.delete(sourceId);
|
|
200042
201840
|
clearTimeout(task.timer);
|
|
201841
|
+
task.abortController.abort("Task timed out");
|
|
200043
201842
|
this.indexTasks.delete(sourceId);
|
|
200044
201843
|
this.broadcaster.timeout({
|
|
200045
201844
|
source_id: sourceId,
|
|
@@ -200055,6 +201854,7 @@ class IndexTaskManager {
|
|
|
200055
201854
|
destroy() {
|
|
200056
201855
|
for (const task of this.indexTasks.values()) {
|
|
200057
201856
|
clearTimeout(task.timer);
|
|
201857
|
+
task.abortController.abort("Manager destroyed");
|
|
200058
201858
|
}
|
|
200059
201859
|
this.indexTasks.clear();
|
|
200060
201860
|
this.pendingPhases.clear();
|