@c4a/server-cli 0.4.15-alpha.4 → 0.4.15-alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -5
- package/index.js +954 -85
- package/package.json +1 -1
- package/serve.js +2016 -216
- package/web/assets/ContentDetail--oZBzWh0.js +1 -0
- package/web/assets/ContentDetail-B5s8bbFo.js +1 -0
- package/web/assets/ContentDetail-C3kXsx-i.js +1 -0
- package/web/assets/ContentDetail-CcLGF_Yi.js +1 -0
- package/web/assets/ContentDetail-D-2xyerw.js +1 -0
- package/web/assets/ContentDetail-DlQ8URkx.js +1 -0
- package/web/assets/ContentDetail-TPc0m0eM.js +1 -0
- package/web/assets/ContentDetail-y0yi2qln.js +1 -0
- package/web/assets/EntityDetail-3CFtMmgQ.js +1 -0
- package/web/assets/EntityDetail-BI3etmj4.js +1 -0
- package/web/assets/EntityDetail-CoFb-qZW.js +1 -0
- package/web/assets/EntityDetail-D_WP7tD4.js +1 -0
- package/web/assets/EntityDetail-DiJPemDY.js +1 -0
- package/web/assets/EntityDetail-DihnDvhA.js +1 -0
- package/web/assets/EntityDetail-DyDH4GAw.js +1 -0
- package/web/assets/EntityDetail-dIZiNN2t.js +1 -0
- package/web/assets/RelationDetail-B2gHrceI.js +1 -0
- package/web/assets/RelationDetail-CEq9vopD.js +1 -0
- package/web/assets/RelationDetail-CaYrspaS.js +1 -0
- package/web/assets/RelationDetail-CpoGdy25.js +1 -0
- package/web/assets/RelationDetail-DU9ECyHi.js +1 -0
- package/web/assets/RelationDetail-Dz7HAlU5.js +1 -0
- package/web/assets/RelationDetail-Wh3IgNaF.js +1 -0
- package/web/assets/RelationDetail-zZ_ZfkYX.js +1 -0
- package/web/assets/index-BPMqeFze.js +111 -0
- package/web/assets/index-BgRuvBL5.js +111 -0
- package/web/assets/index-CcrkBEZl.js +111 -0
- package/web/assets/index-DGDx8sCs.js +111 -0
- package/web/assets/index-DIyAwnqE.js +111 -0
- package/web/assets/index-DW1cCA8v.js +111 -0
- package/web/assets/index-DiAYi5t8.css +1 -0
- package/web/assets/index-FOCWvgW_.css +1 -0
- package/web/assets/index-daOjyLzy.css +1 -0
- package/web/assets/index-moF8uSEi.js +111 -0
- package/web/assets/index-sPNyENFN.js +111 -0
- package/web/assets/index-uGqDxUnx.css +1 -0
- package/web/index.html +2 -2
package/index.js
CHANGED
|
@@ -40342,6 +40342,10 @@ var init_serverConfig = __esm(() => {
|
|
|
40342
40342
|
default_model: "gemini-3-pro-preview"
|
|
40343
40343
|
}
|
|
40344
40344
|
},
|
|
40345
|
+
indexing: {
|
|
40346
|
+
task_timeout_ms: 150 * 60 * 1000,
|
|
40347
|
+
file_timeout_ms: 15 * 60 * 1000
|
|
40348
|
+
},
|
|
40345
40349
|
embedding: {
|
|
40346
40350
|
provider: "huggingface",
|
|
40347
40351
|
huggingface: {
|
|
@@ -44401,7 +44405,7 @@ var init_atomsSchema = __esm(() => {
|
|
|
44401
44405
|
init_zod();
|
|
44402
44406
|
init_base();
|
|
44403
44407
|
init_baseSchema();
|
|
44404
|
-
confidenceAtomSchema = exports_external.number().min(0).max(1).optional();
|
|
44408
|
+
confidenceAtomSchema = exports_external.number().min(0).max(1).optional().catch(undefined);
|
|
44405
44409
|
entityAtomSchema = exports_external.object({
|
|
44406
44410
|
name: exports_external.string(),
|
|
44407
44411
|
kind: kindSchema.optional().catch(undefined),
|
|
@@ -220955,14 +220959,21 @@ function isRetryableStatus(status) {
|
|
|
220955
220959
|
function isAuthStatus(status) {
|
|
220956
220960
|
return status === 401 || status === 403;
|
|
220957
220961
|
}
|
|
220958
|
-
function
|
|
220959
|
-
|
|
220962
|
+
function throwLlmError(error40, status) {
|
|
220963
|
+
const detail = toErrorMessage(error40);
|
|
220964
|
+
const statusTag = status ? ` [HTTP ${status}]` : "";
|
|
220965
|
+
if (isAuthStatus(status)) {
|
|
220966
|
+
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, `LLM 认证失败${statusTag}: ${detail}`, detail);
|
|
220967
|
+
}
|
|
220968
|
+
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, `LLM 调用失败${statusTag}: ${detail}`, detail);
|
|
220960
220969
|
}
|
|
220961
220970
|
|
|
220962
220971
|
class LlmServiceImpl {
|
|
220963
220972
|
options;
|
|
220973
|
+
supportsTemperature;
|
|
220964
220974
|
constructor(options) {
|
|
220965
220975
|
this.options = options;
|
|
220976
|
+
this.supportsTemperature = options.provider !== "openai";
|
|
220966
220977
|
}
|
|
220967
220978
|
async generateText(prompt, options) {
|
|
220968
220979
|
if (this.options.forceStream) {
|
|
@@ -220974,7 +220985,7 @@ class LlmServiceImpl {
|
|
|
220974
220985
|
model: this.options.languageModel,
|
|
220975
220986
|
prompt,
|
|
220976
220987
|
maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
220977
|
-
temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
|
|
220988
|
+
...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
|
|
220978
220989
|
maxRetries: 0
|
|
220979
220990
|
};
|
|
220980
220991
|
if (options?.systemPrompt) {
|
|
@@ -221011,13 +221022,7 @@ class LlmServiceImpl {
|
|
|
221011
221022
|
durationMs,
|
|
221012
221023
|
error: toErrorMessage(error40)
|
|
221013
221024
|
});
|
|
221014
|
-
|
|
221015
|
-
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
|
|
221016
|
-
}
|
|
221017
|
-
if (isBadRequest(status)) {
|
|
221018
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
221019
|
-
}
|
|
221020
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
221025
|
+
throwLlmError(error40, status);
|
|
221021
221026
|
}
|
|
221022
221027
|
}
|
|
221023
221028
|
async generateTextViaStream(prompt, options) {
|
|
@@ -221027,7 +221032,7 @@ class LlmServiceImpl {
|
|
|
221027
221032
|
model: this.options.languageModel,
|
|
221028
221033
|
prompt,
|
|
221029
221034
|
maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
221030
|
-
temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
|
|
221035
|
+
...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
|
|
221031
221036
|
maxRetries: 0
|
|
221032
221037
|
};
|
|
221033
221038
|
if (options?.systemPrompt) {
|
|
@@ -221065,13 +221070,7 @@ class LlmServiceImpl {
|
|
|
221065
221070
|
durationMs: Date.now() - startedAt,
|
|
221066
221071
|
error: toErrorMessage(error40)
|
|
221067
221072
|
});
|
|
221068
|
-
|
|
221069
|
-
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
|
|
221070
|
-
}
|
|
221071
|
-
if (isBadRequest(status)) {
|
|
221072
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
221073
|
-
}
|
|
221074
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
221073
|
+
throwLlmError(error40, status);
|
|
221075
221074
|
}
|
|
221076
221075
|
}
|
|
221077
221076
|
streamText(prompt, options) {
|
|
@@ -221094,7 +221093,7 @@ class LlmServiceImpl {
|
|
|
221094
221093
|
model: this.options.languageModel,
|
|
221095
221094
|
prompt,
|
|
221096
221095
|
maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
221097
|
-
temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
|
|
221096
|
+
...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
|
|
221098
221097
|
maxRetries: 0,
|
|
221099
221098
|
onFinish: (event) => {
|
|
221100
221099
|
const finishEvent = event;
|
|
@@ -221140,13 +221139,7 @@ class LlmServiceImpl {
|
|
|
221140
221139
|
durationMs: Date.now() - startedAt,
|
|
221141
221140
|
error: toErrorMessage(error40)
|
|
221142
221141
|
});
|
|
221143
|
-
|
|
221144
|
-
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
|
|
221145
|
-
}
|
|
221146
|
-
if (isBadRequest(status)) {
|
|
221147
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
221148
|
-
}
|
|
221149
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
221142
|
+
throwLlmError(error40, status);
|
|
221150
221143
|
}
|
|
221151
221144
|
}
|
|
221152
221145
|
}
|
|
@@ -221857,7 +221850,12 @@ function parseExtractionOutput(raw, schema) {
|
|
|
221857
221850
|
return { success: false, error: new Error("Empty output") };
|
|
221858
221851
|
}
|
|
221859
221852
|
const protocolParsed = tryParseProtocol(trimmed);
|
|
221860
|
-
|
|
221853
|
+
let parsed = protocolParsed ?? tryParseJson(trimmed);
|
|
221854
|
+
if (Array.isArray(parsed)) {
|
|
221855
|
+
parsed = { paragraphs: parsed };
|
|
221856
|
+
}
|
|
221857
|
+
parsed = normalizeFlatOutput(parsed);
|
|
221858
|
+
parsed = stripNulls(parsed);
|
|
221861
221859
|
const result = schema.safeParse(parsed);
|
|
221862
221860
|
if (!result.success) {
|
|
221863
221861
|
return { success: false, error: result.error };
|
|
@@ -221927,6 +221925,37 @@ function tryParseJson(raw) {
|
|
|
221927
221925
|
function repairAndParse(raw) {
|
|
221928
221926
|
return JSON.parse(jsonrepair(raw));
|
|
221929
221927
|
}
|
|
221928
|
+
var PARAGRAPH_TAG_RE = /^P\d+$/;
|
|
221929
|
+
function normalizeFlatOutput(parsed) {
|
|
221930
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
|
|
221931
|
+
return parsed;
|
|
221932
|
+
const obj = parsed;
|
|
221933
|
+
if ("paragraphs" in obj)
|
|
221934
|
+
return parsed;
|
|
221935
|
+
const keys = Object.keys(obj);
|
|
221936
|
+
if (keys.length === 0)
|
|
221937
|
+
return { paragraphs: [] };
|
|
221938
|
+
const allTags = keys.every((k) => PARAGRAPH_TAG_RE.test(k));
|
|
221939
|
+
if (!allTags)
|
|
221940
|
+
return parsed;
|
|
221941
|
+
const paragraphs = keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag) => ({ tag, atoms: obj[tag] }));
|
|
221942
|
+
return { paragraphs };
|
|
221943
|
+
}
|
|
221944
|
+
function stripNulls(value) {
|
|
221945
|
+
if (value === null)
|
|
221946
|
+
return;
|
|
221947
|
+
if (Array.isArray(value))
|
|
221948
|
+
return value.map(stripNulls);
|
|
221949
|
+
if (typeof value === "object" && value !== null) {
|
|
221950
|
+
const out = {};
|
|
221951
|
+
for (const [k, v] of Object.entries(value)) {
|
|
221952
|
+
if (v !== null)
|
|
221953
|
+
out[k] = stripNulls(v);
|
|
221954
|
+
}
|
|
221955
|
+
return out;
|
|
221956
|
+
}
|
|
221957
|
+
return value;
|
|
221958
|
+
}
|
|
221930
221959
|
function isRecord(value) {
|
|
221931
221960
|
return !!value && typeof value === "object" && "key" in value && "value" in value && typeof value.key === "string";
|
|
221932
221961
|
}
|
|
@@ -222219,30 +222248,32 @@ Each atom type has specific required fields. Fields with "?" suffix are optional
|
|
|
222219
222248
|
${ATOM_TYPES_BLOCK}
|
|
222220
222249
|
|
|
222221
222250
|
## Output Format
|
|
222222
|
-
Return a single JSON object
|
|
222251
|
+
Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
|
|
222223
222252
|
{
|
|
222224
|
-
"
|
|
222225
|
-
{
|
|
222226
|
-
|
|
222227
|
-
|
|
222228
|
-
|
|
222229
|
-
|
|
222230
|
-
|
|
222231
|
-
}
|
|
222232
|
-
}
|
|
222233
|
-
]
|
|
222253
|
+
"P0": {
|
|
222254
|
+
"entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
|
|
222255
|
+
"relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }]
|
|
222256
|
+
},
|
|
222257
|
+
"P3": {
|
|
222258
|
+
"rules": [{ "description": "User must be authenticated before access", "expression": "user.isAuthenticated == true", "confidence": 0.85 }]
|
|
222259
|
+
}
|
|
222234
222260
|
}
|
|
222235
222261
|
|
|
222236
222262
|
## Rules
|
|
222237
|
-
-
|
|
222263
|
+
- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
|
|
222264
|
+
- Skip paragraphs with no atoms — do NOT emit empty objects.
|
|
222238
222265
|
- Only include atom types that are actually found in a paragraph (all types are optional).
|
|
222239
222266
|
- Every atom MUST include all required fields for its type (see schemas above). Fields with "?" suffix are optional.
|
|
222240
222267
|
- **Enum fields MUST use ONLY the listed values.** For example, entity.kind must be one of "implementation"|"external"|"concept" — do NOT use values from other atom types (e.g., do NOT put "team" or "human" in entity.kind; those belong to roles.kind).
|
|
222241
222268
|
- Every atom MUST include a "confidence" field (0.0-1.0) indicating how confident you are in the extraction. Use higher values (0.85-1.0) for explicitly stated facts and lower values (0.5-0.7) for inferred or ambiguous information.
|
|
222242
222269
|
- **Classify correctly:** People, teams, and personas → "roles" (not "entities"). Technical systems, services, modules → "entities".
|
|
222270
|
+
- **Entity reference consistency (CRITICAL):** Every entity name referenced in relation.from, relation.to, behavior.subject, or any other cross-reference field MUST also appear in the "entities" array of the SAME paragraph (or a preceding paragraph in the same chunk). If an entity is mentioned for the first time in a relation, you MUST also extract it as an entity. This ensures no "dangling references" — every name used in relations has a corresponding entity declaration.
|
|
222243
222271
|
- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions, English input → English descriptions).
|
|
222244
222272
|
- JSON structure keys (tag, atom type names, field names) must always be in English.
|
|
222245
222273
|
- Be thorough: extract ALL relevant atoms from each paragraph.
|
|
222274
|
+
- **Tables: basic extraction only.** For paragraphs containing markdown tables, extract the table heading as an entity and a brief summary attribute. Detailed table modeling (row-level data, comparisons, metrics) is handled by a dedicated table extraction pass — do NOT attempt exhaustive table column extraction here.
|
|
222275
|
+
- **Diagrams: basic extraction only.** For paragraphs containing text-based diagrams (e.g. \`\`\`mermaid, \`\`\`plantuml, \`\`\`dot, etc.), extract the diagram title as an entity and a brief summary attribute describing what the diagram shows. Detailed diagram modeling (nodes, edges, states, transitions) is handled by a dedicated diagram extraction pass — do NOT attempt exhaustive diagram parsing here.
|
|
222276
|
+
- **Metrics checklist:** If the text mentions performance targets, SLA, response times, throughput, error rates, port numbers, timeouts, capacity limits, or any numeric thresholds, extract them as "metrics" atoms.
|
|
222246
222277
|
- Do NOT include "claims" — they are system-generated and not part of document extraction.`;
|
|
222247
222278
|
function buildDocAtomAnnotationPrompt(chunkText) {
|
|
222248
222279
|
return `Extract all semantic atoms from the following document text.
|
|
@@ -222254,6 +222285,13 @@ ${chunkText}
|
|
|
222254
222285
|
|
|
222255
222286
|
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
222256
222287
|
}
|
|
222288
|
+
function toFlatFormat(result) {
|
|
222289
|
+
const flat = {};
|
|
222290
|
+
for (const p of result.paragraphs) {
|
|
222291
|
+
flat[p.tag] = p.atoms;
|
|
222292
|
+
}
|
|
222293
|
+
return flat;
|
|
222294
|
+
}
|
|
222257
222295
|
function buildDocGleaningPrompt(chunkText, previousResult) {
|
|
222258
222296
|
return `Review the following document text and the previously extracted atoms.
|
|
222259
222297
|
Check for any MISSING atoms that were not captured in the first pass.
|
|
@@ -222262,66 +222300,483 @@ Check for any MISSING atoms that were not captured in the first pass.
|
|
|
222262
222300
|
${chunkText}
|
|
222263
222301
|
|
|
222264
222302
|
## Previously Extracted Atoms
|
|
222265
|
-
${JSON.stringify(previousResult, null, 2)}
|
|
222303
|
+
${JSON.stringify(toFlatFormat(previousResult), null, 2)}
|
|
222266
222304
|
|
|
222267
222305
|
## Instructions
|
|
222268
|
-
- If you find missing atoms, output them in the same JSON format
|
|
222306
|
+
- If you find missing atoms, output them in the same flat JSON format keyed by paragraph tags (e.g. {"P0": {"entities": [...]}, "P3": {"rules": [...]}}).
|
|
222269
222307
|
- Only include NEW atoms not already in the previous extraction.
|
|
222270
222308
|
- Every atom MUST include a "confidence" field (0.0-1.0).
|
|
222271
|
-
- If
|
|
222309
|
+
- **Entity reference consistency:** If you add a new relation whose from/to references an entity not yet declared in the previous extraction or your current output, you MUST also add that entity to the "entities" array.
|
|
222310
|
+
- If nothing is missing, return: {}
|
|
222272
222311
|
- Respond in the same language as the input text.
|
|
222273
222312
|
|
|
222274
222313
|
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
222275
222314
|
}
|
|
222276
222315
|
var DOC_ANNOTATION_SYSTEM_PROMPT = SYSTEM_PROMPT;
|
|
222316
|
+
// ../llm/src/prompts/entityResolution.ts
|
|
222317
|
+
var ENTITY_RESOLUTION_SYSTEM_PROMPT = `You are an entity resolution assistant. You review a list of entity names extracted from a technical document and perform two tasks:
|
|
222318
|
+
|
|
222319
|
+
## Task 1: Merge Duplicates
|
|
222320
|
+
- Only merge names that clearly refer to the same entity (same system, service, tool, etc.)
|
|
222321
|
+
- Prefer the LONGER, more descriptive name as the canonical name
|
|
222322
|
+
- Do NOT merge names that share a substring but refer to different things
|
|
222323
|
+
- When uncertain, do NOT merge — add to "ambiguous" instead
|
|
222324
|
+
- Chinese and English names for the same entity SHOULD be merged (e.g. "Vmok" → "Vmok 微模块框架")
|
|
222325
|
+
- Abbreviations should be merged with their full forms (e.g. "AGW" → "API Gateway")
|
|
222326
|
+
|
|
222327
|
+
## Task 2: Remove Noise
|
|
222328
|
+
- Remove names that are NOT meaningful named entities — they are generic words, actions, or descriptions
|
|
222329
|
+
- Examples of noise: common verbs/nouns (登录, 路由, 直连), generic technical terms (Env, query), action descriptions (Kill 3001 进程)
|
|
222330
|
+
- Examples of REAL entities to KEEP: product names (TTAstra, Gulux), tools (nvm, Rush), services (Op Main 服务), platforms (AGW 平台)
|
|
222331
|
+
- When uncertain, KEEP the name — only remove if clearly not a named entity
|
|
222332
|
+
|
|
222333
|
+
## Output
|
|
222334
|
+
Valid JSON only. No markdown fences, no explanation.`;
|
|
222335
|
+
function buildEntityResolutionPrompt(input) {
|
|
222336
|
+
const parts = [];
|
|
222337
|
+
parts.push(`## All Entity Names (${input.allNames.length} total)`);
|
|
222338
|
+
parts.push(input.allNames.map((n, i) => `${i + 1}. ${n}`).join(`
|
|
222339
|
+
`));
|
|
222340
|
+
if (input.candidates.length > 0) {
|
|
222341
|
+
parts.push("");
|
|
222342
|
+
parts.push(`## Suspected Duplicates (${input.candidates.length} pairs)`);
|
|
222343
|
+
parts.push("Review each pair and decide whether to merge:");
|
|
222344
|
+
for (const c of input.candidates) {
|
|
222345
|
+
parts.push(`- "${c.short}" ↔ "${c.long}" — ${c.reason}`);
|
|
222346
|
+
}
|
|
222347
|
+
}
|
|
222348
|
+
if (input.noiseCandidates && input.noiseCandidates.length > 0) {
|
|
222349
|
+
parts.push("");
|
|
222350
|
+
parts.push(`## Suspected Noise (${input.noiseCandidates.length} names)`);
|
|
222351
|
+
parts.push("Review each name — remove if NOT a meaningful named entity, keep if it IS:");
|
|
222352
|
+
for (const n of input.noiseCandidates) {
|
|
222353
|
+
parts.push(`- "${n}"`);
|
|
222354
|
+
}
|
|
222355
|
+
}
|
|
222356
|
+
if (input.contextSnippets && input.contextSnippets.length > 0) {
|
|
222357
|
+
parts.push("");
|
|
222358
|
+
parts.push("## Context Snippets");
|
|
222359
|
+
for (const s of input.contextSnippets) {
|
|
222360
|
+
parts.push(`- **${s.name}**: ${s.snippet}`);
|
|
222361
|
+
}
|
|
222362
|
+
}
|
|
222363
|
+
parts.push("");
|
|
222364
|
+
parts.push(`## Output Format
|
|
222365
|
+
Return a JSON object:
|
|
222366
|
+
{
|
|
222367
|
+
"merges": [
|
|
222368
|
+
{ "from": "alias name", "to": "canonical name" }
|
|
222369
|
+
],
|
|
222370
|
+
"remove": ["noise_name_1", "noise_name_2"],
|
|
222371
|
+
"ambiguous": ["name1", "name2"]
|
|
222372
|
+
}
|
|
222373
|
+
|
|
222374
|
+
- "merges": confirmed duplicate pairs. "from" will be replaced by "to" everywhere.
|
|
222375
|
+
- "remove": names confirmed as noise. They will be deleted from entity list.
|
|
222376
|
+
- "ambiguous": names you're unsure about (optional, for logging).
|
|
222377
|
+
|
|
222378
|
+
Return ONLY valid JSON. No markdown fences, no explanation.`);
|
|
222379
|
+
return parts.join(`
|
|
222380
|
+
`);
|
|
222381
|
+
}
|
|
222382
|
+
// ../llm/src/prompts/docTableAnnotation.ts
|
|
222383
|
+
init_src();
|
|
222384
|
+
var entityFields = zodObjectToPromptFields(entityAtomSchema);
|
|
222385
|
+
var attributeFields = zodObjectToPromptFields(attributeAtomSchema);
|
|
222386
|
+
var relationFields = zodObjectToPromptFields(relationAtomSchema);
|
|
222387
|
+
var comparisonFields = zodObjectToPromptFields(comparisonAtomSchema);
|
|
222388
|
+
var metricFields = zodObjectToPromptFields(metricAtomSchema);
|
|
222389
|
+
var behaviorFields = zodObjectToPromptFields(behaviorAtomSchema);
|
|
222390
|
+
var eventFields = zodObjectToPromptFields(eventAtomSchema);
|
|
222391
|
+
var transitionFields = zodObjectToPromptFields(transitionAtomSchema);
|
|
222392
|
+
var constraintFields = zodObjectToPromptFields(constraintAtomSchema);
|
|
222393
|
+
var stateFields = zodObjectToPromptFields(stateAtomSchema);
|
|
222394
|
+
var ruleFields = zodObjectToPromptFields(ruleAtomSchema);
|
|
222395
|
+
var TABLE_SYSTEM_PROMPT = `You are a table data modeling assistant. Your task is to extract structured semantic atoms from markdown tables in documents.
|
|
222396
|
+
|
|
222397
|
+
Each table paragraph is tagged with [P0], [P1], etc. You must classify the table type FIRST, then apply the corresponding extraction rules.
|
|
222398
|
+
|
|
222399
|
+
## Step 1: Classify the Table
|
|
222400
|
+
|
|
222401
|
+
Determine the table type by examining the relationship between rows:
|
|
222402
|
+
|
|
222403
|
+
### Type A: Collection / Record Table
|
|
222404
|
+
**Rows are peer instances of the same concept.** Each row is an independent record; columns describe different facets of the same instance.
|
|
222405
|
+
- Examples: code→name mappings, enum definitions, config parameter lists, reference data tables
|
|
222406
|
+
- Key signal: removing one row does not affect the meaning of other rows
|
|
222407
|
+
|
|
222408
|
+
### Type B: Single-Object Property Table
|
|
222409
|
+
**Rows describe properties/fields of ONE entity.** First column is property name, other columns are its type/value/description.
|
|
222410
|
+
- Examples: API field definitions, configuration schema, entity attribute lists
|
|
222411
|
+
- Key signal: all rows refer to the same parent entity
|
|
222412
|
+
|
|
222413
|
+
### Type C: Comparison / Evaluation Table
|
|
222414
|
+
**Rows or columns represent different subjects being compared** across the same dimensions.
|
|
222415
|
+
- Examples: technology selection, vendor evaluation, feature comparison
|
|
222416
|
+
- Key signal: multiple named subjects evaluated on shared criteria
|
|
222417
|
+
|
|
222418
|
+
### Type D: Matrix / Cross-Reference Table
|
|
222419
|
+
**Both row headers and column headers are dimensions.** Cells represent the relationship at the intersection.
|
|
222420
|
+
- Examples: permission matrices (role × operation), compatibility matrices, dependency tables
|
|
222421
|
+
- Key signal: both axes are meaningful dimensions, cells are binary/rating/relationship values
|
|
222422
|
+
|
|
222423
|
+
### Type E: Metrics / KPI Table
|
|
222424
|
+
**Rows are measurable indicators** with numeric targets, thresholds, or SLA values.
|
|
222425
|
+
- Examples: SLA tables, performance baselines, capacity planning tables
|
|
222426
|
+
- Key signal: columns include target/threshold/unit/SLA-style values
|
|
222427
|
+
|
|
222428
|
+
### Type F: Timeline / Process Table
|
|
222429
|
+
**Rows represent ordered steps or phases** in a sequence.
|
|
222430
|
+
- Examples: deployment steps, approval workflows, version changelog, migration plans
|
|
222431
|
+
- Key signal: rows have implicit ordering, may have phase/step/date columns
|
|
222432
|
+
|
|
222433
|
+
## Step 2: Extract Atoms by Table Type
|
|
222434
|
+
|
|
222435
|
+
### Type A → Single attribute with row-object array
|
|
222436
|
+
1. Create ONE entity for the abstract concept (table heading or the concept rows represent).
|
|
222437
|
+
Entity schema: ${entityFields}
|
|
222438
|
+
2. Create ONE attribute with \`type: "table"\` and \`value\` as an array of row objects. Each row object uses column headers as keys.
|
|
222439
|
+
Attribute schema: ${attributeFields}
|
|
222440
|
+
Example: \`{ "name": "Region Code Mapping", "type": "table", "value": [{"Code": "1001", "Name": "CN_North", "Region": "CN-NORTH"}, ...] }\`
|
|
222441
|
+
3. **Extract ALL rows — do not sample.** If a table has 30 rows, the value array must contain all 30 objects.
|
|
222442
|
+
4. Extract structural patterns: status indicators (DEPRECATED, enabled/disabled) → "states" + "rules" atoms.
|
|
222443
|
+
State schema: ${stateFields}
|
|
222444
|
+
Rule schema: ${ruleFields}
|
|
222445
|
+
|
|
222446
|
+
### Type B → Multiple attribute atoms
|
|
222447
|
+
1. Create ONE entity for the parent structure.
|
|
222448
|
+
Entity schema: ${entityFields}
|
|
222449
|
+
2. Create one attribute per row: \`name\` = property name, \`type\` = property type, \`value\` = default/example.
|
|
222450
|
+
Attribute schema: ${attributeFields}
|
|
222451
|
+
3. Extract constraints from "required" or "validation" columns.
|
|
222452
|
+
Constraint schema: ${constraintFields}
|
|
222453
|
+
|
|
222454
|
+
### Type C → Comparison atom
|
|
222455
|
+
1. Use "comparisons" atom. Subjects = compared items, dimensions = evaluation criteria.
|
|
222456
|
+
Comparison schema: ${comparisonFields}
|
|
222457
|
+
2. Extract "decisions" atoms if the table leads to a conclusion.
|
|
222458
|
+
|
|
222459
|
+
### Type D → Relations or table attribute
|
|
222460
|
+
1. If cells are simple (yes/no, allowed/denied): extract as "relations" atoms.
|
|
222461
|
+
Relation schema: ${relationFields}
|
|
222462
|
+
Map each cell to a relation: row header → \`from\`, column header → \`to\`, cell value → \`type\` or \`description\`.
|
|
222463
|
+
2. If cells are complex: use Type A approach (single attribute with \`type: "table"\`).
|
|
222464
|
+
3. Create entities for both row headers and column headers if they are named concepts.
|
|
222465
|
+
|
|
222466
|
+
### Type E → Metrics atoms
|
|
222467
|
+
1. Create one "metrics" atom per row.
|
|
222468
|
+
Metric schema: ${metricFields}
|
|
222469
|
+
2. Also create the parent entity if named (e.g., "SLA Requirements").
|
|
222470
|
+
|
|
222471
|
+
### Type F → Behaviors/Events/Transitions
|
|
222472
|
+
1. Create one "behaviors" atom per step/phase.
|
|
222473
|
+
Behavior schema: ${behaviorFields}
|
|
222474
|
+
2. If there are triggers: extract "events" atoms.
|
|
222475
|
+
Event schema: ${eventFields}
|
|
222476
|
+
3. If there are state changes: extract "transitions" atoms.
|
|
222477
|
+
Transition schema: ${transitionFields}
|
|
222478
|
+
4. Create the parent entity for the process/workflow.
|
|
222479
|
+
|
|
222480
|
+
## Output Format
|
|
222481
|
+
Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
|
|
222482
|
+
{
|
|
222483
|
+
"P0": {
|
|
222484
|
+
"tableType": "A",
|
|
222485
|
+
"entities": [...],
|
|
222486
|
+
"attributes": [...]
|
|
222487
|
+
},
|
|
222488
|
+
"P3": {
|
|
222489
|
+
"tableType": "C",
|
|
222490
|
+
"comparisons": [...]
|
|
222491
|
+
}
|
|
222492
|
+
}
|
|
222493
|
+
|
|
222494
|
+
## Rules
|
|
222495
|
+
- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
|
|
222496
|
+
- Skip paragraphs with no atoms — do NOT emit empty objects.
|
|
222497
|
+
- Every atom MUST include a "confidence" field (0.0-1.0).
|
|
222498
|
+
- The "tableType" field is required for each paragraph (one of "A", "B", "C", "D", "E", "F").
|
|
222499
|
+
- Only include atom types that are actually extracted.
|
|
222500
|
+
- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
|
|
222501
|
+
- JSON structure keys must always be in English.
|
|
222502
|
+
- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
|
|
222503
|
+
- Do NOT include "claims" — they are system-generated.`;
|
|
222504
|
+
function buildDocTableAnnotationPrompt(tableText) {
|
|
222505
|
+
return `Classify and extract atoms from the following table paragraphs.
|
|
222506
|
+
Each paragraph is tagged with [P0], [P1], etc. First classify each table, then extract atoms accordingly.
|
|
222507
|
+
|
|
222508
|
+
---
|
|
222509
|
+
${tableText}
|
|
222510
|
+
---
|
|
222511
|
+
|
|
222512
|
+
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
222513
|
+
}
|
|
222514
|
+
var DOC_TABLE_ANNOTATION_SYSTEM_PROMPT = TABLE_SYSTEM_PROMPT;
|
|
222515
|
+
// ../llm/src/prompts/docDiagramAnnotation.ts
|
|
222516
|
+
init_src();
|
|
222517
|
+
var entityFields2 = zodObjectToPromptFields(entityAtomSchema);
|
|
222518
|
+
var attributeFields2 = zodObjectToPromptFields(attributeAtomSchema);
|
|
222519
|
+
var relationFields2 = zodObjectToPromptFields(relationAtomSchema);
|
|
222520
|
+
var behaviorFields2 = zodObjectToPromptFields(behaviorAtomSchema);
|
|
222521
|
+
var transitionFields2 = zodObjectToPromptFields(transitionAtomSchema);
|
|
222522
|
+
var stateFields2 = zodObjectToPromptFields(stateAtomSchema);
|
|
222523
|
+
var roleFields = zodObjectToPromptFields(roleAtomSchema);
|
|
222524
|
+
var eventFields2 = zodObjectToPromptFields(eventAtomSchema);
|
|
222525
|
+
var decisionFields = zodObjectToPromptFields(decisionAtomSchema);
|
|
222526
|
+
var constraintFields2 = zodObjectToPromptFields(constraintAtomSchema);
|
|
222527
|
+
var DIAGRAM_FENCE_TAGS = [
|
|
222528
|
+
"mermaid",
|
|
222529
|
+
"plantuml",
|
|
222530
|
+
"puml",
|
|
222531
|
+
"dot",
|
|
222532
|
+
"graphviz",
|
|
222533
|
+
"viz",
|
|
222534
|
+
"d2",
|
|
222535
|
+
"c4plantuml",
|
|
222536
|
+
"ditaa",
|
|
222537
|
+
"nomnoml",
|
|
222538
|
+
"wavedrom",
|
|
222539
|
+
"vega",
|
|
222540
|
+
"vega-lite"
|
|
222541
|
+
];
|
|
222542
|
+
var DIAGRAM_FENCE_REGEX = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "i");
|
|
222543
|
+
var DIAGRAM_SYSTEM_PROMPT = `You are a diagram analysis assistant. Your task is to extract structured semantic atoms from text-based diagrams (Mermaid, PlantUML, Graphviz, D2, etc.) embedded in documents.
|
|
222544
|
+
|
|
222545
|
+
Each diagram paragraph is tagged with [P0], [P1], etc. You must classify the diagram type FIRST, then extract atoms accordingly.
|
|
222546
|
+
|
|
222547
|
+
## Step 1: Identify the Diagram Format and Type
|
|
222548
|
+
|
|
222549
|
+
### Formats
|
|
222550
|
+
- **Mermaid**: flowchart/graph, sequenceDiagram, stateDiagram, classDiagram, erDiagram, gantt, pie, gitgraph
|
|
222551
|
+
- **PlantUML / C4-PlantUML**: @startuml/@enduml blocks, all UML types, C4 architecture (System_Context, Container, Component)
|
|
222552
|
+
- **Graphviz (DOT)**: digraph/graph, general directed/undirected graphs
|
|
222553
|
+
- **D2**: modern declarative diagrams with shape/connection syntax
|
|
222554
|
+
- **Others**: ditaa (ASCII art), nomnoml (UML), wavedrom (timing), vega/vega-lite (data viz)
|
|
222555
|
+
|
|
222556
|
+
### Diagram Types (by semantic content)
|
|
222557
|
+
- **Flowchart / Process**: decision trees, algorithms, business process flows
|
|
222558
|
+
- **Sequence**: interaction between participants over time (API calls, protocols)
|
|
222559
|
+
- **State Machine**: states and transitions triggered by events/guards
|
|
222560
|
+
- **Class / ER**: data models, entity relationships, inheritance hierarchies
|
|
222561
|
+
- **Architecture**: system components, containers, deployment topology
|
|
222562
|
+
- **Gantt / Timeline**: project schedules, milestones, phases
|
|
222563
|
+
- **Pie / Data Viz**: statistical distributions, metrics visualization
|
|
222564
|
+
|
|
222565
|
+
## Step 2: Extract Atoms by Diagram Type
|
|
222566
|
+
|
|
222567
|
+
### Flowchart / Process → entities + relations + behaviors + decisions
|
|
222568
|
+
1. Extract each node as an entity.
|
|
222569
|
+
Entity schema: ${entityFields2}
|
|
222570
|
+
2. Extract each arrow/edge as a relation. Use edge labels as \`type\` or \`description\`.
|
|
222571
|
+
Relation schema: ${relationFields2}
|
|
222572
|
+
3. Extract action nodes as behaviors (what the process does at each step).
|
|
222573
|
+
Behavior schema: ${behaviorFields2}
|
|
222574
|
+
4. Extract diamond/decision nodes as decisions.
|
|
222575
|
+
Decision schema: ${decisionFields}
|
|
222576
|
+
|
|
222577
|
+
### Sequence → entities + relations + behaviors + events
|
|
222578
|
+
1. Extract each participant/actor as an entity (or role if it's a person/team).
|
|
222579
|
+
Entity schema: ${entityFields2}
|
|
222580
|
+
Role schema: ${roleFields}
|
|
222581
|
+
2. Extract each message/call as a relation (\`from\` = caller, \`to\` = callee, \`type\` = message label).
|
|
222582
|
+
Relation schema: ${relationFields2}
|
|
222583
|
+
3. Extract significant interactions as behaviors.
|
|
222584
|
+
Behavior schema: ${behaviorFields2}
|
|
222585
|
+
4. Extract triggers, responses, and async messages as events.
|
|
222586
|
+
Event schema: ${eventFields2}
|
|
222587
|
+
|
|
222588
|
+
### State Machine → entities + states + transitions + events
|
|
222589
|
+
1. Extract the state machine subject as an entity.
|
|
222590
|
+
Entity schema: ${entityFields2}
|
|
222591
|
+
2. Extract each state as a state atom.
|
|
222592
|
+
State schema: ${stateFields2}
|
|
222593
|
+
3. Extract each arrow as a transition (\`from\` = source state, \`to\` = target state, \`trigger\` = event/guard).
|
|
222594
|
+
Transition schema: ${transitionFields2}
|
|
222595
|
+
4. Extract triggers as events.
|
|
222596
|
+
Event schema: ${eventFields2}
|
|
222597
|
+
|
|
222598
|
+
### Class / ER → entities + attributes + relations
|
|
222599
|
+
1. Extract each class/entity as an entity.
|
|
222600
|
+
Entity schema: ${entityFields2}
|
|
222601
|
+
2. Extract fields/properties as attributes.
|
|
222602
|
+
Attribute schema: ${attributeFields2}
|
|
222603
|
+
3. Extract associations, inheritance, composition as relations (\`type\` = "INHERITS", "CONTAINS", "REFERENCES", etc.).
|
|
222604
|
+
Relation schema: ${relationFields2}
|
|
222605
|
+
|
|
222606
|
+
### Architecture → entities + relations + constraints
|
|
222607
|
+
1. Extract each system/service/container/component as an entity.
|
|
222608
|
+
Entity schema: ${entityFields2}
|
|
222609
|
+
2. Extract connections between components as relations.
|
|
222610
|
+
Relation schema: ${relationFields2}
|
|
222611
|
+
3. Extract deployment constraints, technology choices.
|
|
222612
|
+
Constraint schema: ${constraintFields2}
|
|
222613
|
+
|
|
222614
|
+
### Gantt / Timeline → behaviors + events + constraints
|
|
222615
|
+
1. Extract each task/phase as a behavior.
|
|
222616
|
+
Behavior schema: ${behaviorFields2}
|
|
222617
|
+
2. Extract milestones and deadlines as events.
|
|
222618
|
+
Event schema: ${eventFields2}
|
|
222619
|
+
3. Extract dependencies and critical path constraints.
|
|
222620
|
+
Constraint schema: ${constraintFields2}
|
|
222621
|
+
|
|
222622
|
+
### Pie / Data Viz → attributes (summary only)
|
|
222623
|
+
1. Extract the chart title as an entity.
|
|
222624
|
+
Entity schema: ${entityFields2}
|
|
222625
|
+
2. Extract each slice/data point as an attribute (\`name\` = label, \`value\` = amount/percentage, \`type\` = "metric").
|
|
222626
|
+
Attribute schema: ${attributeFields2}
|
|
222627
|
+
|
|
222628
|
+
## Additional Extraction: Diagram Description
|
|
222629
|
+
|
|
222630
|
+
For EVERY diagram, also extract a "description" attribute on the diagram's primary entity:
|
|
222631
|
+
- \`name\`: "diagram_description"
|
|
222632
|
+
- \`type\`: "description"
|
|
222633
|
+
- \`value\`: A 1-3 sentence natural language summary of what the diagram communicates.
|
|
222634
|
+
|
|
222635
|
+
This description is critical for downstream AI consumers who cannot render the diagram.
|
|
222636
|
+
|
|
222637
|
+
## Output Format
|
|
222638
|
+
Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
|
|
222639
|
+
{
|
|
222640
|
+
"P0": {
|
|
222641
|
+
"diagramFormat": "mermaid",
|
|
222642
|
+
"diagramType": "sequence",
|
|
222643
|
+
"entities": [...],
|
|
222644
|
+
"relations": [...]
|
|
222645
|
+
}
|
|
222646
|
+
}
|
|
222647
|
+
|
|
222648
|
+
## Rules
|
|
222649
|
+
- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
|
|
222650
|
+
- Skip paragraphs with no atoms — do NOT emit empty objects.
|
|
222651
|
+
- Every atom MUST include a "confidence" field (0.0-1.0).
|
|
222652
|
+
- The "diagramFormat" and "diagramType" fields are required for each paragraph.
|
|
222653
|
+
- Only include atom types that are actually extracted.
|
|
222654
|
+
- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
|
|
222655
|
+
- JSON structure keys must always be in English.
|
|
222656
|
+
- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
|
|
222657
|
+
- **Extract ALL nodes and edges** — do not sample or skip.
|
|
222658
|
+
- Do NOT include "claims" — they are system-generated.`;
|
|
222659
|
+
function buildDocDiagramAnnotationPrompt(diagramText) {
|
|
222660
|
+
return `Analyze and extract atoms from the following diagram paragraphs.
|
|
222661
|
+
Each paragraph is tagged with [P0], [P1], etc. First identify the diagram format and type, then extract atoms accordingly.
|
|
222662
|
+
|
|
222663
|
+
---
|
|
222664
|
+
${diagramText}
|
|
222665
|
+
---
|
|
222666
|
+
|
|
222667
|
+
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
222668
|
+
}
|
|
222669
|
+
var DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT = DIAGRAM_SYSTEM_PROMPT;
|
|
222277
222670
|
// ../llm/src/chunking/markdownChunker.ts
|
|
222278
222671
|
var DEFAULT_MAX_TOKENS2 = 4000;
|
|
222672
|
+
var DEFAULT_PARAGRAPH_MAX_TOKENS = 500;
|
|
222279
222673
|
function estimateTokens(text2) {
|
|
222280
222674
|
return Math.ceil(text2.length / 4);
|
|
222281
222675
|
}
|
|
222676
|
+
function findCodeBlockRanges(content) {
|
|
222677
|
+
const ranges = [];
|
|
222678
|
+
const fenceRe = /^(`{3,}|~{3,})/gm;
|
|
222679
|
+
let openStart = -1;
|
|
222680
|
+
let openFence = "";
|
|
222681
|
+
let match;
|
|
222682
|
+
while ((match = fenceRe.exec(content)) !== null) {
|
|
222683
|
+
const fence = match[1];
|
|
222684
|
+
if (openStart === -1) {
|
|
222685
|
+
openStart = match.index;
|
|
222686
|
+
openFence = fence[0].repeat(fence.length);
|
|
222687
|
+
} else if (fence[0] === openFence[0] && fence.length >= openFence.length) {
|
|
222688
|
+
ranges.push({ start: openStart, end: match.index + match[0].length });
|
|
222689
|
+
openStart = -1;
|
|
222690
|
+
openFence = "";
|
|
222691
|
+
}
|
|
222692
|
+
}
|
|
222693
|
+
if (openStart !== -1) {
|
|
222694
|
+
ranges.push({ start: openStart, end: content.length });
|
|
222695
|
+
}
|
|
222696
|
+
return ranges;
|
|
222697
|
+
}
|
|
222698
|
+
function isInsideCodeBlock(pos, ranges) {
|
|
222699
|
+
for (const r of ranges) {
|
|
222700
|
+
if (pos >= r.start && pos < r.end)
|
|
222701
|
+
return true;
|
|
222702
|
+
if (r.start > pos)
|
|
222703
|
+
break;
|
|
222704
|
+
}
|
|
222705
|
+
return false;
|
|
222706
|
+
}
|
|
222282
222707
|
function parseSections(content) {
|
|
222283
|
-
const
|
|
222284
|
-
const sections = [];
|
|
222708
|
+
const codeRanges = findCodeBlockRanges(content);
|
|
222285
222709
|
const matches = [];
|
|
222286
|
-
|
|
222287
|
-
|
|
222288
|
-
|
|
222289
|
-
|
|
222290
|
-
|
|
222291
|
-
|
|
222292
|
-
|
|
222710
|
+
const atxRe = /^(#{1,6})\s+(.*)$/gm;
|
|
222711
|
+
let m;
|
|
222712
|
+
while ((m = atxRe.exec(content)) !== null) {
|
|
222713
|
+
if (!isInsideCodeBlock(m.index, codeRanges)) {
|
|
222714
|
+
matches.push({
|
|
222715
|
+
index: m.index,
|
|
222716
|
+
endIndex: m.index + m[0].length,
|
|
222717
|
+
level: m[1].length,
|
|
222718
|
+
heading: m[2].trim()
|
|
222719
|
+
});
|
|
222720
|
+
}
|
|
222293
222721
|
}
|
|
222722
|
+
const lines = content.split(`
|
|
222723
|
+
`);
|
|
222724
|
+
let offset = 0;
|
|
222725
|
+
for (let i = 0;i < lines.length; i++) {
|
|
222726
|
+
const line = lines[i];
|
|
222727
|
+
if (i > 0) {
|
|
222728
|
+
const prevLine = lines[i - 1].trim();
|
|
222729
|
+
const prevLineStart = offset - lines[i - 1].length - 1;
|
|
222730
|
+
if (prevLine && !isInsideCodeBlock(prevLineStart, codeRanges)) {
|
|
222731
|
+
if (/^={2,}\s*$/.test(line)) {
|
|
222732
|
+
matches.push({
|
|
222733
|
+
index: prevLineStart < 0 ? 0 : prevLineStart,
|
|
222734
|
+
endIndex: offset + line.length,
|
|
222735
|
+
level: 1,
|
|
222736
|
+
heading: prevLine
|
|
222737
|
+
});
|
|
222738
|
+
} else if (/^-{2,}\s*$/.test(line) && !/^-{3,}\s*$/.test(prevLine)) {
|
|
222739
|
+
matches.push({
|
|
222740
|
+
index: prevLineStart < 0 ? 0 : prevLineStart,
|
|
222741
|
+
endIndex: offset + line.length,
|
|
222742
|
+
level: 2,
|
|
222743
|
+
heading: prevLine
|
|
222744
|
+
});
|
|
222745
|
+
}
|
|
222746
|
+
}
|
|
222747
|
+
}
|
|
222748
|
+
offset += line.length + 1;
|
|
222749
|
+
}
|
|
222750
|
+
matches.sort((a, b) => a.index - b.index);
|
|
222751
|
+
const deduped = [];
|
|
222752
|
+
for (const match of matches) {
|
|
222753
|
+
const last = deduped[deduped.length - 1];
|
|
222754
|
+
if (last && match.index < last.endIndex)
|
|
222755
|
+
continue;
|
|
222756
|
+
deduped.push(match);
|
|
222757
|
+
}
|
|
222758
|
+
return buildSectionsFromMatches(content, deduped);
|
|
222759
|
+
}
|
|
222760
|
+
function buildSectionsFromMatches(content, matches) {
|
|
222761
|
+
const sections = [];
|
|
222294
222762
|
if (matches.length === 0) {
|
|
222295
222763
|
const body = content.trim();
|
|
222296
222764
|
if (body) {
|
|
222297
|
-
sections.push({
|
|
222298
|
-
heading: "",
|
|
222299
|
-
level: 0,
|
|
222300
|
-
body,
|
|
222301
|
-
paragraphs: splitParagraphs(body)
|
|
222302
|
-
});
|
|
222765
|
+
sections.push({ heading: "", level: 0, body, paragraphs: splitParagraphs(body) });
|
|
222303
222766
|
}
|
|
222304
222767
|
return sections;
|
|
222305
222768
|
}
|
|
222306
222769
|
if (matches[0].index > 0) {
|
|
222307
222770
|
const preBody = content.slice(0, matches[0].index).trim();
|
|
222308
222771
|
if (preBody) {
|
|
222309
|
-
sections.push({
|
|
222310
|
-
heading: "",
|
|
222311
|
-
level: 0,
|
|
222312
|
-
body: preBody,
|
|
222313
|
-
paragraphs: splitParagraphs(preBody)
|
|
222314
|
-
});
|
|
222772
|
+
sections.push({ heading: "", level: 0, body: preBody, paragraphs: splitParagraphs(preBody) });
|
|
222315
222773
|
}
|
|
222316
222774
|
}
|
|
222317
222775
|
for (let i = 0;i < matches.length; i++) {
|
|
222318
222776
|
const m = matches[i];
|
|
222319
|
-
const
|
|
222320
|
-
const
|
|
222321
|
-
const
|
|
222322
|
-
const headingLineEnd = fullText.indexOf(`
|
|
222323
|
-
`);
|
|
222324
|
-
const body = headingLineEnd === -1 ? "" : fullText.slice(headingLineEnd + 1).trim();
|
|
222777
|
+
const bodyStart = m.endIndex;
|
|
222778
|
+
const bodyEnd = i + 1 < matches.length ? matches[i + 1].index : content.length;
|
|
222779
|
+
const body = content.slice(bodyStart, bodyEnd).trim();
|
|
222325
222780
|
sections.push({
|
|
222326
222781
|
heading: m.heading,
|
|
222327
222782
|
level: m.level,
|
|
@@ -222336,6 +222791,128 @@ function splitParagraphs(text2) {
|
|
|
222336
222791
|
return [];
|
|
222337
222792
|
return text2.split(/\n\n+/).map((p) => p.trim()).filter(Boolean);
|
|
222338
222793
|
}
|
|
222794
|
+
function splitOversizedText(text2, maxTokens) {
|
|
222795
|
+
const doubleNewlineParts = text2.split(/\n\n+/).map((p) => p.trim()).filter(Boolean);
|
|
222796
|
+
if (doubleNewlineParts.length > 1) {
|
|
222797
|
+
const results = [];
|
|
222798
|
+
let acc = "";
|
|
222799
|
+
let accTokens = 0;
|
|
222800
|
+
for (const part of doubleNewlineParts) {
|
|
222801
|
+
const partTokens = estimateTokens(part);
|
|
222802
|
+
if (partTokens > maxTokens) {
|
|
222803
|
+
if (acc) {
|
|
222804
|
+
results.push(acc);
|
|
222805
|
+
acc = "";
|
|
222806
|
+
accTokens = 0;
|
|
222807
|
+
}
|
|
222808
|
+
results.push(...splitOversizedText(part, maxTokens));
|
|
222809
|
+
continue;
|
|
222810
|
+
}
|
|
222811
|
+
if (acc && accTokens + partTokens > maxTokens) {
|
|
222812
|
+
results.push(acc);
|
|
222813
|
+
acc = "";
|
|
222814
|
+
accTokens = 0;
|
|
222815
|
+
}
|
|
222816
|
+
acc = acc ? acc + `
|
|
222817
|
+
|
|
222818
|
+
` + part : part;
|
|
222819
|
+
accTokens += partTokens;
|
|
222820
|
+
}
|
|
222821
|
+
if (acc)
|
|
222822
|
+
results.push(acc);
|
|
222823
|
+
return results;
|
|
222824
|
+
}
|
|
222825
|
+
const lines = text2.split(`
|
|
222826
|
+
`);
|
|
222827
|
+
if (lines.length > 1) {
|
|
222828
|
+
const blocks = mergeAtomicBlocks(lines);
|
|
222829
|
+
const results = [];
|
|
222830
|
+
let acc = "";
|
|
222831
|
+
let accTokens = 0;
|
|
222832
|
+
for (const block of blocks) {
|
|
222833
|
+
const blockTokens = estimateTokens(block);
|
|
222834
|
+
if (blockTokens > maxTokens) {
|
|
222835
|
+
if (acc) {
|
|
222836
|
+
results.push(acc);
|
|
222837
|
+
acc = "";
|
|
222838
|
+
accTokens = 0;
|
|
222839
|
+
}
|
|
222840
|
+
results.push(block);
|
|
222841
|
+
continue;
|
|
222842
|
+
}
|
|
222843
|
+
if (acc && accTokens + blockTokens > maxTokens) {
|
|
222844
|
+
results.push(acc);
|
|
222845
|
+
acc = "";
|
|
222846
|
+
accTokens = 0;
|
|
222847
|
+
}
|
|
222848
|
+
acc = acc ? acc + `
|
|
222849
|
+
` + block : block;
|
|
222850
|
+
accTokens += blockTokens;
|
|
222851
|
+
}
|
|
222852
|
+
if (acc)
|
|
222853
|
+
results.push(acc);
|
|
222854
|
+
return results;
|
|
222855
|
+
}
|
|
222856
|
+
return forceBreakText(text2, maxTokens);
|
|
222857
|
+
}
|
|
222858
|
+
function mergeAtomicBlocks(lines) {
|
|
222859
|
+
const result = [];
|
|
222860
|
+
let i = 0;
|
|
222861
|
+
while (i < lines.length) {
|
|
222862
|
+
const line = lines[i];
|
|
222863
|
+
const trimmed = line.trimStart();
|
|
222864
|
+
if (/^(`{3,}|~{3,})/.test(trimmed)) {
|
|
222865
|
+
const fence = trimmed.match(/^(`{3,}|~{3,})/)[1];
|
|
222866
|
+
const fenceChar = fence[0];
|
|
222867
|
+
const fenceLen = fence.length;
|
|
222868
|
+
const blockLines = [line];
|
|
222869
|
+
i++;
|
|
222870
|
+
while (i < lines.length) {
|
|
222871
|
+
blockLines.push(lines[i]);
|
|
222872
|
+
const inner = lines[i].trimStart();
|
|
222873
|
+
if (inner.startsWith(fenceChar) && inner.match(new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`))) {
|
|
222874
|
+
i++;
|
|
222875
|
+
break;
|
|
222876
|
+
}
|
|
222877
|
+
i++;
|
|
222878
|
+
}
|
|
222879
|
+
result.push(blockLines.join(`
|
|
222880
|
+
`));
|
|
222881
|
+
continue;
|
|
222882
|
+
}
|
|
222883
|
+
if (trimmed.startsWith("|")) {
|
|
222884
|
+
const tableLines = [line];
|
|
222885
|
+
i++;
|
|
222886
|
+
while (i < lines.length && lines[i].trimStart().startsWith("|")) {
|
|
222887
|
+
tableLines.push(lines[i]);
|
|
222888
|
+
i++;
|
|
222889
|
+
}
|
|
222890
|
+
result.push(tableLines.join(`
|
|
222891
|
+
`));
|
|
222892
|
+
continue;
|
|
222893
|
+
}
|
|
222894
|
+
result.push(line);
|
|
222895
|
+
i++;
|
|
222896
|
+
}
|
|
222897
|
+
return result;
|
|
222898
|
+
}
|
|
222899
|
+
function forceBreakText(text2, maxTokens) {
|
|
222900
|
+
const maxChars = maxTokens * 4;
|
|
222901
|
+
const results = [];
|
|
222902
|
+
let remaining = text2;
|
|
222903
|
+
while (remaining.length > maxChars) {
|
|
222904
|
+
let breakAt = maxChars;
|
|
222905
|
+
const spaceIdx = remaining.lastIndexOf(" ", maxChars);
|
|
222906
|
+
if (spaceIdx > maxChars * 0.7) {
|
|
222907
|
+
breakAt = spaceIdx;
|
|
222908
|
+
}
|
|
222909
|
+
results.push(remaining.slice(0, breakAt).trim());
|
|
222910
|
+
remaining = remaining.slice(breakAt).trim();
|
|
222911
|
+
}
|
|
222912
|
+
if (remaining)
|
|
222913
|
+
results.push(remaining);
|
|
222914
|
+
return results;
|
|
222915
|
+
}
|
|
222339
222916
|
function buildBreadcrumb(sections, sectionIndex) {
|
|
222340
222917
|
const current = sections[sectionIndex];
|
|
222341
222918
|
if (current.level <= 0)
|
|
@@ -222364,11 +222941,53 @@ function sectionHeadingLine(section) {
|
|
|
222364
222941
|
return "";
|
|
222365
222942
|
return `${"#".repeat(section.level)} ${section.heading}`;
|
|
222366
222943
|
}
|
|
222944
|
+
function buildCoarseParagraphs(sections, paragraphMaxTokens) {
|
|
222945
|
+
const result = [];
|
|
222946
|
+
const rawEntries = [];
|
|
222947
|
+
for (let sIdx = 0;sIdx < sections.length; sIdx++) {
|
|
222948
|
+
const section = sections[sIdx];
|
|
222949
|
+
if (!section.body.trim())
|
|
222950
|
+
continue;
|
|
222951
|
+
const bodyTokens = estimateTokens(section.body);
|
|
222952
|
+
if (bodyTokens > paragraphMaxTokens) {
|
|
222953
|
+
const parts = splitOversizedText(section.body, paragraphMaxTokens);
|
|
222954
|
+
for (const part of parts) {
|
|
222955
|
+
rawEntries.push({ sectionIndex: sIdx, text: part, tokens: estimateTokens(part) });
|
|
222956
|
+
}
|
|
222957
|
+
} else {
|
|
222958
|
+
rawEntries.push({ sectionIndex: sIdx, text: section.body, tokens: bodyTokens });
|
|
222959
|
+
}
|
|
222960
|
+
}
|
|
222961
|
+
const MERGE_THRESHOLD = 150;
|
|
222962
|
+
const merged = [];
|
|
222963
|
+
for (const entry of rawEntries) {
|
|
222964
|
+
const last = merged[merged.length - 1];
|
|
222965
|
+
if (last && last.tokens < MERGE_THRESHOLD && entry.tokens < MERGE_THRESHOLD && last.tokens + entry.tokens <= paragraphMaxTokens) {
|
|
222966
|
+
last.text = last.text + `
|
|
222967
|
+
|
|
222968
|
+
` + entry.text;
|
|
222969
|
+
last.tokens += entry.tokens;
|
|
222970
|
+
} else {
|
|
222971
|
+
merged.push({ ...entry });
|
|
222972
|
+
}
|
|
222973
|
+
}
|
|
222974
|
+
let pIdx = 0;
|
|
222975
|
+
for (const entry of merged) {
|
|
222976
|
+
result.push({
|
|
222977
|
+
sectionIndex: entry.sectionIndex,
|
|
222978
|
+
paragraphIndex: pIdx++,
|
|
222979
|
+
text: entry.text
|
|
222980
|
+
});
|
|
222981
|
+
}
|
|
222982
|
+
return result;
|
|
222983
|
+
}
|
|
222367
222984
|
function chunkMarkdown(content, options = {}) {
|
|
222368
222985
|
const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS2;
|
|
222986
|
+
const paragraphMaxTokens = options.paragraphMaxTokens ?? DEFAULT_PARAGRAPH_MAX_TOKENS;
|
|
222369
222987
|
const sections = parseSections(content);
|
|
222370
222988
|
if (sections.length === 0)
|
|
222371
222989
|
return [];
|
|
222990
|
+
const coarseParagraphs = buildCoarseParagraphs(sections, paragraphMaxTokens);
|
|
222372
222991
|
const chunks = [];
|
|
222373
222992
|
let pendingSections = [];
|
|
222374
222993
|
let pendingTokens = 0;
|
|
@@ -222386,14 +223005,16 @@ function chunkMarkdown(content, options = {}) {
|
|
|
222386
223005
|
const heading = sectionHeadingLine(entry.section);
|
|
222387
223006
|
if (heading)
|
|
222388
223007
|
textParts.push(heading);
|
|
222389
|
-
|
|
222390
|
-
|
|
222391
|
-
|
|
222392
|
-
|
|
222393
|
-
|
|
222394
|
-
|
|
222395
|
-
|
|
222396
|
-
|
|
223008
|
+
const sectionParas = coarseParagraphs.filter((p) => p.sectionIndex === entry.sectionIndex);
|
|
223009
|
+
for (const p of sectionParas) {
|
|
223010
|
+
if (!paragraphs.some((existing) => existing.paragraphIndex === p.paragraphIndex && existing.text === p.text)) {
|
|
223011
|
+
textParts.push(p.text);
|
|
223012
|
+
paragraphs.push({
|
|
223013
|
+
sectionIndex: p.sectionIndex,
|
|
223014
|
+
paragraphIndex: p.paragraphIndex,
|
|
223015
|
+
text: p.text
|
|
223016
|
+
});
|
|
223017
|
+
}
|
|
222397
223018
|
}
|
|
222398
223019
|
}
|
|
222399
223020
|
chunks.push({
|
|
@@ -222416,7 +223037,7 @@ function chunkMarkdown(content, options = {}) {
|
|
|
222416
223037
|
` : "") + section.body);
|
|
222417
223038
|
if (sectionTokens > maxTokens && section.paragraphs.length > 1) {
|
|
222418
223039
|
flushPending();
|
|
222419
|
-
splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks);
|
|
223040
|
+
splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks, coarseParagraphs);
|
|
222420
223041
|
continue;
|
|
222421
223042
|
}
|
|
222422
223043
|
const crumbTokens = pendingSections.length === 0 ? estimateTokens(breadcrumbPrefix(breadcrumb)) : 0;
|
|
@@ -222429,9 +223050,10 @@ function chunkMarkdown(content, options = {}) {
|
|
|
222429
223050
|
flushPending();
|
|
222430
223051
|
return chunks;
|
|
222431
223052
|
}
|
|
222432
|
-
function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks) {
|
|
223053
|
+
function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks, coarseParagraphs) {
|
|
222433
223054
|
const headingLine = sectionHeadingLine(section);
|
|
222434
223055
|
const prefix = breadcrumbPrefix(breadcrumb);
|
|
223056
|
+
const sectionParas = coarseParagraphs.filter((p) => p.sectionIndex === sectionIndex);
|
|
222435
223057
|
let accParagraphs = [];
|
|
222436
223058
|
let accTextParts = [];
|
|
222437
223059
|
let accTokens = 0;
|
|
@@ -222458,18 +223080,265 @@ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens,
|
|
|
222458
223080
|
accTokens = baseOverhead;
|
|
222459
223081
|
}
|
|
222460
223082
|
accTokens = baseOverhead;
|
|
222461
|
-
for (
|
|
222462
|
-
const
|
|
222463
|
-
const pTokens = estimateTokens(pText);
|
|
223083
|
+
for (const p of sectionParas) {
|
|
223084
|
+
const pTokens = estimateTokens(p.text);
|
|
222464
223085
|
if (accParagraphs.length > 0 && accTokens + pTokens > maxTokens) {
|
|
222465
223086
|
flushAcc();
|
|
222466
223087
|
}
|
|
222467
|
-
accParagraphs.push({ sectionIndex, paragraphIndex:
|
|
222468
|
-
accTextParts.push(
|
|
223088
|
+
accParagraphs.push({ sectionIndex, paragraphIndex: p.paragraphIndex, text: p.text });
|
|
223089
|
+
accTextParts.push(p.text);
|
|
222469
223090
|
accTokens += pTokens;
|
|
222470
223091
|
}
|
|
222471
223092
|
flushAcc();
|
|
222472
223093
|
}
|
|
223094
|
+
// ../llm/src/chunking/normalizeMarkdown.ts
|
|
223095
|
+
function normalizeMarkdown(content) {
|
|
223096
|
+
const stats = { repairs: {} };
|
|
223097
|
+
function count(category) {
|
|
223098
|
+
stats.repairs[category] = (stats.repairs[category] ?? 0) + 1;
|
|
223099
|
+
}
|
|
223100
|
+
let result = stripBomAndInvisible(content, count);
|
|
223101
|
+
result = normalizeLineEndings(result, count);
|
|
223102
|
+
const lines = result.split(`
|
|
223103
|
+
`);
|
|
223104
|
+
const output = processBlocks(lines, count);
|
|
223105
|
+
return { content: output.join(`
|
|
223106
|
+
`), stats };
|
|
223107
|
+
}
|
|
223108
|
+
function stripBomAndInvisible(text2, count) {
|
|
223109
|
+
const cleaned = text2.replace(/[\uFEFF\u200B\u200C\u200D]/g, "");
|
|
223110
|
+
if (cleaned.length !== text2.length) {
|
|
223111
|
+
count("invisible_chars");
|
|
223112
|
+
}
|
|
223113
|
+
return cleaned;
|
|
223114
|
+
}
|
|
223115
|
+
function normalizeLineEndings(text2, count) {
|
|
223116
|
+
if (text2.includes("\r")) {
|
|
223117
|
+
count("line_endings");
|
|
223118
|
+
return text2.replace(/\r\n?/g, `
|
|
223119
|
+
`);
|
|
223120
|
+
}
|
|
223121
|
+
return text2;
|
|
223122
|
+
}
|
|
223123
|
+
function processBlocks(inputLines, count) {
|
|
223124
|
+
const lines = splitInlineFences(inputLines, count);
|
|
223125
|
+
const output = [];
|
|
223126
|
+
let i = 0;
|
|
223127
|
+
while (i < lines.length) {
|
|
223128
|
+
const line = lines[i];
|
|
223129
|
+
const trimmed = line.trimStart();
|
|
223130
|
+
const fenceMatch = trimmed.match(/^(`{3,}|~{3,})/);
|
|
223131
|
+
if (fenceMatch) {
|
|
223132
|
+
const result = handleCodeFence(lines, i, fenceMatch[1], count);
|
|
223133
|
+
output.push(...result.lines);
|
|
223134
|
+
i = result.nextIndex;
|
|
223135
|
+
continue;
|
|
223136
|
+
}
|
|
223137
|
+
if (looksLikeTableRow(trimmed)) {
|
|
223138
|
+
const result = handleTableBlock(lines, i, count);
|
|
223139
|
+
output.push(...result.lines);
|
|
223140
|
+
i = result.nextIndex;
|
|
223141
|
+
continue;
|
|
223142
|
+
}
|
|
223143
|
+
if (trimmed === "") {
|
|
223144
|
+
const result = handleBlankLines(lines, i, count);
|
|
223145
|
+
output.push(...result.lines);
|
|
223146
|
+
i = result.nextIndex;
|
|
223147
|
+
continue;
|
|
223148
|
+
}
|
|
223149
|
+
if (trimmed.startsWith("<!--")) {
|
|
223150
|
+
const result = handleHtmlComment(lines, i, count);
|
|
223151
|
+
output.push(...result.lines);
|
|
223152
|
+
i = result.nextIndex;
|
|
223153
|
+
continue;
|
|
223154
|
+
}
|
|
223155
|
+
if (looksLikeJsonBlockStart(trimmed)) {
|
|
223156
|
+
const result = handleUnfencedJson(lines, i, count);
|
|
223157
|
+
if (result) {
|
|
223158
|
+
output.push(...result.lines);
|
|
223159
|
+
i = result.nextIndex;
|
|
223160
|
+
continue;
|
|
223161
|
+
}
|
|
223162
|
+
}
|
|
223163
|
+
output.push(line);
|
|
223164
|
+
i++;
|
|
223165
|
+
}
|
|
223166
|
+
return output;
|
|
223167
|
+
}
|
|
223168
|
+
function handleCodeFence(lines, startIdx, fence, count) {
|
|
223169
|
+
const fenceChar = fence[0];
|
|
223170
|
+
const fenceLen = fence.length;
|
|
223171
|
+
const result = [lines[startIdx]];
|
|
223172
|
+
let i = startIdx + 1;
|
|
223173
|
+
while (i < lines.length) {
|
|
223174
|
+
const trimmed = lines[i].trimStart();
|
|
223175
|
+
result.push(lines[i]);
|
|
223176
|
+
const closingRe = new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`);
|
|
223177
|
+
if (closingRe.test(trimmed)) {
|
|
223178
|
+
return { lines: result, nextIndex: i + 1 };
|
|
223179
|
+
}
|
|
223180
|
+
i++;
|
|
223181
|
+
}
|
|
223182
|
+
count("unclosed_code_fence");
|
|
223183
|
+
result.push(fence);
|
|
223184
|
+
return { lines: result, nextIndex: i };
|
|
223185
|
+
}
|
|
223186
|
+
function handleTableBlock(lines, startIdx, count) {
|
|
223187
|
+
const tableLines = [];
|
|
223188
|
+
let i = startIdx;
|
|
223189
|
+
while (i < lines.length && looksLikeTableRow(lines[i].trimStart())) {
|
|
223190
|
+
tableLines.push(lines[i]);
|
|
223191
|
+
i++;
|
|
223192
|
+
}
|
|
223193
|
+
if (tableLines.length < 2) {
|
|
223194
|
+
return { lines: tableLines, nextIndex: i };
|
|
223195
|
+
}
|
|
223196
|
+
const normalized = tableLines.map((line) => {
|
|
223197
|
+
const trimmed = line.trimStart();
|
|
223198
|
+
if (!trimmed.startsWith("|") && trimmed.includes("|")) {
|
|
223199
|
+
count("table_leading_pipe");
|
|
223200
|
+
return "| " + trimmed + (trimmed.endsWith("|") ? "" : " |");
|
|
223201
|
+
}
|
|
223202
|
+
return line;
|
|
223203
|
+
});
|
|
223204
|
+
const hasSeparator = normalized.some((line) => /^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/.test(line.trim()));
|
|
223205
|
+
if (!hasSeparator && normalized.length >= 2) {
|
|
223206
|
+
const firstRow = normalized[0].trim();
|
|
223207
|
+
const colCount = countPipes(firstRow) - 1;
|
|
223208
|
+
if (colCount >= 2) {
|
|
223209
|
+
const separator = "| " + Array(colCount).fill("---").join(" | ") + " |";
|
|
223210
|
+
count("table_missing_separator");
|
|
223211
|
+
const result = [normalized[0], separator, ...normalized.slice(1)];
|
|
223212
|
+
return { lines: result, nextIndex: i };
|
|
223213
|
+
}
|
|
223214
|
+
}
|
|
223215
|
+
return { lines: normalized, nextIndex: i };
|
|
223216
|
+
}
|
|
223217
|
+
function handleBlankLines(lines, startIdx, count) {
|
|
223218
|
+
let i = startIdx;
|
|
223219
|
+
while (i < lines.length && lines[i].trim() === "") {
|
|
223220
|
+
i++;
|
|
223221
|
+
}
|
|
223222
|
+
const blankCount = i - startIdx;
|
|
223223
|
+
if (blankCount > 2) {
|
|
223224
|
+
count("excessive_blank_lines");
|
|
223225
|
+
return { lines: [""], nextIndex: i };
|
|
223226
|
+
}
|
|
223227
|
+
return { lines: lines.slice(startIdx, i), nextIndex: i };
|
|
223228
|
+
}
|
|
223229
|
+
function handleHtmlComment(lines, startIdx, count) {
|
|
223230
|
+
const firstLine = lines[startIdx];
|
|
223231
|
+
if (firstLine.includes("-->")) {
|
|
223232
|
+
count("html_comment");
|
|
223233
|
+
return { lines: [], nextIndex: startIdx + 1 };
|
|
223234
|
+
}
|
|
223235
|
+
let i = startIdx + 1;
|
|
223236
|
+
while (i < lines.length) {
|
|
223237
|
+
if (lines[i].includes("-->")) {
|
|
223238
|
+
count("html_comment");
|
|
223239
|
+
return { lines: [], nextIndex: i + 1 };
|
|
223240
|
+
}
|
|
223241
|
+
i++;
|
|
223242
|
+
}
|
|
223243
|
+
return { lines: [firstLine], nextIndex: startIdx + 1 };
|
|
223244
|
+
}
|
|
223245
|
+
function looksLikeJsonBlockStart(trimmed) {
|
|
223246
|
+
return trimmed === "{" || trimmed === "[";
|
|
223247
|
+
}
|
|
223248
|
+
var MIN_JSON_BLOCK_LINES = 5;
|
|
223249
|
+
function handleUnfencedJson(lines, startIdx, count) {
|
|
223250
|
+
const opener = lines[startIdx].trimStart();
|
|
223251
|
+
const openChar = opener[0];
|
|
223252
|
+
const closeChar = openChar === "{" ? "}" : "]";
|
|
223253
|
+
let depth = 0;
|
|
223254
|
+
let i = startIdx;
|
|
223255
|
+
let inString = false;
|
|
223256
|
+
while (i < lines.length) {
|
|
223257
|
+
const line = lines[i];
|
|
223258
|
+
for (let c = 0;c < line.length; c++) {
|
|
223259
|
+
const ch = line[c];
|
|
223260
|
+
if (ch === "\\" && inString) {
|
|
223261
|
+
c++;
|
|
223262
|
+
continue;
|
|
223263
|
+
}
|
|
223264
|
+
if (ch === '"') {
|
|
223265
|
+
inString = !inString;
|
|
223266
|
+
continue;
|
|
223267
|
+
}
|
|
223268
|
+
if (inString)
|
|
223269
|
+
continue;
|
|
223270
|
+
if (ch === "/" && c + 1 < line.length && line[c + 1] === "/") {
|
|
223271
|
+
break;
|
|
223272
|
+
}
|
|
223273
|
+
if (ch === "{" || ch === "[")
|
|
223274
|
+
depth++;
|
|
223275
|
+
else if (ch === "}" || ch === "]")
|
|
223276
|
+
depth--;
|
|
223277
|
+
}
|
|
223278
|
+
i++;
|
|
223279
|
+
if (depth === 0) {
|
|
223280
|
+
const blockLen = i - startIdx;
|
|
223281
|
+
if (blockLen < MIN_JSON_BLOCK_LINES) {
|
|
223282
|
+
return null;
|
|
223283
|
+
}
|
|
223284
|
+
const lastTrimmed = lines[i - 1].trimEnd();
|
|
223285
|
+
if (!lastTrimmed.endsWith(closeChar)) {
|
|
223286
|
+
return null;
|
|
223287
|
+
}
|
|
223288
|
+
count("unfenced_json_block");
|
|
223289
|
+
const fenced = ["```json"];
|
|
223290
|
+
for (let j = startIdx;j < i; j++) {
|
|
223291
|
+
fenced.push(lines[j]);
|
|
223292
|
+
}
|
|
223293
|
+
fenced.push("```");
|
|
223294
|
+
return { lines: fenced, nextIndex: i };
|
|
223295
|
+
}
|
|
223296
|
+
if (depth < 0) {
|
|
223297
|
+
return null;
|
|
223298
|
+
}
|
|
223299
|
+
}
|
|
223300
|
+
return null;
|
|
223301
|
+
}
|
|
223302
|
+
function splitInlineFences(lines, count) {
|
|
223303
|
+
const result = [];
|
|
223304
|
+
for (const line of lines) {
|
|
223305
|
+
const trimmed = line.trimStart();
|
|
223306
|
+
if (/^(`{3,}|~{3,})/.test(trimmed)) {
|
|
223307
|
+
result.push(line);
|
|
223308
|
+
continue;
|
|
223309
|
+
}
|
|
223310
|
+
const inlineMatch = trimmed.match(/(`{3,}|~{3,})(\S*)\s*$/);
|
|
223311
|
+
if (inlineMatch) {
|
|
223312
|
+
const fenceStr = inlineMatch[1];
|
|
223313
|
+
const fenceIdx = trimmed.lastIndexOf(fenceStr);
|
|
223314
|
+
const beforeFence = trimmed.substring(0, fenceIdx);
|
|
223315
|
+
if (beforeFence.trim().length > 0) {
|
|
223316
|
+
const leadingWhitespace = line.substring(0, line.length - trimmed.length);
|
|
223317
|
+
count("inline_code_fence");
|
|
223318
|
+
result.push(leadingWhitespace + beforeFence.trimEnd());
|
|
223319
|
+
result.push(trimmed.substring(fenceIdx));
|
|
223320
|
+
continue;
|
|
223321
|
+
}
|
|
223322
|
+
}
|
|
223323
|
+
result.push(line);
|
|
223324
|
+
}
|
|
223325
|
+
return result;
|
|
223326
|
+
}
|
|
223327
|
+
function looksLikeTableRow(trimmed) {
|
|
223328
|
+
if (trimmed.startsWith("#") || trimmed.startsWith("```") || trimmed.startsWith("~~~")) {
|
|
223329
|
+
return false;
|
|
223330
|
+
}
|
|
223331
|
+
return countPipes(trimmed) >= 1;
|
|
223332
|
+
}
|
|
223333
|
+
function countPipes(text2) {
|
|
223334
|
+
let count = 0;
|
|
223335
|
+
for (let i = 0;i < text2.length; i++) {
|
|
223336
|
+
if (text2[i] === "|" && (i === 0 || text2[i - 1] !== "\\")) {
|
|
223337
|
+
count++;
|
|
223338
|
+
}
|
|
223339
|
+
}
|
|
223340
|
+
return count;
|
|
223341
|
+
}
|
|
222473
223342
|
// ../llm/src/utils/mapConcurrent.ts
|
|
222474
223343
|
async function mapConcurrent(items, concurrency, fn) {
|
|
222475
223344
|
const results = [];
|