@c4a/server-cli 0.4.15-alpha.4 → 0.4.15-alpha.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -5
- package/index.js +965 -90
- package/package.json +1 -1
- package/serve.js +2027 -221
- package/web/assets/ContentDetail--oZBzWh0.js +1 -0
- package/web/assets/ContentDetail-B5s8bbFo.js +1 -0
- package/web/assets/ContentDetail-C0zfArPg.js +1 -0
- package/web/assets/ContentDetail-C3kXsx-i.js +1 -0
- package/web/assets/ContentDetail-CcLGF_Yi.js +1 -0
- package/web/assets/ContentDetail-D-2xyerw.js +1 -0
- package/web/assets/ContentDetail-DlQ8URkx.js +1 -0
- package/web/assets/ContentDetail-TPc0m0eM.js +1 -0
- package/web/assets/ContentDetail-y0yi2qln.js +1 -0
- package/web/assets/EntityDetail-3CFtMmgQ.js +1 -0
- package/web/assets/EntityDetail-BI3etmj4.js +1 -0
- package/web/assets/EntityDetail-C9k4cMVL.js +1 -0
- package/web/assets/EntityDetail-CoFb-qZW.js +1 -0
- package/web/assets/EntityDetail-D_WP7tD4.js +1 -0
- package/web/assets/EntityDetail-DiJPemDY.js +1 -0
- package/web/assets/EntityDetail-DihnDvhA.js +1 -0
- package/web/assets/EntityDetail-DyDH4GAw.js +1 -0
- package/web/assets/EntityDetail-dIZiNN2t.js +1 -0
- package/web/assets/RelationDetail-B2gHrceI.js +1 -0
- package/web/assets/RelationDetail-BK8C5waL.js +1 -0
- package/web/assets/RelationDetail-CEq9vopD.js +1 -0
- package/web/assets/RelationDetail-CaYrspaS.js +1 -0
- package/web/assets/RelationDetail-CpoGdy25.js +1 -0
- package/web/assets/RelationDetail-DU9ECyHi.js +1 -0
- package/web/assets/RelationDetail-Dz7HAlU5.js +1 -0
- package/web/assets/RelationDetail-Wh3IgNaF.js +1 -0
- package/web/assets/RelationDetail-zZ_ZfkYX.js +1 -0
- package/web/assets/index-BKETuM1m.js +111 -0
- package/web/assets/index-BPMqeFze.js +111 -0
- package/web/assets/index-BgRuvBL5.js +111 -0
- package/web/assets/index-C96WspeJ.css +1 -0
- package/web/assets/index-CcrkBEZl.js +111 -0
- package/web/assets/index-DGDx8sCs.js +111 -0
- package/web/assets/index-DIyAwnqE.js +111 -0
- package/web/assets/index-DW1cCA8v.js +111 -0
- package/web/assets/index-DiAYi5t8.css +1 -0
- package/web/assets/index-FOCWvgW_.css +1 -0
- package/web/assets/index-daOjyLzy.css +1 -0
- package/web/assets/index-moF8uSEi.js +111 -0
- package/web/assets/index-sPNyENFN.js +111 -0
- package/web/assets/index-uGqDxUnx.css +1 -0
- package/web/index.html +2 -2
package/serve.js
CHANGED
|
@@ -281,6 +281,10 @@ var init_serverConfig = __esm(() => {
|
|
|
281
281
|
default_model: "gemini-3-pro-preview"
|
|
282
282
|
}
|
|
283
283
|
},
|
|
284
|
+
indexing: {
|
|
285
|
+
task_timeout_ms: 150 * 60 * 1000,
|
|
286
|
+
file_timeout_ms: 15 * 60 * 1000
|
|
287
|
+
},
|
|
284
288
|
embedding: {
|
|
285
289
|
provider: "huggingface",
|
|
286
290
|
huggingface: {
|
|
@@ -4340,7 +4344,7 @@ var init_atomsSchema = __esm(() => {
|
|
|
4340
4344
|
init_zod();
|
|
4341
4345
|
init_base();
|
|
4342
4346
|
init_baseSchema();
|
|
4343
|
-
confidenceAtomSchema = exports_external.number().min(0).max(1).optional();
|
|
4347
|
+
confidenceAtomSchema = exports_external.number().min(0).max(1).optional().catch(undefined);
|
|
4344
4348
|
entityAtomSchema = exports_external.object({
|
|
4345
4349
|
name: exports_external.string(),
|
|
4346
4350
|
kind: kindSchema.optional().catch(undefined),
|
|
@@ -186001,6 +186005,10 @@ function mergeServerConfig(parsed) {
|
|
|
186001
186005
|
...isPlainObject3(input.llm?.google) ? input.llm?.google : {}
|
|
186002
186006
|
}
|
|
186003
186007
|
},
|
|
186008
|
+
indexing: {
|
|
186009
|
+
...defaults2.indexing,
|
|
186010
|
+
...isPlainObject3(input.indexing) ? input.indexing : {}
|
|
186011
|
+
},
|
|
186004
186012
|
embedding: {
|
|
186005
186013
|
...defaults2.embedding,
|
|
186006
186014
|
...isPlainObject3(input.embedding) ? input.embedding : {},
|
|
@@ -194956,14 +194964,21 @@ function isRetryableStatus(status) {
|
|
|
194956
194964
|
function isAuthStatus(status) {
|
|
194957
194965
|
return status === 401 || status === 403;
|
|
194958
194966
|
}
|
|
194959
|
-
function
|
|
194960
|
-
|
|
194967
|
+
function throwLlmError(error40, status) {
|
|
194968
|
+
const detail = toErrorMessage(error40);
|
|
194969
|
+
const statusTag = status ? ` [HTTP ${status}]` : "";
|
|
194970
|
+
if (isAuthStatus(status)) {
|
|
194971
|
+
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, `LLM 认证失败${statusTag}: ${detail}`, detail);
|
|
194972
|
+
}
|
|
194973
|
+
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, `LLM 调用失败${statusTag}: ${detail}`, detail);
|
|
194961
194974
|
}
|
|
194962
194975
|
|
|
194963
194976
|
class LlmServiceImpl {
|
|
194964
194977
|
options;
|
|
194978
|
+
supportsTemperature;
|
|
194965
194979
|
constructor(options) {
|
|
194966
194980
|
this.options = options;
|
|
194981
|
+
this.supportsTemperature = options.provider !== "openai";
|
|
194967
194982
|
}
|
|
194968
194983
|
async generateText(prompt, options) {
|
|
194969
194984
|
if (this.options.forceStream) {
|
|
@@ -194975,7 +194990,7 @@ class LlmServiceImpl {
|
|
|
194975
194990
|
model: this.options.languageModel,
|
|
194976
194991
|
prompt,
|
|
194977
194992
|
maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
194978
|
-
temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
|
|
194993
|
+
...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
|
|
194979
194994
|
maxRetries: 0
|
|
194980
194995
|
};
|
|
194981
194996
|
if (options?.systemPrompt) {
|
|
@@ -195012,13 +195027,7 @@ class LlmServiceImpl {
|
|
|
195012
195027
|
durationMs,
|
|
195013
195028
|
error: toErrorMessage(error40)
|
|
195014
195029
|
});
|
|
195015
|
-
|
|
195016
|
-
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
|
|
195017
|
-
}
|
|
195018
|
-
if (isBadRequest(status)) {
|
|
195019
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195020
|
-
}
|
|
195021
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195030
|
+
throwLlmError(error40, status);
|
|
195022
195031
|
}
|
|
195023
195032
|
}
|
|
195024
195033
|
async generateTextViaStream(prompt, options) {
|
|
@@ -195028,7 +195037,7 @@ class LlmServiceImpl {
|
|
|
195028
195037
|
model: this.options.languageModel,
|
|
195029
195038
|
prompt,
|
|
195030
195039
|
maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
195031
|
-
temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
|
|
195040
|
+
...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
|
|
195032
195041
|
maxRetries: 0
|
|
195033
195042
|
};
|
|
195034
195043
|
if (options?.systemPrompt) {
|
|
@@ -195066,13 +195075,7 @@ class LlmServiceImpl {
|
|
|
195066
195075
|
durationMs: Date.now() - startedAt,
|
|
195067
195076
|
error: toErrorMessage(error40)
|
|
195068
195077
|
});
|
|
195069
|
-
|
|
195070
|
-
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
|
|
195071
|
-
}
|
|
195072
|
-
if (isBadRequest(status)) {
|
|
195073
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195074
|
-
}
|
|
195075
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195078
|
+
throwLlmError(error40, status);
|
|
195076
195079
|
}
|
|
195077
195080
|
}
|
|
195078
195081
|
streamText(prompt, options) {
|
|
@@ -195095,7 +195098,7 @@ class LlmServiceImpl {
|
|
|
195095
195098
|
model: this.options.languageModel,
|
|
195096
195099
|
prompt,
|
|
195097
195100
|
maxOutputTokens: options?.maxTokens ?? DEFAULT_MAX_TOKENS,
|
|
195098
|
-
temperature: options?.temperature ?? DEFAULT_TEMPERATURE,
|
|
195101
|
+
...this.supportsTemperature ? { temperature: options?.temperature ?? DEFAULT_TEMPERATURE } : {},
|
|
195099
195102
|
maxRetries: 0,
|
|
195100
195103
|
onFinish: (event) => {
|
|
195101
195104
|
const finishEvent = event;
|
|
@@ -195141,13 +195144,7 @@ class LlmServiceImpl {
|
|
|
195141
195144
|
durationMs: Date.now() - startedAt,
|
|
195142
195145
|
error: toErrorMessage(error40)
|
|
195143
195146
|
});
|
|
195144
|
-
|
|
195145
|
-
throw new C4AError("LLM_AUTH_FAILED" /* LLM_AUTH_FAILED */, "LLM 认证失败", toErrorMessage(error40));
|
|
195146
|
-
}
|
|
195147
|
-
if (isBadRequest(status)) {
|
|
195148
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195149
|
-
}
|
|
195150
|
-
throw new C4AError("LLM_CALL_FAILED" /* LLM_CALL_FAILED */, "LLM 调用失败", toErrorMessage(error40));
|
|
195147
|
+
throwLlmError(error40, status);
|
|
195151
195148
|
}
|
|
195152
195149
|
}
|
|
195153
195150
|
}
|
|
@@ -195858,7 +195855,12 @@ function parseExtractionOutput(raw5, schema2) {
|
|
|
195858
195855
|
return { success: false, error: new Error("Empty output") };
|
|
195859
195856
|
}
|
|
195860
195857
|
const protocolParsed = tryParseProtocol(trimmed);
|
|
195861
|
-
|
|
195858
|
+
let parsed = protocolParsed ?? tryParseJson(trimmed);
|
|
195859
|
+
if (Array.isArray(parsed)) {
|
|
195860
|
+
parsed = { paragraphs: parsed };
|
|
195861
|
+
}
|
|
195862
|
+
parsed = normalizeFlatOutput(parsed);
|
|
195863
|
+
parsed = stripNulls(parsed);
|
|
195862
195864
|
const result = schema2.safeParse(parsed);
|
|
195863
195865
|
if (!result.success) {
|
|
195864
195866
|
return { success: false, error: result.error };
|
|
@@ -195928,6 +195930,37 @@ function tryParseJson(raw5) {
|
|
|
195928
195930
|
function repairAndParse(raw5) {
|
|
195929
195931
|
return JSON.parse(jsonrepair(raw5));
|
|
195930
195932
|
}
|
|
195933
|
+
var PARAGRAPH_TAG_RE = /^P\d+$/;
|
|
195934
|
+
function normalizeFlatOutput(parsed) {
|
|
195935
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
|
|
195936
|
+
return parsed;
|
|
195937
|
+
const obj = parsed;
|
|
195938
|
+
if ("paragraphs" in obj)
|
|
195939
|
+
return parsed;
|
|
195940
|
+
const keys = Object.keys(obj);
|
|
195941
|
+
if (keys.length === 0)
|
|
195942
|
+
return { paragraphs: [] };
|
|
195943
|
+
const allTags = keys.every((k) => PARAGRAPH_TAG_RE.test(k));
|
|
195944
|
+
if (!allTags)
|
|
195945
|
+
return parsed;
|
|
195946
|
+
const paragraphs = keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag2) => ({ tag: tag2, atoms: obj[tag2] }));
|
|
195947
|
+
return { paragraphs };
|
|
195948
|
+
}
|
|
195949
|
+
function stripNulls(value) {
|
|
195950
|
+
if (value === null)
|
|
195951
|
+
return;
|
|
195952
|
+
if (Array.isArray(value))
|
|
195953
|
+
return value.map(stripNulls);
|
|
195954
|
+
if (typeof value === "object" && value !== null) {
|
|
195955
|
+
const out2 = {};
|
|
195956
|
+
for (const [k, v] of Object.entries(value)) {
|
|
195957
|
+
if (v !== null)
|
|
195958
|
+
out2[k] = stripNulls(v);
|
|
195959
|
+
}
|
|
195960
|
+
return out2;
|
|
195961
|
+
}
|
|
195962
|
+
return value;
|
|
195963
|
+
}
|
|
195931
195964
|
function isRecord(value) {
|
|
195932
195965
|
return !!value && typeof value === "object" && "key" in value && "value" in value && typeof value.key === "string";
|
|
195933
195966
|
}
|
|
@@ -196184,20 +196217,20 @@ class GleaningExtractor {
|
|
|
196184
196217
|
// ../llm/src/prompts/docAtomAnnotation.ts
|
|
196185
196218
|
init_src();
|
|
196186
196219
|
var DOC_ATOM_DEFS = [
|
|
196187
|
-
["entities", "Named things: systems, services, modules,
|
|
196220
|
+
["entities", "Named things with independent identity — something you can ask questions about ('What does X do?', 'Who owns X?'). Examples: systems, services, modules, APIs, products. If it is a value, address, path, or configuration detail, it is an attribute of an entity, not an entity itself. (NOT people/teams — use roles for those). kind: implementation=internal systems/services, external=third-party dependencies, concept=abstract/not-yet-implemented", entityAtomSchema],
|
|
196188
196221
|
["relations", "Connections between entities", relationAtomSchema],
|
|
196189
196222
|
["behaviors", "Actions/operations: functions, API calls, user actions, workflows", behaviorAtomSchema],
|
|
196190
196223
|
["attributes", "Properties of entities", attributeAtomSchema],
|
|
196191
196224
|
["states", "Possible states of entities", stateAtomSchema],
|
|
196192
|
-
["rules", "
|
|
196225
|
+
["rules", "Conditional business/domain logic: IF condition THEN consequence (e.g., 'IF user not authenticated THEN reject request')", ruleAtomSchema],
|
|
196193
196226
|
["transitions", "State changes: from→to triggered by events or guards", transitionAtomSchema],
|
|
196194
196227
|
["events", "Occurrences that trigger behaviors", eventAtomSchema],
|
|
196195
196228
|
["decisions", "Architectural or business decisions", decisionAtomSchema],
|
|
196196
196229
|
["metrics", "Measurable targets: SLA, throughput, error_rate, with thresholds", metricAtomSchema],
|
|
196197
|
-
["roles", "Actors: human roles, teams, personas that perform behaviors", roleAtomSchema],
|
|
196198
|
-
["constraints", "
|
|
196230
|
+
["roles", "Actors: human roles, teams, personas that perform behaviors. kind: human=individual role, team=group/department, persona=user archetype. System-triggered actions use entity relations, NOT roles", roleAtomSchema],
|
|
196231
|
+
["constraints", "Declarative requirements: 'X must/should/must-not Y' (e.g., 'passwords must be >= 8 chars'). Unlike rules, constraints have no IF-THEN condition — they are unconditional mandates or restrictions", constraintAtomSchema],
|
|
196199
196232
|
["comparisons", "Side-by-side evaluations", comparisonAtomSchema],
|
|
196200
|
-
["boundaries", "
|
|
196233
|
+
["boundaries", "Explicit scope declarations: what is included vs excluded. Only extract when the text explicitly declares scope (e.g., 'this product covers X but NOT Y'). Implicit containment (A runs inside B) is expressed via entity relations, not boundaries", boundaryAtomSchema]
|
|
196201
196234
|
];
|
|
196202
196235
|
function buildAtomTypesBlock() {
|
|
196203
196236
|
return DOC_ATOM_DEFS.map(([name21, desc, schema2], i) => {
|
|
@@ -196220,30 +196253,38 @@ Each atom type has specific required fields. Fields with "?" suffix are optional
|
|
|
196220
196253
|
${ATOM_TYPES_BLOCK}
|
|
196221
196254
|
|
|
196222
196255
|
## Output Format
|
|
196223
|
-
Return a single JSON object
|
|
196256
|
+
Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
|
|
196224
196257
|
{
|
|
196225
|
-
"
|
|
196226
|
-
{
|
|
196227
|
-
|
|
196228
|
-
|
|
196229
|
-
|
|
196230
|
-
|
|
196231
|
-
|
|
196232
|
-
|
|
196233
|
-
}
|
|
196234
|
-
]
|
|
196258
|
+
"P0": {
|
|
196259
|
+
"entities": [{ "name": "UserService", "kind": "implementation", "confidence": 0.95 }],
|
|
196260
|
+
"relations": [{ "from": "UserService", "to": "Database", "type": "DEPENDS_ON", "confidence": 0.9 }]
|
|
196261
|
+
},
|
|
196262
|
+
"P3": {
|
|
196263
|
+
"constraints": [{ "description": "User must be authenticated before access", "severity": "must", "confidence": 0.9 }],
|
|
196264
|
+
"rules": [{ "description": "Reject request if user is not authenticated", "expression": "IF !user.isAuthenticated THEN reject", "confidence": 0.85 }]
|
|
196265
|
+
}
|
|
196235
196266
|
}
|
|
196236
196267
|
|
|
196237
196268
|
## Rules
|
|
196238
|
-
-
|
|
196269
|
+
- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
|
|
196270
|
+
- Skip paragraphs with no atoms — do NOT emit empty objects.
|
|
196239
196271
|
- Only include atom types that are actually found in a paragraph (all types are optional).
|
|
196240
196272
|
- Every atom MUST include all required fields for its type (see schemas above). Fields with "?" suffix are optional.
|
|
196241
196273
|
- **Enum fields MUST use ONLY the listed values.** For example, entity.kind must be one of "implementation"|"external"|"concept" — do NOT use values from other atom types (e.g., do NOT put "team" or "human" in entity.kind; those belong to roles.kind).
|
|
196242
196274
|
- Every atom MUST include a "confidence" field (0.0-1.0) indicating how confident you are in the extraction. Use higher values (0.85-1.0) for explicitly stated facts and lower values (0.5-0.7) for inferred or ambiguous information.
|
|
196243
196275
|
- **Classify correctly:** People, teams, and personas → "roles" (not "entities"). Technical systems, services, modules → "entities".
|
|
196276
|
+
- **Entity reference consistency (CRITICAL):** Every entity name referenced in relation.from, relation.to, behavior.subject, or any other cross-reference field MUST also appear in the "entities" array of the SAME paragraph (or a preceding paragraph in the same chunk). If an entity is mentioned for the first time in a relation, you MUST also extract it as an entity. This ensures no "dangling references" — every name used in relations has a corresponding entity declaration.
|
|
196277
|
+
- **Cross-atom reference consistency:** transitions[].from and transitions[].to values MUST exist in states[].values of the same entity. roles[].performs values MUST match names declared in behaviors[].name.
|
|
196278
|
+
- **Constraints vs rules distinction:** Use "constraints" for unconditional declarative mandates ('X must Y'). Use "rules" for conditional logic ('IF X THEN Y'). Do not mix them — a requirement with no condition is a constraint, a requirement triggered by a condition is a rule. Do NOT invent a rule for every constraint — only create a rule when the text explicitly states conditional logic.
|
|
196279
|
+
- **One statement, multiple atoms:** A single sentence can produce several atom types simultaneously. Do NOT force a choice — extract all that apply. Example: "system uptime must be ≥ 99.9%" → constraint (severity: must) + metric (threshold: "≥ 99.9%").
|
|
196280
|
+
- **Relation types:** Use standard relation types when possible: CONTAINS (parent→child composition), DEPENDS_ON (runtime dependency), IMPLEMENTS (code/component→spec realization), PRODUCES (process→output), TRIGGERS (event/process triggering), REFERENCES (weak cross-reference). Only invent a new type when none of these fit.
|
|
196281
|
+
- **Decisions:** Extract as "decisions" when the text records a deliberate choice between alternatives with rationale (e.g., "we chose X because Y", "after evaluating A/B/C, selected B"). Do not extract routine descriptions as decisions.
|
|
196244
196282
|
- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions, English input → English descriptions).
|
|
196245
196283
|
- JSON structure keys (tag, atom type names, field names) must always be in English.
|
|
196246
196284
|
- Be thorough: extract ALL relevant atoms from each paragraph.
|
|
196285
|
+
- **Tables: basic extraction only.** For paragraphs containing markdown tables, extract the table heading as an entity and a brief summary attribute. Detailed table modeling (row-level data, comparisons, metrics) is handled by a dedicated table extraction pass — do NOT attempt exhaustive table column extraction here.
|
|
196286
|
+
- **Diagrams: basic extraction only.** For paragraphs containing text-based diagrams (e.g. \`\`\`mermaid, \`\`\`plantuml, \`\`\`dot, etc.), extract the diagram title as an entity and a brief summary attribute describing what the diagram shows. Detailed diagram modeling (nodes, edges, states, transitions) is handled by a dedicated diagram extraction pass — do NOT attempt exhaustive diagram parsing here.
|
|
196287
|
+
- **Metrics checklist:** If the text mentions performance targets, SLA, response times, throughput, error rates, port numbers, timeouts, capacity limits, or any numeric thresholds, extract them as "metrics" atoms.
|
|
196247
196288
|
- Do NOT include "claims" — they are system-generated and not part of document extraction.`;
|
|
196248
196289
|
function buildDocAtomAnnotationPrompt(chunkText) {
|
|
196249
196290
|
return `Extract all semantic atoms from the following document text.
|
|
@@ -196255,6 +196296,13 @@ ${chunkText}
|
|
|
196255
196296
|
|
|
196256
196297
|
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
196257
196298
|
}
|
|
196299
|
+
function toFlatFormat(result) {
|
|
196300
|
+
const flat = {};
|
|
196301
|
+
for (const p4 of result.paragraphs) {
|
|
196302
|
+
flat[p4.tag] = p4.atoms;
|
|
196303
|
+
}
|
|
196304
|
+
return flat;
|
|
196305
|
+
}
|
|
196258
196306
|
function buildDocGleaningPrompt(chunkText, previousResult) {
|
|
196259
196307
|
return `Review the following document text and the previously extracted atoms.
|
|
196260
196308
|
Check for any MISSING atoms that were not captured in the first pass.
|
|
@@ -196263,66 +196311,483 @@ Check for any MISSING atoms that were not captured in the first pass.
|
|
|
196263
196311
|
${chunkText}
|
|
196264
196312
|
|
|
196265
196313
|
## Previously Extracted Atoms
|
|
196266
|
-
${JSON.stringify(previousResult, null, 2)}
|
|
196314
|
+
${JSON.stringify(toFlatFormat(previousResult), null, 2)}
|
|
196267
196315
|
|
|
196268
196316
|
## Instructions
|
|
196269
|
-
- If you find missing atoms, output them in the same JSON format
|
|
196317
|
+
- If you find missing atoms, output them in the same flat JSON format keyed by paragraph tags (e.g. {"P0": {"entities": [...]}, "P3": {"rules": [...]}}).
|
|
196270
196318
|
- Only include NEW atoms not already in the previous extraction.
|
|
196271
196319
|
- Every atom MUST include a "confidence" field (0.0-1.0).
|
|
196272
|
-
- If
|
|
196320
|
+
- **Entity reference consistency:** If you add a new relation whose from/to references an entity not yet declared in the previous extraction or your current output, you MUST also add that entity to the "entities" array.
|
|
196321
|
+
- If nothing is missing, return: {}
|
|
196273
196322
|
- Respond in the same language as the input text.
|
|
196274
196323
|
|
|
196275
196324
|
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
196276
196325
|
}
|
|
196277
196326
|
var DOC_ANNOTATION_SYSTEM_PROMPT = SYSTEM_PROMPT;
|
|
196327
|
+
// ../llm/src/prompts/entityResolution.ts
|
|
196328
|
+
var ENTITY_RESOLUTION_SYSTEM_PROMPT = `You are an entity resolution assistant. You review a list of entity names extracted from a technical document and perform two tasks:
|
|
196329
|
+
|
|
196330
|
+
## Task 1: Merge Duplicates
|
|
196331
|
+
- Only merge names that clearly refer to the same entity (same system, service, tool, etc.)
|
|
196332
|
+
- Prefer the LONGER, more descriptive name as the canonical name
|
|
196333
|
+
- Do NOT merge names that share a substring but refer to different things
|
|
196334
|
+
- When uncertain, do NOT merge — add to "ambiguous" instead
|
|
196335
|
+
- Chinese and English names for the same entity SHOULD be merged (e.g. "Vmok" → "Vmok 微模块框架")
|
|
196336
|
+
- Abbreviations should be merged with their full forms (e.g. "AGW" → "API Gateway")
|
|
196337
|
+
|
|
196338
|
+
## Task 2: Remove Noise
|
|
196339
|
+
- Apply the **identity test**: a real entity is something you can discuss independently ("What is X?", "How does X work?", "Who owns X?"). Names that fail this test — values, addresses, actions, generic descriptions — are noise.
|
|
196340
|
+
- Remove names that are NOT meaningful named entities: generic words, action descriptions, or things that are attributes/values rather than independent subjects
|
|
196341
|
+
- Examples of REAL entities to KEEP: product names (TTAstra, Gulux), tools (nvm, Rush), services (Op Main 服务), platforms (AGW 平台) — these all pass the identity test
|
|
196342
|
+
- When uncertain, KEEP the name — only remove if it clearly fails the identity test
|
|
196343
|
+
|
|
196344
|
+
## Output
|
|
196345
|
+
Valid JSON only. No markdown fences, no explanation.`;
|
|
196346
|
+
function buildEntityResolutionPrompt(input) {
|
|
196347
|
+
const parts = [];
|
|
196348
|
+
parts.push(`## All Entity Names (${input.allNames.length} total)`);
|
|
196349
|
+
parts.push(input.allNames.map((n, i) => `${i + 1}. ${n}`).join(`
|
|
196350
|
+
`));
|
|
196351
|
+
if (input.candidates.length > 0) {
|
|
196352
|
+
parts.push("");
|
|
196353
|
+
parts.push(`## Suspected Duplicates (${input.candidates.length} pairs)`);
|
|
196354
|
+
parts.push("Review each pair and decide whether to merge:");
|
|
196355
|
+
for (const c of input.candidates) {
|
|
196356
|
+
parts.push(`- "${c.short}" ↔ "${c.long}" — ${c.reason}`);
|
|
196357
|
+
}
|
|
196358
|
+
}
|
|
196359
|
+
if (input.noiseCandidates && input.noiseCandidates.length > 0) {
|
|
196360
|
+
parts.push("");
|
|
196361
|
+
parts.push(`## Suspected Noise (${input.noiseCandidates.length} names)`);
|
|
196362
|
+
parts.push("Review each name — remove if NOT a meaningful named entity, keep if it IS:");
|
|
196363
|
+
for (const n of input.noiseCandidates) {
|
|
196364
|
+
parts.push(`- "${n}"`);
|
|
196365
|
+
}
|
|
196366
|
+
}
|
|
196367
|
+
if (input.contextSnippets && input.contextSnippets.length > 0) {
|
|
196368
|
+
parts.push("");
|
|
196369
|
+
parts.push("## Context Snippets");
|
|
196370
|
+
for (const s of input.contextSnippets) {
|
|
196371
|
+
parts.push(`- **${s.name}**: ${s.snippet}`);
|
|
196372
|
+
}
|
|
196373
|
+
}
|
|
196374
|
+
parts.push("");
|
|
196375
|
+
parts.push(`## Output Format
|
|
196376
|
+
Return a JSON object:
|
|
196377
|
+
{
|
|
196378
|
+
"merges": [
|
|
196379
|
+
{ "from": "alias name", "to": "canonical name" }
|
|
196380
|
+
],
|
|
196381
|
+
"remove": ["noise_name_1", "noise_name_2"],
|
|
196382
|
+
"ambiguous": ["name1", "name2"]
|
|
196383
|
+
}
|
|
196384
|
+
|
|
196385
|
+
- "merges": confirmed duplicate pairs. "from" will be replaced by "to" everywhere.
|
|
196386
|
+
- "remove": names confirmed as noise. They will be deleted from entity list.
|
|
196387
|
+
- "ambiguous": names you're unsure about (optional, for logging).
|
|
196388
|
+
|
|
196389
|
+
Return ONLY valid JSON. No markdown fences, no explanation.`);
|
|
196390
|
+
return parts.join(`
|
|
196391
|
+
`);
|
|
196392
|
+
}
|
|
196393
|
+
// ../llm/src/prompts/docTableAnnotation.ts
|
|
196394
|
+
init_src();
|
|
196395
|
+
var entityFields = zodObjectToPromptFields(entityAtomSchema);
|
|
196396
|
+
var attributeFields = zodObjectToPromptFields(attributeAtomSchema);
|
|
196397
|
+
var relationFields = zodObjectToPromptFields(relationAtomSchema);
|
|
196398
|
+
var comparisonFields = zodObjectToPromptFields(comparisonAtomSchema);
|
|
196399
|
+
var metricFields = zodObjectToPromptFields(metricAtomSchema);
|
|
196400
|
+
var behaviorFields = zodObjectToPromptFields(behaviorAtomSchema);
|
|
196401
|
+
var eventFields = zodObjectToPromptFields(eventAtomSchema);
|
|
196402
|
+
var transitionFields = zodObjectToPromptFields(transitionAtomSchema);
|
|
196403
|
+
var constraintFields = zodObjectToPromptFields(constraintAtomSchema);
|
|
196404
|
+
var stateFields = zodObjectToPromptFields(stateAtomSchema);
|
|
196405
|
+
var ruleFields = zodObjectToPromptFields(ruleAtomSchema);
|
|
196406
|
+
var TABLE_SYSTEM_PROMPT = `You are a table data modeling assistant. Your task is to extract structured semantic atoms from markdown tables in documents.
|
|
196407
|
+
|
|
196408
|
+
Each table paragraph is tagged with [P0], [P1], etc. You must classify the table type FIRST, then apply the corresponding extraction rules.
|
|
196409
|
+
|
|
196410
|
+
## Step 1: Classify the Table
|
|
196411
|
+
|
|
196412
|
+
Determine the table type by examining the relationship between rows:
|
|
196413
|
+
|
|
196414
|
+
### Type A: Collection / Record Table
|
|
196415
|
+
**Rows are peer instances of the same concept.** Each row is an independent record; columns describe different facets of the same instance.
|
|
196416
|
+
- Examples: code→name mappings, enum definitions, config parameter lists, reference data tables
|
|
196417
|
+
- Key signal: removing one row does not affect the meaning of other rows
|
|
196418
|
+
|
|
196419
|
+
### Type B: Single-Object Property Table
|
|
196420
|
+
**Rows describe properties/fields of ONE entity.** First column is property name, other columns are its type/value/description.
|
|
196421
|
+
- Examples: API field definitions, configuration schema, entity attribute lists
|
|
196422
|
+
- Key signal: all rows refer to the same parent entity
|
|
196423
|
+
|
|
196424
|
+
### Type C: Comparison / Evaluation Table
|
|
196425
|
+
**Rows or columns represent different subjects being compared** across the same dimensions.
|
|
196426
|
+
- Examples: technology selection, vendor evaluation, feature comparison
|
|
196427
|
+
- Key signal: multiple named subjects evaluated on shared criteria
|
|
196428
|
+
|
|
196429
|
+
### Type D: Matrix / Cross-Reference Table
|
|
196430
|
+
**Both row headers and column headers are dimensions.** Cells represent the relationship at the intersection.
|
|
196431
|
+
- Examples: permission matrices (role × operation), compatibility matrices, dependency tables
|
|
196432
|
+
- Key signal: both axes are meaningful dimensions, cells are binary/rating/relationship values
|
|
196433
|
+
|
|
196434
|
+
### Type E: Metrics / KPI Table
|
|
196435
|
+
**Rows are measurable indicators** with numeric targets, thresholds, or SLA values.
|
|
196436
|
+
- Examples: SLA tables, performance baselines, capacity planning tables
|
|
196437
|
+
- Key signal: columns include target/threshold/unit/SLA-style values
|
|
196438
|
+
|
|
196439
|
+
### Type F: Timeline / Process Table
|
|
196440
|
+
**Rows represent ordered steps or phases** in a sequence.
|
|
196441
|
+
- Examples: deployment steps, approval workflows, version changelog, migration plans
|
|
196442
|
+
- Key signal: rows have implicit ordering, may have phase/step/date columns
|
|
196443
|
+
|
|
196444
|
+
## Step 2: Extract Atoms by Table Type
|
|
196445
|
+
|
|
196446
|
+
### Type A → Single attribute with row-object array
|
|
196447
|
+
1. Create ONE entity for the abstract concept (table heading or the concept rows represent).
|
|
196448
|
+
Entity schema: ${entityFields}
|
|
196449
|
+
2. Create ONE attribute with \`type: "table"\` and \`value\` as an array of row objects. Each row object uses column headers as keys.
|
|
196450
|
+
Attribute schema: ${attributeFields}
|
|
196451
|
+
Example: \`{ "name": "Region Code Mapping", "type": "table", "value": [{"Code": "1001", "Name": "CN_North", "Region": "CN-NORTH"}, ...] }\`
|
|
196452
|
+
3. **Extract ALL rows — do not sample.** If a table has 30 rows, the value array must contain all 30 objects.
|
|
196453
|
+
4. Extract structural patterns: status indicators (DEPRECATED, enabled/disabled) → "states" + "rules" atoms.
|
|
196454
|
+
State schema: ${stateFields}
|
|
196455
|
+
Rule schema: ${ruleFields}
|
|
196456
|
+
|
|
196457
|
+
### Type B → Multiple attribute atoms
|
|
196458
|
+
1. Create ONE entity for the parent structure.
|
|
196459
|
+
Entity schema: ${entityFields}
|
|
196460
|
+
2. Create one attribute per row: \`name\` = property name, \`type\` = property type, \`value\` = default/example.
|
|
196461
|
+
Attribute schema: ${attributeFields}
|
|
196462
|
+
3. Extract constraints from "required" or "validation" columns.
|
|
196463
|
+
Constraint schema: ${constraintFields}
|
|
196464
|
+
|
|
196465
|
+
### Type C → Comparison atom
|
|
196466
|
+
1. Use "comparisons" atom. Subjects = compared items, dimensions = evaluation criteria.
|
|
196467
|
+
Comparison schema: ${comparisonFields}
|
|
196468
|
+
2. Extract "decisions" atoms if the table leads to a conclusion.
|
|
196469
|
+
|
|
196470
|
+
### Type D → Relations or table attribute
|
|
196471
|
+
1. If cells are simple (yes/no, allowed/denied): extract as "relations" atoms.
|
|
196472
|
+
Relation schema: ${relationFields}
|
|
196473
|
+
Map each cell to a relation: row header → \`from\`, column header → \`to\`, cell value → \`type\` or \`description\`.
|
|
196474
|
+
2. If cells are complex: use Type A approach (single attribute with \`type: "table"\`).
|
|
196475
|
+
3. Create entities for both row headers and column headers if they are named concepts.
|
|
196476
|
+
|
|
196477
|
+
### Type E → Metrics atoms
|
|
196478
|
+
1. Create one "metrics" atom per row.
|
|
196479
|
+
Metric schema: ${metricFields}
|
|
196480
|
+
2. Also create the parent entity if named (e.g., "SLA Requirements").
|
|
196481
|
+
|
|
196482
|
+
### Type F → Behaviors/Events/Transitions
|
|
196483
|
+
1. Create one "behaviors" atom per step/phase.
|
|
196484
|
+
Behavior schema: ${behaviorFields}
|
|
196485
|
+
2. If there are triggers: extract "events" atoms.
|
|
196486
|
+
Event schema: ${eventFields}
|
|
196487
|
+
3. If there are state changes: extract "transitions" atoms.
|
|
196488
|
+
Transition schema: ${transitionFields}
|
|
196489
|
+
4. Create the parent entity for the process/workflow.
|
|
196490
|
+
|
|
196491
|
+
## Output Format
|
|
196492
|
+
Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
|
|
196493
|
+
{
|
|
196494
|
+
"P0": {
|
|
196495
|
+
"tableType": "A",
|
|
196496
|
+
"entities": [...],
|
|
196497
|
+
"attributes": [...]
|
|
196498
|
+
},
|
|
196499
|
+
"P3": {
|
|
196500
|
+
"tableType": "C",
|
|
196501
|
+
"comparisons": [...]
|
|
196502
|
+
}
|
|
196503
|
+
}
|
|
196504
|
+
|
|
196505
|
+
## Rules
|
|
196506
|
+
- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
|
|
196507
|
+
- Skip paragraphs with no atoms — do NOT emit empty objects.
|
|
196508
|
+
- Every atom MUST include a "confidence" field (0.0-1.0).
|
|
196509
|
+
- The "tableType" field is required for each paragraph (one of "A", "B", "C", "D", "E", "F").
|
|
196510
|
+
- Only include atom types that are actually extracted.
|
|
196511
|
+
- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
|
|
196512
|
+
- JSON structure keys must always be in English.
|
|
196513
|
+
- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
|
|
196514
|
+
- Do NOT include "claims" — they are system-generated.`;
|
|
196515
|
+
function buildDocTableAnnotationPrompt(tableText) {
|
|
196516
|
+
return `Classify and extract atoms from the following table paragraphs.
|
|
196517
|
+
Each paragraph is tagged with [P0], [P1], etc. First classify each table, then extract atoms accordingly.
|
|
196518
|
+
|
|
196519
|
+
---
|
|
196520
|
+
${tableText}
|
|
196521
|
+
---
|
|
196522
|
+
|
|
196523
|
+
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
196524
|
+
}
|
|
196525
|
+
var DOC_TABLE_ANNOTATION_SYSTEM_PROMPT = TABLE_SYSTEM_PROMPT;
|
|
196526
|
+
// ../llm/src/prompts/docDiagramAnnotation.ts
|
|
196527
|
+
init_src();
|
|
196528
|
+
var entityFields2 = zodObjectToPromptFields(entityAtomSchema);
|
|
196529
|
+
var attributeFields2 = zodObjectToPromptFields(attributeAtomSchema);
|
|
196530
|
+
var relationFields2 = zodObjectToPromptFields(relationAtomSchema);
|
|
196531
|
+
var behaviorFields2 = zodObjectToPromptFields(behaviorAtomSchema);
|
|
196532
|
+
var transitionFields2 = zodObjectToPromptFields(transitionAtomSchema);
|
|
196533
|
+
var stateFields2 = zodObjectToPromptFields(stateAtomSchema);
|
|
196534
|
+
var roleFields = zodObjectToPromptFields(roleAtomSchema);
|
|
196535
|
+
var eventFields2 = zodObjectToPromptFields(eventAtomSchema);
|
|
196536
|
+
var decisionFields = zodObjectToPromptFields(decisionAtomSchema);
|
|
196537
|
+
var constraintFields2 = zodObjectToPromptFields(constraintAtomSchema);
|
|
196538
|
+
var DIAGRAM_FENCE_TAGS = [
|
|
196539
|
+
"mermaid",
|
|
196540
|
+
"plantuml",
|
|
196541
|
+
"puml",
|
|
196542
|
+
"dot",
|
|
196543
|
+
"graphviz",
|
|
196544
|
+
"viz",
|
|
196545
|
+
"d2",
|
|
196546
|
+
"c4plantuml",
|
|
196547
|
+
"ditaa",
|
|
196548
|
+
"nomnoml",
|
|
196549
|
+
"wavedrom",
|
|
196550
|
+
"vega",
|
|
196551
|
+
"vega-lite"
|
|
196552
|
+
];
|
|
196553
|
+
var DIAGRAM_FENCE_REGEX = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "i");
|
|
196554
|
+
var DIAGRAM_SYSTEM_PROMPT = `You are a diagram analysis assistant. Your task is to extract structured semantic atoms from text-based diagrams (Mermaid, PlantUML, Graphviz, D2, etc.) embedded in documents.
|
|
196555
|
+
|
|
196556
|
+
Each diagram paragraph is tagged with [P0], [P1], etc. You must classify the diagram type FIRST, then extract atoms accordingly.
|
|
196557
|
+
|
|
196558
|
+
## Step 1: Identify the Diagram Format and Type
|
|
196559
|
+
|
|
196560
|
+
### Formats
|
|
196561
|
+
- **Mermaid**: flowchart/graph, sequenceDiagram, stateDiagram, classDiagram, erDiagram, gantt, pie, gitgraph
|
|
196562
|
+
- **PlantUML / C4-PlantUML**: @startuml/@enduml blocks, all UML types, C4 architecture (System_Context, Container, Component)
|
|
196563
|
+
- **Graphviz (DOT)**: digraph/graph, general directed/undirected graphs
|
|
196564
|
+
- **D2**: modern declarative diagrams with shape/connection syntax
|
|
196565
|
+
- **Others**: ditaa (ASCII art), nomnoml (UML), wavedrom (timing), vega/vega-lite (data viz)
|
|
196566
|
+
|
|
196567
|
+
### Diagram Types (by semantic content)
|
|
196568
|
+
- **Flowchart / Process**: decision trees, algorithms, business process flows
|
|
196569
|
+
- **Sequence**: interaction between participants over time (API calls, protocols)
|
|
196570
|
+
- **State Machine**: states and transitions triggered by events/guards
|
|
196571
|
+
- **Class / ER**: data models, entity relationships, inheritance hierarchies
|
|
196572
|
+
- **Architecture**: system components, containers, deployment topology
|
|
196573
|
+
- **Gantt / Timeline**: project schedules, milestones, phases
|
|
196574
|
+
- **Pie / Data Viz**: statistical distributions, metrics visualization
|
|
196575
|
+
|
|
196576
|
+
## Step 2: Extract Atoms by Diagram Type
|
|
196577
|
+
|
|
196578
|
+
### Flowchart / Process → entities + relations + behaviors + decisions
|
|
196579
|
+
1. Extract each node as an entity.
|
|
196580
|
+
Entity schema: ${entityFields2}
|
|
196581
|
+
2. Extract each arrow/edge as a relation. Use edge labels as \`type\` or \`description\`.
|
|
196582
|
+
Relation schema: ${relationFields2}
|
|
196583
|
+
3. Extract action nodes as behaviors (what the process does at each step).
|
|
196584
|
+
Behavior schema: ${behaviorFields2}
|
|
196585
|
+
4. Extract diamond/condition nodes: if it represents a deliberate choice with rationale → "decisions"; if it represents conditional branching logic (IF-THEN) → "rules".
|
|
196586
|
+
Decision schema: ${decisionFields}
|
|
196587
|
+
|
|
196588
|
+
### Sequence → entities + relations + behaviors + events
|
|
196589
|
+
1. Extract each participant/actor as an entity (or role if it's a person/team).
|
|
196590
|
+
Entity schema: ${entityFields2}
|
|
196591
|
+
Role schema: ${roleFields}
|
|
196592
|
+
2. Extract each message/call as a relation (\`from\` = caller, \`to\` = callee, \`type\` = message label).
|
|
196593
|
+
Relation schema: ${relationFields2}
|
|
196594
|
+
3. Extract significant interactions as behaviors.
|
|
196595
|
+
Behavior schema: ${behaviorFields2}
|
|
196596
|
+
4. Extract triggers, responses, and async messages as events.
|
|
196597
|
+
Event schema: ${eventFields2}
|
|
196598
|
+
|
|
196599
|
+
### State Machine → entities + states + transitions + events
|
|
196600
|
+
1. Extract the state machine subject as an entity.
|
|
196601
|
+
Entity schema: ${entityFields2}
|
|
196602
|
+
2. Extract each state as a state atom.
|
|
196603
|
+
State schema: ${stateFields2}
|
|
196604
|
+
3. Extract each arrow as a transition (\`from\` = source state, \`to\` = target state, \`trigger\` = event/guard).
|
|
196605
|
+
Transition schema: ${transitionFields2}
|
|
196606
|
+
4. Extract triggers as events.
|
|
196607
|
+
Event schema: ${eventFields2}
|
|
196608
|
+
|
|
196609
|
+
### Class / ER → entities + attributes + relations
|
|
196610
|
+
1. Extract each class/entity as an entity.
|
|
196611
|
+
Entity schema: ${entityFields2}
|
|
196612
|
+
2. Extract fields/properties as attributes.
|
|
196613
|
+
Attribute schema: ${attributeFields2}
|
|
196614
|
+
3. Extract associations, inheritance, composition as relations (\`type\` = "INHERITS", "CONTAINS", "REFERENCES", etc.).
|
|
196615
|
+
Relation schema: ${relationFields2}
|
|
196616
|
+
|
|
196617
|
+
### Architecture → entities + relations + constraints
|
|
196618
|
+
1. Extract each system/service/container/component as an entity. Use kind to indicate origin: "implementation" for internal systems/services, "external" for third-party dependencies (databases, cloud services, external APIs).
|
|
196619
|
+
Entity schema: ${entityFields2}
|
|
196620
|
+
2. Extract connections between components as relations. Use standard types: CONTAINS (parent→child), DEPENDS_ON (runtime dependency), TRIGGERS (event/process triggering).
|
|
196621
|
+
Relation schema: ${relationFields2}
|
|
196622
|
+
3. Extract deployment constraints, technology choices.
|
|
196623
|
+
Constraint schema: ${constraintFields2}
|
|
196624
|
+
|
|
196625
|
+
### Gantt / Timeline → behaviors + events + constraints
|
|
196626
|
+
1. Extract each task/phase as a behavior.
|
|
196627
|
+
Behavior schema: ${behaviorFields2}
|
|
196628
|
+
2. Extract milestones and deadlines as events.
|
|
196629
|
+
Event schema: ${eventFields2}
|
|
196630
|
+
3. Extract dependencies and critical path constraints.
|
|
196631
|
+
Constraint schema: ${constraintFields2}
|
|
196632
|
+
|
|
196633
|
+
### Pie / Data Viz → attributes (summary only)
|
|
196634
|
+
1. Extract the chart title as an entity.
|
|
196635
|
+
Entity schema: ${entityFields2}
|
|
196636
|
+
2. Extract each slice/data point as an attribute (\`name\` = label, \`value\` = amount/percentage, \`type\` = "metric").
|
|
196637
|
+
Attribute schema: ${attributeFields2}
|
|
196638
|
+
|
|
196639
|
+
## Additional Extraction: Diagram Description
|
|
196640
|
+
|
|
196641
|
+
For EVERY diagram, also extract a "description" attribute on the diagram's primary entity:
|
|
196642
|
+
- \`name\`: "diagram_description"
|
|
196643
|
+
- \`type\`: "description"
|
|
196644
|
+
- \`value\`: A 1-3 sentence natural language summary of what the diagram communicates.
|
|
196645
|
+
|
|
196646
|
+
This description is critical for downstream AI consumers who cannot render the diagram.
|
|
196647
|
+
|
|
196648
|
+
## Output Format
|
|
196649
|
+
Return a single JSON object keyed by paragraph tags. Only include paragraphs that have atoms — skip empty ones:
|
|
196650
|
+
{
|
|
196651
|
+
"P0": {
|
|
196652
|
+
"diagramFormat": "mermaid",
|
|
196653
|
+
"diagramType": "sequence",
|
|
196654
|
+
"entities": [...],
|
|
196655
|
+
"relations": [...]
|
|
196656
|
+
}
|
|
196657
|
+
}
|
|
196658
|
+
|
|
196659
|
+
## Rules
|
|
196660
|
+
- Keys are paragraph tags (P0, P1, ...) corresponding to the tagged paragraphs in the input.
|
|
196661
|
+
- Skip paragraphs with no atoms — do NOT emit empty objects.
|
|
196662
|
+
- Every atom MUST include a "confidence" field (0.0-1.0).
|
|
196663
|
+
- The "diagramFormat" and "diagramType" fields are required for each paragraph.
|
|
196664
|
+
- Only include atom types that are actually extracted.
|
|
196665
|
+
- Respond in the same language as the input text (e.g., Chinese input → Chinese descriptions).
|
|
196666
|
+
- JSON structure keys must always be in English.
|
|
196667
|
+
- **Entity reference consistency:** Every entity name referenced in relations must also appear in the "entities" array.
|
|
196668
|
+
- **Extract ALL nodes and edges** — do not sample or skip.
|
|
196669
|
+
- Do NOT include "claims" — they are system-generated.`;
|
|
196670
|
+
function buildDocDiagramAnnotationPrompt(diagramText) {
|
|
196671
|
+
return `Analyze and extract atoms from the following diagram paragraphs.
|
|
196672
|
+
Each paragraph is tagged with [P0], [P1], etc. First identify the diagram format and type, then extract atoms accordingly.
|
|
196673
|
+
|
|
196674
|
+
---
|
|
196675
|
+
${diagramText}
|
|
196676
|
+
---
|
|
196677
|
+
|
|
196678
|
+
Return ONLY a valid JSON object. No markdown fences, no explanation.`;
|
|
196679
|
+
}
|
|
196680
|
+
var DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT = DIAGRAM_SYSTEM_PROMPT;
|
|
196278
196681
|
// ../llm/src/chunking/markdownChunker.ts
|
|
196279
196682
|
var DEFAULT_MAX_TOKENS2 = 4000;
|
|
196683
|
+
var DEFAULT_PARAGRAPH_MAX_TOKENS = 500;
|
|
196280
196684
|
function estimateTokens(text2) {
|
|
196281
196685
|
return Math.ceil(text2.length / 4);
|
|
196282
196686
|
}
|
|
196687
|
+
function findCodeBlockRanges(content) {
|
|
196688
|
+
const ranges = [];
|
|
196689
|
+
const fenceRe = /^(`{3,}|~{3,})/gm;
|
|
196690
|
+
let openStart = -1;
|
|
196691
|
+
let openFence = "";
|
|
196692
|
+
let match2;
|
|
196693
|
+
while ((match2 = fenceRe.exec(content)) !== null) {
|
|
196694
|
+
const fence = match2[1];
|
|
196695
|
+
if (openStart === -1) {
|
|
196696
|
+
openStart = match2.index;
|
|
196697
|
+
openFence = fence[0].repeat(fence.length);
|
|
196698
|
+
} else if (fence[0] === openFence[0] && fence.length >= openFence.length) {
|
|
196699
|
+
ranges.push({ start: openStart, end: match2.index + match2[0].length });
|
|
196700
|
+
openStart = -1;
|
|
196701
|
+
openFence = "";
|
|
196702
|
+
}
|
|
196703
|
+
}
|
|
196704
|
+
if (openStart !== -1) {
|
|
196705
|
+
ranges.push({ start: openStart, end: content.length });
|
|
196706
|
+
}
|
|
196707
|
+
return ranges;
|
|
196708
|
+
}
|
|
196709
|
+
function isInsideCodeBlock(pos, ranges) {
|
|
196710
|
+
for (const r of ranges) {
|
|
196711
|
+
if (pos >= r.start && pos < r.end)
|
|
196712
|
+
return true;
|
|
196713
|
+
if (r.start > pos)
|
|
196714
|
+
break;
|
|
196715
|
+
}
|
|
196716
|
+
return false;
|
|
196717
|
+
}
|
|
196283
196718
|
function parseSections(content) {
|
|
196284
|
-
const
|
|
196285
|
-
const sections = [];
|
|
196719
|
+
const codeRanges = findCodeBlockRanges(content);
|
|
196286
196720
|
const matches = [];
|
|
196287
|
-
|
|
196288
|
-
|
|
196289
|
-
|
|
196290
|
-
|
|
196291
|
-
|
|
196292
|
-
|
|
196293
|
-
|
|
196721
|
+
const atxRe = /^(#{1,6})\s+(.*)$/gm;
|
|
196722
|
+
let m;
|
|
196723
|
+
while ((m = atxRe.exec(content)) !== null) {
|
|
196724
|
+
if (!isInsideCodeBlock(m.index, codeRanges)) {
|
|
196725
|
+
matches.push({
|
|
196726
|
+
index: m.index,
|
|
196727
|
+
endIndex: m.index + m[0].length,
|
|
196728
|
+
level: m[1].length,
|
|
196729
|
+
heading: m[2].trim()
|
|
196730
|
+
});
|
|
196731
|
+
}
|
|
196732
|
+
}
|
|
196733
|
+
const lines = content.split(`
|
|
196734
|
+
`);
|
|
196735
|
+
let offset = 0;
|
|
196736
|
+
for (let i = 0;i < lines.length; i++) {
|
|
196737
|
+
const line = lines[i];
|
|
196738
|
+
if (i > 0) {
|
|
196739
|
+
const prevLine = lines[i - 1].trim();
|
|
196740
|
+
const prevLineStart = offset - lines[i - 1].length - 1;
|
|
196741
|
+
if (prevLine && !isInsideCodeBlock(prevLineStart, codeRanges)) {
|
|
196742
|
+
if (/^={2,}\s*$/.test(line)) {
|
|
196743
|
+
matches.push({
|
|
196744
|
+
index: prevLineStart < 0 ? 0 : prevLineStart,
|
|
196745
|
+
endIndex: offset + line.length,
|
|
196746
|
+
level: 1,
|
|
196747
|
+
heading: prevLine
|
|
196748
|
+
});
|
|
196749
|
+
} else if (/^-{2,}\s*$/.test(line) && !/^-{3,}\s*$/.test(prevLine)) {
|
|
196750
|
+
matches.push({
|
|
196751
|
+
index: prevLineStart < 0 ? 0 : prevLineStart,
|
|
196752
|
+
endIndex: offset + line.length,
|
|
196753
|
+
level: 2,
|
|
196754
|
+
heading: prevLine
|
|
196755
|
+
});
|
|
196756
|
+
}
|
|
196757
|
+
}
|
|
196758
|
+
}
|
|
196759
|
+
offset += line.length + 1;
|
|
196760
|
+
}
|
|
196761
|
+
matches.sort((a, b) => a.index - b.index);
|
|
196762
|
+
const deduped = [];
|
|
196763
|
+
for (const match2 of matches) {
|
|
196764
|
+
const last = deduped[deduped.length - 1];
|
|
196765
|
+
if (last && match2.index < last.endIndex)
|
|
196766
|
+
continue;
|
|
196767
|
+
deduped.push(match2);
|
|
196294
196768
|
}
|
|
196769
|
+
return buildSectionsFromMatches(content, deduped);
|
|
196770
|
+
}
|
|
196771
|
+
function buildSectionsFromMatches(content, matches) {
|
|
196772
|
+
const sections = [];
|
|
196295
196773
|
if (matches.length === 0) {
|
|
196296
196774
|
const body2 = content.trim();
|
|
196297
196775
|
if (body2) {
|
|
196298
|
-
sections.push({
|
|
196299
|
-
heading: "",
|
|
196300
|
-
level: 0,
|
|
196301
|
-
body: body2,
|
|
196302
|
-
paragraphs: splitParagraphs(body2)
|
|
196303
|
-
});
|
|
196776
|
+
sections.push({ heading: "", level: 0, body: body2, paragraphs: splitParagraphs(body2) });
|
|
196304
196777
|
}
|
|
196305
196778
|
return sections;
|
|
196306
196779
|
}
|
|
196307
196780
|
if (matches[0].index > 0) {
|
|
196308
196781
|
const preBody = content.slice(0, matches[0].index).trim();
|
|
196309
196782
|
if (preBody) {
|
|
196310
|
-
sections.push({
|
|
196311
|
-
heading: "",
|
|
196312
|
-
level: 0,
|
|
196313
|
-
body: preBody,
|
|
196314
|
-
paragraphs: splitParagraphs(preBody)
|
|
196315
|
-
});
|
|
196783
|
+
sections.push({ heading: "", level: 0, body: preBody, paragraphs: splitParagraphs(preBody) });
|
|
196316
196784
|
}
|
|
196317
196785
|
}
|
|
196318
196786
|
for (let i = 0;i < matches.length; i++) {
|
|
196319
196787
|
const m = matches[i];
|
|
196320
|
-
const
|
|
196321
|
-
const
|
|
196322
|
-
const
|
|
196323
|
-
const headingLineEnd = fullText.indexOf(`
|
|
196324
|
-
`);
|
|
196325
|
-
const body2 = headingLineEnd === -1 ? "" : fullText.slice(headingLineEnd + 1).trim();
|
|
196788
|
+
const bodyStart = m.endIndex;
|
|
196789
|
+
const bodyEnd = i + 1 < matches.length ? matches[i + 1].index : content.length;
|
|
196790
|
+
const body2 = content.slice(bodyStart, bodyEnd).trim();
|
|
196326
196791
|
sections.push({
|
|
196327
196792
|
heading: m.heading,
|
|
196328
196793
|
level: m.level,
|
|
@@ -196337,6 +196802,128 @@ function splitParagraphs(text2) {
|
|
|
196337
196802
|
return [];
|
|
196338
196803
|
return text2.split(/\n\n+/).map((p4) => p4.trim()).filter(Boolean);
|
|
196339
196804
|
}
|
|
196805
|
+
function splitOversizedText(text2, maxTokens) {
|
|
196806
|
+
const doubleNewlineParts = text2.split(/\n\n+/).map((p4) => p4.trim()).filter(Boolean);
|
|
196807
|
+
if (doubleNewlineParts.length > 1) {
|
|
196808
|
+
const results = [];
|
|
196809
|
+
let acc = "";
|
|
196810
|
+
let accTokens = 0;
|
|
196811
|
+
for (const part of doubleNewlineParts) {
|
|
196812
|
+
const partTokens = estimateTokens(part);
|
|
196813
|
+
if (partTokens > maxTokens) {
|
|
196814
|
+
if (acc) {
|
|
196815
|
+
results.push(acc);
|
|
196816
|
+
acc = "";
|
|
196817
|
+
accTokens = 0;
|
|
196818
|
+
}
|
|
196819
|
+
results.push(...splitOversizedText(part, maxTokens));
|
|
196820
|
+
continue;
|
|
196821
|
+
}
|
|
196822
|
+
if (acc && accTokens + partTokens > maxTokens) {
|
|
196823
|
+
results.push(acc);
|
|
196824
|
+
acc = "";
|
|
196825
|
+
accTokens = 0;
|
|
196826
|
+
}
|
|
196827
|
+
acc = acc ? acc + `
|
|
196828
|
+
|
|
196829
|
+
` + part : part;
|
|
196830
|
+
accTokens += partTokens;
|
|
196831
|
+
}
|
|
196832
|
+
if (acc)
|
|
196833
|
+
results.push(acc);
|
|
196834
|
+
return results;
|
|
196835
|
+
}
|
|
196836
|
+
const lines = text2.split(`
|
|
196837
|
+
`);
|
|
196838
|
+
if (lines.length > 1) {
|
|
196839
|
+
const blocks = mergeAtomicBlocks(lines);
|
|
196840
|
+
const results = [];
|
|
196841
|
+
let acc = "";
|
|
196842
|
+
let accTokens = 0;
|
|
196843
|
+
for (const block of blocks) {
|
|
196844
|
+
const blockTokens = estimateTokens(block);
|
|
196845
|
+
if (blockTokens > maxTokens) {
|
|
196846
|
+
if (acc) {
|
|
196847
|
+
results.push(acc);
|
|
196848
|
+
acc = "";
|
|
196849
|
+
accTokens = 0;
|
|
196850
|
+
}
|
|
196851
|
+
results.push(block);
|
|
196852
|
+
continue;
|
|
196853
|
+
}
|
|
196854
|
+
if (acc && accTokens + blockTokens > maxTokens) {
|
|
196855
|
+
results.push(acc);
|
|
196856
|
+
acc = "";
|
|
196857
|
+
accTokens = 0;
|
|
196858
|
+
}
|
|
196859
|
+
acc = acc ? acc + `
|
|
196860
|
+
` + block : block;
|
|
196861
|
+
accTokens += blockTokens;
|
|
196862
|
+
}
|
|
196863
|
+
if (acc)
|
|
196864
|
+
results.push(acc);
|
|
196865
|
+
return results;
|
|
196866
|
+
}
|
|
196867
|
+
return forceBreakText(text2, maxTokens);
|
|
196868
|
+
}
|
|
196869
|
+
function mergeAtomicBlocks(lines) {
|
|
196870
|
+
const result = [];
|
|
196871
|
+
let i = 0;
|
|
196872
|
+
while (i < lines.length) {
|
|
196873
|
+
const line = lines[i];
|
|
196874
|
+
const trimmed = line.trimStart();
|
|
196875
|
+
if (/^(`{3,}|~{3,})/.test(trimmed)) {
|
|
196876
|
+
const fence = trimmed.match(/^(`{3,}|~{3,})/)[1];
|
|
196877
|
+
const fenceChar = fence[0];
|
|
196878
|
+
const fenceLen = fence.length;
|
|
196879
|
+
const blockLines = [line];
|
|
196880
|
+
i++;
|
|
196881
|
+
while (i < lines.length) {
|
|
196882
|
+
blockLines.push(lines[i]);
|
|
196883
|
+
const inner = lines[i].trimStart();
|
|
196884
|
+
if (inner.startsWith(fenceChar) && inner.match(new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`))) {
|
|
196885
|
+
i++;
|
|
196886
|
+
break;
|
|
196887
|
+
}
|
|
196888
|
+
i++;
|
|
196889
|
+
}
|
|
196890
|
+
result.push(blockLines.join(`
|
|
196891
|
+
`));
|
|
196892
|
+
continue;
|
|
196893
|
+
}
|
|
196894
|
+
if (trimmed.startsWith("|")) {
|
|
196895
|
+
const tableLines = [line];
|
|
196896
|
+
i++;
|
|
196897
|
+
while (i < lines.length && lines[i].trimStart().startsWith("|")) {
|
|
196898
|
+
tableLines.push(lines[i]);
|
|
196899
|
+
i++;
|
|
196900
|
+
}
|
|
196901
|
+
result.push(tableLines.join(`
|
|
196902
|
+
`));
|
|
196903
|
+
continue;
|
|
196904
|
+
}
|
|
196905
|
+
result.push(line);
|
|
196906
|
+
i++;
|
|
196907
|
+
}
|
|
196908
|
+
return result;
|
|
196909
|
+
}
|
|
196910
|
+
function forceBreakText(text2, maxTokens) {
|
|
196911
|
+
const maxChars = maxTokens * 4;
|
|
196912
|
+
const results = [];
|
|
196913
|
+
let remaining = text2;
|
|
196914
|
+
while (remaining.length > maxChars) {
|
|
196915
|
+
let breakAt = maxChars;
|
|
196916
|
+
const spaceIdx = remaining.lastIndexOf(" ", maxChars);
|
|
196917
|
+
if (spaceIdx > maxChars * 0.7) {
|
|
196918
|
+
breakAt = spaceIdx;
|
|
196919
|
+
}
|
|
196920
|
+
results.push(remaining.slice(0, breakAt).trim());
|
|
196921
|
+
remaining = remaining.slice(breakAt).trim();
|
|
196922
|
+
}
|
|
196923
|
+
if (remaining)
|
|
196924
|
+
results.push(remaining);
|
|
196925
|
+
return results;
|
|
196926
|
+
}
|
|
196340
196927
|
function buildBreadcrumb(sections, sectionIndex) {
|
|
196341
196928
|
const current = sections[sectionIndex];
|
|
196342
196929
|
if (current.level <= 0)
|
|
@@ -196365,11 +196952,53 @@ function sectionHeadingLine(section) {
|
|
|
196365
196952
|
return "";
|
|
196366
196953
|
return `${"#".repeat(section.level)} ${section.heading}`;
|
|
196367
196954
|
}
|
|
196955
|
+
function buildCoarseParagraphs(sections, paragraphMaxTokens) {
|
|
196956
|
+
const result = [];
|
|
196957
|
+
const rawEntries = [];
|
|
196958
|
+
for (let sIdx = 0;sIdx < sections.length; sIdx++) {
|
|
196959
|
+
const section = sections[sIdx];
|
|
196960
|
+
if (!section.body.trim())
|
|
196961
|
+
continue;
|
|
196962
|
+
const bodyTokens = estimateTokens(section.body);
|
|
196963
|
+
if (bodyTokens > paragraphMaxTokens) {
|
|
196964
|
+
const parts = splitOversizedText(section.body, paragraphMaxTokens);
|
|
196965
|
+
for (const part of parts) {
|
|
196966
|
+
rawEntries.push({ sectionIndex: sIdx, text: part, tokens: estimateTokens(part) });
|
|
196967
|
+
}
|
|
196968
|
+
} else {
|
|
196969
|
+
rawEntries.push({ sectionIndex: sIdx, text: section.body, tokens: bodyTokens });
|
|
196970
|
+
}
|
|
196971
|
+
}
|
|
196972
|
+
const MERGE_THRESHOLD = 150;
|
|
196973
|
+
const merged = [];
|
|
196974
|
+
for (const entry of rawEntries) {
|
|
196975
|
+
const last = merged[merged.length - 1];
|
|
196976
|
+
if (last && last.tokens < MERGE_THRESHOLD && entry.tokens < MERGE_THRESHOLD && last.tokens + entry.tokens <= paragraphMaxTokens) {
|
|
196977
|
+
last.text = last.text + `
|
|
196978
|
+
|
|
196979
|
+
` + entry.text;
|
|
196980
|
+
last.tokens += entry.tokens;
|
|
196981
|
+
} else {
|
|
196982
|
+
merged.push({ ...entry });
|
|
196983
|
+
}
|
|
196984
|
+
}
|
|
196985
|
+
let pIdx = 0;
|
|
196986
|
+
for (const entry of merged) {
|
|
196987
|
+
result.push({
|
|
196988
|
+
sectionIndex: entry.sectionIndex,
|
|
196989
|
+
paragraphIndex: pIdx++,
|
|
196990
|
+
text: entry.text
|
|
196991
|
+
});
|
|
196992
|
+
}
|
|
196993
|
+
return result;
|
|
196994
|
+
}
|
|
196368
196995
|
function chunkMarkdown(content, options = {}) {
|
|
196369
196996
|
const maxTokens = options.maxTokens ?? DEFAULT_MAX_TOKENS2;
|
|
196997
|
+
const paragraphMaxTokens = options.paragraphMaxTokens ?? DEFAULT_PARAGRAPH_MAX_TOKENS;
|
|
196370
196998
|
const sections = parseSections(content);
|
|
196371
196999
|
if (sections.length === 0)
|
|
196372
197000
|
return [];
|
|
197001
|
+
const coarseParagraphs = buildCoarseParagraphs(sections, paragraphMaxTokens);
|
|
196373
197002
|
const chunks = [];
|
|
196374
197003
|
let pendingSections = [];
|
|
196375
197004
|
let pendingTokens = 0;
|
|
@@ -196387,14 +197016,16 @@ function chunkMarkdown(content, options = {}) {
|
|
|
196387
197016
|
const heading = sectionHeadingLine(entry.section);
|
|
196388
197017
|
if (heading)
|
|
196389
197018
|
textParts.push(heading);
|
|
196390
|
-
|
|
196391
|
-
|
|
196392
|
-
|
|
196393
|
-
|
|
196394
|
-
|
|
196395
|
-
|
|
196396
|
-
|
|
196397
|
-
|
|
197019
|
+
const sectionParas = coarseParagraphs.filter((p4) => p4.sectionIndex === entry.sectionIndex);
|
|
197020
|
+
for (const p4 of sectionParas) {
|
|
197021
|
+
if (!paragraphs.some((existing) => existing.paragraphIndex === p4.paragraphIndex && existing.text === p4.text)) {
|
|
197022
|
+
textParts.push(p4.text);
|
|
197023
|
+
paragraphs.push({
|
|
197024
|
+
sectionIndex: p4.sectionIndex,
|
|
197025
|
+
paragraphIndex: p4.paragraphIndex,
|
|
197026
|
+
text: p4.text
|
|
197027
|
+
});
|
|
197028
|
+
}
|
|
196398
197029
|
}
|
|
196399
197030
|
}
|
|
196400
197031
|
chunks.push({
|
|
@@ -196417,7 +197048,7 @@ function chunkMarkdown(content, options = {}) {
|
|
|
196417
197048
|
` : "") + section.body);
|
|
196418
197049
|
if (sectionTokens > maxTokens && section.paragraphs.length > 1) {
|
|
196419
197050
|
flushPending();
|
|
196420
|
-
splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks);
|
|
197051
|
+
splitSectionByParagraphs(section, sIdx, breadcrumb, maxTokens, chunks, coarseParagraphs);
|
|
196421
197052
|
continue;
|
|
196422
197053
|
}
|
|
196423
197054
|
const crumbTokens = pendingSections.length === 0 ? estimateTokens(breadcrumbPrefix(breadcrumb)) : 0;
|
|
@@ -196430,9 +197061,10 @@ function chunkMarkdown(content, options = {}) {
|
|
|
196430
197061
|
flushPending();
|
|
196431
197062
|
return chunks;
|
|
196432
197063
|
}
|
|
196433
|
-
function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks) {
|
|
197064
|
+
function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens, chunks, coarseParagraphs) {
|
|
196434
197065
|
const headingLine = sectionHeadingLine(section);
|
|
196435
197066
|
const prefix = breadcrumbPrefix(breadcrumb);
|
|
197067
|
+
const sectionParas = coarseParagraphs.filter((p4) => p4.sectionIndex === sectionIndex);
|
|
196436
197068
|
let accParagraphs = [];
|
|
196437
197069
|
let accTextParts = [];
|
|
196438
197070
|
let accTokens = 0;
|
|
@@ -196459,18 +197091,265 @@ function splitSectionByParagraphs(section, sectionIndex, breadcrumb, maxTokens,
|
|
|
196459
197091
|
accTokens = baseOverhead;
|
|
196460
197092
|
}
|
|
196461
197093
|
accTokens = baseOverhead;
|
|
196462
|
-
for (
|
|
196463
|
-
const
|
|
196464
|
-
const pTokens = estimateTokens(pText);
|
|
197094
|
+
for (const p4 of sectionParas) {
|
|
197095
|
+
const pTokens = estimateTokens(p4.text);
|
|
196465
197096
|
if (accParagraphs.length > 0 && accTokens + pTokens > maxTokens) {
|
|
196466
197097
|
flushAcc();
|
|
196467
197098
|
}
|
|
196468
|
-
accParagraphs.push({ sectionIndex, paragraphIndex:
|
|
196469
|
-
accTextParts.push(
|
|
197099
|
+
accParagraphs.push({ sectionIndex, paragraphIndex: p4.paragraphIndex, text: p4.text });
|
|
197100
|
+
accTextParts.push(p4.text);
|
|
196470
197101
|
accTokens += pTokens;
|
|
196471
197102
|
}
|
|
196472
197103
|
flushAcc();
|
|
196473
197104
|
}
|
|
197105
|
+
// ../llm/src/chunking/normalizeMarkdown.ts
|
|
197106
|
+
function normalizeMarkdown(content) {
|
|
197107
|
+
const stats = { repairs: {} };
|
|
197108
|
+
function count(category) {
|
|
197109
|
+
stats.repairs[category] = (stats.repairs[category] ?? 0) + 1;
|
|
197110
|
+
}
|
|
197111
|
+
let result = stripBomAndInvisible(content, count);
|
|
197112
|
+
result = normalizeLineEndings(result, count);
|
|
197113
|
+
const lines = result.split(`
|
|
197114
|
+
`);
|
|
197115
|
+
const output = processBlocks(lines, count);
|
|
197116
|
+
return { content: output.join(`
|
|
197117
|
+
`), stats };
|
|
197118
|
+
}
|
|
197119
|
+
function stripBomAndInvisible(text2, count) {
|
|
197120
|
+
const cleaned = text2.replace(/[\uFEFF\u200B\u200C\u200D]/g, "");
|
|
197121
|
+
if (cleaned.length !== text2.length) {
|
|
197122
|
+
count("invisible_chars");
|
|
197123
|
+
}
|
|
197124
|
+
return cleaned;
|
|
197125
|
+
}
|
|
197126
|
+
function normalizeLineEndings(text2, count) {
|
|
197127
|
+
if (text2.includes("\r")) {
|
|
197128
|
+
count("line_endings");
|
|
197129
|
+
return text2.replace(/\r\n?/g, `
|
|
197130
|
+
`);
|
|
197131
|
+
}
|
|
197132
|
+
return text2;
|
|
197133
|
+
}
|
|
197134
|
+
function processBlocks(inputLines, count) {
|
|
197135
|
+
const lines = splitInlineFences(inputLines, count);
|
|
197136
|
+
const output = [];
|
|
197137
|
+
let i = 0;
|
|
197138
|
+
while (i < lines.length) {
|
|
197139
|
+
const line = lines[i];
|
|
197140
|
+
const trimmed = line.trimStart();
|
|
197141
|
+
const fenceMatch = trimmed.match(/^(`{3,}|~{3,})/);
|
|
197142
|
+
if (fenceMatch) {
|
|
197143
|
+
const result = handleCodeFence(lines, i, fenceMatch[1], count);
|
|
197144
|
+
output.push(...result.lines);
|
|
197145
|
+
i = result.nextIndex;
|
|
197146
|
+
continue;
|
|
197147
|
+
}
|
|
197148
|
+
if (looksLikeTableRow(trimmed)) {
|
|
197149
|
+
const result = handleTableBlock(lines, i, count);
|
|
197150
|
+
output.push(...result.lines);
|
|
197151
|
+
i = result.nextIndex;
|
|
197152
|
+
continue;
|
|
197153
|
+
}
|
|
197154
|
+
if (trimmed === "") {
|
|
197155
|
+
const result = handleBlankLines(lines, i, count);
|
|
197156
|
+
output.push(...result.lines);
|
|
197157
|
+
i = result.nextIndex;
|
|
197158
|
+
continue;
|
|
197159
|
+
}
|
|
197160
|
+
if (trimmed.startsWith("<!--")) {
|
|
197161
|
+
const result = handleHtmlComment(lines, i, count);
|
|
197162
|
+
output.push(...result.lines);
|
|
197163
|
+
i = result.nextIndex;
|
|
197164
|
+
continue;
|
|
197165
|
+
}
|
|
197166
|
+
if (looksLikeJsonBlockStart(trimmed)) {
|
|
197167
|
+
const result = handleUnfencedJson(lines, i, count);
|
|
197168
|
+
if (result) {
|
|
197169
|
+
output.push(...result.lines);
|
|
197170
|
+
i = result.nextIndex;
|
|
197171
|
+
continue;
|
|
197172
|
+
}
|
|
197173
|
+
}
|
|
197174
|
+
output.push(line);
|
|
197175
|
+
i++;
|
|
197176
|
+
}
|
|
197177
|
+
return output;
|
|
197178
|
+
}
|
|
197179
|
+
function handleCodeFence(lines, startIdx, fence, count) {
|
|
197180
|
+
const fenceChar = fence[0];
|
|
197181
|
+
const fenceLen = fence.length;
|
|
197182
|
+
const result = [lines[startIdx]];
|
|
197183
|
+
let i = startIdx + 1;
|
|
197184
|
+
while (i < lines.length) {
|
|
197185
|
+
const trimmed = lines[i].trimStart();
|
|
197186
|
+
result.push(lines[i]);
|
|
197187
|
+
const closingRe = new RegExp(`^${fenceChar === "`" ? "`" : "~"}{${fenceLen},}\\s*$`);
|
|
197188
|
+
if (closingRe.test(trimmed)) {
|
|
197189
|
+
return { lines: result, nextIndex: i + 1 };
|
|
197190
|
+
}
|
|
197191
|
+
i++;
|
|
197192
|
+
}
|
|
197193
|
+
count("unclosed_code_fence");
|
|
197194
|
+
result.push(fence);
|
|
197195
|
+
return { lines: result, nextIndex: i };
|
|
197196
|
+
}
|
|
197197
|
+
function handleTableBlock(lines, startIdx, count) {
|
|
197198
|
+
const tableLines = [];
|
|
197199
|
+
let i = startIdx;
|
|
197200
|
+
while (i < lines.length && looksLikeTableRow(lines[i].trimStart())) {
|
|
197201
|
+
tableLines.push(lines[i]);
|
|
197202
|
+
i++;
|
|
197203
|
+
}
|
|
197204
|
+
if (tableLines.length < 2) {
|
|
197205
|
+
return { lines: tableLines, nextIndex: i };
|
|
197206
|
+
}
|
|
197207
|
+
const normalized = tableLines.map((line) => {
|
|
197208
|
+
const trimmed = line.trimStart();
|
|
197209
|
+
if (!trimmed.startsWith("|") && trimmed.includes("|")) {
|
|
197210
|
+
count("table_leading_pipe");
|
|
197211
|
+
return "| " + trimmed + (trimmed.endsWith("|") ? "" : " |");
|
|
197212
|
+
}
|
|
197213
|
+
return line;
|
|
197214
|
+
});
|
|
197215
|
+
const hasSeparator = normalized.some((line) => /^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/.test(line.trim()));
|
|
197216
|
+
if (!hasSeparator && normalized.length >= 2) {
|
|
197217
|
+
const firstRow = normalized[0].trim();
|
|
197218
|
+
const colCount = countPipes(firstRow) - 1;
|
|
197219
|
+
if (colCount >= 2) {
|
|
197220
|
+
const separator = "| " + Array(colCount).fill("---").join(" | ") + " |";
|
|
197221
|
+
count("table_missing_separator");
|
|
197222
|
+
const result = [normalized[0], separator, ...normalized.slice(1)];
|
|
197223
|
+
return { lines: result, nextIndex: i };
|
|
197224
|
+
}
|
|
197225
|
+
}
|
|
197226
|
+
return { lines: normalized, nextIndex: i };
|
|
197227
|
+
}
|
|
197228
|
+
function handleBlankLines(lines, startIdx, count) {
|
|
197229
|
+
let i = startIdx;
|
|
197230
|
+
while (i < lines.length && lines[i].trim() === "") {
|
|
197231
|
+
i++;
|
|
197232
|
+
}
|
|
197233
|
+
const blankCount = i - startIdx;
|
|
197234
|
+
if (blankCount > 2) {
|
|
197235
|
+
count("excessive_blank_lines");
|
|
197236
|
+
return { lines: [""], nextIndex: i };
|
|
197237
|
+
}
|
|
197238
|
+
return { lines: lines.slice(startIdx, i), nextIndex: i };
|
|
197239
|
+
}
|
|
197240
|
+
function handleHtmlComment(lines, startIdx, count) {
|
|
197241
|
+
const firstLine = lines[startIdx];
|
|
197242
|
+
if (firstLine.includes("-->")) {
|
|
197243
|
+
count("html_comment");
|
|
197244
|
+
return { lines: [], nextIndex: startIdx + 1 };
|
|
197245
|
+
}
|
|
197246
|
+
let i = startIdx + 1;
|
|
197247
|
+
while (i < lines.length) {
|
|
197248
|
+
if (lines[i].includes("-->")) {
|
|
197249
|
+
count("html_comment");
|
|
197250
|
+
return { lines: [], nextIndex: i + 1 };
|
|
197251
|
+
}
|
|
197252
|
+
i++;
|
|
197253
|
+
}
|
|
197254
|
+
return { lines: [firstLine], nextIndex: startIdx + 1 };
|
|
197255
|
+
}
|
|
197256
|
+
function looksLikeJsonBlockStart(trimmed) {
|
|
197257
|
+
return trimmed === "{" || trimmed === "[";
|
|
197258
|
+
}
|
|
197259
|
+
var MIN_JSON_BLOCK_LINES = 5;
|
|
197260
|
+
function handleUnfencedJson(lines, startIdx, count) {
|
|
197261
|
+
const opener = lines[startIdx].trimStart();
|
|
197262
|
+
const openChar = opener[0];
|
|
197263
|
+
const closeChar = openChar === "{" ? "}" : "]";
|
|
197264
|
+
let depth = 0;
|
|
197265
|
+
let i = startIdx;
|
|
197266
|
+
let inString = false;
|
|
197267
|
+
while (i < lines.length) {
|
|
197268
|
+
const line = lines[i];
|
|
197269
|
+
for (let c = 0;c < line.length; c++) {
|
|
197270
|
+
const ch = line[c];
|
|
197271
|
+
if (ch === "\\" && inString) {
|
|
197272
|
+
c++;
|
|
197273
|
+
continue;
|
|
197274
|
+
}
|
|
197275
|
+
if (ch === '"') {
|
|
197276
|
+
inString = !inString;
|
|
197277
|
+
continue;
|
|
197278
|
+
}
|
|
197279
|
+
if (inString)
|
|
197280
|
+
continue;
|
|
197281
|
+
if (ch === "/" && c + 1 < line.length && line[c + 1] === "/") {
|
|
197282
|
+
break;
|
|
197283
|
+
}
|
|
197284
|
+
if (ch === "{" || ch === "[")
|
|
197285
|
+
depth++;
|
|
197286
|
+
else if (ch === "}" || ch === "]")
|
|
197287
|
+
depth--;
|
|
197288
|
+
}
|
|
197289
|
+
i++;
|
|
197290
|
+
if (depth === 0) {
|
|
197291
|
+
const blockLen = i - startIdx;
|
|
197292
|
+
if (blockLen < MIN_JSON_BLOCK_LINES) {
|
|
197293
|
+
return null;
|
|
197294
|
+
}
|
|
197295
|
+
const lastTrimmed = lines[i - 1].trimEnd();
|
|
197296
|
+
if (!lastTrimmed.endsWith(closeChar)) {
|
|
197297
|
+
return null;
|
|
197298
|
+
}
|
|
197299
|
+
count("unfenced_json_block");
|
|
197300
|
+
const fenced = ["```json"];
|
|
197301
|
+
for (let j = startIdx;j < i; j++) {
|
|
197302
|
+
fenced.push(lines[j]);
|
|
197303
|
+
}
|
|
197304
|
+
fenced.push("```");
|
|
197305
|
+
return { lines: fenced, nextIndex: i };
|
|
197306
|
+
}
|
|
197307
|
+
if (depth < 0) {
|
|
197308
|
+
return null;
|
|
197309
|
+
}
|
|
197310
|
+
}
|
|
197311
|
+
return null;
|
|
197312
|
+
}
|
|
197313
|
+
function splitInlineFences(lines, count) {
|
|
197314
|
+
const result = [];
|
|
197315
|
+
for (const line of lines) {
|
|
197316
|
+
const trimmed = line.trimStart();
|
|
197317
|
+
if (/^(`{3,}|~{3,})/.test(trimmed)) {
|
|
197318
|
+
result.push(line);
|
|
197319
|
+
continue;
|
|
197320
|
+
}
|
|
197321
|
+
const inlineMatch = trimmed.match(/(`{3,}|~{3,})(\S*)\s*$/);
|
|
197322
|
+
if (inlineMatch) {
|
|
197323
|
+
const fenceStr = inlineMatch[1];
|
|
197324
|
+
const fenceIdx = trimmed.lastIndexOf(fenceStr);
|
|
197325
|
+
const beforeFence = trimmed.substring(0, fenceIdx);
|
|
197326
|
+
if (beforeFence.trim().length > 0) {
|
|
197327
|
+
const leadingWhitespace = line.substring(0, line.length - trimmed.length);
|
|
197328
|
+
count("inline_code_fence");
|
|
197329
|
+
result.push(leadingWhitespace + beforeFence.trimEnd());
|
|
197330
|
+
result.push(trimmed.substring(fenceIdx));
|
|
197331
|
+
continue;
|
|
197332
|
+
}
|
|
197333
|
+
}
|
|
197334
|
+
result.push(line);
|
|
197335
|
+
}
|
|
197336
|
+
return result;
|
|
197337
|
+
}
|
|
197338
|
+
function looksLikeTableRow(trimmed) {
|
|
197339
|
+
if (trimmed.startsWith("#") || trimmed.startsWith("```") || trimmed.startsWith("~~~")) {
|
|
197340
|
+
return false;
|
|
197341
|
+
}
|
|
197342
|
+
return countPipes(trimmed) >= 1;
|
|
197343
|
+
}
|
|
197344
|
+
function countPipes(text2) {
|
|
197345
|
+
let count = 0;
|
|
197346
|
+
for (let i = 0;i < text2.length; i++) {
|
|
197347
|
+
if (text2[i] === "|" && (i === 0 || text2[i - 1] !== "\\")) {
|
|
197348
|
+
count++;
|
|
197349
|
+
}
|
|
197350
|
+
}
|
|
197351
|
+
return count;
|
|
197352
|
+
}
|
|
196474
197353
|
// ../llm/src/utils/mapConcurrent.ts
|
|
196475
197354
|
async function mapConcurrent(items, concurrency, fn) {
|
|
196476
197355
|
const results = [];
|
|
@@ -196499,9 +197378,760 @@ async function mapConcurrent(items, concurrency, fn) {
|
|
|
196499
197378
|
}
|
|
196500
197379
|
// ../api/src/services/docIndexer.ts
|
|
196501
197380
|
init_src();
|
|
197381
|
+
|
|
197382
|
+
// ../api/src/services/docEmbedding.ts
|
|
197383
|
+
var EMBEDDING_BATCH_SIZE = 20;
|
|
197384
|
+
var EMBEDDING_MAX_TOKENS = 480;
|
|
197385
|
+
function isPureCodeBlock(text2) {
|
|
197386
|
+
const trimmed = text2.trim();
|
|
197387
|
+
if (/^```[\s\S]*```\s*$/.test(trimmed))
|
|
197388
|
+
return true;
|
|
197389
|
+
const lines = trimmed.split(`
|
|
197390
|
+
`).filter(Boolean);
|
|
197391
|
+
if (lines.length < 3)
|
|
197392
|
+
return false;
|
|
197393
|
+
const indentedLines = lines.filter((l) => /^\s{2,}/.test(l)).length;
|
|
197394
|
+
const indentRatio = indentedLines / lines.length;
|
|
197395
|
+
if (indentRatio > 0.8)
|
|
197396
|
+
return true;
|
|
197397
|
+
const codeChars = (trimmed.match(/[{}();=><|&![\]]/g) || []).length;
|
|
197398
|
+
const ratio = codeChars / trimmed.length;
|
|
197399
|
+
if (ratio > 0.15 && indentRatio > 0.6)
|
|
197400
|
+
return true;
|
|
197401
|
+
return false;
|
|
197402
|
+
}
|
|
197403
|
+
var CODE_SKELETON_MAX_LINES = 20;
|
|
197404
|
+
var CODE_SKELETON_MAX_CHARS = 800;
|
|
197405
|
+
function skeletonizeCodeBlock(text2) {
|
|
197406
|
+
const lines = text2.split(`
|
|
197407
|
+
`);
|
|
197408
|
+
let indentUnit = 2;
|
|
197409
|
+
for (const line of lines) {
|
|
197410
|
+
const match2 = line.match(/^(\s+)\S/);
|
|
197411
|
+
if (match2) {
|
|
197412
|
+
const spaces = match2[1].replace(/\t/g, " ").length;
|
|
197413
|
+
if (spaces > 0) {
|
|
197414
|
+
indentUnit = spaces;
|
|
197415
|
+
break;
|
|
197416
|
+
}
|
|
197417
|
+
}
|
|
197418
|
+
}
|
|
197419
|
+
const maxIndent = indentUnit * 2;
|
|
197420
|
+
const kept = [];
|
|
197421
|
+
let lastWasElided = false;
|
|
197422
|
+
for (const line of lines) {
|
|
197423
|
+
const trimmed = line.trimStart();
|
|
197424
|
+
if (trimmed === "")
|
|
197425
|
+
continue;
|
|
197426
|
+
const leadingSpaces = line.replace(/\t/g, " ").length - trimmed.length;
|
|
197427
|
+
if (leadingSpaces <= maxIndent) {
|
|
197428
|
+
if (lastWasElided) {
|
|
197429
|
+
kept.push(" ...");
|
|
197430
|
+
lastWasElided = false;
|
|
197431
|
+
}
|
|
197432
|
+
kept.push(line);
|
|
197433
|
+
} else {
|
|
197434
|
+
lastWasElided = true;
|
|
197435
|
+
}
|
|
197436
|
+
}
|
|
197437
|
+
if (lastWasElided)
|
|
197438
|
+
kept.push(" ...");
|
|
197439
|
+
let result = kept;
|
|
197440
|
+
if (result.length > CODE_SKELETON_MAX_LINES) {
|
|
197441
|
+
result = result.slice(0, CODE_SKELETON_MAX_LINES);
|
|
197442
|
+
result.push("[...]");
|
|
197443
|
+
}
|
|
197444
|
+
let joined = result.join(`
|
|
197445
|
+
`);
|
|
197446
|
+
if (joined.length > CODE_SKELETON_MAX_CHARS) {
|
|
197447
|
+
joined = joined.slice(0, CODE_SKELETON_MAX_CHARS) + `
|
|
197448
|
+
[...]`;
|
|
197449
|
+
}
|
|
197450
|
+
return joined;
|
|
197451
|
+
}
|
|
197452
|
+
function truncateForEmbedding(text2, maxTokens) {
|
|
197453
|
+
const maxChars = maxTokens * 4;
|
|
197454
|
+
if (text2.length <= maxChars)
|
|
197455
|
+
return text2;
|
|
197456
|
+
const spaceIdx = text2.lastIndexOf(" ", maxChars);
|
|
197457
|
+
const breakAt = spaceIdx > maxChars * 0.8 ? spaceIdx : maxChars;
|
|
197458
|
+
return text2.slice(0, breakAt);
|
|
197459
|
+
}
|
|
197460
|
+
async function generateEmbeddings(digest, embeddingService, onProgress) {
|
|
197461
|
+
const paragraphs = [];
|
|
197462
|
+
let skippedCode = 0;
|
|
197463
|
+
for (let sIdx = 0;sIdx < digest.sections.length; sIdx++) {
|
|
197464
|
+
const section = digest.sections[sIdx];
|
|
197465
|
+
for (let pIdx = 0;pIdx < section.paragraphs.length; pIdx++) {
|
|
197466
|
+
const text2 = section.paragraphs[pIdx].text;
|
|
197467
|
+
if (isPureCodeBlock(text2)) {
|
|
197468
|
+
skippedCode++;
|
|
197469
|
+
continue;
|
|
197470
|
+
}
|
|
197471
|
+
paragraphs.push({
|
|
197472
|
+
sectionIndex: sIdx,
|
|
197473
|
+
paragraphIndex: pIdx,
|
|
197474
|
+
text: truncateForEmbedding(text2, EMBEDDING_MAX_TOKENS)
|
|
197475
|
+
});
|
|
197476
|
+
}
|
|
197477
|
+
}
|
|
197478
|
+
if (paragraphs.length === 0)
|
|
197479
|
+
return 0;
|
|
197480
|
+
if (skippedCode > 0) {
|
|
197481
|
+
onProgress?.({ phase: "embedding", progress: 85, message: `Skipped ${skippedCode} code-only paragraphs` });
|
|
197482
|
+
}
|
|
197483
|
+
const embeddings = [];
|
|
197484
|
+
const totalParagraphs = paragraphs.length;
|
|
197485
|
+
onProgress?.({ phase: "embedding", progress: 85, message: `Loading model (${totalParagraphs} paragraphs)` });
|
|
197486
|
+
const warmupStart = Date.now();
|
|
197487
|
+
await embeddingService.getDimension();
|
|
197488
|
+
const warmupMs = Date.now() - warmupStart;
|
|
197489
|
+
if (warmupMs > 500) {
|
|
197490
|
+
onProgress?.({ phase: "embedding", progress: 86, message: `Model ready (${(warmupMs / 1000).toFixed(1)}s)` });
|
|
197491
|
+
}
|
|
197492
|
+
const totalBatches = Math.ceil(totalParagraphs / EMBEDDING_BATCH_SIZE);
|
|
197493
|
+
for (let i = 0;i < totalParagraphs; i += EMBEDDING_BATCH_SIZE) {
|
|
197494
|
+
const batchIndex = Math.floor(i / EMBEDDING_BATCH_SIZE) + 1;
|
|
197495
|
+
const batch2 = paragraphs.slice(i, i + EMBEDDING_BATCH_SIZE);
|
|
197496
|
+
const texts = batch2.map((p4) => p4.text);
|
|
197497
|
+
const batchStart = Date.now();
|
|
197498
|
+
try {
|
|
197499
|
+
const vectors = await embeddingService.embedBatch(texts);
|
|
197500
|
+
for (let j = 0;j < batch2.length; j++) {
|
|
197501
|
+
embeddings.push({
|
|
197502
|
+
sectionIndex: batch2[j].sectionIndex,
|
|
197503
|
+
paragraphIndex: batch2[j].paragraphIndex,
|
|
197504
|
+
vector: vectors[j]
|
|
197505
|
+
});
|
|
197506
|
+
}
|
|
197507
|
+
} catch {
|
|
197508
|
+
for (let fi = 0;fi < batch2.length; fi++) {
|
|
197509
|
+
const p4 = batch2[fi];
|
|
197510
|
+
try {
|
|
197511
|
+
const vector = await embeddingService.embed(p4.text);
|
|
197512
|
+
embeddings.push({
|
|
197513
|
+
sectionIndex: p4.sectionIndex,
|
|
197514
|
+
paragraphIndex: p4.paragraphIndex,
|
|
197515
|
+
vector
|
|
197516
|
+
});
|
|
197517
|
+
} catch {
|
|
197518
|
+
console.warn(`[docIndexer] embedding failed for section ${p4.sectionIndex} paragraph ${p4.paragraphIndex}`);
|
|
197519
|
+
}
|
|
197520
|
+
const embedded2 = i + fi + 1;
|
|
197521
|
+
const progress2 = 86 + Math.round(embedded2 / totalParagraphs * 9);
|
|
197522
|
+
onProgress?.({ phase: "embedding", progress: progress2, message: `Fallback ${embedded2}/${totalParagraphs}` });
|
|
197523
|
+
}
|
|
197524
|
+
continue;
|
|
197525
|
+
}
|
|
197526
|
+
const embedded = Math.min(i + EMBEDDING_BATCH_SIZE, totalParagraphs);
|
|
197527
|
+
const batchMs = Date.now() - batchStart;
|
|
197528
|
+
const progress = 86 + Math.round(embedded / totalParagraphs * 9);
|
|
197529
|
+
onProgress?.({ phase: "embedding", progress, message: `Batch ${batchIndex}/${totalBatches} (${embedded}/${totalParagraphs}, ${(batchMs / 1000).toFixed(1)}s)` });
|
|
197530
|
+
}
|
|
197531
|
+
digest.embeddings = embeddings;
|
|
197532
|
+
return embeddings.length;
|
|
197533
|
+
}
|
|
197534
|
+
async function writeToVectorStore(digest, vectorStore, hashId, sourceId, sourcePath) {
|
|
197535
|
+
if (digest.embeddings.length === 0)
|
|
197536
|
+
return;
|
|
197537
|
+
try {
|
|
197538
|
+
await vectorStore.deleteByPrefix(`${hashId}:`);
|
|
197539
|
+
await vectorStore.add(digest.embeddings.map((e) => ({
|
|
197540
|
+
id: `${hashId}:${e.sectionIndex}:${e.paragraphIndex}`,
|
|
197541
|
+
embedding: e.vector,
|
|
197542
|
+
metadata: {
|
|
197543
|
+
layer: "digest",
|
|
197544
|
+
sourceId,
|
|
197545
|
+
hashId,
|
|
197546
|
+
sourcePath,
|
|
197547
|
+
sectionIndex: e.sectionIndex,
|
|
197548
|
+
paragraphIndex: e.paragraphIndex
|
|
197549
|
+
}
|
|
197550
|
+
})));
|
|
197551
|
+
} catch (err2) {
|
|
197552
|
+
console.warn(`[docIndexer] IVectorStore write failed (non-blocking):`, err2);
|
|
197553
|
+
}
|
|
197554
|
+
}
|
|
197555
|
+
|
|
197556
|
+
// ../api/src/services/docTableExtractor.ts
|
|
197557
|
+
init_src();
|
|
197558
|
+
function detectTableColumnCount(text2) {
|
|
197559
|
+
const sepMatch = text2.match(/^\|[\s:-]+(?:\|[\s:-]+)+\|?\s*$/m);
|
|
197560
|
+
if (!sepMatch)
|
|
197561
|
+
return 0;
|
|
197562
|
+
return (sepMatch[0].match(/\|/g)?.length ?? 1) - 1;
|
|
197563
|
+
}
|
|
197564
|
+
async function extractTableAtoms(chunk, sections, result, llmService) {
|
|
197565
|
+
const tableParagraphs = [];
|
|
197566
|
+
for (let i = 0;i < chunk.paragraphs.length; i++) {
|
|
197567
|
+
const p4 = chunk.paragraphs[i];
|
|
197568
|
+
const colCount = detectTableColumnCount(p4.text);
|
|
197569
|
+
if (colCount < 2)
|
|
197570
|
+
continue;
|
|
197571
|
+
const section = sections[p4.sectionIndex];
|
|
197572
|
+
const sectionHeading = section?.heading ? `${"#".repeat(section.level)} ${section.heading}` : "";
|
|
197573
|
+
tableParagraphs.push({
|
|
197574
|
+
chunkParaIndex: i,
|
|
197575
|
+
colCount,
|
|
197576
|
+
text: p4.text,
|
|
197577
|
+
sectionHeading
|
|
197578
|
+
});
|
|
197579
|
+
}
|
|
197580
|
+
if (tableParagraphs.length === 0) {
|
|
197581
|
+
return { extracted: 0, llmCalls: 0, totalTokens: 0 };
|
|
197582
|
+
}
|
|
197583
|
+
const parts = [];
|
|
197584
|
+
const tagToChunkIndex = new Map;
|
|
197585
|
+
for (let ti = 0;ti < tableParagraphs.length; ti++) {
|
|
197586
|
+
const tp = tableParagraphs[ti];
|
|
197587
|
+
const tag2 = `P${ti}`;
|
|
197588
|
+
tagToChunkIndex.set(tag2, tp.chunkParaIndex);
|
|
197589
|
+
if (tp.sectionHeading) {
|
|
197590
|
+
parts.push(tp.sectionHeading);
|
|
197591
|
+
}
|
|
197592
|
+
if (tp.chunkParaIndex > 0) {
|
|
197593
|
+
const prevPara = chunk.paragraphs[tp.chunkParaIndex - 1];
|
|
197594
|
+
if (prevPara && detectTableColumnCount(prevPara.text) === 0) {
|
|
197595
|
+
parts.push(prevPara.text);
|
|
197596
|
+
}
|
|
197597
|
+
}
|
|
197598
|
+
parts.push(`[${tag2}] ${tp.text}`);
|
|
197599
|
+
parts.push("");
|
|
197600
|
+
}
|
|
197601
|
+
const tableText = parts.join(`
|
|
197602
|
+
|
|
197603
|
+
`);
|
|
197604
|
+
const prompt = buildDocTableAnnotationPrompt(tableText);
|
|
197605
|
+
try {
|
|
197606
|
+
const res = await llmService.generateText(prompt, {
|
|
197607
|
+
systemPrompt: DOC_TABLE_ANNOTATION_SYSTEM_PROMPT
|
|
197608
|
+
});
|
|
197609
|
+
const parsed = parseExtractionOutput(res.text, docChunkResultSchema);
|
|
197610
|
+
if (!parsed.success) {
|
|
197611
|
+
console.warn(`[docIndexer] table extraction: parse failed: ${parsed.error.message.slice(0, 200)}`);
|
|
197612
|
+
return { extracted: 0, llmCalls: 1, totalTokens: res.usage.totalTokens };
|
|
197613
|
+
}
|
|
197614
|
+
let extracted = 0;
|
|
197615
|
+
for (const tableP of parsed.data.paragraphs) {
|
|
197616
|
+
const chunkParaIndex = tagToChunkIndex.get(tableP.tag);
|
|
197617
|
+
if (chunkParaIndex === undefined)
|
|
197618
|
+
continue;
|
|
197619
|
+
const originalTag = `P${chunkParaIndex}`;
|
|
197620
|
+
const existing = result.paragraphs.find((rp) => rp.tag === originalTag);
|
|
197621
|
+
const tableAtomCount = Object.values(tableP.atoms).reduce((sum, arr) => sum + (Array.isArray(arr) ? arr.length : 0), 0);
|
|
197622
|
+
if (tableAtomCount === 0)
|
|
197623
|
+
continue;
|
|
197624
|
+
if (existing) {
|
|
197625
|
+
for (const [atomType, atoms2] of Object.entries(tableP.atoms)) {
|
|
197626
|
+
if (Array.isArray(atoms2) && atoms2.length > 0) {
|
|
197627
|
+
existing.atoms[atomType] = atoms2;
|
|
197628
|
+
}
|
|
197629
|
+
}
|
|
197630
|
+
} else {
|
|
197631
|
+
result.paragraphs.push({ ...tableP, tag: originalTag });
|
|
197632
|
+
}
|
|
197633
|
+
extracted++;
|
|
197634
|
+
const tp = tableParagraphs.find((t4) => t4.chunkParaIndex === chunkParaIndex);
|
|
197635
|
+
console.log(`[docIndexer] table extraction: ${originalTag} → ${tableAtomCount} atoms (table has ${tp?.colCount ?? "?"} cols)`);
|
|
197636
|
+
}
|
|
197637
|
+
return { extracted, llmCalls: 1, totalTokens: res.usage.totalTokens };
|
|
197638
|
+
} catch (err2) {
|
|
197639
|
+
console.warn("[docIndexer] table extraction failed (non-blocking):", err2);
|
|
197640
|
+
return { extracted: 0, llmCalls: 1, totalTokens: 0 };
|
|
197641
|
+
}
|
|
197642
|
+
}
|
|
197643
|
+
|
|
197644
|
+
// ../api/src/services/docDiagramExtractor.ts
|
|
197645
|
+
init_src();
|
|
197646
|
+
var DIAGRAM_OPEN_RE = new RegExp(`^\`\`\`(?:${DIAGRAM_FENCE_TAGS.join("|")})\\s*$`, "im");
|
|
197647
|
+
function detectDiagramFormat(text2) {
|
|
197648
|
+
const trimmed = text2.trim();
|
|
197649
|
+
const match2 = trimmed.match(new RegExp(`^\`\`\`(${DIAGRAM_FENCE_TAGS.join("|")})\\s*\\n`, "i"));
|
|
197650
|
+
if (!match2)
|
|
197651
|
+
return null;
|
|
197652
|
+
if (!trimmed.endsWith("```"))
|
|
197653
|
+
return null;
|
|
197654
|
+
return match2[1].toLowerCase();
|
|
197655
|
+
}
|
|
197656
|
+
async function extractDiagramAtoms(chunk, sections, result, llmService) {
|
|
197657
|
+
const diagramParagraphs = [];
|
|
197658
|
+
for (let i = 0;i < chunk.paragraphs.length; i++) {
|
|
197659
|
+
const p4 = chunk.paragraphs[i];
|
|
197660
|
+
const format = detectDiagramFormat(p4.text);
|
|
197661
|
+
if (!format)
|
|
197662
|
+
continue;
|
|
197663
|
+
const section = sections[p4.sectionIndex];
|
|
197664
|
+
const sectionHeading = section?.heading ? `${"#".repeat(section.level)} ${section.heading}` : "";
|
|
197665
|
+
diagramParagraphs.push({
|
|
197666
|
+
chunkParaIndex: i,
|
|
197667
|
+
format,
|
|
197668
|
+
text: p4.text,
|
|
197669
|
+
sectionHeading
|
|
197670
|
+
});
|
|
197671
|
+
}
|
|
197672
|
+
if (diagramParagraphs.length === 0) {
|
|
197673
|
+
return { extracted: 0, llmCalls: 0, totalTokens: 0 };
|
|
197674
|
+
}
|
|
197675
|
+
const parts = [];
|
|
197676
|
+
const tagToChunkIndex = new Map;
|
|
197677
|
+
for (let di = 0;di < diagramParagraphs.length; di++) {
|
|
197678
|
+
const dp = diagramParagraphs[di];
|
|
197679
|
+
const tag2 = `P${di}`;
|
|
197680
|
+
tagToChunkIndex.set(tag2, dp.chunkParaIndex);
|
|
197681
|
+
if (dp.sectionHeading) {
|
|
197682
|
+
parts.push(dp.sectionHeading);
|
|
197683
|
+
}
|
|
197684
|
+
if (dp.chunkParaIndex > 0) {
|
|
197685
|
+
const prevPara = chunk.paragraphs[dp.chunkParaIndex - 1];
|
|
197686
|
+
if (prevPara && !detectDiagramFormat(prevPara.text)) {
|
|
197687
|
+
parts.push(prevPara.text);
|
|
197688
|
+
}
|
|
197689
|
+
}
|
|
197690
|
+
parts.push(`[${tag2}] ${dp.text}`);
|
|
197691
|
+
parts.push("");
|
|
197692
|
+
}
|
|
197693
|
+
const diagramText = parts.join(`
|
|
197694
|
+
|
|
197695
|
+
`);
|
|
197696
|
+
const prompt = buildDocDiagramAnnotationPrompt(diagramText);
|
|
197697
|
+
try {
|
|
197698
|
+
const res = await llmService.generateText(prompt, {
|
|
197699
|
+
systemPrompt: DOC_DIAGRAM_ANNOTATION_SYSTEM_PROMPT
|
|
197700
|
+
});
|
|
197701
|
+
const parsed = parseExtractionOutput(res.text, docChunkResultSchema);
|
|
197702
|
+
if (!parsed.success) {
|
|
197703
|
+
console.warn(`[docIndexer] diagram extraction: parse failed — ${parsed.error.message.slice(0, 200)}`);
|
|
197704
|
+
return { extracted: 0, llmCalls: 1, totalTokens: res.usage.totalTokens };
|
|
197705
|
+
}
|
|
197706
|
+
let extracted = 0;
|
|
197707
|
+
for (const diagramP of parsed.data.paragraphs) {
|
|
197708
|
+
const chunkParaIndex = tagToChunkIndex.get(diagramP.tag);
|
|
197709
|
+
if (chunkParaIndex === undefined)
|
|
197710
|
+
continue;
|
|
197711
|
+
const originalTag = `P${chunkParaIndex}`;
|
|
197712
|
+
const existing = result.paragraphs.find((rp) => rp.tag === originalTag);
|
|
197713
|
+
const diagramAtomCount = Object.values(diagramP.atoms).reduce((sum, arr) => sum + (Array.isArray(arr) ? arr.length : 0), 0);
|
|
197714
|
+
if (diagramAtomCount === 0)
|
|
197715
|
+
continue;
|
|
197716
|
+
if (existing) {
|
|
197717
|
+
for (const [atomType, atoms2] of Object.entries(diagramP.atoms)) {
|
|
197718
|
+
if (Array.isArray(atoms2) && atoms2.length > 0) {
|
|
197719
|
+
existing.atoms[atomType] = atoms2;
|
|
197720
|
+
}
|
|
197721
|
+
}
|
|
197722
|
+
} else {
|
|
197723
|
+
result.paragraphs.push({ ...diagramP, tag: originalTag });
|
|
197724
|
+
}
|
|
197725
|
+
extracted++;
|
|
197726
|
+
const dp = diagramParagraphs.find((d) => d.chunkParaIndex === chunkParaIndex);
|
|
197727
|
+
console.log(`[docIndexer] diagram extraction: ${originalTag} → ${diagramAtomCount} atoms (${dp?.format ?? "unknown"} diagram)`);
|
|
197728
|
+
}
|
|
197729
|
+
return { extracted, llmCalls: 1, totalTokens: res.usage.totalTokens };
|
|
197730
|
+
} catch (err2) {
|
|
197731
|
+
console.warn("[docIndexer] diagram extraction failed (non-blocking):", err2);
|
|
197732
|
+
return { extracted: 0, llmCalls: 1, totalTokens: 0 };
|
|
197733
|
+
}
|
|
197734
|
+
}
|
|
197735
|
+
|
|
197736
|
+
// ../api/src/services/docAtomPostProcess.ts
|
|
197737
|
+
function postProcessDigestAtoms(sections) {
|
|
197738
|
+
autoCompleteEntities(sections);
|
|
197739
|
+
normalizeEntityNames(sections);
|
|
197740
|
+
warnCrossRefIssues(sections);
|
|
197741
|
+
}
|
|
197742
|
+
function isNoiseEntityName(name21) {
|
|
197743
|
+
const trimmed = name21.trim();
|
|
197744
|
+
if (trimmed.length === 0)
|
|
197745
|
+
return true;
|
|
197746
|
+
if (trimmed.startsWith("$"))
|
|
197747
|
+
return true;
|
|
197748
|
+
if (/[+=]/.test(trimmed))
|
|
197749
|
+
return true;
|
|
197750
|
+
if (/^\d/.test(trimmed))
|
|
197751
|
+
return true;
|
|
197752
|
+
return false;
|
|
197753
|
+
}
|
|
197754
|
+
function autoCompleteEntities(sections) {
|
|
197755
|
+
const declaredEntities = new Set;
|
|
197756
|
+
for (const section of sections) {
|
|
197757
|
+
for (const para of section.paragraphs) {
|
|
197758
|
+
const entities2 = para.atoms.entities;
|
|
197759
|
+
if (entities2) {
|
|
197760
|
+
for (const e of entities2)
|
|
197761
|
+
declaredEntities.add(e.name);
|
|
197762
|
+
}
|
|
197763
|
+
}
|
|
197764
|
+
}
|
|
197765
|
+
let autoCreated = 0;
|
|
197766
|
+
let skippedNoise = 0;
|
|
197767
|
+
for (const section of sections) {
|
|
197768
|
+
for (const para of section.paragraphs) {
|
|
197769
|
+
const referencedNames = new Set;
|
|
197770
|
+
const relations = para.atoms.relations;
|
|
197771
|
+
if (relations) {
|
|
197772
|
+
for (const r of relations) {
|
|
197773
|
+
referencedNames.add(r.from);
|
|
197774
|
+
referencedNames.add(r.to);
|
|
197775
|
+
}
|
|
197776
|
+
}
|
|
197777
|
+
const boundaries = para.atoms.boundaries;
|
|
197778
|
+
if (boundaries) {
|
|
197779
|
+
for (const b of boundaries) {
|
|
197780
|
+
for (const name21 of b.contains)
|
|
197781
|
+
referencedNames.add(name21);
|
|
197782
|
+
if (b.excludes)
|
|
197783
|
+
for (const name21 of b.excludes)
|
|
197784
|
+
referencedNames.add(name21);
|
|
197785
|
+
}
|
|
197786
|
+
}
|
|
197787
|
+
for (const name21 of referencedNames) {
|
|
197788
|
+
if (!declaredEntities.has(name21)) {
|
|
197789
|
+
if (isNoiseEntityName(name21)) {
|
|
197790
|
+
skippedNoise++;
|
|
197791
|
+
continue;
|
|
197792
|
+
}
|
|
197793
|
+
if (!para.atoms.entities) {
|
|
197794
|
+
para.atoms.entities = [];
|
|
197795
|
+
}
|
|
197796
|
+
para.atoms.entities.push({
|
|
197797
|
+
name: name21,
|
|
197798
|
+
kind: "concept",
|
|
197799
|
+
confidence: 0.6
|
|
197800
|
+
});
|
|
197801
|
+
declaredEntities.add(name21);
|
|
197802
|
+
autoCreated++;
|
|
197803
|
+
}
|
|
197804
|
+
}
|
|
197805
|
+
}
|
|
197806
|
+
}
|
|
197807
|
+
if (autoCreated > 0 || skippedNoise > 0) {
|
|
197808
|
+
console.log(`[docAtomPostProcess] auto-created ${autoCreated} entities, skipped ${skippedNoise} noise names`);
|
|
197809
|
+
}
|
|
197810
|
+
}
|
|
197811
|
+
function normalizeEntityNames(sections) {
|
|
197812
|
+
const allNames = [];
|
|
197813
|
+
for (const section of sections) {
|
|
197814
|
+
for (const para of section.paragraphs) {
|
|
197815
|
+
const entities2 = para.atoms.entities;
|
|
197816
|
+
if (entities2) {
|
|
197817
|
+
for (const e of entities2)
|
|
197818
|
+
allNames.push(e.name);
|
|
197819
|
+
}
|
|
197820
|
+
}
|
|
197821
|
+
}
|
|
197822
|
+
const uniqueNames = [...new Set(allNames)].sort((a, b) => b.length - a.length);
|
|
197823
|
+
const mergeMap = new Map;
|
|
197824
|
+
for (let i = 0;i < uniqueNames.length; i++) {
|
|
197825
|
+
const short = uniqueNames[i];
|
|
197826
|
+
if (short.length < 3)
|
|
197827
|
+
continue;
|
|
197828
|
+
if (mergeMap.has(short))
|
|
197829
|
+
continue;
|
|
197830
|
+
for (let j = 0;j < i; j++) {
|
|
197831
|
+
const long = uniqueNames[j];
|
|
197832
|
+
if (mergeMap.has(long))
|
|
197833
|
+
continue;
|
|
197834
|
+
if (long.includes(short) && long !== short) {
|
|
197835
|
+
mergeMap.set(short, long);
|
|
197836
|
+
break;
|
|
197837
|
+
}
|
|
197838
|
+
}
|
|
197839
|
+
}
|
|
197840
|
+
if (mergeMap.size === 0)
|
|
197841
|
+
return;
|
|
197842
|
+
let normalized = 0;
|
|
197843
|
+
for (const section of sections) {
|
|
197844
|
+
for (const para of section.paragraphs) {
|
|
197845
|
+
const entities2 = para.atoms.entities;
|
|
197846
|
+
if (entities2) {
|
|
197847
|
+
for (const e of entities2) {
|
|
197848
|
+
const canonical = mergeMap.get(e.name);
|
|
197849
|
+
if (canonical) {
|
|
197850
|
+
e.name = canonical;
|
|
197851
|
+
normalized++;
|
|
197852
|
+
}
|
|
197853
|
+
}
|
|
197854
|
+
const seen = new Set;
|
|
197855
|
+
para.atoms.entities = entities2.filter((e) => {
|
|
197856
|
+
if (seen.has(e.name))
|
|
197857
|
+
return false;
|
|
197858
|
+
seen.add(e.name);
|
|
197859
|
+
return true;
|
|
197860
|
+
});
|
|
197861
|
+
}
|
|
197862
|
+
const relations = para.atoms.relations;
|
|
197863
|
+
if (relations) {
|
|
197864
|
+
for (const r of relations) {
|
|
197865
|
+
const fromCanonical = mergeMap.get(r.from);
|
|
197866
|
+
if (fromCanonical) {
|
|
197867
|
+
r.from = fromCanonical;
|
|
197868
|
+
normalized++;
|
|
197869
|
+
}
|
|
197870
|
+
const toCanonical = mergeMap.get(r.to);
|
|
197871
|
+
if (toCanonical) {
|
|
197872
|
+
r.to = toCanonical;
|
|
197873
|
+
normalized++;
|
|
197874
|
+
}
|
|
197875
|
+
}
|
|
197876
|
+
}
|
|
197877
|
+
const boundaries = para.atoms.boundaries;
|
|
197878
|
+
if (boundaries) {
|
|
197879
|
+
for (const b of boundaries) {
|
|
197880
|
+
b.contains = b.contains.map((n) => mergeMap.get(n) ?? n);
|
|
197881
|
+
if (b.excludes)
|
|
197882
|
+
b.excludes = b.excludes.map((n) => mergeMap.get(n) ?? n);
|
|
197883
|
+
}
|
|
197884
|
+
}
|
|
197885
|
+
}
|
|
197886
|
+
}
|
|
197887
|
+
if (normalized > 0) {
|
|
197888
|
+
console.log(`[docAtomPostProcess] normalized ${normalized} entity name references (${mergeMap.size} merge rules)`);
|
|
197889
|
+
for (const [short, long] of mergeMap) {
|
|
197890
|
+
console.log(` "${short}" → "${long}"`);
|
|
197891
|
+
}
|
|
197892
|
+
}
|
|
197893
|
+
}
|
|
197894
|
+
function warnCrossRefIssues(sections) {
|
|
197895
|
+
const allEntityNames = new Set;
|
|
197896
|
+
for (const section of sections) {
|
|
197897
|
+
for (const para of section.paragraphs) {
|
|
197898
|
+
const entities2 = para.atoms.entities;
|
|
197899
|
+
if (entities2) {
|
|
197900
|
+
for (const e of entities2)
|
|
197901
|
+
allEntityNames.add(e.name);
|
|
197902
|
+
}
|
|
197903
|
+
}
|
|
197904
|
+
}
|
|
197905
|
+
const allStateValues = new Set;
|
|
197906
|
+
for (const section of sections) {
|
|
197907
|
+
for (const para of section.paragraphs) {
|
|
197908
|
+
const states = para.atoms.states;
|
|
197909
|
+
if (states) {
|
|
197910
|
+
for (const s of states)
|
|
197911
|
+
for (const v of s.values)
|
|
197912
|
+
allStateValues.add(v);
|
|
197913
|
+
}
|
|
197914
|
+
}
|
|
197915
|
+
}
|
|
197916
|
+
let warnings = 0;
|
|
197917
|
+
for (const section of sections) {
|
|
197918
|
+
for (const para of section.paragraphs) {
|
|
197919
|
+
const transitions = para.atoms.transitions;
|
|
197920
|
+
if (transitions) {
|
|
197921
|
+
for (const t4 of transitions) {
|
|
197922
|
+
if (allStateValues.size > 0 && !allStateValues.has(t4.from)) {
|
|
197923
|
+
console.warn(`[docAtomPostProcess] transition.from "${t4.from}" not in declared states`);
|
|
197924
|
+
warnings++;
|
|
197925
|
+
}
|
|
197926
|
+
if (allStateValues.size > 0 && !allStateValues.has(t4.to)) {
|
|
197927
|
+
console.warn(`[docAtomPostProcess] transition.to "${t4.to}" not in declared states`);
|
|
197928
|
+
warnings++;
|
|
197929
|
+
}
|
|
197930
|
+
}
|
|
197931
|
+
}
|
|
197932
|
+
const roles = para.atoms.roles;
|
|
197933
|
+
if (roles) {
|
|
197934
|
+
const allBehaviorNames = new Set;
|
|
197935
|
+
for (const s of sections) {
|
|
197936
|
+
for (const p4 of s.paragraphs) {
|
|
197937
|
+
const behaviors = p4.atoms.behaviors;
|
|
197938
|
+
if (behaviors)
|
|
197939
|
+
for (const b of behaviors)
|
|
197940
|
+
allBehaviorNames.add(b.name);
|
|
197941
|
+
}
|
|
197942
|
+
}
|
|
197943
|
+
for (const role of roles) {
|
|
197944
|
+
if (role.performs) {
|
|
197945
|
+
for (const p4 of role.performs) {
|
|
197946
|
+
if (allBehaviorNames.size > 0 && !allBehaviorNames.has(p4)) {
|
|
197947
|
+
console.warn(`[docAtomPostProcess] role.performs "${p4}" not in declared behaviors`);
|
|
197948
|
+
warnings++;
|
|
197949
|
+
}
|
|
197950
|
+
}
|
|
197951
|
+
}
|
|
197952
|
+
}
|
|
197953
|
+
}
|
|
197954
|
+
}
|
|
197955
|
+
}
|
|
197956
|
+
if (warnings > 0) {
|
|
197957
|
+
console.warn(`[docAtomPostProcess] ${warnings} cross-reference warnings (non-blocking)`);
|
|
197958
|
+
}
|
|
197959
|
+
}
|
|
197960
|
+
function detectNoiseCandidates(entityNames) {
|
|
197961
|
+
const candidates = [];
|
|
197962
|
+
for (const name21 of entityNames) {
|
|
197963
|
+
if (/^[\u4e00-\u9fff]{1,4}$/.test(name21)) {
|
|
197964
|
+
candidates.push(name21);
|
|
197965
|
+
continue;
|
|
197966
|
+
}
|
|
197967
|
+
if (/^[a-zA-Z]{1,5}$/.test(name21) && name21[0] === name21[0].toLowerCase()) {
|
|
197968
|
+
candidates.push(name21);
|
|
197969
|
+
continue;
|
|
197970
|
+
}
|
|
197971
|
+
if (/^Kill\s+\d/.test(name21)) {
|
|
197972
|
+
candidates.push(name21);
|
|
197973
|
+
continue;
|
|
197974
|
+
}
|
|
197975
|
+
}
|
|
197976
|
+
return candidates;
|
|
197977
|
+
}
|
|
197978
|
+
function collectExtractionStats(sections) {
|
|
197979
|
+
const atomTypeCounts = {};
|
|
197980
|
+
const entityNames = new Set;
|
|
197981
|
+
let paragraphsWithAtoms = 0;
|
|
197982
|
+
let paragraphsTotal = 0;
|
|
197983
|
+
for (const section of sections) {
|
|
197984
|
+
for (const para of section.paragraphs) {
|
|
197985
|
+
paragraphsTotal++;
|
|
197986
|
+
let hasAtoms = false;
|
|
197987
|
+
for (const [atomType, atoms2] of Object.entries(para.atoms)) {
|
|
197988
|
+
if (!Array.isArray(atoms2) || atoms2.length === 0)
|
|
197989
|
+
continue;
|
|
197990
|
+
hasAtoms = true;
|
|
197991
|
+
atomTypeCounts[atomType] = (atomTypeCounts[atomType] ?? 0) + atoms2.length;
|
|
197992
|
+
if (atomType === "entities") {
|
|
197993
|
+
for (const e of atoms2)
|
|
197994
|
+
entityNames.add(e.name);
|
|
197995
|
+
}
|
|
197996
|
+
}
|
|
197997
|
+
if (hasAtoms)
|
|
197998
|
+
paragraphsWithAtoms++;
|
|
197999
|
+
}
|
|
198000
|
+
}
|
|
198001
|
+
return {
|
|
198002
|
+
entityCount: atomTypeCounts.entities ?? 0,
|
|
198003
|
+
relationCount: atomTypeCounts.relations ?? 0,
|
|
198004
|
+
atomTypeCounts,
|
|
198005
|
+
uniqueEntityNames: [...entityNames],
|
|
198006
|
+
paragraphsWithAtoms,
|
|
198007
|
+
paragraphsTotal
|
|
198008
|
+
};
|
|
198009
|
+
}
|
|
198010
|
+
function detectResolutionCandidates(entityNames) {
|
|
198011
|
+
const candidates = [];
|
|
198012
|
+
const sorted = [...entityNames].sort((a, b) => b.length - a.length);
|
|
198013
|
+
for (let i = 0;i < sorted.length; i++) {
|
|
198014
|
+
const long = sorted[i];
|
|
198015
|
+
for (let j = i + 1;j < sorted.length; j++) {
|
|
198016
|
+
const short = sorted[j];
|
|
198017
|
+
if (short.length < 3)
|
|
198018
|
+
continue;
|
|
198019
|
+
if (short === long)
|
|
198020
|
+
continue;
|
|
198021
|
+
if (long.includes(short)) {
|
|
198022
|
+
candidates.push({ short, long, reason: "substring match" });
|
|
198023
|
+
continue;
|
|
198024
|
+
}
|
|
198025
|
+
if (long.toLowerCase() === short.toLowerCase()) {
|
|
198026
|
+
candidates.push({ short, long, reason: "case-insensitive match" });
|
|
198027
|
+
continue;
|
|
198028
|
+
}
|
|
198029
|
+
if (long.toLowerCase().includes(short.toLowerCase()) && short.length >= 4) {
|
|
198030
|
+
candidates.push({ short, long, reason: "case-insensitive substring" });
|
|
198031
|
+
}
|
|
198032
|
+
}
|
|
198033
|
+
}
|
|
198034
|
+
return candidates;
|
|
198035
|
+
}
|
|
198036
|
+
function applyEntityMerges(sections, merges) {
|
|
198037
|
+
if (merges.length === 0)
|
|
198038
|
+
return 0;
|
|
198039
|
+
const mergeMap = new Map;
|
|
198040
|
+
for (const m of merges)
|
|
198041
|
+
mergeMap.set(m.from, m.to);
|
|
198042
|
+
let normalized = 0;
|
|
198043
|
+
for (const section of sections) {
|
|
198044
|
+
for (const para of section.paragraphs) {
|
|
198045
|
+
const entities2 = para.atoms.entities;
|
|
198046
|
+
if (entities2) {
|
|
198047
|
+
for (const e of entities2) {
|
|
198048
|
+
const canonical = mergeMap.get(e.name);
|
|
198049
|
+
if (canonical) {
|
|
198050
|
+
e.name = canonical;
|
|
198051
|
+
normalized++;
|
|
198052
|
+
}
|
|
198053
|
+
}
|
|
198054
|
+
const seen = new Set;
|
|
198055
|
+
para.atoms.entities = entities2.filter((e) => {
|
|
198056
|
+
if (seen.has(e.name))
|
|
198057
|
+
return false;
|
|
198058
|
+
seen.add(e.name);
|
|
198059
|
+
return true;
|
|
198060
|
+
});
|
|
198061
|
+
}
|
|
198062
|
+
const relations = para.atoms.relations;
|
|
198063
|
+
if (relations) {
|
|
198064
|
+
for (const r of relations) {
|
|
198065
|
+
const fromCanonical = mergeMap.get(r.from);
|
|
198066
|
+
if (fromCanonical) {
|
|
198067
|
+
r.from = fromCanonical;
|
|
198068
|
+
normalized++;
|
|
198069
|
+
}
|
|
198070
|
+
const toCanonical = mergeMap.get(r.to);
|
|
198071
|
+
if (toCanonical) {
|
|
198072
|
+
r.to = toCanonical;
|
|
198073
|
+
normalized++;
|
|
198074
|
+
}
|
|
198075
|
+
}
|
|
198076
|
+
}
|
|
198077
|
+
const boundaries = para.atoms.boundaries;
|
|
198078
|
+
if (boundaries) {
|
|
198079
|
+
for (const b of boundaries) {
|
|
198080
|
+
b.contains = b.contains.map((n) => mergeMap.get(n) ?? n);
|
|
198081
|
+
if (b.excludes)
|
|
198082
|
+
b.excludes = b.excludes.map((n) => mergeMap.get(n) ?? n);
|
|
198083
|
+
}
|
|
198084
|
+
}
|
|
198085
|
+
}
|
|
198086
|
+
}
|
|
198087
|
+
if (normalized > 0) {
|
|
198088
|
+
console.log(`[docAtomPostProcess] LLM entity resolution: normalized ${normalized} references (${merges.length} merge rules)`);
|
|
198089
|
+
for (const m of merges) {
|
|
198090
|
+
console.log(` "${m.from}" → "${m.to}"`);
|
|
198091
|
+
}
|
|
198092
|
+
}
|
|
198093
|
+
return normalized;
|
|
198094
|
+
}
|
|
198095
|
+
function removeNoiseEntities(sections, names) {
|
|
198096
|
+
if (names.length === 0)
|
|
198097
|
+
return 0;
|
|
198098
|
+
const removeSet = new Set(names);
|
|
198099
|
+
let removed = 0;
|
|
198100
|
+
for (const section of sections) {
|
|
198101
|
+
for (const para of section.paragraphs) {
|
|
198102
|
+
const entities2 = para.atoms.entities;
|
|
198103
|
+
if (entities2) {
|
|
198104
|
+
const before = entities2.length;
|
|
198105
|
+
para.atoms.entities = entities2.filter((e) => !removeSet.has(e.name));
|
|
198106
|
+
removed += before - para.atoms.entities.length;
|
|
198107
|
+
}
|
|
198108
|
+
const relations = para.atoms.relations;
|
|
198109
|
+
if (relations) {
|
|
198110
|
+
para.atoms.relations = relations.filter((r) => !removeSet.has(r.from) || !removeSet.has(r.to));
|
|
198111
|
+
}
|
|
198112
|
+
const boundaries = para.atoms.boundaries;
|
|
198113
|
+
if (boundaries) {
|
|
198114
|
+
for (const b of boundaries) {
|
|
198115
|
+
b.contains = b.contains.filter((n) => !removeSet.has(n));
|
|
198116
|
+
if (b.excludes)
|
|
198117
|
+
b.excludes = b.excludes.filter((n) => !removeSet.has(n));
|
|
198118
|
+
}
|
|
198119
|
+
}
|
|
198120
|
+
}
|
|
198121
|
+
}
|
|
198122
|
+
if (removed > 0) {
|
|
198123
|
+
console.log(`[docAtomPostProcess] removed ${removed} noise entity instances (${names.length} names)`);
|
|
198124
|
+
for (const n of names) {
|
|
198125
|
+
console.log(` ✕ "${n}"`);
|
|
198126
|
+
}
|
|
198127
|
+
}
|
|
198128
|
+
return removed;
|
|
198129
|
+
}
|
|
198130
|
+
|
|
198131
|
+
// ../api/src/services/docIndexer.ts
|
|
196502
198132
|
var CHUNK_CONCURRENCY = 2;
|
|
196503
198133
|
var GLEANING_MAX_ROUNDS = 2;
|
|
196504
|
-
var
|
|
198134
|
+
var CODE_BLOCK_MIN_LENGTH = 500;
|
|
196505
198135
|
function injectParagraphTags(chunk, sections) {
|
|
196506
198136
|
const parts = [];
|
|
196507
198137
|
if (chunk.breadcrumb.length > 0) {
|
|
@@ -196519,7 +198149,11 @@ function injectParagraphTags(chunk, sections) {
|
|
|
196519
198149
|
parts.push(`${"#".repeat(section.level)} ${section.heading}`);
|
|
196520
198150
|
}
|
|
196521
198151
|
}
|
|
196522
|
-
|
|
198152
|
+
if (p4.text.length >= CODE_BLOCK_MIN_LENGTH && isPureCodeBlock(p4.text)) {
|
|
198153
|
+
parts.push(`[P${i}] ${skeletonizeCodeBlock(p4.text)}`);
|
|
198154
|
+
} else {
|
|
198155
|
+
parts.push(`[P${i}] ${p4.text}`);
|
|
198156
|
+
}
|
|
196523
198157
|
}
|
|
196524
198158
|
return parts.join(`
|
|
196525
198159
|
|
|
@@ -196584,12 +198218,14 @@ Continue the JSON output from the exact point of truncation. Output ONLY the rem
|
|
|
196584
198218
|
});
|
|
196585
198219
|
const combined = trimmed + result.text.trim();
|
|
196586
198220
|
JSON.parse(jsonrepair(combined));
|
|
198221
|
+
console.log(`[docIndexer] continuation: merged T1 (${trimmed.length} chars) + continuation (${result.text.trim().length} chars) = ${combined.length} chars`);
|
|
196587
198222
|
return {
|
|
196588
198223
|
text: combined,
|
|
196589
198224
|
extraCalls: 1,
|
|
196590
198225
|
extraTokens: result.usage.totalTokens
|
|
196591
198226
|
};
|
|
196592
|
-
} catch {
|
|
198227
|
+
} catch (contErr) {
|
|
198228
|
+
console.warn(`[docIndexer] continuation: merge failed, returning original (${trimmed.length} chars). ` + `Error: ${contErr instanceof Error ? contErr.message : String(contErr)}`);
|
|
196593
198229
|
return { text: text2, extraCalls: 1, extraTokens: 0 };
|
|
196594
198230
|
}
|
|
196595
198231
|
}
|
|
@@ -196630,9 +198266,16 @@ async function processChunk(chunk, chunkIndex, llmService, sections, onStep) {
|
|
|
196630
198266
|
llmCalls += continued.extraCalls;
|
|
196631
198267
|
totalTokens += continued.extraTokens;
|
|
196632
198268
|
onStep?.("T1 done", llmCalls, totalTokens);
|
|
196633
|
-
|
|
198269
|
+
let parseResult = parseExtractionOutput(continued.text, docChunkResultSchema);
|
|
196634
198270
|
if (!parseResult.success) {
|
|
196635
|
-
|
|
198271
|
+
const preview = continued.text.slice(0, 500).replace(/\n/g, "\\n");
|
|
198272
|
+
console.warn(`[docIndexer] chunk ${chunkIndex} T1 strict parse failed, attempting lenient. ` + `Error: ${parseResult.error.message.slice(0, 200)}. ` + `LLM output preview: ${preview}`);
|
|
198273
|
+
const lenient = tryLenientParse(continued.text, chunkIndex);
|
|
198274
|
+
if (lenient) {
|
|
198275
|
+
parseResult = { success: true, data: lenient };
|
|
198276
|
+
} else {
|
|
198277
|
+
throw new Error(`Chunk ${chunkIndex} T1 parse failed: ${parseResult.error.message}`);
|
|
198278
|
+
}
|
|
196636
198279
|
}
|
|
196637
198280
|
try {
|
|
196638
198281
|
const rawJson = JSON.parse(jsonrepair(continued.text));
|
|
@@ -196689,8 +198332,19 @@ async function processChunk(chunk, chunkIndex, llmService, sections, onStep) {
|
|
|
196689
198332
|
chunkText,
|
|
196690
198333
|
previousResult: parseResult.data
|
|
196691
198334
|
});
|
|
196692
|
-
const
|
|
196693
|
-
|
|
198335
|
+
const tableResult = await extractTableAtoms(chunk, sections, cumulativeResult, llmService);
|
|
198336
|
+
if (tableResult.extracted > 0) {
|
|
198337
|
+
onStep?.(`table extraction (${tableResult.extracted} tables)`, llmCalls + tableResult.llmCalls, totalTokens + tableResult.totalTokens);
|
|
198338
|
+
}
|
|
198339
|
+
llmCalls += tableResult.llmCalls;
|
|
198340
|
+
totalTokens += tableResult.totalTokens;
|
|
198341
|
+
const diagramResult = await extractDiagramAtoms(chunk, sections, cumulativeResult, llmService);
|
|
198342
|
+
if (diagramResult.extracted > 0) {
|
|
198343
|
+
onStep?.(`diagram extraction (${diagramResult.extracted} diagrams)`, llmCalls + diagramResult.llmCalls, totalTokens + diagramResult.totalTokens);
|
|
198344
|
+
}
|
|
198345
|
+
llmCalls += diagramResult.llmCalls;
|
|
198346
|
+
totalTokens += diagramResult.totalTokens;
|
|
198347
|
+
return { result: cumulativeResult, llmCalls, totalTokens };
|
|
196694
198348
|
}
|
|
196695
198349
|
function mapChunkResultToSections(chunk, chunkResult, sections) {
|
|
196696
198350
|
for (const p4 of chunkResult.paragraphs) {
|
|
@@ -196735,6 +198389,123 @@ function mapChunkResultToSections(chunk, chunkResult, sections) {
|
|
|
196735
198389
|
}
|
|
196736
198390
|
}
|
|
196737
198391
|
}
|
|
198392
|
+
var ATOM_TYPE_KEYS = new Set([
|
|
198393
|
+
"entities",
|
|
198394
|
+
"relations",
|
|
198395
|
+
"behaviors",
|
|
198396
|
+
"attributes",
|
|
198397
|
+
"states",
|
|
198398
|
+
"rules",
|
|
198399
|
+
"transitions",
|
|
198400
|
+
"events",
|
|
198401
|
+
"decisions",
|
|
198402
|
+
"metrics",
|
|
198403
|
+
"roles",
|
|
198404
|
+
"constraints",
|
|
198405
|
+
"comparisons",
|
|
198406
|
+
"boundaries"
|
|
198407
|
+
]);
|
|
198408
|
+
function looksLikeAtoms(obj) {
|
|
198409
|
+
return Object.keys(obj).some((k) => ATOM_TYPE_KEYS.has(k) && Array.isArray(obj[k]));
|
|
198410
|
+
}
|
|
198411
|
+
var ATOM_REQUIRED_FIELDS = {
|
|
198412
|
+
entities: ["name"],
|
|
198413
|
+
relations: ["from", "to", "type"],
|
|
198414
|
+
behaviors: ["name"],
|
|
198415
|
+
attributes: ["name"],
|
|
198416
|
+
states: ["name"],
|
|
198417
|
+
rules: ["description"],
|
|
198418
|
+
transitions: ["from", "to"],
|
|
198419
|
+
events: ["name"],
|
|
198420
|
+
decisions: ["description"],
|
|
198421
|
+
metrics: ["name"],
|
|
198422
|
+
roles: ["name"],
|
|
198423
|
+
constraints: ["description"],
|
|
198424
|
+
comparisons: ["description"],
|
|
198425
|
+
boundaries: ["name"]
|
|
198426
|
+
};
|
|
198427
|
+
var PARAGRAPH_TAG_RE2 = /^P\d+$/;
|
|
198428
|
+
function tryLenientParse(rawText, chunkIndex) {
|
|
198429
|
+
try {
|
|
198430
|
+
let raw5 = JSON.parse(jsonrepair(rawText));
|
|
198431
|
+
if (Array.isArray(raw5)) {
|
|
198432
|
+
raw5 = { paragraphs: raw5 };
|
|
198433
|
+
}
|
|
198434
|
+
if (raw5 && typeof raw5 === "object" && !Array.isArray(raw5) && !raw5.paragraphs) {
|
|
198435
|
+
const keys = Object.keys(raw5);
|
|
198436
|
+
if (keys.length > 0 && keys.every((k) => PARAGRAPH_TAG_RE2.test(k))) {
|
|
198437
|
+
raw5 = {
|
|
198438
|
+
paragraphs: keys.sort((a, b) => parseInt(a.slice(1)) - parseInt(b.slice(1))).map((tag2) => ({ tag: tag2, atoms: raw5[tag2] }))
|
|
198439
|
+
};
|
|
198440
|
+
}
|
|
198441
|
+
}
|
|
198442
|
+
if (!Array.isArray(raw5?.paragraphs) && raw5 && typeof raw5 === "object" && looksLikeAtoms(raw5)) {
|
|
198443
|
+
raw5 = { paragraphs: [{ tag: "P0", atoms: raw5 }] };
|
|
198444
|
+
}
|
|
198445
|
+
if (!raw5 || !Array.isArray(raw5.paragraphs))
|
|
198446
|
+
return null;
|
|
198447
|
+
const salvaged = { paragraphs: [] };
|
|
198448
|
+
let droppedAtoms = 0;
|
|
198449
|
+
let fixedTags = 0;
|
|
198450
|
+
for (let idx = 0;idx < raw5.paragraphs.length; idx++) {
|
|
198451
|
+
const rawPara = raw5.paragraphs[idx];
|
|
198452
|
+
if (!rawPara || typeof rawPara !== "object")
|
|
198453
|
+
continue;
|
|
198454
|
+
let tag2 = rawPara.tag;
|
|
198455
|
+
if (!tag2 || typeof tag2 !== "string" || !/^P\d+$/.test(tag2)) {
|
|
198456
|
+
tag2 = `P${idx}`;
|
|
198457
|
+
fixedTags++;
|
|
198458
|
+
}
|
|
198459
|
+
let atomsObj;
|
|
198460
|
+
if (rawPara.atoms && typeof rawPara.atoms === "object") {
|
|
198461
|
+
atomsObj = rawPara.atoms;
|
|
198462
|
+
} else if (looksLikeAtoms(rawPara)) {
|
|
198463
|
+
atomsObj = rawPara;
|
|
198464
|
+
} else {
|
|
198465
|
+
continue;
|
|
198466
|
+
}
|
|
198467
|
+
const cleanAtoms = {};
|
|
198468
|
+
for (const [atomType, atoms2] of Object.entries(atomsObj)) {
|
|
198469
|
+
if (!ATOM_TYPE_KEYS.has(atomType) || !Array.isArray(atoms2))
|
|
198470
|
+
continue;
|
|
198471
|
+
const requiredFields = ATOM_REQUIRED_FIELDS[atomType] ?? [];
|
|
198472
|
+
const kept = [];
|
|
198473
|
+
for (const atom of atoms2) {
|
|
198474
|
+
if (!atom || typeof atom !== "object") {
|
|
198475
|
+
droppedAtoms++;
|
|
198476
|
+
continue;
|
|
198477
|
+
}
|
|
198478
|
+
const rec = atom;
|
|
198479
|
+
const hasRequired = requiredFields.every((f) => rec[f] != null && rec[f] !== "");
|
|
198480
|
+
if (hasRequired) {
|
|
198481
|
+
kept.push(atom);
|
|
198482
|
+
} else {
|
|
198483
|
+
droppedAtoms++;
|
|
198484
|
+
}
|
|
198485
|
+
}
|
|
198486
|
+
if (kept.length > 0)
|
|
198487
|
+
cleanAtoms[atomType] = kept;
|
|
198488
|
+
}
|
|
198489
|
+
salvaged.paragraphs.push({ tag: tag2, atoms: cleanAtoms });
|
|
198490
|
+
}
|
|
198491
|
+
if (salvaged.paragraphs.length === 0)
|
|
198492
|
+
return null;
|
|
198493
|
+
const result = docChunkResultSchema.safeParse(salvaged);
|
|
198494
|
+
if (!result.success)
|
|
198495
|
+
return null;
|
|
198496
|
+
const fixes = [];
|
|
198497
|
+
if (fixedTags > 0)
|
|
198498
|
+
fixes.push(`${fixedTags} tags auto-assigned`);
|
|
198499
|
+
if (droppedAtoms > 0)
|
|
198500
|
+
fixes.push(`${droppedAtoms} invalid atoms dropped`);
|
|
198501
|
+
if (fixes.length > 0) {
|
|
198502
|
+
console.warn(`[docIndexer] chunk ${chunkIndex}: lenient parse salvaged — ${fixes.join(", ")}`);
|
|
198503
|
+
}
|
|
198504
|
+
return result.data;
|
|
198505
|
+
} catch {
|
|
198506
|
+
return null;
|
|
198507
|
+
}
|
|
198508
|
+
}
|
|
196738
198509
|
function ensureAtomConfidence(atoms2) {
|
|
196739
198510
|
const DEFAULT_DOC_CONFIDENCE = 0.7;
|
|
196740
198511
|
for (const atomList of Object.values(atoms2)) {
|
|
@@ -196758,90 +198529,62 @@ function countAtoms(sections) {
|
|
|
196758
198529
|
}
|
|
196759
198530
|
return counts;
|
|
196760
198531
|
}
|
|
196761
|
-
|
|
196762
|
-
const
|
|
196763
|
-
|
|
196764
|
-
|
|
196765
|
-
|
|
196766
|
-
|
|
196767
|
-
|
|
196768
|
-
|
|
196769
|
-
|
|
196770
|
-
|
|
196771
|
-
}
|
|
196772
|
-
}
|
|
196773
|
-
if (paragraphs.length === 0)
|
|
196774
|
-
return 0;
|
|
196775
|
-
const embeddings = [];
|
|
196776
|
-
const totalParagraphs = paragraphs.length;
|
|
196777
|
-
onProgress?.({ phase: "embedding", progress: 85, message: `Loading model (${totalParagraphs} paragraphs)` });
|
|
196778
|
-
const warmupStart = Date.now();
|
|
196779
|
-
await embeddingService.getDimension();
|
|
196780
|
-
const warmupMs = Date.now() - warmupStart;
|
|
196781
|
-
if (warmupMs > 500) {
|
|
196782
|
-
onProgress?.({ phase: "embedding", progress: 86, message: `Model ready (${(warmupMs / 1000).toFixed(1)}s)` });
|
|
198532
|
+
function formatExtractionStats(stats) {
|
|
198533
|
+
const typeSummary = Object.entries(stats.atomTypeCounts).sort(([, a], [, b]) => b - a).map(([t4, c]) => `${t4}:${c}`).join(" ");
|
|
198534
|
+
return `${stats.uniqueEntityNames.length} entities, ${stats.relationCount} relations, ` + `${stats.paragraphsWithAtoms}/${stats.paragraphsTotal} paragraphs with atoms | ${typeSummary}`;
|
|
198535
|
+
}
|
|
198536
|
+
async function runEntityResolution(sections, entityNames, llmService, onProgress) {
|
|
198537
|
+
const candidates = detectResolutionCandidates(entityNames);
|
|
198538
|
+
const noiseCandidates = detectNoiseCandidates(entityNames);
|
|
198539
|
+
if (candidates.length === 0 && noiseCandidates.length === 0) {
|
|
198540
|
+
console.log("[docIndexer] entity resolution: no duplicates or noise candidates, skipping");
|
|
198541
|
+
onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.no_resolution" } });
|
|
198542
|
+
return { llmCalls: 0, totalTokens: 0 };
|
|
196783
198543
|
}
|
|
196784
|
-
|
|
196785
|
-
|
|
196786
|
-
|
|
196787
|
-
|
|
196788
|
-
|
|
196789
|
-
|
|
198544
|
+
console.log(`[docIndexer] entity resolution: ${candidates.length} duplicate pairs, ${noiseCandidates.length} noise candidates`);
|
|
198545
|
+
onProgress?.({
|
|
198546
|
+
phase: "post-processing",
|
|
198547
|
+
progress: 83,
|
|
198548
|
+
message: { key: "index.doc.msg.resolving", params: { duplicates: candidates.length, noise: noiseCandidates.length } }
|
|
198549
|
+
});
|
|
198550
|
+
try {
|
|
198551
|
+
const prompt = buildEntityResolutionPrompt({
|
|
198552
|
+
allNames: entityNames,
|
|
198553
|
+
candidates,
|
|
198554
|
+
...noiseCandidates.length > 0 ? { noiseCandidates } : {}
|
|
198555
|
+
});
|
|
198556
|
+
const result = await llmService.generateText(prompt, {
|
|
198557
|
+
systemPrompt: ENTITY_RESOLUTION_SYSTEM_PROMPT
|
|
198558
|
+
});
|
|
198559
|
+
let resolution;
|
|
196790
198560
|
try {
|
|
196791
|
-
|
|
196792
|
-
for (let j = 0;j < batch2.length; j++) {
|
|
196793
|
-
embeddings.push({
|
|
196794
|
-
sectionIndex: batch2[j].sectionIndex,
|
|
196795
|
-
paragraphIndex: batch2[j].paragraphIndex,
|
|
196796
|
-
vector: vectors[j]
|
|
196797
|
-
});
|
|
196798
|
-
}
|
|
198561
|
+
resolution = JSON.parse(jsonrepair(result.text));
|
|
196799
198562
|
} catch {
|
|
196800
|
-
|
|
196801
|
-
|
|
196802
|
-
|
|
196803
|
-
const vector = await embeddingService.embed(p4.text);
|
|
196804
|
-
embeddings.push({
|
|
196805
|
-
sectionIndex: p4.sectionIndex,
|
|
196806
|
-
paragraphIndex: p4.paragraphIndex,
|
|
196807
|
-
vector
|
|
196808
|
-
});
|
|
196809
|
-
} catch {
|
|
196810
|
-
console.warn(`[docIndexer] embedding failed for section ${p4.sectionIndex} paragraph ${p4.paragraphIndex}`);
|
|
196811
|
-
}
|
|
196812
|
-
const embedded2 = i + fi + 1;
|
|
196813
|
-
const progress2 = 86 + Math.round(embedded2 / totalParagraphs * 9);
|
|
196814
|
-
onProgress?.({ phase: "embedding", progress: progress2, message: `Fallback ${embedded2}/${totalParagraphs}` });
|
|
196815
|
-
}
|
|
196816
|
-
continue;
|
|
198563
|
+
console.warn("[docIndexer] entity resolution: failed to parse LLM response, skipping");
|
|
198564
|
+
onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.resolution_parse_failed" } });
|
|
198565
|
+
return { llmCalls: 1, totalTokens: result.usage.totalTokens };
|
|
196817
198566
|
}
|
|
196818
|
-
const
|
|
196819
|
-
const
|
|
196820
|
-
|
|
196821
|
-
|
|
196822
|
-
|
|
196823
|
-
|
|
196824
|
-
|
|
196825
|
-
|
|
196826
|
-
|
|
196827
|
-
|
|
196828
|
-
|
|
196829
|
-
|
|
196830
|
-
|
|
196831
|
-
|
|
196832
|
-
id: `${hashId}:${e.sectionIndex}:${e.paragraphIndex}`,
|
|
196833
|
-
embedding: e.vector,
|
|
196834
|
-
metadata: {
|
|
196835
|
-
layer: "digest",
|
|
196836
|
-
sourceId,
|
|
196837
|
-
hashId,
|
|
196838
|
-
sourcePath,
|
|
196839
|
-
sectionIndex: e.sectionIndex,
|
|
196840
|
-
paragraphIndex: e.paragraphIndex
|
|
198567
|
+
const mergeCount = applyEntityMerges(sections, resolution.merges ?? []);
|
|
198568
|
+
const removeCount = removeNoiseEntities(sections, resolution.remove ?? []);
|
|
198569
|
+
onProgress?.({
|
|
198570
|
+
phase: "post-processing",
|
|
198571
|
+
progress: 84,
|
|
198572
|
+
message: {
|
|
198573
|
+
key: "index.doc.msg.resolution_result",
|
|
198574
|
+
params: {
|
|
198575
|
+
merges: resolution.merges?.length ?? 0,
|
|
198576
|
+
mergeRefs: mergeCount,
|
|
198577
|
+
removed: resolution.remove?.length ?? 0,
|
|
198578
|
+
removeRefs: removeCount,
|
|
198579
|
+
ambiguous: resolution.ambiguous?.length ?? 0
|
|
198580
|
+
}
|
|
196841
198581
|
}
|
|
196842
|
-
})
|
|
198582
|
+
});
|
|
198583
|
+
return { llmCalls: 1, totalTokens: result.usage.totalTokens };
|
|
196843
198584
|
} catch (err2) {
|
|
196844
|
-
console.warn(
|
|
198585
|
+
console.warn("[docIndexer] entity resolution LLM call failed (non-blocking):", err2);
|
|
198586
|
+
onProgress?.({ phase: "post-processing", progress: 84, message: { key: "index.doc.msg.resolution_failed" } });
|
|
198587
|
+
return { llmCalls: 0, totalTokens: 0 };
|
|
196845
198588
|
}
|
|
196846
198589
|
}
|
|
196847
198590
|
async function indexDocument(input) {
|
|
@@ -196854,16 +198597,22 @@ async function indexDocument(input) {
|
|
|
196854
198597
|
digestStore: digestStore2,
|
|
196855
198598
|
onProgress
|
|
196856
198599
|
} = input;
|
|
198600
|
+
const { content: normalizedContent, stats: normalizeStats } = normalizeMarkdown(content);
|
|
198601
|
+
const repairCount = Object.values(normalizeStats.repairs).reduce((a, b) => a + b, 0);
|
|
198602
|
+
if (repairCount > 0) {
|
|
198603
|
+
const repairSummary = Object.entries(normalizeStats.repairs).map(([k, v]) => `${k}:${v}`).join(" ");
|
|
198604
|
+
console.log(`[docIndexer] markdown normalized: ${repairCount} repairs (${repairSummary})`);
|
|
198605
|
+
}
|
|
196857
198606
|
onProgress?.({ phase: "chunking", progress: 3 });
|
|
196858
|
-
const chunks = chunkMarkdown(
|
|
196859
|
-
const parsedSections = parseSections(
|
|
198607
|
+
const chunks = chunkMarkdown(normalizedContent);
|
|
198608
|
+
const parsedSections = parseSections(normalizedContent);
|
|
196860
198609
|
if (chunks.length === 0) {
|
|
196861
198610
|
throw new Error("Document produced no chunks — content may be empty");
|
|
196862
198611
|
}
|
|
196863
|
-
onProgress?.({ phase: "chunking", progress: 8, message:
|
|
198612
|
+
onProgress?.({ phase: "chunking", progress: 8, message: { key: "index.doc.msg.chunking_result", params: { chunks: chunks.length, sections: parsedSections.length } } });
|
|
196864
198613
|
const totalChunks = chunks.length;
|
|
196865
|
-
const
|
|
196866
|
-
onProgress?.({ phase: "annotating", progress: 10, message:
|
|
198614
|
+
const annotateStartMsg = input.llmModel ? { key: "index.doc.msg.annotating_start_model", params: { n: totalChunks, model: input.llmModel } } : { key: "index.doc.msg.annotating_start", params: { n: totalChunks } };
|
|
198615
|
+
onProgress?.({ phase: "annotating", progress: 10, message: annotateStartMsg });
|
|
196867
198616
|
let completedChunks = 0;
|
|
196868
198617
|
let totalLlmCalls = 0;
|
|
196869
198618
|
let totalTokens = 0;
|
|
@@ -196873,7 +198622,7 @@ async function indexDocument(input) {
|
|
|
196873
198622
|
onProgress?.({
|
|
196874
198623
|
phase: "annotating",
|
|
196875
198624
|
progress: baseProgress,
|
|
196876
|
-
message:
|
|
198625
|
+
message: { key: "index.doc.msg.annotating_chunk", params: { current: completedChunks + 1, total: totalChunks, step, calls, tokens } }
|
|
196877
198626
|
});
|
|
196878
198627
|
});
|
|
196879
198628
|
completedChunks++;
|
|
@@ -196883,7 +198632,7 @@ async function indexDocument(input) {
|
|
|
196883
198632
|
onProgress?.({
|
|
196884
198633
|
phase: "annotating",
|
|
196885
198634
|
progress,
|
|
196886
|
-
message:
|
|
198635
|
+
message: { key: "index.doc.msg.annotating_chunk_done", params: { current: completedChunks, total: totalChunks, calls: totalLlmCalls, tokens: totalTokens } }
|
|
196887
198636
|
});
|
|
196888
198637
|
return result;
|
|
196889
198638
|
});
|
|
@@ -196894,17 +198643,25 @@ async function indexDocument(input) {
|
|
|
196894
198643
|
const sectionsMap = new Map;
|
|
196895
198644
|
for (let i = 0;i < parsedSections.length; i++) {
|
|
196896
198645
|
const s = parsedSections[i];
|
|
196897
|
-
|
|
196898
|
-
sectionsMap.set(sectionKey, {
|
|
198646
|
+
sectionsMap.set(`${i}`, {
|
|
196899
198647
|
heading: s.heading,
|
|
196900
198648
|
level: s.level,
|
|
196901
198649
|
paragraphs: new Map
|
|
196902
198650
|
});
|
|
196903
|
-
|
|
196904
|
-
|
|
196905
|
-
|
|
196906
|
-
|
|
196907
|
-
|
|
198651
|
+
}
|
|
198652
|
+
for (const chunk of chunks) {
|
|
198653
|
+
for (const cp of chunk.paragraphs) {
|
|
198654
|
+
const sectionKey = `${cp.sectionIndex}`;
|
|
198655
|
+
if (!sectionsMap.has(sectionKey)) {
|
|
198656
|
+
sectionsMap.set(sectionKey, { heading: "", level: 0, paragraphs: new Map });
|
|
198657
|
+
}
|
|
198658
|
+
const paragraphKey = `${cp.sectionIndex}:${cp.paragraphIndex}`;
|
|
198659
|
+
if (!sectionsMap.get(sectionKey).paragraphs.has(paragraphKey)) {
|
|
198660
|
+
sectionsMap.get(sectionKey).paragraphs.set(paragraphKey, {
|
|
198661
|
+
text: cp.text,
|
|
198662
|
+
atoms: {}
|
|
198663
|
+
});
|
|
198664
|
+
}
|
|
196908
198665
|
}
|
|
196909
198666
|
}
|
|
196910
198667
|
for (const success2 of chunkProcessResult.successes) {
|
|
@@ -196934,6 +198691,25 @@ async function indexDocument(input) {
|
|
|
196934
198691
|
ensureAtomConfidence(para.atoms);
|
|
196935
198692
|
}
|
|
196936
198693
|
}
|
|
198694
|
+
onProgress?.({ phase: "post-processing", progress: 81, message: { key: "index.doc.msg.post_process_start" } });
|
|
198695
|
+
postProcessDigestAtoms(digestSections);
|
|
198696
|
+
const preStats = collectExtractionStats(digestSections);
|
|
198697
|
+
const statsMsg = formatExtractionStats(preStats);
|
|
198698
|
+
console.log(`[docIndexer] extraction stats: ${statsMsg}`);
|
|
198699
|
+
onProgress?.({ phase: "post-processing", progress: 82, message: {
|
|
198700
|
+
key: "index.doc.msg.extraction_stats",
|
|
198701
|
+
params: {
|
|
198702
|
+
entities: preStats.uniqueEntityNames.length,
|
|
198703
|
+
relations: preStats.relationCount,
|
|
198704
|
+
withAtoms: preStats.paragraphsWithAtoms,
|
|
198705
|
+
totalParas: preStats.paragraphsTotal
|
|
198706
|
+
}
|
|
198707
|
+
} });
|
|
198708
|
+
if ((input.enableEntityResolution ?? true) && preStats.uniqueEntityNames.length > 1) {
|
|
198709
|
+
const resolutionResult = await runEntityResolution(digestSections, preStats.uniqueEntityNames, llmService, onProgress);
|
|
198710
|
+
totalLlmCalls += resolutionResult.llmCalls;
|
|
198711
|
+
totalTokens += resolutionResult.totalTokens;
|
|
198712
|
+
}
|
|
196937
198713
|
const atomCounts = countAtoms(sectionsMap);
|
|
196938
198714
|
const paragraphCount = digestSections.reduce((sum, s) => sum + s.paragraphs.length, 0);
|
|
196939
198715
|
if (paragraphCount === 0) {
|
|
@@ -196955,7 +198731,7 @@ async function indexDocument(input) {
|
|
|
196955
198731
|
processedAt: new Date().toISOString()
|
|
196956
198732
|
}
|
|
196957
198733
|
};
|
|
196958
|
-
const embedMsg = input.embeddingModel ?
|
|
198734
|
+
const embedMsg = input.embeddingModel ? { key: "index.doc.msg.embedding_model", params: { model: input.embeddingModel } } : undefined;
|
|
196959
198735
|
onProgress?.({ phase: "embedding", progress: 85, ...embedMsg ? { message: embedMsg } : {} });
|
|
196960
198736
|
let embeddingCount = 0;
|
|
196961
198737
|
if (input.embeddingService) {
|
|
@@ -197250,45 +199026,55 @@ async function runDocIndexPipeline(opts) {
|
|
|
197250
199026
|
const llmModelId = serverConfig2.llm[llmProvider]?.default_model ?? llmProvider;
|
|
197251
199027
|
const embProvider = serverConfig2.embedding?.provider;
|
|
197252
199028
|
const embModelId = embProvider ? serverConfig2.embedding[embProvider]?.model_id ?? embProvider : undefined;
|
|
199029
|
+
const fileTimeoutMs = serverConfig2.indexing?.file_timeout_ms ?? 15 * 60 * 1000;
|
|
199030
|
+
const abortSignal = indexTaskManager.getAbortSignal?.(sourceId) ?? null;
|
|
197253
199031
|
for (let fileIdx = 0;fileIdx < filesToIndex.length; fileIdx++) {
|
|
199032
|
+
if (abortSignal?.aborted) {
|
|
199033
|
+
const reason = typeof abortSignal.reason === "string" ? abortSignal.reason : "Task aborted";
|
|
199034
|
+
console.warn(`[runDocIndexPipeline] aborted before file ${fileIdx + 1}/${filesToIndex.length}: ${reason}`);
|
|
199035
|
+
break;
|
|
199036
|
+
}
|
|
197254
199037
|
const file2 = filesToIndex[fileIdx];
|
|
197255
|
-
const fileLabel = `[${fileIdx + 1}/${filesToIndex.length}] ${file2.sourcePath}`;
|
|
197256
199038
|
if (indexTaskManager.hasTask(sourceId)) {
|
|
197257
199039
|
indexTaskManager.updateProgress(sourceId, {
|
|
197258
|
-
stage: "
|
|
199040
|
+
stage: "chunking",
|
|
197259
199041
|
percent: 0,
|
|
197260
|
-
message:
|
|
199042
|
+
message: { key: "index.doc.msg.file_start", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath } }
|
|
197261
199043
|
});
|
|
197262
199044
|
}
|
|
197263
199045
|
try {
|
|
197264
|
-
|
|
197265
|
-
|
|
197266
|
-
|
|
197267
|
-
|
|
197268
|
-
|
|
197269
|
-
|
|
197270
|
-
|
|
197271
|
-
|
|
197272
|
-
|
|
197273
|
-
|
|
197274
|
-
|
|
197275
|
-
|
|
197276
|
-
|
|
197277
|
-
|
|
197278
|
-
|
|
197279
|
-
|
|
197280
|
-
|
|
197281
|
-
|
|
197282
|
-
|
|
199046
|
+
const fileTimeout = new Promise((_, reject) => setTimeout(() => reject(new Error(`File timeout after ${Math.round(fileTimeoutMs / 60000)}min: ${file2.sourcePath}`)), fileTimeoutMs));
|
|
199047
|
+
await Promise.race([
|
|
199048
|
+
indexDocument({
|
|
199049
|
+
sourceId,
|
|
199050
|
+
hashId: file2.hashId,
|
|
199051
|
+
sourcePath: file2.sourcePath,
|
|
199052
|
+
content: file2.content,
|
|
199053
|
+
contentType: "markdown",
|
|
199054
|
+
llmService,
|
|
199055
|
+
embeddingService,
|
|
199056
|
+
vectorStore,
|
|
199057
|
+
digestStore: digestStore2,
|
|
199058
|
+
llmModel: `${llmProvider}/${llmModelId}`,
|
|
199059
|
+
...embModelId ? { embeddingModel: `${embProvider}/${embModelId}` } : {},
|
|
199060
|
+
onProgress: (p4) => {
|
|
199061
|
+
if (indexTaskManager.hasTask(sourceId)) {
|
|
199062
|
+
indexTaskManager.updateProgress(sourceId, {
|
|
199063
|
+
stage: p4.phase,
|
|
199064
|
+
percent: p4.progress,
|
|
199065
|
+
...p4.message != null ? { message: p4.message } : {}
|
|
199066
|
+
});
|
|
199067
|
+
}
|
|
197283
199068
|
}
|
|
197284
|
-
}
|
|
197285
|
-
|
|
199069
|
+
}),
|
|
199070
|
+
fileTimeout
|
|
199071
|
+
]);
|
|
197286
199072
|
stored.push({ hash_id: file2.hashId, status: "created" });
|
|
197287
199073
|
if (indexTaskManager.hasTask(sourceId)) {
|
|
197288
199074
|
indexTaskManager.updateProgress(sourceId, {
|
|
197289
199075
|
stage: "storing",
|
|
197290
199076
|
percent: 100,
|
|
197291
|
-
message:
|
|
199077
|
+
message: { key: "index.doc.msg.file_done", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath } }
|
|
197292
199078
|
});
|
|
197293
199079
|
}
|
|
197294
199080
|
} catch (err2) {
|
|
@@ -197299,11 +199085,15 @@ async function runDocIndexPipeline(opts) {
|
|
|
197299
199085
|
indexTaskManager.updateProgress(sourceId, {
|
|
197300
199086
|
stage: "annotating",
|
|
197301
199087
|
percent: 0,
|
|
197302
|
-
message:
|
|
199088
|
+
message: { key: "index.doc.msg.file_error", params: { idx: fileIdx + 1, total: filesToIndex.length, file: file2.sourcePath, error: msg } }
|
|
197303
199089
|
});
|
|
197304
199090
|
}
|
|
197305
199091
|
}
|
|
197306
199092
|
}
|
|
199093
|
+
if (abortSignal?.aborted) {
|
|
199094
|
+
console.warn(`[runDocIndexPipeline] pipeline aborted for ${sourceId}, skipping completion`);
|
|
199095
|
+
return;
|
|
199096
|
+
}
|
|
197307
199097
|
if (stored.length === 0 && errors5.length > 0) {
|
|
197308
199098
|
const errorCode = errors5[0].code ?? "DOC_INDEX_LLM_EXHAUSTED" /* DOC_INDEX_LLM_EXHAUSTED */;
|
|
197309
199099
|
indexTaskManager.failTask(sourceId, errors5[0].error, errorCode);
|
|
@@ -197417,7 +199207,7 @@ async function handleDocIndex(c, storageProvider, source2) {
|
|
|
197417
199207
|
throw new C4AError("DOC_INDEX_EMBEDDING_UNAVAILABLE" /* DOC_INDEX_EMBEDDING_UNAVAILABLE */, "Embedding service not configured", null);
|
|
197418
199208
|
}
|
|
197419
199209
|
const modulePaths = modules?.map((m) => m.path);
|
|
197420
|
-
indexTaskManager.createTask(source2.id, "server", source2.id, modulePaths);
|
|
199210
|
+
indexTaskManager.createTask(source2.id, "server", source2.id, modulePaths, serverConfig2.indexing?.task_timeout_ms);
|
|
197421
199211
|
const hashToPath = new Map;
|
|
197422
199212
|
for (const sf of latestByPath.values()) {
|
|
197423
199213
|
hashToPath.set(sf.hash_id, sf.source_path ?? "");
|
|
@@ -199261,6 +201051,10 @@ function mergeServerConfig2(parsed) {
|
|
|
199261
201051
|
...isPlainObject5(input.llm?.google) ? input.llm?.google : {}
|
|
199262
201052
|
}
|
|
199263
201053
|
},
|
|
201054
|
+
indexing: {
|
|
201055
|
+
...defaults2.indexing,
|
|
201056
|
+
...isPlainObject5(input.indexing) ? input.indexing : {}
|
|
201057
|
+
},
|
|
199264
201058
|
embedding: {
|
|
199265
201059
|
...defaults2.embedding,
|
|
199266
201060
|
...isPlainObject5(input.embedding) ? input.embedding : {},
|
|
@@ -199935,7 +201729,8 @@ import path9 from "node:path";
|
|
|
199935
201729
|
import { fileURLToPath } from "node:url";
|
|
199936
201730
|
|
|
199937
201731
|
// ../server/src/indexTaskManager.ts
|
|
199938
|
-
var DEFAULT_INDEX_TASK_TIMEOUT_MS =
|
|
201732
|
+
var DEFAULT_INDEX_TASK_TIMEOUT_MS = 150 * 60 * 1000;
|
|
201733
|
+
var DEFAULT_FILE_TIMEOUT_MS = 15 * 60 * 1000;
|
|
199939
201734
|
|
|
199940
201735
|
class IndexTaskManager {
|
|
199941
201736
|
broadcaster;
|
|
@@ -199955,12 +201750,18 @@ class IndexTaskManager {
|
|
|
199955
201750
|
getTask(sourceId) {
|
|
199956
201751
|
return this.indexTasks.get(sourceId) ?? null;
|
|
199957
201752
|
}
|
|
199958
|
-
|
|
201753
|
+
getAbortSignal(sourceId) {
|
|
201754
|
+
return this.indexTasks.get(sourceId)?.abortController.signal ?? null;
|
|
201755
|
+
}
|
|
201756
|
+
createTask(sourceId, machineId, targetCommit, modules, timeoutMs) {
|
|
199959
201757
|
const existing = this.indexTasks.get(sourceId);
|
|
199960
201758
|
if (existing) {
|
|
199961
201759
|
clearTimeout(existing.timer);
|
|
201760
|
+
existing.abortController.abort("Task replaced by new task");
|
|
199962
201761
|
this.indexTasks.delete(sourceId);
|
|
199963
201762
|
}
|
|
201763
|
+
const abortController = new AbortController;
|
|
201764
|
+
const effectiveTimeout = timeoutMs ?? this.timeoutMs;
|
|
199964
201765
|
const task = {
|
|
199965
201766
|
sourceId,
|
|
199966
201767
|
machineId,
|
|
@@ -199968,8 +201769,10 @@ class IndexTaskManager {
|
|
|
199968
201769
|
startedAt: new Date,
|
|
199969
201770
|
timer: setTimeout(() => {
|
|
199970
201771
|
this.timeoutTask(sourceId);
|
|
199971
|
-
},
|
|
201772
|
+
}, effectiveTimeout),
|
|
201773
|
+
timeoutMs: effectiveTimeout,
|
|
199972
201774
|
progress: null,
|
|
201775
|
+
abortController,
|
|
199973
201776
|
...modules && modules.length > 0 ? { modules } : {}
|
|
199974
201777
|
};
|
|
199975
201778
|
this.indexTasks.set(sourceId, task);
|
|
@@ -200007,7 +201810,7 @@ class IndexTaskManager {
|
|
|
200007
201810
|
clearTimeout(task.timer);
|
|
200008
201811
|
task.timer = setTimeout(() => {
|
|
200009
201812
|
this.timeoutTask(sourceId);
|
|
200010
|
-
},
|
|
201813
|
+
}, task.timeoutMs);
|
|
200011
201814
|
nextPhase();
|
|
200012
201815
|
return;
|
|
200013
201816
|
}
|
|
@@ -200026,6 +201829,7 @@ class IndexTaskManager {
|
|
|
200026
201829
|
return;
|
|
200027
201830
|
this.pendingPhases.delete(sourceId);
|
|
200028
201831
|
clearTimeout(task.timer);
|
|
201832
|
+
task.abortController.abort(error40);
|
|
200029
201833
|
this.indexTasks.delete(sourceId);
|
|
200030
201834
|
this.broadcaster.error({
|
|
200031
201835
|
source_id: sourceId,
|
|
@@ -200040,6 +201844,7 @@ class IndexTaskManager {
|
|
|
200040
201844
|
return;
|
|
200041
201845
|
this.pendingPhases.delete(sourceId);
|
|
200042
201846
|
clearTimeout(task.timer);
|
|
201847
|
+
task.abortController.abort("Task timed out");
|
|
200043
201848
|
this.indexTasks.delete(sourceId);
|
|
200044
201849
|
this.broadcaster.timeout({
|
|
200045
201850
|
source_id: sourceId,
|
|
@@ -200055,6 +201860,7 @@ class IndexTaskManager {
|
|
|
200055
201860
|
destroy() {
|
|
200056
201861
|
for (const task of this.indexTasks.values()) {
|
|
200057
201862
|
clearTimeout(task.timer);
|
|
201863
|
+
task.abortController.abort("Manager destroyed");
|
|
200058
201864
|
}
|
|
200059
201865
|
this.indexTasks.clear();
|
|
200060
201866
|
this.pendingPhases.clear();
|