@semiont/jobs 0.4.19 → 0.4.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -4
- package/dist/index.d.ts +42 -224
- package/dist/index.js +1518 -12463
- package/dist/index.js.map +1 -1
- package/dist/smelter-main.d.ts +2 -0
- package/dist/smelter-main.js +10063 -0
- package/dist/smelter-main.js.map +1 -0
- package/dist/worker-main.d.ts +2 -0
- package/dist/worker-main.js +1646 -0
- package/dist/worker-main.js.map +1 -0
- package/package.json +12 -2
|
@@ -0,0 +1,1646 @@
|
|
|
1
|
+
import { InMemorySessionStorage, setStoredSession, SemiontSession, createJobClaimAdapter, validateAndCorrectOffsets, getLocaleEnglishName } from '@semiont/api-client';
|
|
2
|
+
import { createTomlConfigLoader, RESOURCE_BROADCAST_TYPES, didToAgent } from '@semiont/core';
|
|
3
|
+
import { deriveStorageUri } from '@semiont/content';
|
|
4
|
+
import { generateAnnotationId } from '@semiont/event-sourcing';
|
|
5
|
+
import { createInferenceClient } from '@semiont/inference';
|
|
6
|
+
import { createServer } from 'http';
|
|
7
|
+
import { existsSync, readFileSync } from 'fs';
|
|
8
|
+
import { homedir, hostname } from 'os';
|
|
9
|
+
import { join } from 'path';
|
|
10
|
+
import winston from 'winston';
|
|
11
|
+
|
|
12
|
+
// src/worker-process.ts
|
|
13
|
+
|
|
14
|
+
// src/workers/detection/motivation-prompts.ts
|
|
15
|
+
var MotivationPrompts = class {
|
|
16
|
+
/**
|
|
17
|
+
* Build a prompt for detecting comment-worthy passages
|
|
18
|
+
*
|
|
19
|
+
* @param content - The text content to analyze (will be truncated to 8000 chars)
|
|
20
|
+
* @param instructions - Optional user-provided instructions
|
|
21
|
+
* @param tone - Optional tone guidance (e.g., "academic", "conversational")
|
|
22
|
+
* @param density - Optional target number of comments per 2000 words
|
|
23
|
+
* @returns Formatted prompt string
|
|
24
|
+
*/
|
|
25
|
+
static buildCommentPrompt(content, instructions, tone, density) {
|
|
26
|
+
let prompt;
|
|
27
|
+
if (instructions) {
|
|
28
|
+
const toneGuidance = tone ? ` Use a ${tone} tone.` : "";
|
|
29
|
+
const densityGuidance = density ? `
|
|
30
|
+
|
|
31
|
+
Aim for approximately ${density} comments per 2000 words of text.` : "";
|
|
32
|
+
prompt = `Add comments to passages in this text following these instructions:
|
|
33
|
+
|
|
34
|
+
${instructions}${toneGuidance}${densityGuidance}
|
|
35
|
+
|
|
36
|
+
Text to analyze:
|
|
37
|
+
---
|
|
38
|
+
${content.substring(0, 8e3)}
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
Return a JSON array of comments. Each comment must have:
|
|
42
|
+
- "exact": the exact text passage being commented on (quoted verbatim from source)
|
|
43
|
+
- "start": character offset where the passage starts
|
|
44
|
+
- "end": character offset where the passage ends
|
|
45
|
+
- "prefix": up to 32 characters of text immediately before the passage
|
|
46
|
+
- "suffix": up to 32 characters of text immediately after the passage
|
|
47
|
+
- "comment": your comment following the instructions above
|
|
48
|
+
|
|
49
|
+
Respond with a valid JSON array.
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
[
|
|
53
|
+
{"exact": "the quarterly review meeting", "start": 142, "end": 169, "prefix": "We need to schedule ", "suffix": " for next month.", "comment": "Who will lead this? Should we invite the external auditors?"}
|
|
54
|
+
]`;
|
|
55
|
+
} else {
|
|
56
|
+
const toneGuidance = tone ? `
|
|
57
|
+
|
|
58
|
+
Tone: Use a ${tone} style in your comments.` : "";
|
|
59
|
+
const densityGuidance = density ? `
|
|
60
|
+
- Aim for approximately ${density} comments per 2000 words` : `
|
|
61
|
+
- Aim for 3-8 comments per 2000 words (not too sparse or dense)`;
|
|
62
|
+
prompt = `Identify passages in this text that would benefit from explanatory comments.
|
|
63
|
+
For each passage, provide contextual information, clarification, or background.${toneGuidance}
|
|
64
|
+
|
|
65
|
+
Guidelines:
|
|
66
|
+
- Select passages that reference technical terms, historical figures, complex concepts, or unclear references
|
|
67
|
+
- Provide comments that ADD VALUE beyond restating the text
|
|
68
|
+
- Focus on explanation, background, or connections to other ideas
|
|
69
|
+
- Avoid obvious or trivial comments
|
|
70
|
+
- Keep comments concise (1-3 sentences typically)${densityGuidance}
|
|
71
|
+
|
|
72
|
+
Text to analyze:
|
|
73
|
+
---
|
|
74
|
+
${content.substring(0, 8e3)}
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
Return a JSON array of comments. Each comment should have:
|
|
78
|
+
- "exact": the exact text passage being commented on (quoted verbatim from source)
|
|
79
|
+
- "start": character offset where the passage starts
|
|
80
|
+
- "end": character offset where the passage ends
|
|
81
|
+
- "prefix": up to 32 characters of text immediately before the passage
|
|
82
|
+
- "suffix": up to 32 characters of text immediately after the passage
|
|
83
|
+
- "comment": your explanatory comment (1-3 sentences, provide context/background/clarification)
|
|
84
|
+
|
|
85
|
+
Respond with a valid JSON array.
|
|
86
|
+
|
|
87
|
+
Example format:
|
|
88
|
+
[
|
|
89
|
+
{"exact": "Ouranos", "start": 52, "end": 59, "prefix": "In the beginning, ", "suffix": " ruled the universe", "comment": "Ouranos (also spelled Uranus) is the primordial Greek deity personifying the sky. In Hesiod's Theogony, he is the son and husband of Gaia (Earth) and father of the Titans."}
|
|
90
|
+
]`;
|
|
91
|
+
}
|
|
92
|
+
return prompt;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Build a prompt for detecting highlight-worthy passages
|
|
96
|
+
*
|
|
97
|
+
* @param content - The text content to analyze (will be truncated to 8000 chars)
|
|
98
|
+
* @param instructions - Optional user-provided instructions
|
|
99
|
+
* @param density - Optional target number of highlights per 2000 words
|
|
100
|
+
* @returns Formatted prompt string
|
|
101
|
+
*/
|
|
102
|
+
static buildHighlightPrompt(content, instructions, density) {
|
|
103
|
+
let prompt;
|
|
104
|
+
if (instructions) {
|
|
105
|
+
const densityGuidance = density ? `
|
|
106
|
+
|
|
107
|
+
Aim for approximately ${density} highlights per 2000 words of text.` : "";
|
|
108
|
+
prompt = `Identify passages in this text to highlight following these instructions:
|
|
109
|
+
|
|
110
|
+
${instructions}${densityGuidance}
|
|
111
|
+
|
|
112
|
+
Text to analyze:
|
|
113
|
+
---
|
|
114
|
+
${content.substring(0, 8e3)}
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
Return a JSON array of highlights. Each highlight must have:
|
|
118
|
+
- "exact": the exact text passage to highlight (quoted verbatim from source)
|
|
119
|
+
- "start": character offset where the passage starts
|
|
120
|
+
- "end": character offset where the passage ends
|
|
121
|
+
- "prefix": up to 32 characters of text immediately before the passage
|
|
122
|
+
- "suffix": up to 32 characters of text immediately after the passage
|
|
123
|
+
|
|
124
|
+
Respond with a valid JSON array.
|
|
125
|
+
|
|
126
|
+
Example:
|
|
127
|
+
[
|
|
128
|
+
{"exact": "revenue grew 45% year-over-year", "start": 142, "end": 174, "prefix": "In Q3 2024, ", "suffix": ", exceeding all forecasts."}
|
|
129
|
+
]`;
|
|
130
|
+
} else {
|
|
131
|
+
const densityGuidance = density ? `
|
|
132
|
+
- Aim for approximately ${density} highlights per 2000 words` : `
|
|
133
|
+
- Aim for 3-8 highlights per 2000 words (be selective)`;
|
|
134
|
+
prompt = `Identify passages in this text that merit highlighting for their importance or salience.
|
|
135
|
+
Focus on content that readers should notice and remember.
|
|
136
|
+
|
|
137
|
+
Guidelines:
|
|
138
|
+
- Highlight key claims, findings, or conclusions
|
|
139
|
+
- Highlight important definitions, terminology, or concepts
|
|
140
|
+
- Highlight notable quotes or particularly striking statements
|
|
141
|
+
- Highlight critical decisions, action items, or turning points
|
|
142
|
+
- Select passages that are SIGNIFICANT, not just interesting
|
|
143
|
+
- Avoid trivial or obvious content${densityGuidance}
|
|
144
|
+
|
|
145
|
+
Text to analyze:
|
|
146
|
+
---
|
|
147
|
+
${content.substring(0, 8e3)}
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
Return a JSON array of highlights. Each highlight should have:
|
|
151
|
+
- "exact": the exact text passage to highlight (quoted verbatim from source)
|
|
152
|
+
- "start": character offset where the passage starts
|
|
153
|
+
- "end": character offset where the passage ends
|
|
154
|
+
- "prefix": up to 32 characters of text immediately before the passage
|
|
155
|
+
- "suffix": up to 32 characters of text immediately after the passage
|
|
156
|
+
|
|
157
|
+
Respond with a valid JSON array.
|
|
158
|
+
|
|
159
|
+
Example format:
|
|
160
|
+
[
|
|
161
|
+
{"exact": "we will discontinue support for legacy systems by March 2025", "start": 52, "end": 113, "prefix": "After careful consideration, ", "suffix": ". This decision affects"}
|
|
162
|
+
]`;
|
|
163
|
+
}
|
|
164
|
+
return prompt;
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Build a prompt for detecting assessment-worthy passages
|
|
168
|
+
*
|
|
169
|
+
* @param content - The text content to analyze (will be truncated to 8000 chars)
|
|
170
|
+
* @param instructions - Optional user-provided instructions
|
|
171
|
+
* @param tone - Optional tone guidance (e.g., "critical", "supportive")
|
|
172
|
+
* @param density - Optional target number of assessments per 2000 words
|
|
173
|
+
* @returns Formatted prompt string
|
|
174
|
+
*/
|
|
175
|
+
static buildAssessmentPrompt(content, instructions, tone, density) {
|
|
176
|
+
let prompt;
|
|
177
|
+
if (instructions) {
|
|
178
|
+
const toneGuidance = tone ? ` Use a ${tone} tone.` : "";
|
|
179
|
+
const densityGuidance = density ? `
|
|
180
|
+
|
|
181
|
+
Aim for approximately ${density} assessments per 2000 words of text.` : "";
|
|
182
|
+
prompt = `Assess passages in this text following these instructions:
|
|
183
|
+
|
|
184
|
+
${instructions}${toneGuidance}${densityGuidance}
|
|
185
|
+
|
|
186
|
+
Text to analyze:
|
|
187
|
+
---
|
|
188
|
+
${content.substring(0, 8e3)}
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
Return a JSON array of assessments. Each assessment must have:
|
|
192
|
+
- "exact": the exact text passage being assessed (quoted verbatim from source)
|
|
193
|
+
- "start": character offset where the passage starts
|
|
194
|
+
- "end": character offset where the passage ends
|
|
195
|
+
- "prefix": up to 32 characters of text immediately before the passage
|
|
196
|
+
- "suffix": up to 32 characters of text immediately after the passage
|
|
197
|
+
- "assessment": your assessment following the instructions above
|
|
198
|
+
|
|
199
|
+
Respond with a valid JSON array.
|
|
200
|
+
|
|
201
|
+
Example:
|
|
202
|
+
[
|
|
203
|
+
{"exact": "the quarterly revenue target", "start": 142, "end": 169, "prefix": "We established ", "suffix": " for Q4 2024.", "assessment": "This target seems ambitious given market conditions. Consider revising based on recent trends."}
|
|
204
|
+
]`;
|
|
205
|
+
} else {
|
|
206
|
+
const toneGuidance = tone ? `
|
|
207
|
+
|
|
208
|
+
Tone: Use a ${tone} style in your assessments.` : "";
|
|
209
|
+
const densityGuidance = density ? `
|
|
210
|
+
- Aim for approximately ${density} assessments per 2000 words` : `
|
|
211
|
+
- Aim for 2-6 assessments per 2000 words (focus on key passages)`;
|
|
212
|
+
prompt = `Identify passages in this text that merit critical assessment or evaluation.
|
|
213
|
+
For each passage, provide analysis of its validity, strength, or implications.${toneGuidance}
|
|
214
|
+
|
|
215
|
+
Guidelines:
|
|
216
|
+
- Select passages containing claims, arguments, conclusions, or assertions
|
|
217
|
+
- Assess evidence quality, logical soundness, or practical implications
|
|
218
|
+
- Provide assessments that ADD INSIGHT beyond restating the text
|
|
219
|
+
- Focus on passages where evaluation would help readers form judgments
|
|
220
|
+
- Keep assessments concise yet substantive (1-3 sentences typically)${densityGuidance}
|
|
221
|
+
|
|
222
|
+
Text to analyze:
|
|
223
|
+
---
|
|
224
|
+
${content.substring(0, 8e3)}
|
|
225
|
+
---
|
|
226
|
+
|
|
227
|
+
Return a JSON array of assessments. Each assessment should have:
|
|
228
|
+
- "exact": the exact text passage being assessed (quoted verbatim from source)
|
|
229
|
+
- "start": character offset where the passage starts
|
|
230
|
+
- "end": character offset where the passage ends
|
|
231
|
+
- "prefix": up to 32 characters of text immediately before the passage
|
|
232
|
+
- "suffix": up to 32 characters of text immediately after the passage
|
|
233
|
+
- "assessment": your analytical assessment (1-3 sentences, evaluate validity/strength/implications)
|
|
234
|
+
|
|
235
|
+
Respond with a valid JSON array.
|
|
236
|
+
|
|
237
|
+
Example format:
|
|
238
|
+
[
|
|
239
|
+
{"exact": "AI will replace most jobs by 2030", "start": 52, "end": 89, "prefix": "Many experts predict that ", "suffix": ", fundamentally reshaping", "assessment": "This claim lacks nuance and supporting evidence. Employment patterns historically show job transformation rather than wholesale replacement. The timeline appears speculative without specific sector analysis."}
|
|
240
|
+
]`;
|
|
241
|
+
}
|
|
242
|
+
return prompt;
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Build a prompt for detecting structural tags
|
|
246
|
+
*
|
|
247
|
+
* @param content - The full text content to analyze (NOT truncated for structural analysis)
|
|
248
|
+
* @param category - The specific category to detect
|
|
249
|
+
* @param schemaName - Human-readable schema name
|
|
250
|
+
* @param schemaDescription - Schema description
|
|
251
|
+
* @param schemaDomain - Schema domain
|
|
252
|
+
* @param categoryDescription - Category description
|
|
253
|
+
* @param categoryExamples - Example questions/guidance for this category
|
|
254
|
+
* @returns Formatted prompt string
|
|
255
|
+
*/
|
|
256
|
+
static buildTagPrompt(content, category, schemaName, schemaDescription, schemaDomain, categoryDescription, categoryExamples) {
|
|
257
|
+
const prompt = `You are analyzing a text using the ${schemaName} framework.
|
|
258
|
+
|
|
259
|
+
Schema: ${schemaDescription}
|
|
260
|
+
Domain: ${schemaDomain}
|
|
261
|
+
|
|
262
|
+
Your task: Identify passages that serve the structural role of "${category}".
|
|
263
|
+
|
|
264
|
+
Category: ${category}
|
|
265
|
+
Description: ${categoryDescription}
|
|
266
|
+
Key questions:
|
|
267
|
+
${categoryExamples.map((ex) => `- ${ex}`).join("\n")}
|
|
268
|
+
|
|
269
|
+
Guidelines:
|
|
270
|
+
- Focus on STRUCTURAL FUNCTION, not semantic content
|
|
271
|
+
- A passage serves the "${category}" role if it performs this function in the document's structure
|
|
272
|
+
- Look for passages that explicitly fulfill this role
|
|
273
|
+
- Passages can be sentences, paragraphs, or sections
|
|
274
|
+
- Aim for precision - only tag passages that clearly serve this structural role
|
|
275
|
+
- Typical documents have 1-5 instances of each category (some may have 0)
|
|
276
|
+
|
|
277
|
+
Text to analyze:
|
|
278
|
+
---
|
|
279
|
+
${content}
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
Return a JSON array of tags. Each tag should have:
|
|
283
|
+
- "exact": the exact text passage (quoted verbatim from source)
|
|
284
|
+
- "start": character offset where the passage starts
|
|
285
|
+
- "end": character offset where the passage ends
|
|
286
|
+
- "prefix": up to 32 characters of text immediately before the passage
|
|
287
|
+
- "suffix": up to 32 characters of text immediately after the passage
|
|
288
|
+
|
|
289
|
+
Respond with a valid JSON array.
|
|
290
|
+
|
|
291
|
+
Example format:
|
|
292
|
+
[
|
|
293
|
+
{"exact": "What duty did the defendant owe?", "start": 142, "end": 175, "prefix": "The central question is: ", "suffix": " This question must be"},
|
|
294
|
+
{"exact": "In tort law, a duty of care is established when...", "start": 412, "end": 520, "prefix": "Legal framework:\\n", "suffix": "\\n\\nApplying this standard"}
|
|
295
|
+
]`;
|
|
296
|
+
return prompt;
|
|
297
|
+
}
|
|
298
|
+
};
|
|
299
|
+
function extractObjectsFromArray(response) {
|
|
300
|
+
let cleaned = response.trim();
|
|
301
|
+
if (cleaned.startsWith("```")) {
|
|
302
|
+
cleaned = cleaned.replace(/^```(?:json)?\s*\n?/, "").replace(/\n?```\s*$/, "");
|
|
303
|
+
}
|
|
304
|
+
try {
|
|
305
|
+
const parsed = JSON.parse(cleaned);
|
|
306
|
+
return Array.isArray(parsed) ? parsed : [];
|
|
307
|
+
} catch {
|
|
308
|
+
}
|
|
309
|
+
const start = cleaned.indexOf("[");
|
|
310
|
+
if (start === -1) return [];
|
|
311
|
+
const endBracket = cleaned.lastIndexOf("]");
|
|
312
|
+
const end = endBracket > start ? endBracket : cleaned.length;
|
|
313
|
+
const inner = cleaned.slice(start + 1, end);
|
|
314
|
+
const objects = [];
|
|
315
|
+
let depth = 0;
|
|
316
|
+
let objStart = -1;
|
|
317
|
+
let inString = false;
|
|
318
|
+
let escape = false;
|
|
319
|
+
for (let i = 0; i < inner.length; i++) {
|
|
320
|
+
const ch = inner[i];
|
|
321
|
+
if (escape) {
|
|
322
|
+
escape = false;
|
|
323
|
+
continue;
|
|
324
|
+
}
|
|
325
|
+
if (ch === "\\") {
|
|
326
|
+
escape = true;
|
|
327
|
+
continue;
|
|
328
|
+
}
|
|
329
|
+
if (ch === '"') {
|
|
330
|
+
inString = !inString;
|
|
331
|
+
continue;
|
|
332
|
+
}
|
|
333
|
+
if (inString) continue;
|
|
334
|
+
if (ch === "{") {
|
|
335
|
+
if (depth === 0) objStart = i;
|
|
336
|
+
depth++;
|
|
337
|
+
} else if (ch === "}") {
|
|
338
|
+
depth--;
|
|
339
|
+
if (depth === 0 && objStart !== -1) {
|
|
340
|
+
try {
|
|
341
|
+
objects.push(JSON.parse(inner.slice(objStart, i + 1)));
|
|
342
|
+
} catch {
|
|
343
|
+
}
|
|
344
|
+
objStart = -1;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
return objects;
|
|
349
|
+
}
|
|
350
|
+
var MotivationParsers = class {
|
|
351
|
+
/**
|
|
352
|
+
* Parse and validate AI response for comment detection
|
|
353
|
+
*
|
|
354
|
+
* @param response - Raw AI response string (may include markdown code fences)
|
|
355
|
+
* @param content - Original content to validate offsets against
|
|
356
|
+
* @returns Array of validated comment matches
|
|
357
|
+
*/
|
|
358
|
+
static parseComments(response, content) {
|
|
359
|
+
try {
|
|
360
|
+
const parsed = extractObjectsFromArray(response);
|
|
361
|
+
const valid = parsed.filter(
|
|
362
|
+
(c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.start === "number" && typeof c.end === "number" && typeof c.comment === "string" && c.comment.trim().length > 0
|
|
363
|
+
);
|
|
364
|
+
console.log(`[MotivationParsers] Parsed ${valid.length} valid comments from ${parsed.length} total`);
|
|
365
|
+
const validatedComments = [];
|
|
366
|
+
for (const comment of valid) {
|
|
367
|
+
try {
|
|
368
|
+
const validated = validateAndCorrectOffsets(content, comment.start, comment.end, comment.exact);
|
|
369
|
+
validatedComments.push({
|
|
370
|
+
...comment,
|
|
371
|
+
start: validated.start,
|
|
372
|
+
end: validated.end,
|
|
373
|
+
prefix: validated.prefix,
|
|
374
|
+
suffix: validated.suffix
|
|
375
|
+
});
|
|
376
|
+
} catch (error) {
|
|
377
|
+
console.warn(`[MotivationParsers] Skipping invalid comment "${comment.exact}":`, error);
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
return validatedComments;
|
|
381
|
+
} catch (error) {
|
|
382
|
+
console.error("[MotivationParsers] Failed to parse AI comment response:", error);
|
|
383
|
+
return [];
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
/**
|
|
387
|
+
* Parse and validate AI response for highlight detection
|
|
388
|
+
*
|
|
389
|
+
* @param response - Raw AI response string (may include markdown code fences)
|
|
390
|
+
* @param content - Original content to validate offsets against
|
|
391
|
+
* @returns Array of validated highlight matches
|
|
392
|
+
*/
|
|
393
|
+
static parseHighlights(response, content) {
|
|
394
|
+
try {
|
|
395
|
+
const parsed = extractObjectsFromArray(response);
|
|
396
|
+
const highlights = parsed.filter(
|
|
397
|
+
(h) => !!h && typeof h === "object" && typeof h.exact === "string" && typeof h.start === "number" && typeof h.end === "number"
|
|
398
|
+
);
|
|
399
|
+
const validatedHighlights = [];
|
|
400
|
+
for (const highlight of highlights) {
|
|
401
|
+
try {
|
|
402
|
+
const validated = validateAndCorrectOffsets(content, highlight.start, highlight.end, highlight.exact);
|
|
403
|
+
validatedHighlights.push({
|
|
404
|
+
...highlight,
|
|
405
|
+
start: validated.start,
|
|
406
|
+
end: validated.end,
|
|
407
|
+
prefix: validated.prefix,
|
|
408
|
+
suffix: validated.suffix
|
|
409
|
+
});
|
|
410
|
+
} catch (error) {
|
|
411
|
+
console.warn(`[MotivationParsers] Skipping invalid highlight "${highlight.exact}":`, error);
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
return validatedHighlights;
|
|
415
|
+
} catch (error) {
|
|
416
|
+
console.error("[MotivationParsers] Failed to parse AI highlight response:", error);
|
|
417
|
+
console.error("Raw response:", response);
|
|
418
|
+
return [];
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
/**
|
|
422
|
+
* Parse and validate AI response for assessment detection
|
|
423
|
+
*
|
|
424
|
+
* @param response - Raw AI response string (may include markdown code fences)
|
|
425
|
+
* @param content - Original content to validate offsets against
|
|
426
|
+
* @returns Array of validated assessment matches
|
|
427
|
+
*/
|
|
428
|
+
static parseAssessments(response, content) {
|
|
429
|
+
try {
|
|
430
|
+
const parsed = extractObjectsFromArray(response);
|
|
431
|
+
const assessments = parsed.filter(
|
|
432
|
+
(a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.start === "number" && typeof a.end === "number" && typeof a.assessment === "string"
|
|
433
|
+
);
|
|
434
|
+
const validatedAssessments = [];
|
|
435
|
+
for (const assessment of assessments) {
|
|
436
|
+
try {
|
|
437
|
+
const validated = validateAndCorrectOffsets(content, assessment.start, assessment.end, assessment.exact);
|
|
438
|
+
validatedAssessments.push({
|
|
439
|
+
...assessment,
|
|
440
|
+
start: validated.start,
|
|
441
|
+
end: validated.end,
|
|
442
|
+
prefix: validated.prefix,
|
|
443
|
+
suffix: validated.suffix
|
|
444
|
+
});
|
|
445
|
+
} catch (error) {
|
|
446
|
+
console.warn(`[MotivationParsers] Skipping invalid assessment "${assessment.exact}":`, error);
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
return validatedAssessments;
|
|
450
|
+
} catch (error) {
|
|
451
|
+
console.error("[MotivationParsers] Failed to parse AI assessment response:", error);
|
|
452
|
+
console.error("Raw response:", response);
|
|
453
|
+
return [];
|
|
454
|
+
}
|
|
455
|
+
}
|
|
456
|
+
/**
|
|
457
|
+
* Parse and validate AI response for tag detection
|
|
458
|
+
* Note: Does NOT validate offsets - caller must do that with content
|
|
459
|
+
*
|
|
460
|
+
* @param response - Raw AI response string (may include markdown code fences)
|
|
461
|
+
* @returns Array of tag matches (offsets not yet validated)
|
|
462
|
+
*/
|
|
463
|
+
static parseTags(response) {
|
|
464
|
+
try {
|
|
465
|
+
const parsed = extractObjectsFromArray(response);
|
|
466
|
+
const valid = parsed.filter(
|
|
467
|
+
(t) => !!t && typeof t === "object" && typeof t.exact === "string" && typeof t.start === "number" && typeof t.end === "number" && t.exact.trim().length > 0
|
|
468
|
+
);
|
|
469
|
+
console.log(`[MotivationParsers] Parsed ${valid.length} valid tags from ${parsed.length} total`);
|
|
470
|
+
return valid;
|
|
471
|
+
} catch (error) {
|
|
472
|
+
console.error("[MotivationParsers] Failed to parse AI tag response:", error);
|
|
473
|
+
return [];
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
/**
|
|
477
|
+
* Validate tag offsets against content and add category
|
|
478
|
+
* Helper for tag detection after initial parsing
|
|
479
|
+
*
|
|
480
|
+
* @param tags - Parsed tags without validated offsets
|
|
481
|
+
* @param content - Original content to validate against
|
|
482
|
+
* @param category - Category to assign to validated tags
|
|
483
|
+
* @returns Array of validated tag matches
|
|
484
|
+
*/
|
|
485
|
+
static validateTagOffsets(tags, content, category) {
|
|
486
|
+
const validatedTags = [];
|
|
487
|
+
for (const tag of tags) {
|
|
488
|
+
try {
|
|
489
|
+
const validated = validateAndCorrectOffsets(content, tag.start, tag.end, tag.exact);
|
|
490
|
+
validatedTags.push({
|
|
491
|
+
...tag,
|
|
492
|
+
category,
|
|
493
|
+
start: validated.start,
|
|
494
|
+
end: validated.end,
|
|
495
|
+
prefix: validated.prefix,
|
|
496
|
+
suffix: validated.suffix
|
|
497
|
+
});
|
|
498
|
+
} catch (error) {
|
|
499
|
+
console.warn(`[MotivationParsers] Skipping invalid tag for category "${category}":`, error);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
return validatedTags;
|
|
503
|
+
}
|
|
504
|
+
};
|
|
505
|
+
|
|
506
|
+
// ../ontology/dist/index.js
|
|
507
|
+
var TAG_SCHEMAS = {
|
|
508
|
+
"legal-irac": {
|
|
509
|
+
id: "legal-irac",
|
|
510
|
+
name: "Legal Analysis (IRAC)",
|
|
511
|
+
description: "Issue, Rule, Application, Conclusion framework for legal reasoning",
|
|
512
|
+
domain: "legal",
|
|
513
|
+
tags: [
|
|
514
|
+
{
|
|
515
|
+
name: "Issue",
|
|
516
|
+
description: "The legal question or problem to be resolved",
|
|
517
|
+
examples: [
|
|
518
|
+
"What is the central legal question?",
|
|
519
|
+
"What must the court decide?",
|
|
520
|
+
"What is the dispute about?"
|
|
521
|
+
]
|
|
522
|
+
},
|
|
523
|
+
{
|
|
524
|
+
name: "Rule",
|
|
525
|
+
description: "The relevant law, statute, or legal principle",
|
|
526
|
+
examples: [
|
|
527
|
+
"What law applies?",
|
|
528
|
+
"What is the legal standard?",
|
|
529
|
+
"What statute governs this case?"
|
|
530
|
+
]
|
|
531
|
+
},
|
|
532
|
+
{
|
|
533
|
+
name: "Application",
|
|
534
|
+
description: "How the rule applies to the specific facts",
|
|
535
|
+
examples: [
|
|
536
|
+
"How does the law apply to these facts?",
|
|
537
|
+
"Analysis of the case",
|
|
538
|
+
"How do the facts satisfy the legal standard?"
|
|
539
|
+
]
|
|
540
|
+
},
|
|
541
|
+
{
|
|
542
|
+
name: "Conclusion",
|
|
543
|
+
description: "The resolution or outcome based on the analysis",
|
|
544
|
+
examples: [
|
|
545
|
+
"What is the court's decision?",
|
|
546
|
+
"What is the final judgment?",
|
|
547
|
+
"What is the holding?"
|
|
548
|
+
]
|
|
549
|
+
}
|
|
550
|
+
]
|
|
551
|
+
},
|
|
552
|
+
"scientific-imrad": {
|
|
553
|
+
id: "scientific-imrad",
|
|
554
|
+
name: "Scientific Paper (IMRAD)",
|
|
555
|
+
description: "Introduction, Methods, Results, Discussion structure for research papers",
|
|
556
|
+
domain: "scientific",
|
|
557
|
+
tags: [
|
|
558
|
+
{
|
|
559
|
+
name: "Introduction",
|
|
560
|
+
description: "Background, context, and research question",
|
|
561
|
+
examples: [
|
|
562
|
+
"What is the research question?",
|
|
563
|
+
"Why is this important?",
|
|
564
|
+
"What is the hypothesis?"
|
|
565
|
+
]
|
|
566
|
+
},
|
|
567
|
+
{
|
|
568
|
+
name: "Methods",
|
|
569
|
+
description: "Experimental design and procedures",
|
|
570
|
+
examples: [
|
|
571
|
+
"How was the study conducted?",
|
|
572
|
+
"What methods were used?",
|
|
573
|
+
"What was the experimental design?"
|
|
574
|
+
]
|
|
575
|
+
},
|
|
576
|
+
{
|
|
577
|
+
name: "Results",
|
|
578
|
+
description: "Findings and observations",
|
|
579
|
+
examples: [
|
|
580
|
+
"What did the study find?",
|
|
581
|
+
"What are the data?",
|
|
582
|
+
"What were the observations?"
|
|
583
|
+
]
|
|
584
|
+
},
|
|
585
|
+
{
|
|
586
|
+
name: "Discussion",
|
|
587
|
+
description: "Interpretation and implications of results",
|
|
588
|
+
examples: [
|
|
589
|
+
"What do the results mean?",
|
|
590
|
+
"What are the implications?",
|
|
591
|
+
"How do these findings relate to prior work?"
|
|
592
|
+
]
|
|
593
|
+
}
|
|
594
|
+
]
|
|
595
|
+
},
|
|
596
|
+
"argument-toulmin": {
|
|
597
|
+
id: "argument-toulmin",
|
|
598
|
+
name: "Argument Structure (Toulmin)",
|
|
599
|
+
description: "Claim, Evidence, Warrant, Counterargument, Rebuttal framework for argumentation",
|
|
600
|
+
domain: "general",
|
|
601
|
+
tags: [
|
|
602
|
+
{
|
|
603
|
+
name: "Claim",
|
|
604
|
+
description: "The main assertion or thesis",
|
|
605
|
+
examples: [
|
|
606
|
+
"What is being argued?",
|
|
607
|
+
"What is the main point?",
|
|
608
|
+
"What position is being taken?"
|
|
609
|
+
]
|
|
610
|
+
},
|
|
611
|
+
{
|
|
612
|
+
name: "Evidence",
|
|
613
|
+
description: "Data or facts supporting the claim",
|
|
614
|
+
examples: [
|
|
615
|
+
"What supports this claim?",
|
|
616
|
+
"What are the facts?",
|
|
617
|
+
"What data is provided?"
|
|
618
|
+
]
|
|
619
|
+
},
|
|
620
|
+
{
|
|
621
|
+
name: "Warrant",
|
|
622
|
+
description: "Reasoning connecting evidence to claim",
|
|
623
|
+
examples: [
|
|
624
|
+
"Why does this evidence support the claim?",
|
|
625
|
+
"What is the logic?",
|
|
626
|
+
"How does this reasoning work?"
|
|
627
|
+
]
|
|
628
|
+
},
|
|
629
|
+
{
|
|
630
|
+
name: "Counterargument",
|
|
631
|
+
description: "Opposing viewpoints or objections",
|
|
632
|
+
examples: [
|
|
633
|
+
"What are the objections?",
|
|
634
|
+
"What do critics say?",
|
|
635
|
+
"What are alternative views?"
|
|
636
|
+
]
|
|
637
|
+
},
|
|
638
|
+
{
|
|
639
|
+
name: "Rebuttal",
|
|
640
|
+
description: "Response to counterarguments",
|
|
641
|
+
examples: [
|
|
642
|
+
"How is the objection addressed?",
|
|
643
|
+
"Why is the counterargument wrong?",
|
|
644
|
+
"How is the criticism answered?"
|
|
645
|
+
]
|
|
646
|
+
}
|
|
647
|
+
]
|
|
648
|
+
}
|
|
649
|
+
};
|
|
650
|
+
function getTagSchema(schemaId) {
|
|
651
|
+
return TAG_SCHEMAS[schemaId] || null;
|
|
652
|
+
}
|
|
653
|
+
function getSchemaCategory(schemaId, categoryName) {
|
|
654
|
+
const schema = getTagSchema(schemaId);
|
|
655
|
+
if (!schema) return null;
|
|
656
|
+
return schema.tags.find((tag) => tag.name === categoryName) || null;
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
// src/workers/annotation-detection.ts
|
|
660
|
+
var AnnotationDetection = class {
|
|
661
|
+
/**
|
|
662
|
+
* Fetch content from a ContentFetcher and read the stream to a string.
|
|
663
|
+
* Shared helper for all workers.
|
|
664
|
+
*/
|
|
665
|
+
static async fetchContent(contentFetcher, resourceId) {
|
|
666
|
+
const stream = await contentFetcher(resourceId);
|
|
667
|
+
if (!stream) {
|
|
668
|
+
throw new Error(`Could not load content for resource ${resourceId}`);
|
|
669
|
+
}
|
|
670
|
+
const chunks = [];
|
|
671
|
+
for await (const chunk of stream) {
|
|
672
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
673
|
+
}
|
|
674
|
+
return Buffer.concat(chunks).toString("utf-8");
|
|
675
|
+
}
|
|
676
|
+
/**
|
|
677
|
+
* Detect comments in content
|
|
678
|
+
*/
|
|
679
|
+
static async detectComments(content, client, instructions, tone, density) {
|
|
680
|
+
const prompt = MotivationPrompts.buildCommentPrompt(content, instructions, tone, density);
|
|
681
|
+
const response = await client.generateText(prompt, 3e3, 0.4);
|
|
682
|
+
return MotivationParsers.parseComments(response, content);
|
|
683
|
+
}
|
|
684
|
+
/**
|
|
685
|
+
* Detect highlights in content
|
|
686
|
+
*/
|
|
687
|
+
static async detectHighlights(content, client, instructions, density) {
|
|
688
|
+
const prompt = MotivationPrompts.buildHighlightPrompt(content, instructions, density);
|
|
689
|
+
const response = await client.generateText(prompt, 2e3, 0.3);
|
|
690
|
+
return MotivationParsers.parseHighlights(response, content);
|
|
691
|
+
}
|
|
692
|
+
/**
|
|
693
|
+
* Detect assessments in content
|
|
694
|
+
*/
|
|
695
|
+
static async detectAssessments(content, client, instructions, tone, density) {
|
|
696
|
+
const prompt = MotivationPrompts.buildAssessmentPrompt(content, instructions, tone, density);
|
|
697
|
+
const response = await client.generateText(prompt, 3e3, 0.3);
|
|
698
|
+
return MotivationParsers.parseAssessments(response, content);
|
|
699
|
+
}
|
|
700
|
+
/**
|
|
701
|
+
* Detect tags in content for a specific category
|
|
702
|
+
*/
|
|
703
|
+
static async detectTags(content, client, schemaId, category) {
|
|
704
|
+
const schema = getTagSchema(schemaId);
|
|
705
|
+
if (!schema) {
|
|
706
|
+
throw new Error(`Invalid tag schema: ${schemaId}`);
|
|
707
|
+
}
|
|
708
|
+
const categoryInfo = getSchemaCategory(schemaId, category);
|
|
709
|
+
if (!categoryInfo) {
|
|
710
|
+
throw new Error(`Invalid category "${category}" for schema ${schemaId}`);
|
|
711
|
+
}
|
|
712
|
+
const prompt = MotivationPrompts.buildTagPrompt(
|
|
713
|
+
content,
|
|
714
|
+
category,
|
|
715
|
+
schema.name,
|
|
716
|
+
schema.description,
|
|
717
|
+
schema.domain,
|
|
718
|
+
categoryInfo.description,
|
|
719
|
+
categoryInfo.examples
|
|
720
|
+
);
|
|
721
|
+
const response = await client.generateText(prompt, 4e3, 0.2);
|
|
722
|
+
const parsedTags = MotivationParsers.parseTags(response);
|
|
723
|
+
return MotivationParsers.validateTagOffsets(parsedTags, content, category);
|
|
724
|
+
}
|
|
725
|
+
};
|
|
726
|
+
|
|
727
|
+
// src/workers/detection/entity-extractor.ts
|
|
728
|
+
async function extractEntities(exact, entityTypes, client, includeDescriptiveReferences = false, logger2) {
|
|
729
|
+
const entityTypesDescription = entityTypes.map((et) => {
|
|
730
|
+
if (typeof et === "string") {
|
|
731
|
+
return et;
|
|
732
|
+
}
|
|
733
|
+
return et.examples && et.examples.length > 0 ? `${et.type} (examples: ${et.examples.slice(0, 3).join(", ")})` : et.type;
|
|
734
|
+
}).join(", ");
|
|
735
|
+
const descriptiveReferenceGuidance = includeDescriptiveReferences ? `
|
|
736
|
+
Include both:
|
|
737
|
+
- Direct mentions (names, proper nouns)
|
|
738
|
+
- Descriptive references (substantive phrases that refer to entities)
|
|
739
|
+
|
|
740
|
+
For descriptive references, include:
|
|
741
|
+
- Definite descriptions: "the Nobel laureate", "the tech giant", "the former president"
|
|
742
|
+
- Role-based references: "the CEO", "the physicist", "the author", "the owner", "the contractor"
|
|
743
|
+
- Epithets with context: "the Cupertino-based company", "the iPhone maker"
|
|
744
|
+
- References to entities even when identity is unknown or unspecified
|
|
745
|
+
|
|
746
|
+
Do NOT include:
|
|
747
|
+
- Simple pronouns alone: he, she, it, they, him, her, them
|
|
748
|
+
- Generic determiners alone: this, that, these, those
|
|
749
|
+
- Possessives without substance: his, her, their, its
|
|
750
|
+
|
|
751
|
+
Examples:
|
|
752
|
+
- For "Marie Curie", include "the Nobel laureate" and "the physicist" but NOT "she"
|
|
753
|
+
- For an unknown person, include "the owner" or "the contractor" (role-based references count even when identity is unspecified)
|
|
754
|
+
` : `
|
|
755
|
+
Find direct mentions only (names, proper nouns). Do not include pronouns or descriptive references.
|
|
756
|
+
`;
|
|
757
|
+
const prompt = `Identify entity references in the following text. Look for mentions of: ${entityTypesDescription}.
|
|
758
|
+
${descriptiveReferenceGuidance}
|
|
759
|
+
Text to analyze:
|
|
760
|
+
"""
|
|
761
|
+
${exact}
|
|
762
|
+
"""
|
|
763
|
+
|
|
764
|
+
Respond with a JSON array of entities found. Each entity should have:
|
|
765
|
+
- exact: the exact text span from the input
|
|
766
|
+
- entityType: one of the provided entity types
|
|
767
|
+
- startOffset: character position where the entity starts (0-indexed)
|
|
768
|
+
- endOffset: character position where the entity ends
|
|
769
|
+
- prefix: up to 32 characters of text immediately before the entity (helps identify correct occurrence)
|
|
770
|
+
- suffix: up to 32 characters of text immediately after the entity (helps identify correct occurrence)
|
|
771
|
+
|
|
772
|
+
If no entities are found, respond with an empty array [].
|
|
773
|
+
|
|
774
|
+
Example output:
|
|
775
|
+
[{"exact":"Alice","entityType":"Person","startOffset":0,"endOffset":5,"prefix":"","suffix":" went to"},{"exact":"Paris","entityType":"Location","startOffset":20,"endOffset":25,"prefix":"went to ","suffix":" yesterday"}]`;
|
|
776
|
+
const response = await client.generateTextWithMetadata(
|
|
777
|
+
prompt,
|
|
778
|
+
4e3,
|
|
779
|
+
// Increased to handle many entities without truncation
|
|
780
|
+
0.3
|
|
781
|
+
// Lower temperature for more consistent extraction
|
|
782
|
+
);
|
|
783
|
+
try {
|
|
784
|
+
let jsonStr = response.text.trim();
|
|
785
|
+
if (jsonStr.startsWith("```")) {
|
|
786
|
+
jsonStr = jsonStr.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
|
|
787
|
+
}
|
|
788
|
+
const entities = JSON.parse(jsonStr);
|
|
789
|
+
logger2?.debug("Parsed entities from AI response", { count: entities.length });
|
|
790
|
+
if (response.stopReason === "max_tokens") {
|
|
791
|
+
const errorMsg = `AI response truncated: Found ${entities.length} entities but response hit max_tokens limit. Increase max_tokens or reduce resource size.`;
|
|
792
|
+
logger2?.error(errorMsg);
|
|
793
|
+
throw new Error(errorMsg);
|
|
794
|
+
}
|
|
795
|
+
return entities.map((entity, idx) => {
|
|
796
|
+
let startOffset = entity.startOffset;
|
|
797
|
+
let endOffset = entity.endOffset;
|
|
798
|
+
logger2?.debug("Processing entity", {
|
|
799
|
+
index: idx + 1,
|
|
800
|
+
total: entities.length,
|
|
801
|
+
type: entity.entityType,
|
|
802
|
+
text: entity.exact,
|
|
803
|
+
offsetsFromAI: `[${startOffset}:${endOffset}]`
|
|
804
|
+
});
|
|
805
|
+
const extractedText = exact.substring(startOffset, endOffset);
|
|
806
|
+
let anchorMethod;
|
|
807
|
+
if (extractedText === entity.exact) {
|
|
808
|
+
anchorMethod = "llm-exact";
|
|
809
|
+
logger2?.debug("Entity anchored", {
|
|
810
|
+
text: entity.exact,
|
|
811
|
+
entityType: entity.entityType,
|
|
812
|
+
anchorMethod
|
|
813
|
+
});
|
|
814
|
+
} else {
|
|
815
|
+
logger2?.debug("LLM offsets mismatch \u2014 attempting re-anchor", {
|
|
816
|
+
expected: entity.exact,
|
|
817
|
+
llmOffsets: `[${startOffset}:${endOffset}]`,
|
|
818
|
+
foundAtLlmOffsets: extractedText
|
|
819
|
+
});
|
|
820
|
+
let occurrenceCount = 0;
|
|
821
|
+
let firstOccurrence = -1;
|
|
822
|
+
let searchPos = 0;
|
|
823
|
+
while ((searchPos = exact.indexOf(entity.exact, searchPos)) !== -1) {
|
|
824
|
+
if (firstOccurrence === -1) firstOccurrence = searchPos;
|
|
825
|
+
occurrenceCount++;
|
|
826
|
+
searchPos++;
|
|
827
|
+
}
|
|
828
|
+
if (occurrenceCount === 0) {
|
|
829
|
+
anchorMethod = "dropped";
|
|
830
|
+
logger2?.error("Entity text not found in resource \u2014 dropping", {
|
|
831
|
+
text: entity.exact,
|
|
832
|
+
entityType: entity.entityType,
|
|
833
|
+
llmOffsets: `[${startOffset}:${endOffset}]`,
|
|
834
|
+
anchorMethod,
|
|
835
|
+
resourceStart: exact.substring(0, 200)
|
|
836
|
+
});
|
|
837
|
+
return null;
|
|
838
|
+
}
|
|
839
|
+
let recoveredOffset = -1;
|
|
840
|
+
if (entity.prefix || entity.suffix) {
|
|
841
|
+
let p = 0;
|
|
842
|
+
while ((p = exact.indexOf(entity.exact, p)) !== -1) {
|
|
843
|
+
const candidatePrefix = exact.substring(Math.max(0, p - 32), p);
|
|
844
|
+
const candidateSuffix = exact.substring(
|
|
845
|
+
p + entity.exact.length,
|
|
846
|
+
Math.min(exact.length, p + entity.exact.length + 32)
|
|
847
|
+
);
|
|
848
|
+
const prefixMatch = !entity.prefix || candidatePrefix.endsWith(entity.prefix);
|
|
849
|
+
const suffixMatch = !entity.suffix || candidateSuffix.startsWith(entity.suffix);
|
|
850
|
+
if (prefixMatch && suffixMatch) {
|
|
851
|
+
recoveredOffset = p;
|
|
852
|
+
break;
|
|
853
|
+
}
|
|
854
|
+
p++;
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
if (recoveredOffset !== -1) {
|
|
858
|
+
anchorMethod = "context-recovered";
|
|
859
|
+
startOffset = recoveredOffset;
|
|
860
|
+
endOffset = recoveredOffset + entity.exact.length;
|
|
861
|
+
logger2?.debug("Entity anchored", {
|
|
862
|
+
text: entity.exact,
|
|
863
|
+
entityType: entity.entityType,
|
|
864
|
+
anchorMethod,
|
|
865
|
+
offsetDiff: recoveredOffset - entity.startOffset
|
|
866
|
+
});
|
|
867
|
+
} else if (occurrenceCount === 1) {
|
|
868
|
+
anchorMethod = "unique-match";
|
|
869
|
+
startOffset = firstOccurrence;
|
|
870
|
+
endOffset = firstOccurrence + entity.exact.length;
|
|
871
|
+
logger2?.debug("Entity anchored", {
|
|
872
|
+
text: entity.exact,
|
|
873
|
+
entityType: entity.entityType,
|
|
874
|
+
anchorMethod,
|
|
875
|
+
offsetDiff: firstOccurrence - entity.startOffset
|
|
876
|
+
});
|
|
877
|
+
} else {
|
|
878
|
+
anchorMethod = "first-of-many";
|
|
879
|
+
startOffset = firstOccurrence;
|
|
880
|
+
endOffset = firstOccurrence + entity.exact.length;
|
|
881
|
+
logger2?.warn("Entity anchored at first of multiple occurrences \u2014 may be wrong", {
|
|
882
|
+
text: entity.exact,
|
|
883
|
+
entityType: entity.entityType,
|
|
884
|
+
anchorMethod,
|
|
885
|
+
occurrenceCount,
|
|
886
|
+
chosenOffset: firstOccurrence,
|
|
887
|
+
llmOffsets: `[${entity.startOffset}:${entity.endOffset}]`,
|
|
888
|
+
hasPrefix: !!entity.prefix,
|
|
889
|
+
hasSuffix: !!entity.suffix
|
|
890
|
+
});
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
return {
|
|
894
|
+
exact: entity.exact,
|
|
895
|
+
entityType: entity.entityType,
|
|
896
|
+
startOffset,
|
|
897
|
+
endOffset,
|
|
898
|
+
prefix: entity.prefix,
|
|
899
|
+
suffix: entity.suffix
|
|
900
|
+
};
|
|
901
|
+
}).filter((entity) => {
|
|
902
|
+
if (entity === null) {
|
|
903
|
+
logger2?.debug("Filtered entity: null");
|
|
904
|
+
return false;
|
|
905
|
+
}
|
|
906
|
+
if (entity.startOffset === void 0 || entity.endOffset === void 0) {
|
|
907
|
+
logger2?.warn("Filtered entity: missing offsets", { text: entity.exact });
|
|
908
|
+
return false;
|
|
909
|
+
}
|
|
910
|
+
if (entity.startOffset < 0) {
|
|
911
|
+
logger2?.warn("Filtered entity: negative startOffset", {
|
|
912
|
+
text: entity.exact,
|
|
913
|
+
startOffset: entity.startOffset
|
|
914
|
+
});
|
|
915
|
+
return false;
|
|
916
|
+
}
|
|
917
|
+
if (entity.endOffset > exact.length) {
|
|
918
|
+
logger2?.warn("Filtered entity: endOffset exceeds text length", {
|
|
919
|
+
text: entity.exact,
|
|
920
|
+
endOffset: entity.endOffset,
|
|
921
|
+
textLength: exact.length
|
|
922
|
+
});
|
|
923
|
+
return false;
|
|
924
|
+
}
|
|
925
|
+
const extractedText = exact.substring(entity.startOffset, entity.endOffset);
|
|
926
|
+
if (extractedText !== entity.exact) {
|
|
927
|
+
logger2?.warn("Filtered entity: offset mismatch", {
|
|
928
|
+
expected: entity.exact,
|
|
929
|
+
got: extractedText,
|
|
930
|
+
offsets: `[${entity.startOffset}:${entity.endOffset}]`
|
|
931
|
+
});
|
|
932
|
+
return false;
|
|
933
|
+
}
|
|
934
|
+
logger2?.debug("Accepted entity", {
|
|
935
|
+
text: entity.exact,
|
|
936
|
+
offsets: `[${entity.startOffset}:${entity.endOffset}]`
|
|
937
|
+
});
|
|
938
|
+
return true;
|
|
939
|
+
});
|
|
940
|
+
} catch (error) {
|
|
941
|
+
return [];
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
function getLanguageName(locale) {
|
|
945
|
+
return getLocaleEnglishName(locale) || locale;
|
|
946
|
+
}
|
|
947
|
+
async function generateResourceFromTopic(topic, entityTypes, client, userPrompt, locale, context, temperature, maxTokens, logger2) {
|
|
948
|
+
const finalTemperature = temperature ?? 0.7;
|
|
949
|
+
const finalMaxTokens = maxTokens ?? 500;
|
|
950
|
+
const languageInstruction = locale && locale !== "en" ? `
|
|
951
|
+
|
|
952
|
+
IMPORTANT: Write the entire resource in ${getLanguageName(locale)}.` : "";
|
|
953
|
+
let annotationSection = "";
|
|
954
|
+
if (context) {
|
|
955
|
+
const parts = [];
|
|
956
|
+
parts.push(`- Annotation motivation: ${context.annotation.motivation}`);
|
|
957
|
+
parts.push(`- Source resource: ${context.sourceResource.name}`);
|
|
958
|
+
const { motivation, body } = context.annotation;
|
|
959
|
+
if (motivation === "commenting" || motivation === "assessing") {
|
|
960
|
+
const bodyItem = Array.isArray(body) ? body[0] : body;
|
|
961
|
+
if (bodyItem && "value" in bodyItem && bodyItem.value) {
|
|
962
|
+
const label = motivation === "commenting" ? "Comment" : "Assessment";
|
|
963
|
+
parts.push(`- ${label}: ${bodyItem.value}`);
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
annotationSection = `
|
|
967
|
+
|
|
968
|
+
Annotation context:
|
|
969
|
+
${parts.join("\n")}`;
|
|
970
|
+
}
|
|
971
|
+
let contextSection = "";
|
|
972
|
+
if (context?.sourceContext) {
|
|
973
|
+
const { before, selected, after } = context.sourceContext;
|
|
974
|
+
contextSection = `
|
|
975
|
+
|
|
976
|
+
Source document context:
|
|
977
|
+
---
|
|
978
|
+
${before ? `...${before}` : ""}
|
|
979
|
+
**[${selected}]**
|
|
980
|
+
${after ? `${after}...` : ""}
|
|
981
|
+
---
|
|
982
|
+
`;
|
|
983
|
+
}
|
|
984
|
+
let graphContextSection = "";
|
|
985
|
+
if (context?.graphContext) {
|
|
986
|
+
const gc = context.graphContext;
|
|
987
|
+
const connections = gc.connections ?? [];
|
|
988
|
+
const citedBy = gc.citedBy ?? [];
|
|
989
|
+
const parts = [];
|
|
990
|
+
if (connections.length > 0) {
|
|
991
|
+
const connList = connections.map((c) => `${c.resourceName}${c.entityTypes?.length ? ` (${c.entityTypes.join(", ")})` : ""}`).join(", ");
|
|
992
|
+
parts.push(`- Connected resources: ${connList}`);
|
|
993
|
+
}
|
|
994
|
+
if (gc.citedByCount && gc.citedByCount > 0) {
|
|
995
|
+
const citedNames = citedBy.map((c) => c.resourceName).join(", ");
|
|
996
|
+
parts.push(`- This resource is cited by ${gc.citedByCount} other resource${gc.citedByCount > 1 ? "s" : ""}${citedNames ? `: ${citedNames}` : ""}`);
|
|
997
|
+
}
|
|
998
|
+
if (gc.siblingEntityTypes && gc.siblingEntityTypes.length > 0) {
|
|
999
|
+
parts.push(`- Related entity types in this document: ${gc.siblingEntityTypes.join(", ")}`);
|
|
1000
|
+
}
|
|
1001
|
+
if (gc.inferredRelationshipSummary) {
|
|
1002
|
+
parts.push(`- Relationship summary: ${gc.inferredRelationshipSummary}`);
|
|
1003
|
+
}
|
|
1004
|
+
if (parts.length > 0) {
|
|
1005
|
+
graphContextSection = `
|
|
1006
|
+
|
|
1007
|
+
Knowledge graph context:
|
|
1008
|
+
${parts.join("\n")}`;
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
const structureGuidance = finalMaxTokens >= 1e3 ? "organized into titled sections (## Section) with well-structured paragraphs" : "organized into well-structured paragraphs";
|
|
1012
|
+
const prompt = `Generate a concise, informative resource about "${topic}".
|
|
1013
|
+
${entityTypes.length > 0 ? `Focus on these entity types: ${entityTypes.join(", ")}.` : ""}
|
|
1014
|
+
${userPrompt ? `Additional context: ${userPrompt}` : ""}${annotationSection}${contextSection}${graphContextSection}${languageInstruction}
|
|
1015
|
+
|
|
1016
|
+
Requirements:
|
|
1017
|
+
- Start with a clear heading (# Title)
|
|
1018
|
+
- Aim for approximately ${finalMaxTokens} tokens of content, ${structureGuidance}
|
|
1019
|
+
- Be factual and informative
|
|
1020
|
+
- Use markdown formatting
|
|
1021
|
+
- Write the response as markdown`;
|
|
1022
|
+
const parseResponse = (response2) => {
|
|
1023
|
+
let content = response2.trim();
|
|
1024
|
+
if (content.startsWith("```markdown") || content.startsWith("```md")) {
|
|
1025
|
+
content = content.slice(content.indexOf("\n") + 1);
|
|
1026
|
+
const endIndex = content.lastIndexOf("```");
|
|
1027
|
+
if (endIndex !== -1) {
|
|
1028
|
+
content = content.slice(0, endIndex);
|
|
1029
|
+
}
|
|
1030
|
+
} else if (content.startsWith("```")) {
|
|
1031
|
+
content = content.slice(3);
|
|
1032
|
+
const endIndex = content.lastIndexOf("```");
|
|
1033
|
+
if (endIndex !== -1) {
|
|
1034
|
+
content = content.slice(0, endIndex);
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
content = content.trim();
|
|
1038
|
+
return {
|
|
1039
|
+
title: topic,
|
|
1040
|
+
content
|
|
1041
|
+
};
|
|
1042
|
+
};
|
|
1043
|
+
const response = await client.generateText(prompt, finalMaxTokens, finalTemperature);
|
|
1044
|
+
const result = parseResponse(response);
|
|
1045
|
+
return result;
|
|
1046
|
+
}
|
|
1047
|
+
function buildTextAnnotation(resourceId, userId, generator, motivation, match, body) {
|
|
1048
|
+
return {
|
|
1049
|
+
"@context": "http://www.w3.org/ns/anno.jsonld",
|
|
1050
|
+
"type": "Annotation",
|
|
1051
|
+
"id": generateAnnotationId(),
|
|
1052
|
+
motivation,
|
|
1053
|
+
creator: didToAgent(userId),
|
|
1054
|
+
generator,
|
|
1055
|
+
created: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1056
|
+
target: {
|
|
1057
|
+
type: "SpecificResource",
|
|
1058
|
+
source: resourceId,
|
|
1059
|
+
selector: [
|
|
1060
|
+
{ type: "TextPositionSelector", start: match.start, end: match.end },
|
|
1061
|
+
{
|
|
1062
|
+
type: "TextQuoteSelector",
|
|
1063
|
+
exact: match.exact,
|
|
1064
|
+
...match.prefix && { prefix: match.prefix },
|
|
1065
|
+
...match.suffix && { suffix: match.suffix }
|
|
1066
|
+
}
|
|
1067
|
+
]
|
|
1068
|
+
},
|
|
1069
|
+
...body !== void 0 ? { body } : {}
|
|
1070
|
+
};
|
|
1071
|
+
}
|
|
1072
|
+
async function processHighlightJob(content, inferenceClient, params, userId, generator, onProgress) {
|
|
1073
|
+
onProgress(10, "Loading resource...", "analyzing");
|
|
1074
|
+
onProgress(30, "Analyzing text...", "analyzing");
|
|
1075
|
+
const highlights = await AnnotationDetection.detectHighlights(
|
|
1076
|
+
content,
|
|
1077
|
+
inferenceClient,
|
|
1078
|
+
params.instructions,
|
|
1079
|
+
params.density
|
|
1080
|
+
);
|
|
1081
|
+
onProgress(60, `Creating ${highlights.length} annotations...`, "creating");
|
|
1082
|
+
const annotations = highlights.map(
|
|
1083
|
+
(h) => buildTextAnnotation(params.resourceId, userId, generator, "highlighting", h)
|
|
1084
|
+
);
|
|
1085
|
+
onProgress(100, `Complete! Created ${annotations.length} highlights`, "creating");
|
|
1086
|
+
return {
|
|
1087
|
+
annotations,
|
|
1088
|
+
result: { highlightsFound: highlights.length, highlightsCreated: annotations.length }
|
|
1089
|
+
};
|
|
1090
|
+
}
|
|
1091
|
+
async function processCommentJob(content, inferenceClient, params, userId, generator, onProgress) {
|
|
1092
|
+
onProgress(10, "Loading resource...", "analyzing");
|
|
1093
|
+
onProgress(30, "Analyzing text...", "analyzing");
|
|
1094
|
+
const comments = await AnnotationDetection.detectComments(
|
|
1095
|
+
content,
|
|
1096
|
+
inferenceClient,
|
|
1097
|
+
params.instructions,
|
|
1098
|
+
params.tone,
|
|
1099
|
+
params.density
|
|
1100
|
+
);
|
|
1101
|
+
onProgress(60, `Creating ${comments.length} annotations...`, "creating");
|
|
1102
|
+
const annotations = comments.map(
|
|
1103
|
+
(c) => (
|
|
1104
|
+
// Match the pre-#651 CommentAnnotationWorker: include format and
|
|
1105
|
+
// language on the body TextualBody. Optional in the schema, but
|
|
1106
|
+
// consumers that do language-aware rendering rely on them.
|
|
1107
|
+
buildTextAnnotation(params.resourceId, userId, generator, "commenting", c, [
|
|
1108
|
+
{ type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language: "en" }
|
|
1109
|
+
])
|
|
1110
|
+
)
|
|
1111
|
+
);
|
|
1112
|
+
onProgress(100, `Complete! Created ${annotations.length} comments`, "creating");
|
|
1113
|
+
return {
|
|
1114
|
+
annotations,
|
|
1115
|
+
result: { commentsFound: comments.length, commentsCreated: annotations.length }
|
|
1116
|
+
};
|
|
1117
|
+
}
|
|
1118
|
+
async function processAssessmentJob(content, inferenceClient, params, userId, generator, onProgress) {
|
|
1119
|
+
onProgress(10, "Loading resource...", "analyzing");
|
|
1120
|
+
onProgress(30, "Analyzing text...", "analyzing");
|
|
1121
|
+
const assessments = await AnnotationDetection.detectAssessments(
|
|
1122
|
+
content,
|
|
1123
|
+
inferenceClient,
|
|
1124
|
+
params.instructions,
|
|
1125
|
+
params.tone,
|
|
1126
|
+
params.density
|
|
1127
|
+
);
|
|
1128
|
+
onProgress(60, `Creating ${assessments.length} annotations...`, "creating");
|
|
1129
|
+
const annotations = assessments.map(
|
|
1130
|
+
(a) => (
|
|
1131
|
+
// Single-object body with purpose aligned to motivation, matching the
|
|
1132
|
+
// pre-#651 AssessmentAnnotationWorker's shape and the majority of
|
|
1133
|
+
// persisted assessments. Do not switch to an array or to
|
|
1134
|
+
// purpose='describing' — that loses the "this is an assessment, not
|
|
1135
|
+
// a description" signal and breaks existing readers that access
|
|
1136
|
+
// `body.value` directly on the object.
|
|
1137
|
+
buildTextAnnotation(params.resourceId, userId, generator, "assessing", a, {
|
|
1138
|
+
type: "TextualBody",
|
|
1139
|
+
value: a.assessment,
|
|
1140
|
+
purpose: "assessing",
|
|
1141
|
+
format: "text/plain",
|
|
1142
|
+
language: "en"
|
|
1143
|
+
})
|
|
1144
|
+
)
|
|
1145
|
+
);
|
|
1146
|
+
onProgress(100, `Complete! Created ${annotations.length} assessments`, "creating");
|
|
1147
|
+
return {
|
|
1148
|
+
annotations,
|
|
1149
|
+
result: { assessmentsFound: assessments.length, assessmentsCreated: annotations.length }
|
|
1150
|
+
};
|
|
1151
|
+
}
|
|
1152
|
+
async function processReferenceJob(content, inferenceClient, params, userId, generator, onProgress, logger2) {
|
|
1153
|
+
const entityTypeNames = params.entityTypes.map(String);
|
|
1154
|
+
const requestParams = [{ label: "Entity types", value: entityTypeNames.join(", ") }];
|
|
1155
|
+
const completedEntityTypes = [];
|
|
1156
|
+
let totalFound = 0;
|
|
1157
|
+
let totalEmitted = 0;
|
|
1158
|
+
let errors = 0;
|
|
1159
|
+
const allAnnotations = [];
|
|
1160
|
+
onProgress(10, "Loading resource...", "analyzing", { requestParams });
|
|
1161
|
+
for (let i = 0; i < entityTypeNames.length; i++) {
|
|
1162
|
+
const entityTypeName = entityTypeNames[i];
|
|
1163
|
+
if (!entityTypeName) continue;
|
|
1164
|
+
const pct = 20 + Math.round(i / entityTypeNames.length * 60);
|
|
1165
|
+
onProgress(pct, `Detecting ${entityTypeName} entities...`, "analyzing", {
|
|
1166
|
+
currentEntityType: entityTypeName,
|
|
1167
|
+
processedEntityTypes: i,
|
|
1168
|
+
totalEntityTypes: entityTypeNames.length,
|
|
1169
|
+
entitiesFound: totalFound,
|
|
1170
|
+
entitiesEmitted: totalEmitted,
|
|
1171
|
+
completedEntityTypes: [...completedEntityTypes],
|
|
1172
|
+
requestParams
|
|
1173
|
+
});
|
|
1174
|
+
const extractedEntities = await extractEntities(
|
|
1175
|
+
content,
|
|
1176
|
+
[entityTypeName],
|
|
1177
|
+
inferenceClient,
|
|
1178
|
+
params.includeDescriptiveReferences ?? false,
|
|
1179
|
+
logger2
|
|
1180
|
+
);
|
|
1181
|
+
totalFound += extractedEntities.length;
|
|
1182
|
+
completedEntityTypes.push({ entityType: entityTypeName, foundCount: extractedEntities.length });
|
|
1183
|
+
const unresolvedBody = [{ type: "TextualBody", value: entityTypeName, purpose: "tagging" }];
|
|
1184
|
+
for (const entity of extractedEntities) {
|
|
1185
|
+
try {
|
|
1186
|
+
const validated = validateAndCorrectOffsets(content, entity.startOffset, entity.endOffset, entity.exact);
|
|
1187
|
+
const ann = buildTextAnnotation(
|
|
1188
|
+
params.resourceId,
|
|
1189
|
+
userId,
|
|
1190
|
+
generator,
|
|
1191
|
+
"linking",
|
|
1192
|
+
validated,
|
|
1193
|
+
unresolvedBody
|
|
1194
|
+
);
|
|
1195
|
+
allAnnotations.push(ann);
|
|
1196
|
+
totalEmitted++;
|
|
1197
|
+
} catch {
|
|
1198
|
+
errors++;
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
}
|
|
1202
|
+
onProgress(100, `Complete! Created ${totalEmitted} references`, "creating");
|
|
1203
|
+
return {
|
|
1204
|
+
annotations: allAnnotations,
|
|
1205
|
+
result: { totalFound, totalEmitted, errors }
|
|
1206
|
+
};
|
|
1207
|
+
}
|
|
1208
|
+
async function processTagJob(content, inferenceClient, params, userId, generator, onProgress) {
|
|
1209
|
+
onProgress(10, "Loading resource...", "analyzing");
|
|
1210
|
+
onProgress(30, "Analyzing text for tags...", "analyzing");
|
|
1211
|
+
const allTags = [];
|
|
1212
|
+
for (const category of params.categories) {
|
|
1213
|
+
const categoryTags = await AnnotationDetection.detectTags(
|
|
1214
|
+
content,
|
|
1215
|
+
inferenceClient,
|
|
1216
|
+
params.schemaId,
|
|
1217
|
+
category
|
|
1218
|
+
);
|
|
1219
|
+
allTags.push(...categoryTags);
|
|
1220
|
+
}
|
|
1221
|
+
const tags = allTags;
|
|
1222
|
+
onProgress(60, `Creating ${tags.length} tag annotations...`, "creating");
|
|
1223
|
+
const byCategory = {};
|
|
1224
|
+
const annotations = tags.map((t) => {
|
|
1225
|
+
const category = t.category ?? "unknown";
|
|
1226
|
+
byCategory[category] = (byCategory[category] ?? 0) + 1;
|
|
1227
|
+
return buildTextAnnotation(params.resourceId, userId, generator, "tagging", t, [
|
|
1228
|
+
{ type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: "en" },
|
|
1229
|
+
{ type: "TextualBody", value: params.schemaId, purpose: "classifying", format: "text/plain" }
|
|
1230
|
+
]);
|
|
1231
|
+
});
|
|
1232
|
+
onProgress(100, `Complete! Created ${annotations.length} tags`, "creating");
|
|
1233
|
+
return {
|
|
1234
|
+
annotations,
|
|
1235
|
+
result: { tagsFound: tags.length, tagsCreated: annotations.length, byCategory }
|
|
1236
|
+
};
|
|
1237
|
+
}
|
|
1238
|
+
async function processGenerationJob(inferenceClient, params, onProgress) {
|
|
1239
|
+
onProgress(20, "Fetching context...", "fetching");
|
|
1240
|
+
const title = params.title ?? "Untitled";
|
|
1241
|
+
const entityTypes = (params.entityTypes ?? []).map(String);
|
|
1242
|
+
onProgress(40, "Generating resource...", "generating");
|
|
1243
|
+
const generated = await generateResourceFromTopic(
|
|
1244
|
+
title,
|
|
1245
|
+
entityTypes,
|
|
1246
|
+
inferenceClient,
|
|
1247
|
+
params.prompt,
|
|
1248
|
+
params.language,
|
|
1249
|
+
params.context,
|
|
1250
|
+
params.temperature,
|
|
1251
|
+
params.maxTokens
|
|
1252
|
+
);
|
|
1253
|
+
onProgress(85, "Creating resource...", "creating");
|
|
1254
|
+
return {
|
|
1255
|
+
content: generated.content,
|
|
1256
|
+
title: generated.title ?? title,
|
|
1257
|
+
format: "text/markdown",
|
|
1258
|
+
result: {
|
|
1259
|
+
resourceId: "",
|
|
1260
|
+
resourceName: generated.title ?? title
|
|
1261
|
+
}
|
|
1262
|
+
};
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
// src/worker-process.ts
|
|
1266
|
+
async function emitEvent(session, channel, payload) {
|
|
1267
|
+
const isBroadcast = RESOURCE_BROADCAST_TYPES.includes(channel);
|
|
1268
|
+
const resourceScope = isBroadcast ? payload.resourceId : void 0;
|
|
1269
|
+
await session.client.actor.emit(channel, payload, resourceScope);
|
|
1270
|
+
}
|
|
1271
|
+
function startWorkerProcess(config) {
|
|
1272
|
+
const { session, logger: logger2 } = config;
|
|
1273
|
+
const adapter = createJobClaimAdapter({
|
|
1274
|
+
actor: session.client.actor,
|
|
1275
|
+
jobTypes: config.jobTypes
|
|
1276
|
+
});
|
|
1277
|
+
adapter.activeJob$.subscribe((job) => {
|
|
1278
|
+
if (!job) return;
|
|
1279
|
+
logger2.info("Processing job", { jobId: job.jobId, type: job.type, resourceId: job.resourceId });
|
|
1280
|
+
handleJob(adapter, config, job).catch((error) => {
|
|
1281
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1282
|
+
logger2.error("Job failed", { jobId: job.jobId, error: message, stack: error instanceof Error ? error.stack : void 0 });
|
|
1283
|
+
const failAnnotationId = job.params.referenceId;
|
|
1284
|
+
emitEvent(session, "job:fail", {
|
|
1285
|
+
resourceId: job.resourceId,
|
|
1286
|
+
userId: job.userId,
|
|
1287
|
+
jobId: job.jobId,
|
|
1288
|
+
jobType: job.type,
|
|
1289
|
+
...failAnnotationId ? { annotationId: failAnnotationId } : {},
|
|
1290
|
+
error: message
|
|
1291
|
+
}).catch(() => {
|
|
1292
|
+
});
|
|
1293
|
+
adapter.failJob(job.jobId, message);
|
|
1294
|
+
});
|
|
1295
|
+
});
|
|
1296
|
+
adapter.start();
|
|
1297
|
+
return adapter;
|
|
1298
|
+
}
|
|
1299
|
+
async function handleJob(adapter, config, job) {
|
|
1300
|
+
const { session } = config;
|
|
1301
|
+
const { resourceId, userId, jobId, type: jobType } = job;
|
|
1302
|
+
const annotationId = job.params.referenceId;
|
|
1303
|
+
const lifecycleBase = {
|
|
1304
|
+
resourceId,
|
|
1305
|
+
userId,
|
|
1306
|
+
jobId,
|
|
1307
|
+
jobType,
|
|
1308
|
+
...annotationId ? { annotationId } : {}
|
|
1309
|
+
};
|
|
1310
|
+
await emitEvent(session, "job:start", lifecycleBase);
|
|
1311
|
+
const engine = config.engines[jobType];
|
|
1312
|
+
if (!engine) {
|
|
1313
|
+
adapter.failJob(jobId, `No inference engine configured for job type: ${jobType}`);
|
|
1314
|
+
return;
|
|
1315
|
+
}
|
|
1316
|
+
const { inferenceClient, generator } = engine;
|
|
1317
|
+
const onProgress = (percentage, message, stage, extra) => {
|
|
1318
|
+
emitEvent(session, "job:report-progress", {
|
|
1319
|
+
...lifecycleBase,
|
|
1320
|
+
percentage,
|
|
1321
|
+
progress: {
|
|
1322
|
+
stage,
|
|
1323
|
+
percentage,
|
|
1324
|
+
message,
|
|
1325
|
+
...annotationId ? { annotationId } : {},
|
|
1326
|
+
...extra ?? {}
|
|
1327
|
+
}
|
|
1328
|
+
}).catch(() => {
|
|
1329
|
+
});
|
|
1330
|
+
};
|
|
1331
|
+
const fetchContent = async () => {
|
|
1332
|
+
return await session.client.browse.resourceContent(resourceId);
|
|
1333
|
+
};
|
|
1334
|
+
if (jobType === "highlight-annotation") {
|
|
1335
|
+
const content = await fetchContent();
|
|
1336
|
+
const { annotations, result } = await processHighlightJob(
|
|
1337
|
+
content,
|
|
1338
|
+
inferenceClient,
|
|
1339
|
+
job.params,
|
|
1340
|
+
userId,
|
|
1341
|
+
generator,
|
|
1342
|
+
onProgress
|
|
1343
|
+
);
|
|
1344
|
+
for (const ann of annotations) {
|
|
1345
|
+
await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
|
|
1346
|
+
}
|
|
1347
|
+
await emitEvent(session, "job:complete", {
|
|
1348
|
+
...lifecycleBase,
|
|
1349
|
+
result
|
|
1350
|
+
});
|
|
1351
|
+
adapter.completeJob();
|
|
1352
|
+
} else if (jobType === "comment-annotation") {
|
|
1353
|
+
const content = await fetchContent();
|
|
1354
|
+
const { annotations, result } = await processCommentJob(
|
|
1355
|
+
content,
|
|
1356
|
+
inferenceClient,
|
|
1357
|
+
job.params,
|
|
1358
|
+
userId,
|
|
1359
|
+
generator,
|
|
1360
|
+
onProgress
|
|
1361
|
+
);
|
|
1362
|
+
for (const ann of annotations) {
|
|
1363
|
+
await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
|
|
1364
|
+
}
|
|
1365
|
+
await emitEvent(session, "job:complete", {
|
|
1366
|
+
...lifecycleBase,
|
|
1367
|
+
result
|
|
1368
|
+
});
|
|
1369
|
+
adapter.completeJob();
|
|
1370
|
+
} else if (jobType === "assessment-annotation") {
|
|
1371
|
+
const content = await fetchContent();
|
|
1372
|
+
const { annotations, result } = await processAssessmentJob(
|
|
1373
|
+
content,
|
|
1374
|
+
inferenceClient,
|
|
1375
|
+
job.params,
|
|
1376
|
+
userId,
|
|
1377
|
+
generator,
|
|
1378
|
+
onProgress
|
|
1379
|
+
);
|
|
1380
|
+
for (const ann of annotations) {
|
|
1381
|
+
await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
|
|
1382
|
+
}
|
|
1383
|
+
await emitEvent(session, "job:complete", {
|
|
1384
|
+
...lifecycleBase,
|
|
1385
|
+
result
|
|
1386
|
+
});
|
|
1387
|
+
adapter.completeJob();
|
|
1388
|
+
} else if (jobType === "reference-annotation") {
|
|
1389
|
+
const content = await fetchContent();
|
|
1390
|
+
const { annotations, result } = await processReferenceJob(
|
|
1391
|
+
content,
|
|
1392
|
+
inferenceClient,
|
|
1393
|
+
job.params,
|
|
1394
|
+
userId,
|
|
1395
|
+
generator,
|
|
1396
|
+
onProgress
|
|
1397
|
+
);
|
|
1398
|
+
for (const ann of annotations) {
|
|
1399
|
+
await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
|
|
1400
|
+
}
|
|
1401
|
+
await emitEvent(session, "job:complete", {
|
|
1402
|
+
...lifecycleBase,
|
|
1403
|
+
result
|
|
1404
|
+
});
|
|
1405
|
+
adapter.completeJob();
|
|
1406
|
+
} else if (jobType === "tag-annotation") {
|
|
1407
|
+
const content = await fetchContent();
|
|
1408
|
+
const { annotations, result } = await processTagJob(
|
|
1409
|
+
content,
|
|
1410
|
+
inferenceClient,
|
|
1411
|
+
job.params,
|
|
1412
|
+
userId,
|
|
1413
|
+
generator,
|
|
1414
|
+
onProgress
|
|
1415
|
+
);
|
|
1416
|
+
for (const ann of annotations) {
|
|
1417
|
+
await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
|
|
1418
|
+
}
|
|
1419
|
+
await emitEvent(session, "job:complete", {
|
|
1420
|
+
...lifecycleBase,
|
|
1421
|
+
result
|
|
1422
|
+
});
|
|
1423
|
+
adapter.completeJob();
|
|
1424
|
+
} else if (jobType === "generation") {
|
|
1425
|
+
const genResult = await processGenerationJob(
|
|
1426
|
+
inferenceClient,
|
|
1427
|
+
job.params,
|
|
1428
|
+
onProgress
|
|
1429
|
+
);
|
|
1430
|
+
const genParams = job.params;
|
|
1431
|
+
const storageUri = deriveStorageUri(genResult.title, genResult.format);
|
|
1432
|
+
const { resourceId: newResourceId } = await session.client.yield.resource({
|
|
1433
|
+
name: genResult.title,
|
|
1434
|
+
file: Buffer.from(genResult.content),
|
|
1435
|
+
format: genResult.format,
|
|
1436
|
+
storageUri,
|
|
1437
|
+
creationMethod: "generated",
|
|
1438
|
+
sourceResourceId: resourceId,
|
|
1439
|
+
...genParams.referenceId ? { sourceAnnotationId: genParams.referenceId } : {},
|
|
1440
|
+
...genParams.prompt ? { generationPrompt: genParams.prompt } : {},
|
|
1441
|
+
...genParams.language ? { language: genParams.language } : {},
|
|
1442
|
+
generator
|
|
1443
|
+
});
|
|
1444
|
+
await emitEvent(session, "job:complete", {
|
|
1445
|
+
...lifecycleBase,
|
|
1446
|
+
result: { resourceId: newResourceId, resourceName: genResult.title }
|
|
1447
|
+
});
|
|
1448
|
+
adapter.completeJob();
|
|
1449
|
+
} else {
|
|
1450
|
+
adapter.failJob(jobId, `Unknown job type: ${jobType}`);
|
|
1451
|
+
}
|
|
1452
|
+
}
|
|
1453
|
+
function createProcessLogger(component) {
|
|
1454
|
+
const level = process.env.LOG_LEVEL ?? "info";
|
|
1455
|
+
const format = process.env.LOG_FORMAT === "simple" ? winston.format.combine(
|
|
1456
|
+
winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
|
|
1457
|
+
winston.format.errors({ stack: true }),
|
|
1458
|
+
winston.format.printf(({ level: lvl, message, timestamp, ...meta }) => {
|
|
1459
|
+
const metaStr = Object.keys(meta).length > 0 ? ` ${JSON.stringify(meta)}` : "";
|
|
1460
|
+
return `${timestamp} [${lvl.toUpperCase()}] [${component}] ${message}${metaStr}`;
|
|
1461
|
+
})
|
|
1462
|
+
) : winston.format.combine(
|
|
1463
|
+
winston.format.timestamp(),
|
|
1464
|
+
winston.format.errors({ stack: true }),
|
|
1465
|
+
winston.format.json()
|
|
1466
|
+
);
|
|
1467
|
+
const logger2 = winston.createLogger({
|
|
1468
|
+
level,
|
|
1469
|
+
defaultMeta: { component },
|
|
1470
|
+
format,
|
|
1471
|
+
transports: [new winston.transports.Console()]
|
|
1472
|
+
});
|
|
1473
|
+
return logger2;
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1476
|
+
// src/worker-main.ts
|
|
1477
|
+
var ALL_JOB_TYPES = [
|
|
1478
|
+
"reference-annotation",
|
|
1479
|
+
"generation",
|
|
1480
|
+
"highlight-annotation",
|
|
1481
|
+
"assessment-annotation",
|
|
1482
|
+
"comment-annotation",
|
|
1483
|
+
"tag-annotation"
|
|
1484
|
+
];
|
|
1485
|
+
var configPath = join(homedir(), ".semiontconfig");
|
|
1486
|
+
var tomlReader = {
|
|
1487
|
+
readIfExists: (p) => existsSync(p) ? readFileSync(p, "utf-8") : null
|
|
1488
|
+
};
|
|
1489
|
+
var envConfig = createTomlConfigLoader(
|
|
1490
|
+
tomlReader,
|
|
1491
|
+
configPath,
|
|
1492
|
+
process.env
|
|
1493
|
+
)(null, "local");
|
|
1494
|
+
var workerInferenceMap = envConfig._metadata?.workers;
|
|
1495
|
+
if (!workerInferenceMap || Object.keys(workerInferenceMap).length === 0) {
|
|
1496
|
+
throw new Error(
|
|
1497
|
+
'No worker inference config found in ~/.semiontconfig. Add at least [environments.<env>.workers.default.inference] with type = "..." and model = "...".'
|
|
1498
|
+
);
|
|
1499
|
+
}
|
|
1500
|
+
function resolveWorker(jobType) {
|
|
1501
|
+
const specific = workerInferenceMap[jobType];
|
|
1502
|
+
if (specific) return specific;
|
|
1503
|
+
const def = workerInferenceMap["default"];
|
|
1504
|
+
if (def) return def;
|
|
1505
|
+
throw new Error(
|
|
1506
|
+
`No inference config for worker '${jobType}' and no workers.default in ~/.semiontconfig.`
|
|
1507
|
+
);
|
|
1508
|
+
}
|
|
1509
|
+
var backendPublicURL = envConfig.services?.backend?.publicURL;
|
|
1510
|
+
if (!backendPublicURL) {
|
|
1511
|
+
throw new Error("services.backend.publicURL is required in ~/.semiontconfig");
|
|
1512
|
+
}
|
|
1513
|
+
var backendBaseUrl = backendPublicURL;
|
|
1514
|
+
var workerSecret = process.env.SEMIONT_WORKER_SECRET ?? "";
|
|
1515
|
+
var healthPort = 9090;
|
|
1516
|
+
var logger = createProcessLogger("worker");
|
|
1517
|
+
function clientKey(w) {
|
|
1518
|
+
return [w.type, w.model, w.apiKey ?? "", w.endpoint ?? "", w.baseURL ?? ""].join("|");
|
|
1519
|
+
}
|
|
1520
|
+
function toClientConfig(w) {
|
|
1521
|
+
return {
|
|
1522
|
+
type: w.type,
|
|
1523
|
+
model: w.model,
|
|
1524
|
+
...w.endpoint && { endpoint: w.endpoint },
|
|
1525
|
+
...w.baseURL && { baseURL: w.baseURL },
|
|
1526
|
+
...w.apiKey && { apiKey: w.apiKey }
|
|
1527
|
+
};
|
|
1528
|
+
}
|
|
1529
|
+
var clientCache = /* @__PURE__ */ new Map();
|
|
1530
|
+
var engines = {};
|
|
1531
|
+
for (const jobType of ALL_JOB_TYPES) {
|
|
1532
|
+
const w = resolveWorker(jobType);
|
|
1533
|
+
const key = clientKey(w);
|
|
1534
|
+
let client = clientCache.get(key);
|
|
1535
|
+
if (!client) {
|
|
1536
|
+
client = createInferenceClient(toClientConfig(w), logger);
|
|
1537
|
+
clientCache.set(key, client);
|
|
1538
|
+
}
|
|
1539
|
+
const generator = {
|
|
1540
|
+
"@type": "SoftwareAgent",
|
|
1541
|
+
name: `worker-pool / ${w.type} ${w.model}`,
|
|
1542
|
+
worker: "worker-pool",
|
|
1543
|
+
inferenceProvider: w.type,
|
|
1544
|
+
model: w.model
|
|
1545
|
+
};
|
|
1546
|
+
engines[jobType] = { inferenceClient: client, generator };
|
|
1547
|
+
}
|
|
1548
|
+
function parseBackendUrl(url) {
|
|
1549
|
+
const parsed = new URL(url);
|
|
1550
|
+
const protocol = parsed.protocol.replace(":", "") === "https" ? "https" : "http";
|
|
1551
|
+
const host = parsed.hostname;
|
|
1552
|
+
const port = parsed.port ? Number(parsed.port) : protocol === "https" ? 443 : 80;
|
|
1553
|
+
return { protocol, host, port };
|
|
1554
|
+
}
|
|
1555
|
+
async function authenticate() {
|
|
1556
|
+
if (!workerSecret) {
|
|
1557
|
+
logger.warn("No SEMIONT_WORKER_SECRET set \u2014 using empty token");
|
|
1558
|
+
return "";
|
|
1559
|
+
}
|
|
1560
|
+
const response = await fetch(`${backendBaseUrl}/api/tokens/worker`, {
|
|
1561
|
+
method: "POST",
|
|
1562
|
+
headers: { "Content-Type": "application/json" },
|
|
1563
|
+
body: JSON.stringify({ secret: workerSecret })
|
|
1564
|
+
});
|
|
1565
|
+
if (!response.ok) {
|
|
1566
|
+
throw new Error(`Authentication failed: ${response.status} ${response.statusText}`);
|
|
1567
|
+
}
|
|
1568
|
+
const { token } = await response.json();
|
|
1569
|
+
return token;
|
|
1570
|
+
}
|
|
1571
|
+
async function main() {
|
|
1572
|
+
logger.info("Authenticating", { baseUrl: backendBaseUrl });
|
|
1573
|
+
const initialToken = await authenticate();
|
|
1574
|
+
logger.info("Authenticated");
|
|
1575
|
+
const { protocol, host, port } = parseBackendUrl(backendBaseUrl);
|
|
1576
|
+
const kbId = `worker-${hostname()}`;
|
|
1577
|
+
const kb = {
|
|
1578
|
+
id: kbId,
|
|
1579
|
+
label: `Worker pool @ ${host}`,
|
|
1580
|
+
host,
|
|
1581
|
+
port,
|
|
1582
|
+
protocol,
|
|
1583
|
+
email: `worker-pool@${host}`
|
|
1584
|
+
};
|
|
1585
|
+
const storage = new InMemorySessionStorage();
|
|
1586
|
+
setStoredSession(storage, kbId, { access: initialToken, refresh: "" });
|
|
1587
|
+
const session = new SemiontSession({
|
|
1588
|
+
kb,
|
|
1589
|
+
storage,
|
|
1590
|
+
refresh: async () => {
|
|
1591
|
+
try {
|
|
1592
|
+
return await authenticate();
|
|
1593
|
+
} catch (err) {
|
|
1594
|
+
logger.error("Worker token refresh failed", {
|
|
1595
|
+
error: err instanceof Error ? err.message : String(err)
|
|
1596
|
+
});
|
|
1597
|
+
return null;
|
|
1598
|
+
}
|
|
1599
|
+
},
|
|
1600
|
+
// No validate callback — workers are service principals with no
|
|
1601
|
+
// user record to fetch. `session.user$` stays null.
|
|
1602
|
+
onError: (err) => {
|
|
1603
|
+
logger.error("Session error", { code: err.code, message: err.message });
|
|
1604
|
+
}
|
|
1605
|
+
});
|
|
1606
|
+
await session.ready;
|
|
1607
|
+
const workerVm = startWorkerProcess({
|
|
1608
|
+
session,
|
|
1609
|
+
jobTypes: ALL_JOB_TYPES,
|
|
1610
|
+
engines,
|
|
1611
|
+
logger
|
|
1612
|
+
});
|
|
1613
|
+
logger.info("Connected", {
|
|
1614
|
+
baseUrl: backendBaseUrl,
|
|
1615
|
+
engines: Object.fromEntries(
|
|
1616
|
+
Object.entries(engines).map(([jt, e]) => [jt, `${e.generator.inferenceProvider} / ${e.generator.model}`])
|
|
1617
|
+
)
|
|
1618
|
+
});
|
|
1619
|
+
const health = createServer((req, res) => {
|
|
1620
|
+
if (req.url === "/health") {
|
|
1621
|
+
res.writeHead(200, { "Content-Type": "application/json" });
|
|
1622
|
+
res.end(JSON.stringify({ status: "ok" }));
|
|
1623
|
+
} else {
|
|
1624
|
+
res.writeHead(404);
|
|
1625
|
+
res.end();
|
|
1626
|
+
}
|
|
1627
|
+
});
|
|
1628
|
+
health.listen(healthPort, () => {
|
|
1629
|
+
logger.info("Health endpoint ready", { port: healthPort });
|
|
1630
|
+
});
|
|
1631
|
+
const shutdown = async () => {
|
|
1632
|
+
logger.info("Shutting down");
|
|
1633
|
+
workerVm.dispose();
|
|
1634
|
+
await session.dispose();
|
|
1635
|
+
health.close();
|
|
1636
|
+
process.exit(0);
|
|
1637
|
+
};
|
|
1638
|
+
process.on("SIGTERM", shutdown);
|
|
1639
|
+
process.on("SIGINT", shutdown);
|
|
1640
|
+
}
|
|
1641
|
+
main().catch((error) => {
|
|
1642
|
+
logger.error("Fatal", { error: error instanceof Error ? error.message : String(error), stack: error instanceof Error ? error.stack : void 0 });
|
|
1643
|
+
process.exit(1);
|
|
1644
|
+
});
|
|
1645
|
+
//# sourceMappingURL=worker-main.js.map
|
|
1646
|
+
//# sourceMappingURL=worker-main.js.map
|