@semiont/jobs 0.4.19 → 0.4.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1646 @@
1
+ import { InMemorySessionStorage, setStoredSession, SemiontSession, createJobClaimAdapter, validateAndCorrectOffsets, getLocaleEnglishName } from '@semiont/api-client';
2
+ import { createTomlConfigLoader, RESOURCE_BROADCAST_TYPES, didToAgent } from '@semiont/core';
3
+ import { deriveStorageUri } from '@semiont/content';
4
+ import { generateAnnotationId } from '@semiont/event-sourcing';
5
+ import { createInferenceClient } from '@semiont/inference';
6
+ import { createServer } from 'http';
7
+ import { existsSync, readFileSync } from 'fs';
8
+ import { homedir, hostname } from 'os';
9
+ import { join } from 'path';
10
+ import winston from 'winston';
11
+
12
+ // src/worker-process.ts
13
+
14
+ // src/workers/detection/motivation-prompts.ts
15
+ var MotivationPrompts = class {
16
+ /**
17
+ * Build a prompt for detecting comment-worthy passages
18
+ *
19
+ * @param content - The text content to analyze (will be truncated to 8000 chars)
20
+ * @param instructions - Optional user-provided instructions
21
+ * @param tone - Optional tone guidance (e.g., "academic", "conversational")
22
+ * @param density - Optional target number of comments per 2000 words
23
+ * @returns Formatted prompt string
24
+ */
25
+ static buildCommentPrompt(content, instructions, tone, density) {
26
+ let prompt;
27
+ if (instructions) {
28
+ const toneGuidance = tone ? ` Use a ${tone} tone.` : "";
29
+ const densityGuidance = density ? `
30
+
31
+ Aim for approximately ${density} comments per 2000 words of text.` : "";
32
+ prompt = `Add comments to passages in this text following these instructions:
33
+
34
+ ${instructions}${toneGuidance}${densityGuidance}
35
+
36
+ Text to analyze:
37
+ ---
38
+ ${content.substring(0, 8e3)}
39
+ ---
40
+
41
+ Return a JSON array of comments. Each comment must have:
42
+ - "exact": the exact text passage being commented on (quoted verbatim from source)
43
+ - "start": character offset where the passage starts
44
+ - "end": character offset where the passage ends
45
+ - "prefix": up to 32 characters of text immediately before the passage
46
+ - "suffix": up to 32 characters of text immediately after the passage
47
+ - "comment": your comment following the instructions above
48
+
49
+ Respond with a valid JSON array.
50
+
51
+ Example:
52
+ [
53
+ {"exact": "the quarterly review meeting", "start": 142, "end": 169, "prefix": "We need to schedule ", "suffix": " for next month.", "comment": "Who will lead this? Should we invite the external auditors?"}
54
+ ]`;
55
+ } else {
56
+ const toneGuidance = tone ? `
57
+
58
+ Tone: Use a ${tone} style in your comments.` : "";
59
+ const densityGuidance = density ? `
60
+ - Aim for approximately ${density} comments per 2000 words` : `
61
+ - Aim for 3-8 comments per 2000 words (not too sparse or dense)`;
62
+ prompt = `Identify passages in this text that would benefit from explanatory comments.
63
+ For each passage, provide contextual information, clarification, or background.${toneGuidance}
64
+
65
+ Guidelines:
66
+ - Select passages that reference technical terms, historical figures, complex concepts, or unclear references
67
+ - Provide comments that ADD VALUE beyond restating the text
68
+ - Focus on explanation, background, or connections to other ideas
69
+ - Avoid obvious or trivial comments
70
+ - Keep comments concise (1-3 sentences typically)${densityGuidance}
71
+
72
+ Text to analyze:
73
+ ---
74
+ ${content.substring(0, 8e3)}
75
+ ---
76
+
77
+ Return a JSON array of comments. Each comment should have:
78
+ - "exact": the exact text passage being commented on (quoted verbatim from source)
79
+ - "start": character offset where the passage starts
80
+ - "end": character offset where the passage ends
81
+ - "prefix": up to 32 characters of text immediately before the passage
82
+ - "suffix": up to 32 characters of text immediately after the passage
83
+ - "comment": your explanatory comment (1-3 sentences, provide context/background/clarification)
84
+
85
+ Respond with a valid JSON array.
86
+
87
+ Example format:
88
+ [
89
+ {"exact": "Ouranos", "start": 52, "end": 59, "prefix": "In the beginning, ", "suffix": " ruled the universe", "comment": "Ouranos (also spelled Uranus) is the primordial Greek deity personifying the sky. In Hesiod's Theogony, he is the son and husband of Gaia (Earth) and father of the Titans."}
90
+ ]`;
91
+ }
92
+ return prompt;
93
+ }
94
+ /**
95
+ * Build a prompt for detecting highlight-worthy passages
96
+ *
97
+ * @param content - The text content to analyze (will be truncated to 8000 chars)
98
+ * @param instructions - Optional user-provided instructions
99
+ * @param density - Optional target number of highlights per 2000 words
100
+ * @returns Formatted prompt string
101
+ */
102
+ static buildHighlightPrompt(content, instructions, density) {
103
+ let prompt;
104
+ if (instructions) {
105
+ const densityGuidance = density ? `
106
+
107
+ Aim for approximately ${density} highlights per 2000 words of text.` : "";
108
+ prompt = `Identify passages in this text to highlight following these instructions:
109
+
110
+ ${instructions}${densityGuidance}
111
+
112
+ Text to analyze:
113
+ ---
114
+ ${content.substring(0, 8e3)}
115
+ ---
116
+
117
+ Return a JSON array of highlights. Each highlight must have:
118
+ - "exact": the exact text passage to highlight (quoted verbatim from source)
119
+ - "start": character offset where the passage starts
120
+ - "end": character offset where the passage ends
121
+ - "prefix": up to 32 characters of text immediately before the passage
122
+ - "suffix": up to 32 characters of text immediately after the passage
123
+
124
+ Respond with a valid JSON array.
125
+
126
+ Example:
127
+ [
128
+ {"exact": "revenue grew 45% year-over-year", "start": 142, "end": 174, "prefix": "In Q3 2024, ", "suffix": ", exceeding all forecasts."}
129
+ ]`;
130
+ } else {
131
+ const densityGuidance = density ? `
132
+ - Aim for approximately ${density} highlights per 2000 words` : `
133
+ - Aim for 3-8 highlights per 2000 words (be selective)`;
134
+ prompt = `Identify passages in this text that merit highlighting for their importance or salience.
135
+ Focus on content that readers should notice and remember.
136
+
137
+ Guidelines:
138
+ - Highlight key claims, findings, or conclusions
139
+ - Highlight important definitions, terminology, or concepts
140
+ - Highlight notable quotes or particularly striking statements
141
+ - Highlight critical decisions, action items, or turning points
142
+ - Select passages that are SIGNIFICANT, not just interesting
143
+ - Avoid trivial or obvious content${densityGuidance}
144
+
145
+ Text to analyze:
146
+ ---
147
+ ${content.substring(0, 8e3)}
148
+ ---
149
+
150
+ Return a JSON array of highlights. Each highlight should have:
151
+ - "exact": the exact text passage to highlight (quoted verbatim from source)
152
+ - "start": character offset where the passage starts
153
+ - "end": character offset where the passage ends
154
+ - "prefix": up to 32 characters of text immediately before the passage
155
+ - "suffix": up to 32 characters of text immediately after the passage
156
+
157
+ Respond with a valid JSON array.
158
+
159
+ Example format:
160
+ [
161
+ {"exact": "we will discontinue support for legacy systems by March 2025", "start": 52, "end": 113, "prefix": "After careful consideration, ", "suffix": ". This decision affects"}
162
+ ]`;
163
+ }
164
+ return prompt;
165
+ }
166
+ /**
167
+ * Build a prompt for detecting assessment-worthy passages
168
+ *
169
+ * @param content - The text content to analyze (will be truncated to 8000 chars)
170
+ * @param instructions - Optional user-provided instructions
171
+ * @param tone - Optional tone guidance (e.g., "critical", "supportive")
172
+ * @param density - Optional target number of assessments per 2000 words
173
+ * @returns Formatted prompt string
174
+ */
175
+ static buildAssessmentPrompt(content, instructions, tone, density) {
176
+ let prompt;
177
+ if (instructions) {
178
+ const toneGuidance = tone ? ` Use a ${tone} tone.` : "";
179
+ const densityGuidance = density ? `
180
+
181
+ Aim for approximately ${density} assessments per 2000 words of text.` : "";
182
+ prompt = `Assess passages in this text following these instructions:
183
+
184
+ ${instructions}${toneGuidance}${densityGuidance}
185
+
186
+ Text to analyze:
187
+ ---
188
+ ${content.substring(0, 8e3)}
189
+ ---
190
+
191
+ Return a JSON array of assessments. Each assessment must have:
192
+ - "exact": the exact text passage being assessed (quoted verbatim from source)
193
+ - "start": character offset where the passage starts
194
+ - "end": character offset where the passage ends
195
+ - "prefix": up to 32 characters of text immediately before the passage
196
+ - "suffix": up to 32 characters of text immediately after the passage
197
+ - "assessment": your assessment following the instructions above
198
+
199
+ Respond with a valid JSON array.
200
+
201
+ Example:
202
+ [
203
+ {"exact": "the quarterly revenue target", "start": 142, "end": 169, "prefix": "We established ", "suffix": " for Q4 2024.", "assessment": "This target seems ambitious given market conditions. Consider revising based on recent trends."}
204
+ ]`;
205
+ } else {
206
+ const toneGuidance = tone ? `
207
+
208
+ Tone: Use a ${tone} style in your assessments.` : "";
209
+ const densityGuidance = density ? `
210
+ - Aim for approximately ${density} assessments per 2000 words` : `
211
+ - Aim for 2-6 assessments per 2000 words (focus on key passages)`;
212
+ prompt = `Identify passages in this text that merit critical assessment or evaluation.
213
+ For each passage, provide analysis of its validity, strength, or implications.${toneGuidance}
214
+
215
+ Guidelines:
216
+ - Select passages containing claims, arguments, conclusions, or assertions
217
+ - Assess evidence quality, logical soundness, or practical implications
218
+ - Provide assessments that ADD INSIGHT beyond restating the text
219
+ - Focus on passages where evaluation would help readers form judgments
220
+ - Keep assessments concise yet substantive (1-3 sentences typically)${densityGuidance}
221
+
222
+ Text to analyze:
223
+ ---
224
+ ${content.substring(0, 8e3)}
225
+ ---
226
+
227
+ Return a JSON array of assessments. Each assessment should have:
228
+ - "exact": the exact text passage being assessed (quoted verbatim from source)
229
+ - "start": character offset where the passage starts
230
+ - "end": character offset where the passage ends
231
+ - "prefix": up to 32 characters of text immediately before the passage
232
+ - "suffix": up to 32 characters of text immediately after the passage
233
+ - "assessment": your analytical assessment (1-3 sentences, evaluate validity/strength/implications)
234
+
235
+ Respond with a valid JSON array.
236
+
237
+ Example format:
238
+ [
239
+ {"exact": "AI will replace most jobs by 2030", "start": 52, "end": 89, "prefix": "Many experts predict that ", "suffix": ", fundamentally reshaping", "assessment": "This claim lacks nuance and supporting evidence. Employment patterns historically show job transformation rather than wholesale replacement. The timeline appears speculative without specific sector analysis."}
240
+ ]`;
241
+ }
242
+ return prompt;
243
+ }
244
+ /**
245
+ * Build a prompt for detecting structural tags
246
+ *
247
+ * @param content - The full text content to analyze (NOT truncated for structural analysis)
248
+ * @param category - The specific category to detect
249
+ * @param schemaName - Human-readable schema name
250
+ * @param schemaDescription - Schema description
251
+ * @param schemaDomain - Schema domain
252
+ * @param categoryDescription - Category description
253
+ * @param categoryExamples - Example questions/guidance for this category
254
+ * @returns Formatted prompt string
255
+ */
256
+ static buildTagPrompt(content, category, schemaName, schemaDescription, schemaDomain, categoryDescription, categoryExamples) {
257
+ const prompt = `You are analyzing a text using the ${schemaName} framework.
258
+
259
+ Schema: ${schemaDescription}
260
+ Domain: ${schemaDomain}
261
+
262
+ Your task: Identify passages that serve the structural role of "${category}".
263
+
264
+ Category: ${category}
265
+ Description: ${categoryDescription}
266
+ Key questions:
267
+ ${categoryExamples.map((ex) => `- ${ex}`).join("\n")}
268
+
269
+ Guidelines:
270
+ - Focus on STRUCTURAL FUNCTION, not semantic content
271
+ - A passage serves the "${category}" role if it performs this function in the document's structure
272
+ - Look for passages that explicitly fulfill this role
273
+ - Passages can be sentences, paragraphs, or sections
274
+ - Aim for precision - only tag passages that clearly serve this structural role
275
+ - Typical documents have 1-5 instances of each category (some may have 0)
276
+
277
+ Text to analyze:
278
+ ---
279
+ ${content}
280
+ ---
281
+
282
+ Return a JSON array of tags. Each tag should have:
283
+ - "exact": the exact text passage (quoted verbatim from source)
284
+ - "start": character offset where the passage starts
285
+ - "end": character offset where the passage ends
286
+ - "prefix": up to 32 characters of text immediately before the passage
287
+ - "suffix": up to 32 characters of text immediately after the passage
288
+
289
+ Respond with a valid JSON array.
290
+
291
+ Example format:
292
+ [
293
+ {"exact": "What duty did the defendant owe?", "start": 142, "end": 175, "prefix": "The central question is: ", "suffix": " This question must be"},
294
+ {"exact": "In tort law, a duty of care is established when...", "start": 412, "end": 520, "prefix": "Legal framework:\\n", "suffix": "\\n\\nApplying this standard"}
295
+ ]`;
296
+ return prompt;
297
+ }
298
+ };
299
+ function extractObjectsFromArray(response) {
300
+ let cleaned = response.trim();
301
+ if (cleaned.startsWith("```")) {
302
+ cleaned = cleaned.replace(/^```(?:json)?\s*\n?/, "").replace(/\n?```\s*$/, "");
303
+ }
304
+ try {
305
+ const parsed = JSON.parse(cleaned);
306
+ return Array.isArray(parsed) ? parsed : [];
307
+ } catch {
308
+ }
309
+ const start = cleaned.indexOf("[");
310
+ if (start === -1) return [];
311
+ const endBracket = cleaned.lastIndexOf("]");
312
+ const end = endBracket > start ? endBracket : cleaned.length;
313
+ const inner = cleaned.slice(start + 1, end);
314
+ const objects = [];
315
+ let depth = 0;
316
+ let objStart = -1;
317
+ let inString = false;
318
+ let escape = false;
319
+ for (let i = 0; i < inner.length; i++) {
320
+ const ch = inner[i];
321
+ if (escape) {
322
+ escape = false;
323
+ continue;
324
+ }
325
+ if (ch === "\\") {
326
+ escape = true;
327
+ continue;
328
+ }
329
+ if (ch === '"') {
330
+ inString = !inString;
331
+ continue;
332
+ }
333
+ if (inString) continue;
334
+ if (ch === "{") {
335
+ if (depth === 0) objStart = i;
336
+ depth++;
337
+ } else if (ch === "}") {
338
+ depth--;
339
+ if (depth === 0 && objStart !== -1) {
340
+ try {
341
+ objects.push(JSON.parse(inner.slice(objStart, i + 1)));
342
+ } catch {
343
+ }
344
+ objStart = -1;
345
+ }
346
+ }
347
+ }
348
+ return objects;
349
+ }
350
+ var MotivationParsers = class {
351
+ /**
352
+ * Parse and validate AI response for comment detection
353
+ *
354
+ * @param response - Raw AI response string (may include markdown code fences)
355
+ * @param content - Original content to validate offsets against
356
+ * @returns Array of validated comment matches
357
+ */
358
+ static parseComments(response, content) {
359
+ try {
360
+ const parsed = extractObjectsFromArray(response);
361
+ const valid = parsed.filter(
362
+ (c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.start === "number" && typeof c.end === "number" && typeof c.comment === "string" && c.comment.trim().length > 0
363
+ );
364
+ console.log(`[MotivationParsers] Parsed ${valid.length} valid comments from ${parsed.length} total`);
365
+ const validatedComments = [];
366
+ for (const comment of valid) {
367
+ try {
368
+ const validated = validateAndCorrectOffsets(content, comment.start, comment.end, comment.exact);
369
+ validatedComments.push({
370
+ ...comment,
371
+ start: validated.start,
372
+ end: validated.end,
373
+ prefix: validated.prefix,
374
+ suffix: validated.suffix
375
+ });
376
+ } catch (error) {
377
+ console.warn(`[MotivationParsers] Skipping invalid comment "${comment.exact}":`, error);
378
+ }
379
+ }
380
+ return validatedComments;
381
+ } catch (error) {
382
+ console.error("[MotivationParsers] Failed to parse AI comment response:", error);
383
+ return [];
384
+ }
385
+ }
386
+ /**
387
+ * Parse and validate AI response for highlight detection
388
+ *
389
+ * @param response - Raw AI response string (may include markdown code fences)
390
+ * @param content - Original content to validate offsets against
391
+ * @returns Array of validated highlight matches
392
+ */
393
+ static parseHighlights(response, content) {
394
+ try {
395
+ const parsed = extractObjectsFromArray(response);
396
+ const highlights = parsed.filter(
397
+ (h) => !!h && typeof h === "object" && typeof h.exact === "string" && typeof h.start === "number" && typeof h.end === "number"
398
+ );
399
+ const validatedHighlights = [];
400
+ for (const highlight of highlights) {
401
+ try {
402
+ const validated = validateAndCorrectOffsets(content, highlight.start, highlight.end, highlight.exact);
403
+ validatedHighlights.push({
404
+ ...highlight,
405
+ start: validated.start,
406
+ end: validated.end,
407
+ prefix: validated.prefix,
408
+ suffix: validated.suffix
409
+ });
410
+ } catch (error) {
411
+ console.warn(`[MotivationParsers] Skipping invalid highlight "${highlight.exact}":`, error);
412
+ }
413
+ }
414
+ return validatedHighlights;
415
+ } catch (error) {
416
+ console.error("[MotivationParsers] Failed to parse AI highlight response:", error);
417
+ console.error("Raw response:", response);
418
+ return [];
419
+ }
420
+ }
421
+ /**
422
+ * Parse and validate AI response for assessment detection
423
+ *
424
+ * @param response - Raw AI response string (may include markdown code fences)
425
+ * @param content - Original content to validate offsets against
426
+ * @returns Array of validated assessment matches
427
+ */
428
+ static parseAssessments(response, content) {
429
+ try {
430
+ const parsed = extractObjectsFromArray(response);
431
+ const assessments = parsed.filter(
432
+ (a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.start === "number" && typeof a.end === "number" && typeof a.assessment === "string"
433
+ );
434
+ const validatedAssessments = [];
435
+ for (const assessment of assessments) {
436
+ try {
437
+ const validated = validateAndCorrectOffsets(content, assessment.start, assessment.end, assessment.exact);
438
+ validatedAssessments.push({
439
+ ...assessment,
440
+ start: validated.start,
441
+ end: validated.end,
442
+ prefix: validated.prefix,
443
+ suffix: validated.suffix
444
+ });
445
+ } catch (error) {
446
+ console.warn(`[MotivationParsers] Skipping invalid assessment "${assessment.exact}":`, error);
447
+ }
448
+ }
449
+ return validatedAssessments;
450
+ } catch (error) {
451
+ console.error("[MotivationParsers] Failed to parse AI assessment response:", error);
452
+ console.error("Raw response:", response);
453
+ return [];
454
+ }
455
+ }
456
+ /**
457
+ * Parse and validate AI response for tag detection
458
+ * Note: Does NOT validate offsets - caller must do that with content
459
+ *
460
+ * @param response - Raw AI response string (may include markdown code fences)
461
+ * @returns Array of tag matches (offsets not yet validated)
462
+ */
463
+ static parseTags(response) {
464
+ try {
465
+ const parsed = extractObjectsFromArray(response);
466
+ const valid = parsed.filter(
467
+ (t) => !!t && typeof t === "object" && typeof t.exact === "string" && typeof t.start === "number" && typeof t.end === "number" && t.exact.trim().length > 0
468
+ );
469
+ console.log(`[MotivationParsers] Parsed ${valid.length} valid tags from ${parsed.length} total`);
470
+ return valid;
471
+ } catch (error) {
472
+ console.error("[MotivationParsers] Failed to parse AI tag response:", error);
473
+ return [];
474
+ }
475
+ }
476
+ /**
477
+ * Validate tag offsets against content and add category
478
+ * Helper for tag detection after initial parsing
479
+ *
480
+ * @param tags - Parsed tags without validated offsets
481
+ * @param content - Original content to validate against
482
+ * @param category - Category to assign to validated tags
483
+ * @returns Array of validated tag matches
484
+ */
485
+ static validateTagOffsets(tags, content, category) {
486
+ const validatedTags = [];
487
+ for (const tag of tags) {
488
+ try {
489
+ const validated = validateAndCorrectOffsets(content, tag.start, tag.end, tag.exact);
490
+ validatedTags.push({
491
+ ...tag,
492
+ category,
493
+ start: validated.start,
494
+ end: validated.end,
495
+ prefix: validated.prefix,
496
+ suffix: validated.suffix
497
+ });
498
+ } catch (error) {
499
+ console.warn(`[MotivationParsers] Skipping invalid tag for category "${category}":`, error);
500
+ }
501
+ }
502
+ return validatedTags;
503
+ }
504
+ };
505
+
506
+ // ../ontology/dist/index.js
507
+ var TAG_SCHEMAS = {
508
+ "legal-irac": {
509
+ id: "legal-irac",
510
+ name: "Legal Analysis (IRAC)",
511
+ description: "Issue, Rule, Application, Conclusion framework for legal reasoning",
512
+ domain: "legal",
513
+ tags: [
514
+ {
515
+ name: "Issue",
516
+ description: "The legal question or problem to be resolved",
517
+ examples: [
518
+ "What is the central legal question?",
519
+ "What must the court decide?",
520
+ "What is the dispute about?"
521
+ ]
522
+ },
523
+ {
524
+ name: "Rule",
525
+ description: "The relevant law, statute, or legal principle",
526
+ examples: [
527
+ "What law applies?",
528
+ "What is the legal standard?",
529
+ "What statute governs this case?"
530
+ ]
531
+ },
532
+ {
533
+ name: "Application",
534
+ description: "How the rule applies to the specific facts",
535
+ examples: [
536
+ "How does the law apply to these facts?",
537
+ "Analysis of the case",
538
+ "How do the facts satisfy the legal standard?"
539
+ ]
540
+ },
541
+ {
542
+ name: "Conclusion",
543
+ description: "The resolution or outcome based on the analysis",
544
+ examples: [
545
+ "What is the court's decision?",
546
+ "What is the final judgment?",
547
+ "What is the holding?"
548
+ ]
549
+ }
550
+ ]
551
+ },
552
+ "scientific-imrad": {
553
+ id: "scientific-imrad",
554
+ name: "Scientific Paper (IMRAD)",
555
+ description: "Introduction, Methods, Results, Discussion structure for research papers",
556
+ domain: "scientific",
557
+ tags: [
558
+ {
559
+ name: "Introduction",
560
+ description: "Background, context, and research question",
561
+ examples: [
562
+ "What is the research question?",
563
+ "Why is this important?",
564
+ "What is the hypothesis?"
565
+ ]
566
+ },
567
+ {
568
+ name: "Methods",
569
+ description: "Experimental design and procedures",
570
+ examples: [
571
+ "How was the study conducted?",
572
+ "What methods were used?",
573
+ "What was the experimental design?"
574
+ ]
575
+ },
576
+ {
577
+ name: "Results",
578
+ description: "Findings and observations",
579
+ examples: [
580
+ "What did the study find?",
581
+ "What are the data?",
582
+ "What were the observations?"
583
+ ]
584
+ },
585
+ {
586
+ name: "Discussion",
587
+ description: "Interpretation and implications of results",
588
+ examples: [
589
+ "What do the results mean?",
590
+ "What are the implications?",
591
+ "How do these findings relate to prior work?"
592
+ ]
593
+ }
594
+ ]
595
+ },
596
+ "argument-toulmin": {
597
+ id: "argument-toulmin",
598
+ name: "Argument Structure (Toulmin)",
599
+ description: "Claim, Evidence, Warrant, Counterargument, Rebuttal framework for argumentation",
600
+ domain: "general",
601
+ tags: [
602
+ {
603
+ name: "Claim",
604
+ description: "The main assertion or thesis",
605
+ examples: [
606
+ "What is being argued?",
607
+ "What is the main point?",
608
+ "What position is being taken?"
609
+ ]
610
+ },
611
+ {
612
+ name: "Evidence",
613
+ description: "Data or facts supporting the claim",
614
+ examples: [
615
+ "What supports this claim?",
616
+ "What are the facts?",
617
+ "What data is provided?"
618
+ ]
619
+ },
620
+ {
621
+ name: "Warrant",
622
+ description: "Reasoning connecting evidence to claim",
623
+ examples: [
624
+ "Why does this evidence support the claim?",
625
+ "What is the logic?",
626
+ "How does this reasoning work?"
627
+ ]
628
+ },
629
+ {
630
+ name: "Counterargument",
631
+ description: "Opposing viewpoints or objections",
632
+ examples: [
633
+ "What are the objections?",
634
+ "What do critics say?",
635
+ "What are alternative views?"
636
+ ]
637
+ },
638
+ {
639
+ name: "Rebuttal",
640
+ description: "Response to counterarguments",
641
+ examples: [
642
+ "How is the objection addressed?",
643
+ "Why is the counterargument wrong?",
644
+ "How is the criticism answered?"
645
+ ]
646
+ }
647
+ ]
648
+ }
649
+ };
650
+ function getTagSchema(schemaId) {
651
+ return TAG_SCHEMAS[schemaId] || null;
652
+ }
653
+ function getSchemaCategory(schemaId, categoryName) {
654
+ const schema = getTagSchema(schemaId);
655
+ if (!schema) return null;
656
+ return schema.tags.find((tag) => tag.name === categoryName) || null;
657
+ }
658
+
659
+ // src/workers/annotation-detection.ts
660
+ var AnnotationDetection = class {
661
+ /**
662
+ * Fetch content from a ContentFetcher and read the stream to a string.
663
+ * Shared helper for all workers.
664
+ */
665
+ static async fetchContent(contentFetcher, resourceId) {
666
+ const stream = await contentFetcher(resourceId);
667
+ if (!stream) {
668
+ throw new Error(`Could not load content for resource ${resourceId}`);
669
+ }
670
+ const chunks = [];
671
+ for await (const chunk of stream) {
672
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
673
+ }
674
+ return Buffer.concat(chunks).toString("utf-8");
675
+ }
676
+ /**
677
+ * Detect comments in content
678
+ */
679
+ static async detectComments(content, client, instructions, tone, density) {
680
+ const prompt = MotivationPrompts.buildCommentPrompt(content, instructions, tone, density);
681
+ const response = await client.generateText(prompt, 3e3, 0.4);
682
+ return MotivationParsers.parseComments(response, content);
683
+ }
684
+ /**
685
+ * Detect highlights in content
686
+ */
687
+ static async detectHighlights(content, client, instructions, density) {
688
+ const prompt = MotivationPrompts.buildHighlightPrompt(content, instructions, density);
689
+ const response = await client.generateText(prompt, 2e3, 0.3);
690
+ return MotivationParsers.parseHighlights(response, content);
691
+ }
692
+ /**
693
+ * Detect assessments in content
694
+ */
695
+ static async detectAssessments(content, client, instructions, tone, density) {
696
+ const prompt = MotivationPrompts.buildAssessmentPrompt(content, instructions, tone, density);
697
+ const response = await client.generateText(prompt, 3e3, 0.3);
698
+ return MotivationParsers.parseAssessments(response, content);
699
+ }
700
+ /**
701
+ * Detect tags in content for a specific category
702
+ */
703
+ static async detectTags(content, client, schemaId, category) {
704
+ const schema = getTagSchema(schemaId);
705
+ if (!schema) {
706
+ throw new Error(`Invalid tag schema: ${schemaId}`);
707
+ }
708
+ const categoryInfo = getSchemaCategory(schemaId, category);
709
+ if (!categoryInfo) {
710
+ throw new Error(`Invalid category "${category}" for schema ${schemaId}`);
711
+ }
712
+ const prompt = MotivationPrompts.buildTagPrompt(
713
+ content,
714
+ category,
715
+ schema.name,
716
+ schema.description,
717
+ schema.domain,
718
+ categoryInfo.description,
719
+ categoryInfo.examples
720
+ );
721
+ const response = await client.generateText(prompt, 4e3, 0.2);
722
+ const parsedTags = MotivationParsers.parseTags(response);
723
+ return MotivationParsers.validateTagOffsets(parsedTags, content, category);
724
+ }
725
+ };
726
+
727
+ // src/workers/detection/entity-extractor.ts
728
+ async function extractEntities(exact, entityTypes, client, includeDescriptiveReferences = false, logger2) {
729
+ const entityTypesDescription = entityTypes.map((et) => {
730
+ if (typeof et === "string") {
731
+ return et;
732
+ }
733
+ return et.examples && et.examples.length > 0 ? `${et.type} (examples: ${et.examples.slice(0, 3).join(", ")})` : et.type;
734
+ }).join(", ");
735
+ const descriptiveReferenceGuidance = includeDescriptiveReferences ? `
736
+ Include both:
737
+ - Direct mentions (names, proper nouns)
738
+ - Descriptive references (substantive phrases that refer to entities)
739
+
740
+ For descriptive references, include:
741
+ - Definite descriptions: "the Nobel laureate", "the tech giant", "the former president"
742
+ - Role-based references: "the CEO", "the physicist", "the author", "the owner", "the contractor"
743
+ - Epithets with context: "the Cupertino-based company", "the iPhone maker"
744
+ - References to entities even when identity is unknown or unspecified
745
+
746
+ Do NOT include:
747
+ - Simple pronouns alone: he, she, it, they, him, her, them
748
+ - Generic determiners alone: this, that, these, those
749
+ - Possessives without substance: his, her, their, its
750
+
751
+ Examples:
752
+ - For "Marie Curie", include "the Nobel laureate" and "the physicist" but NOT "she"
753
+ - For an unknown person, include "the owner" or "the contractor" (role-based references count even when identity is unspecified)
754
+ ` : `
755
+ Find direct mentions only (names, proper nouns). Do not include pronouns or descriptive references.
756
+ `;
757
+ const prompt = `Identify entity references in the following text. Look for mentions of: ${entityTypesDescription}.
758
+ ${descriptiveReferenceGuidance}
759
+ Text to analyze:
760
+ """
761
+ ${exact}
762
+ """
763
+
764
+ Respond with a JSON array of entities found. Each entity should have:
765
+ - exact: the exact text span from the input
766
+ - entityType: one of the provided entity types
767
+ - startOffset: character position where the entity starts (0-indexed)
768
+ - endOffset: character position where the entity ends
769
+ - prefix: up to 32 characters of text immediately before the entity (helps identify correct occurrence)
770
+ - suffix: up to 32 characters of text immediately after the entity (helps identify correct occurrence)
771
+
772
+ If no entities are found, respond with an empty array [].
773
+
774
+ Example output:
775
+ [{"exact":"Alice","entityType":"Person","startOffset":0,"endOffset":5,"prefix":"","suffix":" went to"},{"exact":"Paris","entityType":"Location","startOffset":20,"endOffset":25,"prefix":"went to ","suffix":" yesterday"}]`;
776
+ const response = await client.generateTextWithMetadata(
777
+ prompt,
778
+ 4e3,
779
+ // Increased to handle many entities without truncation
780
+ 0.3
781
+ // Lower temperature for more consistent extraction
782
+ );
783
+ try {
784
+ let jsonStr = response.text.trim();
785
+ if (jsonStr.startsWith("```")) {
786
+ jsonStr = jsonStr.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
787
+ }
788
+ const entities = JSON.parse(jsonStr);
789
+ logger2?.debug("Parsed entities from AI response", { count: entities.length });
790
+ if (response.stopReason === "max_tokens") {
791
+ const errorMsg = `AI response truncated: Found ${entities.length} entities but response hit max_tokens limit. Increase max_tokens or reduce resource size.`;
792
+ logger2?.error(errorMsg);
793
+ throw new Error(errorMsg);
794
+ }
795
+ return entities.map((entity, idx) => {
796
+ let startOffset = entity.startOffset;
797
+ let endOffset = entity.endOffset;
798
+ logger2?.debug("Processing entity", {
799
+ index: idx + 1,
800
+ total: entities.length,
801
+ type: entity.entityType,
802
+ text: entity.exact,
803
+ offsetsFromAI: `[${startOffset}:${endOffset}]`
804
+ });
805
+ const extractedText = exact.substring(startOffset, endOffset);
806
+ let anchorMethod;
807
+ if (extractedText === entity.exact) {
808
+ anchorMethod = "llm-exact";
809
+ logger2?.debug("Entity anchored", {
810
+ text: entity.exact,
811
+ entityType: entity.entityType,
812
+ anchorMethod
813
+ });
814
+ } else {
815
+ logger2?.debug("LLM offsets mismatch \u2014 attempting re-anchor", {
816
+ expected: entity.exact,
817
+ llmOffsets: `[${startOffset}:${endOffset}]`,
818
+ foundAtLlmOffsets: extractedText
819
+ });
820
+ let occurrenceCount = 0;
821
+ let firstOccurrence = -1;
822
+ let searchPos = 0;
823
+ while ((searchPos = exact.indexOf(entity.exact, searchPos)) !== -1) {
824
+ if (firstOccurrence === -1) firstOccurrence = searchPos;
825
+ occurrenceCount++;
826
+ searchPos++;
827
+ }
828
+ if (occurrenceCount === 0) {
829
+ anchorMethod = "dropped";
830
+ logger2?.error("Entity text not found in resource \u2014 dropping", {
831
+ text: entity.exact,
832
+ entityType: entity.entityType,
833
+ llmOffsets: `[${startOffset}:${endOffset}]`,
834
+ anchorMethod,
835
+ resourceStart: exact.substring(0, 200)
836
+ });
837
+ return null;
838
+ }
839
+ let recoveredOffset = -1;
840
+ if (entity.prefix || entity.suffix) {
841
+ let p = 0;
842
+ while ((p = exact.indexOf(entity.exact, p)) !== -1) {
843
+ const candidatePrefix = exact.substring(Math.max(0, p - 32), p);
844
+ const candidateSuffix = exact.substring(
845
+ p + entity.exact.length,
846
+ Math.min(exact.length, p + entity.exact.length + 32)
847
+ );
848
+ const prefixMatch = !entity.prefix || candidatePrefix.endsWith(entity.prefix);
849
+ const suffixMatch = !entity.suffix || candidateSuffix.startsWith(entity.suffix);
850
+ if (prefixMatch && suffixMatch) {
851
+ recoveredOffset = p;
852
+ break;
853
+ }
854
+ p++;
855
+ }
856
+ }
857
+ if (recoveredOffset !== -1) {
858
+ anchorMethod = "context-recovered";
859
+ startOffset = recoveredOffset;
860
+ endOffset = recoveredOffset + entity.exact.length;
861
+ logger2?.debug("Entity anchored", {
862
+ text: entity.exact,
863
+ entityType: entity.entityType,
864
+ anchorMethod,
865
+ offsetDiff: recoveredOffset - entity.startOffset
866
+ });
867
+ } else if (occurrenceCount === 1) {
868
+ anchorMethod = "unique-match";
869
+ startOffset = firstOccurrence;
870
+ endOffset = firstOccurrence + entity.exact.length;
871
+ logger2?.debug("Entity anchored", {
872
+ text: entity.exact,
873
+ entityType: entity.entityType,
874
+ anchorMethod,
875
+ offsetDiff: firstOccurrence - entity.startOffset
876
+ });
877
+ } else {
878
+ anchorMethod = "first-of-many";
879
+ startOffset = firstOccurrence;
880
+ endOffset = firstOccurrence + entity.exact.length;
881
+ logger2?.warn("Entity anchored at first of multiple occurrences \u2014 may be wrong", {
882
+ text: entity.exact,
883
+ entityType: entity.entityType,
884
+ anchorMethod,
885
+ occurrenceCount,
886
+ chosenOffset: firstOccurrence,
887
+ llmOffsets: `[${entity.startOffset}:${entity.endOffset}]`,
888
+ hasPrefix: !!entity.prefix,
889
+ hasSuffix: !!entity.suffix
890
+ });
891
+ }
892
+ }
893
+ return {
894
+ exact: entity.exact,
895
+ entityType: entity.entityType,
896
+ startOffset,
897
+ endOffset,
898
+ prefix: entity.prefix,
899
+ suffix: entity.suffix
900
+ };
901
+ }).filter((entity) => {
902
+ if (entity === null) {
903
+ logger2?.debug("Filtered entity: null");
904
+ return false;
905
+ }
906
+ if (entity.startOffset === void 0 || entity.endOffset === void 0) {
907
+ logger2?.warn("Filtered entity: missing offsets", { text: entity.exact });
908
+ return false;
909
+ }
910
+ if (entity.startOffset < 0) {
911
+ logger2?.warn("Filtered entity: negative startOffset", {
912
+ text: entity.exact,
913
+ startOffset: entity.startOffset
914
+ });
915
+ return false;
916
+ }
917
+ if (entity.endOffset > exact.length) {
918
+ logger2?.warn("Filtered entity: endOffset exceeds text length", {
919
+ text: entity.exact,
920
+ endOffset: entity.endOffset,
921
+ textLength: exact.length
922
+ });
923
+ return false;
924
+ }
925
+ const extractedText = exact.substring(entity.startOffset, entity.endOffset);
926
+ if (extractedText !== entity.exact) {
927
+ logger2?.warn("Filtered entity: offset mismatch", {
928
+ expected: entity.exact,
929
+ got: extractedText,
930
+ offsets: `[${entity.startOffset}:${entity.endOffset}]`
931
+ });
932
+ return false;
933
+ }
934
+ logger2?.debug("Accepted entity", {
935
+ text: entity.exact,
936
+ offsets: `[${entity.startOffset}:${entity.endOffset}]`
937
+ });
938
+ return true;
939
+ });
940
+ } catch (error) {
941
+ return [];
942
+ }
943
+ }
944
+ function getLanguageName(locale) {
945
+ return getLocaleEnglishName(locale) || locale;
946
+ }
947
+ async function generateResourceFromTopic(topic, entityTypes, client, userPrompt, locale, context, temperature, maxTokens, logger2) {
948
+ const finalTemperature = temperature ?? 0.7;
949
+ const finalMaxTokens = maxTokens ?? 500;
950
+ const languageInstruction = locale && locale !== "en" ? `
951
+
952
+ IMPORTANT: Write the entire resource in ${getLanguageName(locale)}.` : "";
953
+ let annotationSection = "";
954
+ if (context) {
955
+ const parts = [];
956
+ parts.push(`- Annotation motivation: ${context.annotation.motivation}`);
957
+ parts.push(`- Source resource: ${context.sourceResource.name}`);
958
+ const { motivation, body } = context.annotation;
959
+ if (motivation === "commenting" || motivation === "assessing") {
960
+ const bodyItem = Array.isArray(body) ? body[0] : body;
961
+ if (bodyItem && "value" in bodyItem && bodyItem.value) {
962
+ const label = motivation === "commenting" ? "Comment" : "Assessment";
963
+ parts.push(`- ${label}: ${bodyItem.value}`);
964
+ }
965
+ }
966
+ annotationSection = `
967
+
968
+ Annotation context:
969
+ ${parts.join("\n")}`;
970
+ }
971
+ let contextSection = "";
972
+ if (context?.sourceContext) {
973
+ const { before, selected, after } = context.sourceContext;
974
+ contextSection = `
975
+
976
+ Source document context:
977
+ ---
978
+ ${before ? `...${before}` : ""}
979
+ **[${selected}]**
980
+ ${after ? `${after}...` : ""}
981
+ ---
982
+ `;
983
+ }
984
+ let graphContextSection = "";
985
+ if (context?.graphContext) {
986
+ const gc = context.graphContext;
987
+ const connections = gc.connections ?? [];
988
+ const citedBy = gc.citedBy ?? [];
989
+ const parts = [];
990
+ if (connections.length > 0) {
991
+ const connList = connections.map((c) => `${c.resourceName}${c.entityTypes?.length ? ` (${c.entityTypes.join(", ")})` : ""}`).join(", ");
992
+ parts.push(`- Connected resources: ${connList}`);
993
+ }
994
+ if (gc.citedByCount && gc.citedByCount > 0) {
995
+ const citedNames = citedBy.map((c) => c.resourceName).join(", ");
996
+ parts.push(`- This resource is cited by ${gc.citedByCount} other resource${gc.citedByCount > 1 ? "s" : ""}${citedNames ? `: ${citedNames}` : ""}`);
997
+ }
998
+ if (gc.siblingEntityTypes && gc.siblingEntityTypes.length > 0) {
999
+ parts.push(`- Related entity types in this document: ${gc.siblingEntityTypes.join(", ")}`);
1000
+ }
1001
+ if (gc.inferredRelationshipSummary) {
1002
+ parts.push(`- Relationship summary: ${gc.inferredRelationshipSummary}`);
1003
+ }
1004
+ if (parts.length > 0) {
1005
+ graphContextSection = `
1006
+
1007
+ Knowledge graph context:
1008
+ ${parts.join("\n")}`;
1009
+ }
1010
+ }
1011
+ const structureGuidance = finalMaxTokens >= 1e3 ? "organized into titled sections (## Section) with well-structured paragraphs" : "organized into well-structured paragraphs";
1012
+ const prompt = `Generate a concise, informative resource about "${topic}".
1013
+ ${entityTypes.length > 0 ? `Focus on these entity types: ${entityTypes.join(", ")}.` : ""}
1014
+ ${userPrompt ? `Additional context: ${userPrompt}` : ""}${annotationSection}${contextSection}${graphContextSection}${languageInstruction}
1015
+
1016
+ Requirements:
1017
+ - Start with a clear heading (# Title)
1018
+ - Aim for approximately ${finalMaxTokens} tokens of content, ${structureGuidance}
1019
+ - Be factual and informative
1020
+ - Use markdown formatting
1021
+ - Write the response as markdown`;
1022
+ const parseResponse = (response2) => {
1023
+ let content = response2.trim();
1024
+ if (content.startsWith("```markdown") || content.startsWith("```md")) {
1025
+ content = content.slice(content.indexOf("\n") + 1);
1026
+ const endIndex = content.lastIndexOf("```");
1027
+ if (endIndex !== -1) {
1028
+ content = content.slice(0, endIndex);
1029
+ }
1030
+ } else if (content.startsWith("```")) {
1031
+ content = content.slice(3);
1032
+ const endIndex = content.lastIndexOf("```");
1033
+ if (endIndex !== -1) {
1034
+ content = content.slice(0, endIndex);
1035
+ }
1036
+ }
1037
+ content = content.trim();
1038
+ return {
1039
+ title: topic,
1040
+ content
1041
+ };
1042
+ };
1043
+ const response = await client.generateText(prompt, finalMaxTokens, finalTemperature);
1044
+ const result = parseResponse(response);
1045
+ return result;
1046
+ }
1047
+ function buildTextAnnotation(resourceId, userId, generator, motivation, match, body) {
1048
+ return {
1049
+ "@context": "http://www.w3.org/ns/anno.jsonld",
1050
+ "type": "Annotation",
1051
+ "id": generateAnnotationId(),
1052
+ motivation,
1053
+ creator: didToAgent(userId),
1054
+ generator,
1055
+ created: (/* @__PURE__ */ new Date()).toISOString(),
1056
+ target: {
1057
+ type: "SpecificResource",
1058
+ source: resourceId,
1059
+ selector: [
1060
+ { type: "TextPositionSelector", start: match.start, end: match.end },
1061
+ {
1062
+ type: "TextQuoteSelector",
1063
+ exact: match.exact,
1064
+ ...match.prefix && { prefix: match.prefix },
1065
+ ...match.suffix && { suffix: match.suffix }
1066
+ }
1067
+ ]
1068
+ },
1069
+ ...body !== void 0 ? { body } : {}
1070
+ };
1071
+ }
1072
+ async function processHighlightJob(content, inferenceClient, params, userId, generator, onProgress) {
1073
+ onProgress(10, "Loading resource...", "analyzing");
1074
+ onProgress(30, "Analyzing text...", "analyzing");
1075
+ const highlights = await AnnotationDetection.detectHighlights(
1076
+ content,
1077
+ inferenceClient,
1078
+ params.instructions,
1079
+ params.density
1080
+ );
1081
+ onProgress(60, `Creating ${highlights.length} annotations...`, "creating");
1082
+ const annotations = highlights.map(
1083
+ (h) => buildTextAnnotation(params.resourceId, userId, generator, "highlighting", h)
1084
+ );
1085
+ onProgress(100, `Complete! Created ${annotations.length} highlights`, "creating");
1086
+ return {
1087
+ annotations,
1088
+ result: { highlightsFound: highlights.length, highlightsCreated: annotations.length }
1089
+ };
1090
+ }
1091
+ async function processCommentJob(content, inferenceClient, params, userId, generator, onProgress) {
1092
+ onProgress(10, "Loading resource...", "analyzing");
1093
+ onProgress(30, "Analyzing text...", "analyzing");
1094
+ const comments = await AnnotationDetection.detectComments(
1095
+ content,
1096
+ inferenceClient,
1097
+ params.instructions,
1098
+ params.tone,
1099
+ params.density
1100
+ );
1101
+ onProgress(60, `Creating ${comments.length} annotations...`, "creating");
1102
+ const annotations = comments.map(
1103
+ (c) => (
1104
+ // Match the pre-#651 CommentAnnotationWorker: include format and
1105
+ // language on the body TextualBody. Optional in the schema, but
1106
+ // consumers that do language-aware rendering rely on them.
1107
+ buildTextAnnotation(params.resourceId, userId, generator, "commenting", c, [
1108
+ { type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language: "en" }
1109
+ ])
1110
+ )
1111
+ );
1112
+ onProgress(100, `Complete! Created ${annotations.length} comments`, "creating");
1113
+ return {
1114
+ annotations,
1115
+ result: { commentsFound: comments.length, commentsCreated: annotations.length }
1116
+ };
1117
+ }
1118
+ async function processAssessmentJob(content, inferenceClient, params, userId, generator, onProgress) {
1119
+ onProgress(10, "Loading resource...", "analyzing");
1120
+ onProgress(30, "Analyzing text...", "analyzing");
1121
+ const assessments = await AnnotationDetection.detectAssessments(
1122
+ content,
1123
+ inferenceClient,
1124
+ params.instructions,
1125
+ params.tone,
1126
+ params.density
1127
+ );
1128
+ onProgress(60, `Creating ${assessments.length} annotations...`, "creating");
1129
+ const annotations = assessments.map(
1130
+ (a) => (
1131
+ // Single-object body with purpose aligned to motivation, matching the
1132
+ // pre-#651 AssessmentAnnotationWorker's shape and the majority of
1133
+ // persisted assessments. Do not switch to an array or to
1134
+ // purpose='describing' — that loses the "this is an assessment, not
1135
+ // a description" signal and breaks existing readers that access
1136
+ // `body.value` directly on the object.
1137
+ buildTextAnnotation(params.resourceId, userId, generator, "assessing", a, {
1138
+ type: "TextualBody",
1139
+ value: a.assessment,
1140
+ purpose: "assessing",
1141
+ format: "text/plain",
1142
+ language: "en"
1143
+ })
1144
+ )
1145
+ );
1146
+ onProgress(100, `Complete! Created ${annotations.length} assessments`, "creating");
1147
+ return {
1148
+ annotations,
1149
+ result: { assessmentsFound: assessments.length, assessmentsCreated: annotations.length }
1150
+ };
1151
+ }
1152
+ async function processReferenceJob(content, inferenceClient, params, userId, generator, onProgress, logger2) {
1153
+ const entityTypeNames = params.entityTypes.map(String);
1154
+ const requestParams = [{ label: "Entity types", value: entityTypeNames.join(", ") }];
1155
+ const completedEntityTypes = [];
1156
+ let totalFound = 0;
1157
+ let totalEmitted = 0;
1158
+ let errors = 0;
1159
+ const allAnnotations = [];
1160
+ onProgress(10, "Loading resource...", "analyzing", { requestParams });
1161
+ for (let i = 0; i < entityTypeNames.length; i++) {
1162
+ const entityTypeName = entityTypeNames[i];
1163
+ if (!entityTypeName) continue;
1164
+ const pct = 20 + Math.round(i / entityTypeNames.length * 60);
1165
+ onProgress(pct, `Detecting ${entityTypeName} entities...`, "analyzing", {
1166
+ currentEntityType: entityTypeName,
1167
+ processedEntityTypes: i,
1168
+ totalEntityTypes: entityTypeNames.length,
1169
+ entitiesFound: totalFound,
1170
+ entitiesEmitted: totalEmitted,
1171
+ completedEntityTypes: [...completedEntityTypes],
1172
+ requestParams
1173
+ });
1174
+ const extractedEntities = await extractEntities(
1175
+ content,
1176
+ [entityTypeName],
1177
+ inferenceClient,
1178
+ params.includeDescriptiveReferences ?? false,
1179
+ logger2
1180
+ );
1181
+ totalFound += extractedEntities.length;
1182
+ completedEntityTypes.push({ entityType: entityTypeName, foundCount: extractedEntities.length });
1183
+ const unresolvedBody = [{ type: "TextualBody", value: entityTypeName, purpose: "tagging" }];
1184
+ for (const entity of extractedEntities) {
1185
+ try {
1186
+ const validated = validateAndCorrectOffsets(content, entity.startOffset, entity.endOffset, entity.exact);
1187
+ const ann = buildTextAnnotation(
1188
+ params.resourceId,
1189
+ userId,
1190
+ generator,
1191
+ "linking",
1192
+ validated,
1193
+ unresolvedBody
1194
+ );
1195
+ allAnnotations.push(ann);
1196
+ totalEmitted++;
1197
+ } catch {
1198
+ errors++;
1199
+ }
1200
+ }
1201
+ }
1202
+ onProgress(100, `Complete! Created ${totalEmitted} references`, "creating");
1203
+ return {
1204
+ annotations: allAnnotations,
1205
+ result: { totalFound, totalEmitted, errors }
1206
+ };
1207
+ }
1208
+ async function processTagJob(content, inferenceClient, params, userId, generator, onProgress) {
1209
+ onProgress(10, "Loading resource...", "analyzing");
1210
+ onProgress(30, "Analyzing text for tags...", "analyzing");
1211
+ const allTags = [];
1212
+ for (const category of params.categories) {
1213
+ const categoryTags = await AnnotationDetection.detectTags(
1214
+ content,
1215
+ inferenceClient,
1216
+ params.schemaId,
1217
+ category
1218
+ );
1219
+ allTags.push(...categoryTags);
1220
+ }
1221
+ const tags = allTags;
1222
+ onProgress(60, `Creating ${tags.length} tag annotations...`, "creating");
1223
+ const byCategory = {};
1224
+ const annotations = tags.map((t) => {
1225
+ const category = t.category ?? "unknown";
1226
+ byCategory[category] = (byCategory[category] ?? 0) + 1;
1227
+ return buildTextAnnotation(params.resourceId, userId, generator, "tagging", t, [
1228
+ { type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: "en" },
1229
+ { type: "TextualBody", value: params.schemaId, purpose: "classifying", format: "text/plain" }
1230
+ ]);
1231
+ });
1232
+ onProgress(100, `Complete! Created ${annotations.length} tags`, "creating");
1233
+ return {
1234
+ annotations,
1235
+ result: { tagsFound: tags.length, tagsCreated: annotations.length, byCategory }
1236
+ };
1237
+ }
1238
+ async function processGenerationJob(inferenceClient, params, onProgress) {
1239
+ onProgress(20, "Fetching context...", "fetching");
1240
+ const title = params.title ?? "Untitled";
1241
+ const entityTypes = (params.entityTypes ?? []).map(String);
1242
+ onProgress(40, "Generating resource...", "generating");
1243
+ const generated = await generateResourceFromTopic(
1244
+ title,
1245
+ entityTypes,
1246
+ inferenceClient,
1247
+ params.prompt,
1248
+ params.language,
1249
+ params.context,
1250
+ params.temperature,
1251
+ params.maxTokens
1252
+ );
1253
+ onProgress(85, "Creating resource...", "creating");
1254
+ return {
1255
+ content: generated.content,
1256
+ title: generated.title ?? title,
1257
+ format: "text/markdown",
1258
+ result: {
1259
+ resourceId: "",
1260
+ resourceName: generated.title ?? title
1261
+ }
1262
+ };
1263
+ }
1264
+
1265
+ // src/worker-process.ts
1266
+ async function emitEvent(session, channel, payload) {
1267
+ const isBroadcast = RESOURCE_BROADCAST_TYPES.includes(channel);
1268
+ const resourceScope = isBroadcast ? payload.resourceId : void 0;
1269
+ await session.client.actor.emit(channel, payload, resourceScope);
1270
+ }
1271
+ function startWorkerProcess(config) {
1272
+ const { session, logger: logger2 } = config;
1273
+ const adapter = createJobClaimAdapter({
1274
+ actor: session.client.actor,
1275
+ jobTypes: config.jobTypes
1276
+ });
1277
+ adapter.activeJob$.subscribe((job) => {
1278
+ if (!job) return;
1279
+ logger2.info("Processing job", { jobId: job.jobId, type: job.type, resourceId: job.resourceId });
1280
+ handleJob(adapter, config, job).catch((error) => {
1281
+ const message = error instanceof Error ? error.message : String(error);
1282
+ logger2.error("Job failed", { jobId: job.jobId, error: message, stack: error instanceof Error ? error.stack : void 0 });
1283
+ const failAnnotationId = job.params.referenceId;
1284
+ emitEvent(session, "job:fail", {
1285
+ resourceId: job.resourceId,
1286
+ userId: job.userId,
1287
+ jobId: job.jobId,
1288
+ jobType: job.type,
1289
+ ...failAnnotationId ? { annotationId: failAnnotationId } : {},
1290
+ error: message
1291
+ }).catch(() => {
1292
+ });
1293
+ adapter.failJob(job.jobId, message);
1294
+ });
1295
+ });
1296
+ adapter.start();
1297
+ return adapter;
1298
+ }
1299
+ async function handleJob(adapter, config, job) {
1300
+ const { session } = config;
1301
+ const { resourceId, userId, jobId, type: jobType } = job;
1302
+ const annotationId = job.params.referenceId;
1303
+ const lifecycleBase = {
1304
+ resourceId,
1305
+ userId,
1306
+ jobId,
1307
+ jobType,
1308
+ ...annotationId ? { annotationId } : {}
1309
+ };
1310
+ await emitEvent(session, "job:start", lifecycleBase);
1311
+ const engine = config.engines[jobType];
1312
+ if (!engine) {
1313
+ adapter.failJob(jobId, `No inference engine configured for job type: ${jobType}`);
1314
+ return;
1315
+ }
1316
+ const { inferenceClient, generator } = engine;
1317
+ const onProgress = (percentage, message, stage, extra) => {
1318
+ emitEvent(session, "job:report-progress", {
1319
+ ...lifecycleBase,
1320
+ percentage,
1321
+ progress: {
1322
+ stage,
1323
+ percentage,
1324
+ message,
1325
+ ...annotationId ? { annotationId } : {},
1326
+ ...extra ?? {}
1327
+ }
1328
+ }).catch(() => {
1329
+ });
1330
+ };
1331
+ const fetchContent = async () => {
1332
+ return await session.client.browse.resourceContent(resourceId);
1333
+ };
1334
+ if (jobType === "highlight-annotation") {
1335
+ const content = await fetchContent();
1336
+ const { annotations, result } = await processHighlightJob(
1337
+ content,
1338
+ inferenceClient,
1339
+ job.params,
1340
+ userId,
1341
+ generator,
1342
+ onProgress
1343
+ );
1344
+ for (const ann of annotations) {
1345
+ await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
1346
+ }
1347
+ await emitEvent(session, "job:complete", {
1348
+ ...lifecycleBase,
1349
+ result
1350
+ });
1351
+ adapter.completeJob();
1352
+ } else if (jobType === "comment-annotation") {
1353
+ const content = await fetchContent();
1354
+ const { annotations, result } = await processCommentJob(
1355
+ content,
1356
+ inferenceClient,
1357
+ job.params,
1358
+ userId,
1359
+ generator,
1360
+ onProgress
1361
+ );
1362
+ for (const ann of annotations) {
1363
+ await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
1364
+ }
1365
+ await emitEvent(session, "job:complete", {
1366
+ ...lifecycleBase,
1367
+ result
1368
+ });
1369
+ adapter.completeJob();
1370
+ } else if (jobType === "assessment-annotation") {
1371
+ const content = await fetchContent();
1372
+ const { annotations, result } = await processAssessmentJob(
1373
+ content,
1374
+ inferenceClient,
1375
+ job.params,
1376
+ userId,
1377
+ generator,
1378
+ onProgress
1379
+ );
1380
+ for (const ann of annotations) {
1381
+ await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
1382
+ }
1383
+ await emitEvent(session, "job:complete", {
1384
+ ...lifecycleBase,
1385
+ result
1386
+ });
1387
+ adapter.completeJob();
1388
+ } else if (jobType === "reference-annotation") {
1389
+ const content = await fetchContent();
1390
+ const { annotations, result } = await processReferenceJob(
1391
+ content,
1392
+ inferenceClient,
1393
+ job.params,
1394
+ userId,
1395
+ generator,
1396
+ onProgress
1397
+ );
1398
+ for (const ann of annotations) {
1399
+ await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
1400
+ }
1401
+ await emitEvent(session, "job:complete", {
1402
+ ...lifecycleBase,
1403
+ result
1404
+ });
1405
+ adapter.completeJob();
1406
+ } else if (jobType === "tag-annotation") {
1407
+ const content = await fetchContent();
1408
+ const { annotations, result } = await processTagJob(
1409
+ content,
1410
+ inferenceClient,
1411
+ job.params,
1412
+ userId,
1413
+ generator,
1414
+ onProgress
1415
+ );
1416
+ for (const ann of annotations) {
1417
+ await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
1418
+ }
1419
+ await emitEvent(session, "job:complete", {
1420
+ ...lifecycleBase,
1421
+ result
1422
+ });
1423
+ adapter.completeJob();
1424
+ } else if (jobType === "generation") {
1425
+ const genResult = await processGenerationJob(
1426
+ inferenceClient,
1427
+ job.params,
1428
+ onProgress
1429
+ );
1430
+ const genParams = job.params;
1431
+ const storageUri = deriveStorageUri(genResult.title, genResult.format);
1432
+ const { resourceId: newResourceId } = await session.client.yield.resource({
1433
+ name: genResult.title,
1434
+ file: Buffer.from(genResult.content),
1435
+ format: genResult.format,
1436
+ storageUri,
1437
+ creationMethod: "generated",
1438
+ sourceResourceId: resourceId,
1439
+ ...genParams.referenceId ? { sourceAnnotationId: genParams.referenceId } : {},
1440
+ ...genParams.prompt ? { generationPrompt: genParams.prompt } : {},
1441
+ ...genParams.language ? { language: genParams.language } : {},
1442
+ generator
1443
+ });
1444
+ await emitEvent(session, "job:complete", {
1445
+ ...lifecycleBase,
1446
+ result: { resourceId: newResourceId, resourceName: genResult.title }
1447
+ });
1448
+ adapter.completeJob();
1449
+ } else {
1450
+ adapter.failJob(jobId, `Unknown job type: ${jobType}`);
1451
+ }
1452
+ }
1453
+ function createProcessLogger(component) {
1454
+ const level = process.env.LOG_LEVEL ?? "info";
1455
+ const format = process.env.LOG_FORMAT === "simple" ? winston.format.combine(
1456
+ winston.format.timestamp({ format: "YYYY-MM-DD HH:mm:ss" }),
1457
+ winston.format.errors({ stack: true }),
1458
+ winston.format.printf(({ level: lvl, message, timestamp, ...meta }) => {
1459
+ const metaStr = Object.keys(meta).length > 0 ? ` ${JSON.stringify(meta)}` : "";
1460
+ return `${timestamp} [${lvl.toUpperCase()}] [${component}] ${message}${metaStr}`;
1461
+ })
1462
+ ) : winston.format.combine(
1463
+ winston.format.timestamp(),
1464
+ winston.format.errors({ stack: true }),
1465
+ winston.format.json()
1466
+ );
1467
+ const logger2 = winston.createLogger({
1468
+ level,
1469
+ defaultMeta: { component },
1470
+ format,
1471
+ transports: [new winston.transports.Console()]
1472
+ });
1473
+ return logger2;
1474
+ }
1475
+
1476
+ // src/worker-main.ts
1477
+ var ALL_JOB_TYPES = [
1478
+ "reference-annotation",
1479
+ "generation",
1480
+ "highlight-annotation",
1481
+ "assessment-annotation",
1482
+ "comment-annotation",
1483
+ "tag-annotation"
1484
+ ];
1485
+ var configPath = join(homedir(), ".semiontconfig");
1486
+ var tomlReader = {
1487
+ readIfExists: (p) => existsSync(p) ? readFileSync(p, "utf-8") : null
1488
+ };
1489
+ var envConfig = createTomlConfigLoader(
1490
+ tomlReader,
1491
+ configPath,
1492
+ process.env
1493
+ )(null, "local");
1494
+ var workerInferenceMap = envConfig._metadata?.workers;
1495
+ if (!workerInferenceMap || Object.keys(workerInferenceMap).length === 0) {
1496
+ throw new Error(
1497
+ 'No worker inference config found in ~/.semiontconfig. Add at least [environments.<env>.workers.default.inference] with type = "..." and model = "...".'
1498
+ );
1499
+ }
1500
+ function resolveWorker(jobType) {
1501
+ const specific = workerInferenceMap[jobType];
1502
+ if (specific) return specific;
1503
+ const def = workerInferenceMap["default"];
1504
+ if (def) return def;
1505
+ throw new Error(
1506
+ `No inference config for worker '${jobType}' and no workers.default in ~/.semiontconfig.`
1507
+ );
1508
+ }
1509
+ var backendPublicURL = envConfig.services?.backend?.publicURL;
1510
+ if (!backendPublicURL) {
1511
+ throw new Error("services.backend.publicURL is required in ~/.semiontconfig");
1512
+ }
1513
+ var backendBaseUrl = backendPublicURL;
1514
+ var workerSecret = process.env.SEMIONT_WORKER_SECRET ?? "";
1515
+ var healthPort = 9090;
1516
+ var logger = createProcessLogger("worker");
1517
+ function clientKey(w) {
1518
+ return [w.type, w.model, w.apiKey ?? "", w.endpoint ?? "", w.baseURL ?? ""].join("|");
1519
+ }
1520
+ function toClientConfig(w) {
1521
+ return {
1522
+ type: w.type,
1523
+ model: w.model,
1524
+ ...w.endpoint && { endpoint: w.endpoint },
1525
+ ...w.baseURL && { baseURL: w.baseURL },
1526
+ ...w.apiKey && { apiKey: w.apiKey }
1527
+ };
1528
+ }
1529
+ var clientCache = /* @__PURE__ */ new Map();
1530
+ var engines = {};
1531
+ for (const jobType of ALL_JOB_TYPES) {
1532
+ const w = resolveWorker(jobType);
1533
+ const key = clientKey(w);
1534
+ let client = clientCache.get(key);
1535
+ if (!client) {
1536
+ client = createInferenceClient(toClientConfig(w), logger);
1537
+ clientCache.set(key, client);
1538
+ }
1539
+ const generator = {
1540
+ "@type": "SoftwareAgent",
1541
+ name: `worker-pool / ${w.type} ${w.model}`,
1542
+ worker: "worker-pool",
1543
+ inferenceProvider: w.type,
1544
+ model: w.model
1545
+ };
1546
+ engines[jobType] = { inferenceClient: client, generator };
1547
+ }
1548
+ function parseBackendUrl(url) {
1549
+ const parsed = new URL(url);
1550
+ const protocol = parsed.protocol.replace(":", "") === "https" ? "https" : "http";
1551
+ const host = parsed.hostname;
1552
+ const port = parsed.port ? Number(parsed.port) : protocol === "https" ? 443 : 80;
1553
+ return { protocol, host, port };
1554
+ }
1555
+ async function authenticate() {
1556
+ if (!workerSecret) {
1557
+ logger.warn("No SEMIONT_WORKER_SECRET set \u2014 using empty token");
1558
+ return "";
1559
+ }
1560
+ const response = await fetch(`${backendBaseUrl}/api/tokens/worker`, {
1561
+ method: "POST",
1562
+ headers: { "Content-Type": "application/json" },
1563
+ body: JSON.stringify({ secret: workerSecret })
1564
+ });
1565
+ if (!response.ok) {
1566
+ throw new Error(`Authentication failed: ${response.status} ${response.statusText}`);
1567
+ }
1568
+ const { token } = await response.json();
1569
+ return token;
1570
+ }
1571
+ async function main() {
1572
+ logger.info("Authenticating", { baseUrl: backendBaseUrl });
1573
+ const initialToken = await authenticate();
1574
+ logger.info("Authenticated");
1575
+ const { protocol, host, port } = parseBackendUrl(backendBaseUrl);
1576
+ const kbId = `worker-${hostname()}`;
1577
+ const kb = {
1578
+ id: kbId,
1579
+ label: `Worker pool @ ${host}`,
1580
+ host,
1581
+ port,
1582
+ protocol,
1583
+ email: `worker-pool@${host}`
1584
+ };
1585
+ const storage = new InMemorySessionStorage();
1586
+ setStoredSession(storage, kbId, { access: initialToken, refresh: "" });
1587
+ const session = new SemiontSession({
1588
+ kb,
1589
+ storage,
1590
+ refresh: async () => {
1591
+ try {
1592
+ return await authenticate();
1593
+ } catch (err) {
1594
+ logger.error("Worker token refresh failed", {
1595
+ error: err instanceof Error ? err.message : String(err)
1596
+ });
1597
+ return null;
1598
+ }
1599
+ },
1600
+ // No validate callback — workers are service principals with no
1601
+ // user record to fetch. `session.user$` stays null.
1602
+ onError: (err) => {
1603
+ logger.error("Session error", { code: err.code, message: err.message });
1604
+ }
1605
+ });
1606
+ await session.ready;
1607
+ const workerVm = startWorkerProcess({
1608
+ session,
1609
+ jobTypes: ALL_JOB_TYPES,
1610
+ engines,
1611
+ logger
1612
+ });
1613
+ logger.info("Connected", {
1614
+ baseUrl: backendBaseUrl,
1615
+ engines: Object.fromEntries(
1616
+ Object.entries(engines).map(([jt, e]) => [jt, `${e.generator.inferenceProvider} / ${e.generator.model}`])
1617
+ )
1618
+ });
1619
+ const health = createServer((req, res) => {
1620
+ if (req.url === "/health") {
1621
+ res.writeHead(200, { "Content-Type": "application/json" });
1622
+ res.end(JSON.stringify({ status: "ok" }));
1623
+ } else {
1624
+ res.writeHead(404);
1625
+ res.end();
1626
+ }
1627
+ });
1628
+ health.listen(healthPort, () => {
1629
+ logger.info("Health endpoint ready", { port: healthPort });
1630
+ });
1631
+ const shutdown = async () => {
1632
+ logger.info("Shutting down");
1633
+ workerVm.dispose();
1634
+ await session.dispose();
1635
+ health.close();
1636
+ process.exit(0);
1637
+ };
1638
+ process.on("SIGTERM", shutdown);
1639
+ process.on("SIGINT", shutdown);
1640
+ }
1641
+ main().catch((error) => {
1642
+ logger.error("Fatal", { error: error instanceof Error ? error.message : String(error), stack: error instanceof Error ? error.stack : void 0 });
1643
+ process.exit(1);
1644
+ });
1645
+ //# sourceMappingURL=worker-main.js.map
1646
+ //# sourceMappingURL=worker-main.js.map