@semiont/jobs 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.d.ts +80 -11
- package/dist/index.js +99 -40
- package/dist/index.js.map +1 -1
- package/dist/worker-main.js +264 -149
- package/dist/worker-main.js.map +1 -1
- package/package.json +2 -8
- package/dist/smelter-main.d.ts +0 -2
- package/dist/smelter-main.js +0 -10076
- package/dist/smelter-main.js.map +0 -1
package/README.md
CHANGED
|
@@ -10,7 +10,7 @@ Job queue, worker infrastructure, and annotation workers for [Semiont](https://g
|
|
|
10
10
|
|
|
11
11
|
## Architecture Context
|
|
12
12
|
|
|
13
|
-
Workers run in a separate separate process and connect to the Knowledge System (KS) over HTTP/SSE using `
|
|
13
|
+
Workers run in a separate separate process and connect to the Knowledge System (KS) over HTTP/SSE using `WorkerStateUnit` from `@semiont/api-client`. Workers receive job assignments via SSE push, claim jobs atomically, and emit domain events back to the KS via HTTP. The KS ingests these events onto its EventBus for SSE delivery to the frontend.
|
|
14
14
|
|
|
15
15
|
## Installation
|
|
16
16
|
|
package/dist/index.d.ts
CHANGED
|
@@ -39,6 +39,26 @@ interface JobMetadata {
|
|
|
39
39
|
retryCount: number;
|
|
40
40
|
maxRetries: number;
|
|
41
41
|
}
|
|
42
|
+
/**
|
|
43
|
+
* Locale conventions for detection/generation params.
|
|
44
|
+
*
|
|
45
|
+
* Two independent locales flow through these jobs:
|
|
46
|
+
*
|
|
47
|
+
* - `language` — *annotation body* locale. The BCP-47 tag the LLM should
|
|
48
|
+
* write generated body text in (comment text, assessment text, generated
|
|
49
|
+
* resource content, tag category label). Sourced from the user's UI
|
|
50
|
+
* locale. Stamped onto the W3C `TextualBody.language` field.
|
|
51
|
+
*
|
|
52
|
+
* - `sourceLanguage` — *source resource* locale. The BCP-47 tag of the
|
|
53
|
+
* content being analyzed. Sourced from `ResourceDescriptor` (carried as
|
|
54
|
+
* `Representation.language` on the primary representation). Used in
|
|
55
|
+
* prompts so the LLM analyzes non-English source correctly even when
|
|
56
|
+
* the user's UI locale differs.
|
|
57
|
+
*
|
|
58
|
+
* Examples: a German user analyzing an English document → `language='de'`,
|
|
59
|
+
* `sourceLanguage='en'`. An English user detecting entities in a French
|
|
60
|
+
* document → `language='en'` (unused for entity references), `sourceLanguage='fr'`.
|
|
61
|
+
*/
|
|
42
62
|
/**
|
|
43
63
|
* Detection job parameters
|
|
44
64
|
*/
|
|
@@ -46,6 +66,10 @@ interface DetectionParams {
|
|
|
46
66
|
resourceId: ResourceId;
|
|
47
67
|
entityTypes: EntityType[];
|
|
48
68
|
includeDescriptiveReferences?: boolean;
|
|
69
|
+
/** Annotation body locale — see locale conventions above. */
|
|
70
|
+
language?: string;
|
|
71
|
+
/** Source-resource locale — see locale conventions above. */
|
|
72
|
+
sourceLanguage?: string;
|
|
49
73
|
}
|
|
50
74
|
/**
|
|
51
75
|
* Generation job parameters
|
|
@@ -58,7 +82,14 @@ interface GenerationParams {
|
|
|
58
82
|
prompt?: string;
|
|
59
83
|
title?: string;
|
|
60
84
|
entityTypes?: EntityType[];
|
|
85
|
+
/** Annotation body locale — language the *generated resource* is written in. */
|
|
61
86
|
language?: string;
|
|
87
|
+
/**
|
|
88
|
+
* Source-resource locale — language of the resource being referenced.
|
|
89
|
+
* Used in the prompt so the LLM understands the embedded source-context
|
|
90
|
+
* snippet correctly when source ≠ target language.
|
|
91
|
+
*/
|
|
92
|
+
sourceLanguage?: string;
|
|
62
93
|
context?: GatheredContext;
|
|
63
94
|
temperature?: number;
|
|
64
95
|
maxTokens?: number;
|
|
@@ -71,6 +102,8 @@ interface HighlightDetectionParams {
|
|
|
71
102
|
resourceId: ResourceId;
|
|
72
103
|
instructions?: string;
|
|
73
104
|
density?: number;
|
|
105
|
+
/** Source-resource locale — see locale conventions above. */
|
|
106
|
+
sourceLanguage?: string;
|
|
74
107
|
}
|
|
75
108
|
/**
|
|
76
109
|
* Assessment detection job parameters
|
|
@@ -80,7 +113,10 @@ interface AssessmentDetectionParams {
|
|
|
80
113
|
instructions?: string;
|
|
81
114
|
tone?: 'analytical' | 'critical' | 'balanced' | 'constructive';
|
|
82
115
|
density?: number;
|
|
116
|
+
/** Annotation body locale — see locale conventions above. */
|
|
83
117
|
language?: string;
|
|
118
|
+
/** Source-resource locale — see locale conventions above. */
|
|
119
|
+
sourceLanguage?: string;
|
|
84
120
|
}
|
|
85
121
|
/**
|
|
86
122
|
* Comment detection job parameters
|
|
@@ -90,7 +126,10 @@ interface CommentDetectionParams {
|
|
|
90
126
|
instructions?: string;
|
|
91
127
|
tone?: 'scholarly' | 'explanatory' | 'conversational' | 'technical';
|
|
92
128
|
density?: number;
|
|
129
|
+
/** Annotation body locale — see locale conventions above. */
|
|
93
130
|
language?: string;
|
|
131
|
+
/** Source-resource locale — see locale conventions above. */
|
|
132
|
+
sourceLanguage?: string;
|
|
94
133
|
}
|
|
95
134
|
/**
|
|
96
135
|
* Tag detection job parameters
|
|
@@ -99,6 +138,10 @@ interface TagDetectionParams {
|
|
|
99
138
|
resourceId: ResourceId;
|
|
100
139
|
schemaId: string;
|
|
101
140
|
categories: string[];
|
|
141
|
+
/** Annotation body locale — see locale conventions above. */
|
|
142
|
+
language?: string;
|
|
143
|
+
/** Source-resource locale — see locale conventions above. */
|
|
144
|
+
sourceLanguage?: string;
|
|
102
145
|
}
|
|
103
146
|
/**
|
|
104
147
|
* Detection job progress
|
|
@@ -521,21 +564,39 @@ declare class AnnotationDetection {
|
|
|
521
564
|
*/
|
|
522
565
|
static fetchContent(contentFetcher: ContentFetcher, resourceId: ResourceId): Promise<string>;
|
|
523
566
|
/**
|
|
524
|
-
* Detect comments in content
|
|
567
|
+
* Detect comments in content.
|
|
568
|
+
*
|
|
569
|
+
* `language` is the locale the LLM should write comment text in (annotation
|
|
570
|
+
* body locale). `sourceLanguage` is the locale of the content being analyzed
|
|
571
|
+
* (source-resource locale). See `types.ts` "Locale conventions" for the
|
|
572
|
+
* full discussion.
|
|
525
573
|
*/
|
|
526
|
-
static detectComments(content: string, client: InferenceClient, instructions?: string, tone?: string, density?: number): Promise<CommentMatch[]>;
|
|
574
|
+
static detectComments(content: string, client: InferenceClient, instructions?: string, tone?: string, density?: number, language?: string, sourceLanguage?: string): Promise<CommentMatch[]>;
|
|
527
575
|
/**
|
|
528
|
-
* Detect highlights in content
|
|
576
|
+
* Detect highlights in content.
|
|
577
|
+
*
|
|
578
|
+
* Highlights have no body — only `sourceLanguage` (source-resource locale)
|
|
579
|
+
* applies, used in the prompt so the LLM analyzes non-English source
|
|
580
|
+
* correctly.
|
|
529
581
|
*/
|
|
530
|
-
static detectHighlights(content: string, client: InferenceClient, instructions?: string, density?: number): Promise<HighlightMatch[]>;
|
|
582
|
+
static detectHighlights(content: string, client: InferenceClient, instructions?: string, density?: number, sourceLanguage?: string): Promise<HighlightMatch[]>;
|
|
531
583
|
/**
|
|
532
|
-
* Detect assessments in content
|
|
584
|
+
* Detect assessments in content.
|
|
585
|
+
*
|
|
586
|
+
* `language` is the locale the LLM should write assessment text in
|
|
587
|
+
* (annotation body locale). `sourceLanguage` is the locale of the content
|
|
588
|
+
* being analyzed (source-resource locale).
|
|
533
589
|
*/
|
|
534
|
-
static detectAssessments(content: string, client: InferenceClient, instructions?: string, tone?: string, density?: number): Promise<AssessmentMatch[]>;
|
|
590
|
+
static detectAssessments(content: string, client: InferenceClient, instructions?: string, tone?: string, density?: number, language?: string, sourceLanguage?: string): Promise<AssessmentMatch[]>;
|
|
535
591
|
/**
|
|
536
|
-
* Detect tags in content for a specific category
|
|
592
|
+
* Detect tags in content for a specific category.
|
|
593
|
+
*
|
|
594
|
+
* `sourceLanguage` is the locale of the content being analyzed. Body-locale
|
|
595
|
+
* (`language`) doesn't influence the tag prompt — categories are schema
|
|
596
|
+
* identifiers, not LLM-generated text — so it's consumed at the body-stamp
|
|
597
|
+
* site, not here.
|
|
537
598
|
*/
|
|
538
|
-
static detectTags(content: string, client: InferenceClient, schemaId: string, category: string): Promise<TagMatch[]>;
|
|
599
|
+
static detectTags(content: string, client: InferenceClient, schemaId: string, category: string, sourceLanguage?: string): Promise<TagMatch[]>;
|
|
539
600
|
}
|
|
540
601
|
|
|
541
602
|
/**
|
|
@@ -545,9 +606,17 @@ declare class AnnotationDetection {
|
|
|
545
606
|
*/
|
|
546
607
|
|
|
547
608
|
/**
|
|
548
|
-
* Generate resource content using inference
|
|
549
|
-
|
|
550
|
-
|
|
609
|
+
* Generate resource content using inference.
|
|
610
|
+
*
|
|
611
|
+
* Locale parameters: `locale` is the *body* locale — the language the
|
|
612
|
+
* generated resource should be written in (sourced from the user's UI
|
|
613
|
+
* locale). `sourceLanguage` is the *source* locale — the language of the
|
|
614
|
+
* referenced resource whose context (selected passage, surrounding text)
|
|
615
|
+
* is embedded into the prompt. They're independent: a German user can
|
|
616
|
+
* generate German content from an English source resource. See
|
|
617
|
+
* `types.ts` "Locale conventions" for the full discussion.
|
|
618
|
+
*/
|
|
619
|
+
declare function generateResourceFromTopic(topic: string, entityTypes: string[], client: InferenceClient, userPrompt?: string, locale?: string, context?: GatheredContext, temperature?: number, maxTokens?: number, logger?: Logger, sourceLanguage?: string): Promise<{
|
|
551
620
|
title: string;
|
|
552
621
|
content: string;
|
|
553
622
|
}>;
|
package/dist/index.js
CHANGED
|
@@ -435,8 +435,21 @@ function isFailedJob(job) {
|
|
|
435
435
|
function isCancelledJob(job) {
|
|
436
436
|
return job.status === "cancelled";
|
|
437
437
|
}
|
|
438
|
+
function languageName(tag) {
|
|
439
|
+
return getLocaleEnglishName(tag) || tag;
|
|
440
|
+
}
|
|
441
|
+
function sourceLanguageGuidance(sourceLanguage) {
|
|
442
|
+
if (!sourceLanguage) return "";
|
|
443
|
+
return `
|
|
444
|
+
|
|
445
|
+
Source text language: ${languageName(sourceLanguage)}.`;
|
|
446
|
+
}
|
|
447
|
+
function bodyLanguageGuidance(language, kind) {
|
|
448
|
+
if (!language || language === "en") return "";
|
|
449
|
+
return `
|
|
438
450
|
|
|
439
|
-
|
|
451
|
+
IMPORTANT: Write your ${kind} in ${languageName(language)}.`;
|
|
452
|
+
}
|
|
440
453
|
var MotivationPrompts = class {
|
|
441
454
|
/**
|
|
442
455
|
* Build a prompt for detecting comment-worthy passages
|
|
@@ -447,8 +460,10 @@ var MotivationPrompts = class {
|
|
|
447
460
|
* @param density - Optional target number of comments per 2000 words
|
|
448
461
|
* @returns Formatted prompt string
|
|
449
462
|
*/
|
|
450
|
-
static buildCommentPrompt(content, instructions, tone, density) {
|
|
463
|
+
static buildCommentPrompt(content, instructions, tone, density, language, sourceLanguage) {
|
|
451
464
|
let prompt;
|
|
465
|
+
const sourceLang = sourceLanguageGuidance(sourceLanguage);
|
|
466
|
+
const bodyLang = bodyLanguageGuidance(language, "comments");
|
|
452
467
|
if (instructions) {
|
|
453
468
|
const toneGuidance = tone ? ` Use a ${tone} tone.` : "";
|
|
454
469
|
const densityGuidance = density ? `
|
|
@@ -456,7 +471,7 @@ var MotivationPrompts = class {
|
|
|
456
471
|
Aim for approximately ${density} comments per 2000 words of text.` : "";
|
|
457
472
|
prompt = `Add comments to passages in this text following these instructions:
|
|
458
473
|
|
|
459
|
-
${instructions}${toneGuidance}${densityGuidance}
|
|
474
|
+
${instructions}${toneGuidance}${densityGuidance}${sourceLang}${bodyLang}
|
|
460
475
|
|
|
461
476
|
Text to analyze:
|
|
462
477
|
---
|
|
@@ -492,7 +507,7 @@ Guidelines:
|
|
|
492
507
|
- Provide comments that ADD VALUE beyond restating the text
|
|
493
508
|
- Focus on explanation, background, or connections to other ideas
|
|
494
509
|
- Avoid obvious or trivial comments
|
|
495
|
-
- Keep comments concise (1-3 sentences typically)${densityGuidance}
|
|
510
|
+
- Keep comments concise (1-3 sentences typically)${densityGuidance}${sourceLang}${bodyLang}
|
|
496
511
|
|
|
497
512
|
Text to analyze:
|
|
498
513
|
---
|
|
@@ -524,15 +539,16 @@ Example format:
|
|
|
524
539
|
* @param density - Optional target number of highlights per 2000 words
|
|
525
540
|
* @returns Formatted prompt string
|
|
526
541
|
*/
|
|
527
|
-
static buildHighlightPrompt(content, instructions, density) {
|
|
542
|
+
static buildHighlightPrompt(content, instructions, density, sourceLanguage) {
|
|
528
543
|
let prompt;
|
|
544
|
+
const sourceLang = sourceLanguageGuidance(sourceLanguage);
|
|
529
545
|
if (instructions) {
|
|
530
546
|
const densityGuidance = density ? `
|
|
531
547
|
|
|
532
548
|
Aim for approximately ${density} highlights per 2000 words of text.` : "";
|
|
533
549
|
prompt = `Identify passages in this text to highlight following these instructions:
|
|
534
550
|
|
|
535
|
-
${instructions}${densityGuidance}
|
|
551
|
+
${instructions}${densityGuidance}${sourceLang}
|
|
536
552
|
|
|
537
553
|
Text to analyze:
|
|
538
554
|
---
|
|
@@ -565,7 +581,7 @@ Guidelines:
|
|
|
565
581
|
- Highlight notable quotes or particularly striking statements
|
|
566
582
|
- Highlight critical decisions, action items, or turning points
|
|
567
583
|
- Select passages that are SIGNIFICANT, not just interesting
|
|
568
|
-
- Avoid trivial or obvious content${densityGuidance}
|
|
584
|
+
- Avoid trivial or obvious content${densityGuidance}${sourceLang}
|
|
569
585
|
|
|
570
586
|
Text to analyze:
|
|
571
587
|
---
|
|
@@ -597,8 +613,10 @@ Example format:
|
|
|
597
613
|
* @param density - Optional target number of assessments per 2000 words
|
|
598
614
|
* @returns Formatted prompt string
|
|
599
615
|
*/
|
|
600
|
-
static buildAssessmentPrompt(content, instructions, tone, density) {
|
|
616
|
+
static buildAssessmentPrompt(content, instructions, tone, density, language, sourceLanguage) {
|
|
601
617
|
let prompt;
|
|
618
|
+
const sourceLang = sourceLanguageGuidance(sourceLanguage);
|
|
619
|
+
const bodyLang = bodyLanguageGuidance(language, "assessments");
|
|
602
620
|
if (instructions) {
|
|
603
621
|
const toneGuidance = tone ? ` Use a ${tone} tone.` : "";
|
|
604
622
|
const densityGuidance = density ? `
|
|
@@ -606,7 +624,7 @@ Example format:
|
|
|
606
624
|
Aim for approximately ${density} assessments per 2000 words of text.` : "";
|
|
607
625
|
prompt = `Assess passages in this text following these instructions:
|
|
608
626
|
|
|
609
|
-
${instructions}${toneGuidance}${densityGuidance}
|
|
627
|
+
${instructions}${toneGuidance}${densityGuidance}${sourceLang}${bodyLang}
|
|
610
628
|
|
|
611
629
|
Text to analyze:
|
|
612
630
|
---
|
|
@@ -642,7 +660,7 @@ Guidelines:
|
|
|
642
660
|
- Assess evidence quality, logical soundness, or practical implications
|
|
643
661
|
- Provide assessments that ADD INSIGHT beyond restating the text
|
|
644
662
|
- Focus on passages where evaluation would help readers form judgments
|
|
645
|
-
- Keep assessments concise yet substantive (1-3 sentences typically)${densityGuidance}
|
|
663
|
+
- Keep assessments concise yet substantive (1-3 sentences typically)${densityGuidance}${sourceLang}${bodyLang}
|
|
646
664
|
|
|
647
665
|
Text to analyze:
|
|
648
666
|
---
|
|
@@ -678,7 +696,8 @@ Example format:
|
|
|
678
696
|
* @param categoryExamples - Example questions/guidance for this category
|
|
679
697
|
* @returns Formatted prompt string
|
|
680
698
|
*/
|
|
681
|
-
static buildTagPrompt(content, category, schemaName, schemaDescription, schemaDomain, categoryDescription, categoryExamples) {
|
|
699
|
+
static buildTagPrompt(content, category, schemaName, schemaDescription, schemaDomain, categoryDescription, categoryExamples, sourceLanguage) {
|
|
700
|
+
const sourceLang = sourceLanguageGuidance(sourceLanguage);
|
|
682
701
|
const prompt = `You are analyzing a text using the ${schemaName} framework.
|
|
683
702
|
|
|
684
703
|
Schema: ${schemaDescription}
|
|
@@ -697,7 +716,7 @@ Guidelines:
|
|
|
697
716
|
- Look for passages that explicitly fulfill this role
|
|
698
717
|
- Passages can be sentences, paragraphs, or sections
|
|
699
718
|
- Aim for precision - only tag passages that clearly serve this structural role
|
|
700
|
-
- Typical documents have 1-5 instances of each category (some may have 0)
|
|
719
|
+
- Typical documents have 1-5 instances of each category (some may have 0)${sourceLang}
|
|
701
720
|
|
|
702
721
|
Text to analyze:
|
|
703
722
|
---
|
|
@@ -1099,33 +1118,51 @@ var AnnotationDetection = class {
|
|
|
1099
1118
|
return Buffer.concat(chunks).toString("utf-8");
|
|
1100
1119
|
}
|
|
1101
1120
|
/**
|
|
1102
|
-
* Detect comments in content
|
|
1121
|
+
* Detect comments in content.
|
|
1122
|
+
*
|
|
1123
|
+
* `language` is the locale the LLM should write comment text in (annotation
|
|
1124
|
+
* body locale). `sourceLanguage` is the locale of the content being analyzed
|
|
1125
|
+
* (source-resource locale). See `types.ts` "Locale conventions" for the
|
|
1126
|
+
* full discussion.
|
|
1103
1127
|
*/
|
|
1104
|
-
static async detectComments(content, client, instructions, tone, density) {
|
|
1105
|
-
const prompt = MotivationPrompts.buildCommentPrompt(content, instructions, tone, density);
|
|
1128
|
+
static async detectComments(content, client, instructions, tone, density, language, sourceLanguage) {
|
|
1129
|
+
const prompt = MotivationPrompts.buildCommentPrompt(content, instructions, tone, density, language, sourceLanguage);
|
|
1106
1130
|
const response = await client.generateText(prompt, 3e3, 0.4);
|
|
1107
1131
|
return MotivationParsers.parseComments(response, content);
|
|
1108
1132
|
}
|
|
1109
1133
|
/**
|
|
1110
|
-
* Detect highlights in content
|
|
1134
|
+
* Detect highlights in content.
|
|
1135
|
+
*
|
|
1136
|
+
* Highlights have no body — only `sourceLanguage` (source-resource locale)
|
|
1137
|
+
* applies, used in the prompt so the LLM analyzes non-English source
|
|
1138
|
+
* correctly.
|
|
1111
1139
|
*/
|
|
1112
|
-
static async detectHighlights(content, client, instructions, density) {
|
|
1113
|
-
const prompt = MotivationPrompts.buildHighlightPrompt(content, instructions, density);
|
|
1140
|
+
static async detectHighlights(content, client, instructions, density, sourceLanguage) {
|
|
1141
|
+
const prompt = MotivationPrompts.buildHighlightPrompt(content, instructions, density, sourceLanguage);
|
|
1114
1142
|
const response = await client.generateText(prompt, 2e3, 0.3);
|
|
1115
1143
|
return MotivationParsers.parseHighlights(response, content);
|
|
1116
1144
|
}
|
|
1117
1145
|
/**
|
|
1118
|
-
* Detect assessments in content
|
|
1146
|
+
* Detect assessments in content.
|
|
1147
|
+
*
|
|
1148
|
+
* `language` is the locale the LLM should write assessment text in
|
|
1149
|
+
* (annotation body locale). `sourceLanguage` is the locale of the content
|
|
1150
|
+
* being analyzed (source-resource locale).
|
|
1119
1151
|
*/
|
|
1120
|
-
static async detectAssessments(content, client, instructions, tone, density) {
|
|
1121
|
-
const prompt = MotivationPrompts.buildAssessmentPrompt(content, instructions, tone, density);
|
|
1152
|
+
static async detectAssessments(content, client, instructions, tone, density, language, sourceLanguage) {
|
|
1153
|
+
const prompt = MotivationPrompts.buildAssessmentPrompt(content, instructions, tone, density, language, sourceLanguage);
|
|
1122
1154
|
const response = await client.generateText(prompt, 3e3, 0.3);
|
|
1123
1155
|
return MotivationParsers.parseAssessments(response, content);
|
|
1124
1156
|
}
|
|
1125
1157
|
/**
|
|
1126
|
-
* Detect tags in content for a specific category
|
|
1158
|
+
* Detect tags in content for a specific category.
|
|
1159
|
+
*
|
|
1160
|
+
* `sourceLanguage` is the locale of the content being analyzed. Body-locale
|
|
1161
|
+
* (`language`) doesn't influence the tag prompt — categories are schema
|
|
1162
|
+
* identifiers, not LLM-generated text — so it's consumed at the body-stamp
|
|
1163
|
+
* site, not here.
|
|
1127
1164
|
*/
|
|
1128
|
-
static async detectTags(content, client, schemaId, category) {
|
|
1165
|
+
static async detectTags(content, client, schemaId, category, sourceLanguage) {
|
|
1129
1166
|
const schema = getTagSchema(schemaId);
|
|
1130
1167
|
if (!schema) {
|
|
1131
1168
|
throw new Error(`Invalid tag schema: ${schemaId}`);
|
|
@@ -1141,16 +1178,15 @@ var AnnotationDetection = class {
|
|
|
1141
1178
|
schema.description,
|
|
1142
1179
|
schema.domain,
|
|
1143
1180
|
categoryInfo.description,
|
|
1144
|
-
categoryInfo.examples
|
|
1181
|
+
categoryInfo.examples,
|
|
1182
|
+
sourceLanguage
|
|
1145
1183
|
);
|
|
1146
1184
|
const response = await client.generateText(prompt, 4e3, 0.2);
|
|
1147
1185
|
const parsedTags = MotivationParsers.parseTags(response);
|
|
1148
1186
|
return MotivationParsers.validateTagOffsets(parsedTags, content, category);
|
|
1149
1187
|
}
|
|
1150
1188
|
};
|
|
1151
|
-
|
|
1152
|
-
// src/workers/detection/entity-extractor.ts
|
|
1153
|
-
async function extractEntities(exact, entityTypes, client, includeDescriptiveReferences = false, logger) {
|
|
1189
|
+
async function extractEntities(exact, entityTypes, client, includeDescriptiveReferences = false, logger, sourceLanguage) {
|
|
1154
1190
|
const entityTypesDescription = entityTypes.map((et) => {
|
|
1155
1191
|
if (typeof et === "string") {
|
|
1156
1192
|
return et;
|
|
@@ -1179,8 +1215,11 @@ Examples:
|
|
|
1179
1215
|
` : `
|
|
1180
1216
|
Find direct mentions only (names, proper nouns). Do not include pronouns or descriptive references.
|
|
1181
1217
|
`;
|
|
1218
|
+
const sourceLangGuidance = sourceLanguage ? `
|
|
1219
|
+
Source text language: ${getLocaleEnglishName(sourceLanguage) || sourceLanguage}.
|
|
1220
|
+
` : "";
|
|
1182
1221
|
const prompt = `Identify entity references in the following text. Look for mentions of: ${entityTypesDescription}.
|
|
1183
|
-
${descriptiveReferenceGuidance}
|
|
1222
|
+
${descriptiveReferenceGuidance}${sourceLangGuidance}
|
|
1184
1223
|
Text to analyze:
|
|
1185
1224
|
"""
|
|
1186
1225
|
${exact}
|
|
@@ -1374,12 +1413,13 @@ Example output:
|
|
|
1374
1413
|
function getLanguageName(locale) {
|
|
1375
1414
|
return getLocaleEnglishName(locale) || locale;
|
|
1376
1415
|
}
|
|
1377
|
-
async function generateResourceFromTopic(topic, entityTypes, client, userPrompt, locale, context, temperature, maxTokens, logger) {
|
|
1416
|
+
async function generateResourceFromTopic(topic, entityTypes, client, userPrompt, locale, context, temperature, maxTokens, logger, sourceLanguage) {
|
|
1378
1417
|
logger?.debug("Generating resource from topic", {
|
|
1379
1418
|
topicPreview: topic.substring(0, 100),
|
|
1380
1419
|
entityTypes,
|
|
1381
1420
|
hasUserPrompt: !!userPrompt,
|
|
1382
1421
|
locale,
|
|
1422
|
+
sourceLanguage,
|
|
1383
1423
|
hasContext: !!context,
|
|
1384
1424
|
temperature,
|
|
1385
1425
|
maxTokens
|
|
@@ -1389,6 +1429,9 @@ async function generateResourceFromTopic(topic, entityTypes, client, userPrompt,
|
|
|
1389
1429
|
const languageInstruction = locale && locale !== "en" ? `
|
|
1390
1430
|
|
|
1391
1431
|
IMPORTANT: Write the entire resource in ${getLanguageName(locale)}.` : "";
|
|
1432
|
+
const sourceLanguageInstruction = sourceLanguage ? `
|
|
1433
|
+
|
|
1434
|
+
The source resource and embedded context are in ${getLanguageName(sourceLanguage)}.` : "";
|
|
1392
1435
|
let annotationSection = "";
|
|
1393
1436
|
if (context) {
|
|
1394
1437
|
const parts = [];
|
|
@@ -1450,7 +1493,7 @@ ${parts.join("\n")}`;
|
|
|
1450
1493
|
const structureGuidance = finalMaxTokens >= 1e3 ? "organized into titled sections (## Section) with well-structured paragraphs" : "organized into well-structured paragraphs";
|
|
1451
1494
|
const prompt = `Generate a concise, informative resource about "${topic}".
|
|
1452
1495
|
${entityTypes.length > 0 ? `Focus on these entity types: ${entityTypes.join(", ")}.` : ""}
|
|
1453
|
-
${userPrompt ? `Additional context: ${userPrompt}` : ""}${annotationSection}${contextSection}${graphContextSection}${languageInstruction}
|
|
1496
|
+
${userPrompt ? `Additional context: ${userPrompt}` : ""}${annotationSection}${contextSection}${graphContextSection}${sourceLanguageInstruction}${languageInstruction}
|
|
1454
1497
|
|
|
1455
1498
|
Requirements:
|
|
1456
1499
|
- Start with a clear heading (# Title)
|
|
@@ -1527,7 +1570,8 @@ async function processHighlightJob(content, inferenceClient, params, userId, gen
|
|
|
1527
1570
|
content,
|
|
1528
1571
|
inferenceClient,
|
|
1529
1572
|
params.instructions,
|
|
1530
|
-
params.density
|
|
1573
|
+
params.density,
|
|
1574
|
+
params.sourceLanguage
|
|
1531
1575
|
);
|
|
1532
1576
|
onProgress(60, `Creating ${highlights.length} annotations...`, "creating");
|
|
1533
1577
|
const annotations = highlights.map(
|
|
@@ -1547,16 +1591,19 @@ async function processCommentJob(content, inferenceClient, params, userId, gener
|
|
|
1547
1591
|
inferenceClient,
|
|
1548
1592
|
params.instructions,
|
|
1549
1593
|
params.tone,
|
|
1550
|
-
params.density
|
|
1594
|
+
params.density,
|
|
1595
|
+
params.language,
|
|
1596
|
+
params.sourceLanguage
|
|
1551
1597
|
);
|
|
1552
1598
|
onProgress(60, `Creating ${comments.length} annotations...`, "creating");
|
|
1599
|
+
const bodyLanguage = params.language ?? "en";
|
|
1553
1600
|
const annotations = comments.map(
|
|
1554
1601
|
(c) => (
|
|
1555
1602
|
// Match the pre-#651 CommentAnnotationWorker: include format and
|
|
1556
1603
|
// language on the body TextualBody. Optional in the schema, but
|
|
1557
1604
|
// consumers that do language-aware rendering rely on them.
|
|
1558
1605
|
buildTextAnnotation(params.resourceId, userId, generator, "commenting", c, [
|
|
1559
|
-
{ type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language:
|
|
1606
|
+
{ type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language: bodyLanguage }
|
|
1560
1607
|
])
|
|
1561
1608
|
)
|
|
1562
1609
|
);
|
|
@@ -1574,9 +1621,12 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
|
|
|
1574
1621
|
inferenceClient,
|
|
1575
1622
|
params.instructions,
|
|
1576
1623
|
params.tone,
|
|
1577
|
-
params.density
|
|
1624
|
+
params.density,
|
|
1625
|
+
params.language,
|
|
1626
|
+
params.sourceLanguage
|
|
1578
1627
|
);
|
|
1579
1628
|
onProgress(60, `Creating ${assessments.length} annotations...`, "creating");
|
|
1629
|
+
const bodyLanguage = params.language ?? "en";
|
|
1580
1630
|
const annotations = assessments.map(
|
|
1581
1631
|
(a) => (
|
|
1582
1632
|
// Single-object body with purpose aligned to motivation, matching the
|
|
@@ -1590,7 +1640,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
|
|
|
1590
1640
|
value: a.assessment,
|
|
1591
1641
|
purpose: "assessing",
|
|
1592
1642
|
format: "text/plain",
|
|
1593
|
-
language:
|
|
1643
|
+
language: bodyLanguage
|
|
1594
1644
|
})
|
|
1595
1645
|
)
|
|
1596
1646
|
);
|
|
@@ -1609,6 +1659,7 @@ async function processReferenceJob(content, inferenceClient, params, userId, gen
|
|
|
1609
1659
|
let errors = 0;
|
|
1610
1660
|
const allAnnotations = [];
|
|
1611
1661
|
onProgress(10, "Loading resource...", "analyzing", { requestParams });
|
|
1662
|
+
const bodyLanguage = params.language ?? "en";
|
|
1612
1663
|
for (let i = 0; i < entityTypeNames.length; i++) {
|
|
1613
1664
|
const entityTypeName = entityTypeNames[i];
|
|
1614
1665
|
if (!entityTypeName) continue;
|
|
@@ -1627,11 +1678,14 @@ async function processReferenceJob(content, inferenceClient, params, userId, gen
|
|
|
1627
1678
|
[entityTypeName],
|
|
1628
1679
|
inferenceClient,
|
|
1629
1680
|
params.includeDescriptiveReferences ?? false,
|
|
1630
|
-
logger
|
|
1681
|
+
logger,
|
|
1682
|
+
params.sourceLanguage
|
|
1631
1683
|
);
|
|
1632
1684
|
totalFound += extractedEntities.length;
|
|
1633
1685
|
completedEntityTypes.push({ entityType: entityTypeName, foundCount: extractedEntities.length });
|
|
1634
|
-
const unresolvedBody = [
|
|
1686
|
+
const unresolvedBody = [
|
|
1687
|
+
{ type: "TextualBody", value: entityTypeName, purpose: "tagging", format: "text/plain", language: bodyLanguage }
|
|
1688
|
+
];
|
|
1635
1689
|
for (const entity of extractedEntities) {
|
|
1636
1690
|
try {
|
|
1637
1691
|
const validated = validateAndCorrectOffsets(content, entity.startOffset, entity.endOffset, entity.exact);
|
|
@@ -1665,18 +1719,20 @@ async function processTagJob(content, inferenceClient, params, userId, generator
|
|
|
1665
1719
|
content,
|
|
1666
1720
|
inferenceClient,
|
|
1667
1721
|
params.schemaId,
|
|
1668
|
-
category
|
|
1722
|
+
category,
|
|
1723
|
+
params.sourceLanguage
|
|
1669
1724
|
);
|
|
1670
1725
|
allTags.push(...categoryTags);
|
|
1671
1726
|
}
|
|
1672
1727
|
const tags = allTags;
|
|
1673
1728
|
onProgress(60, `Creating ${tags.length} tag annotations...`, "creating");
|
|
1729
|
+
const bodyLanguage = params.language ?? "en";
|
|
1674
1730
|
const byCategory = {};
|
|
1675
1731
|
const annotations = tags.map((t) => {
|
|
1676
1732
|
const category = t.category ?? "unknown";
|
|
1677
1733
|
byCategory[category] = (byCategory[category] ?? 0) + 1;
|
|
1678
1734
|
return buildTextAnnotation(params.resourceId, userId, generator, "tagging", t, [
|
|
1679
|
-
{ type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language:
|
|
1735
|
+
{ type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: bodyLanguage },
|
|
1680
1736
|
{ type: "TextualBody", value: params.schemaId, purpose: "classifying", format: "text/plain" }
|
|
1681
1737
|
]);
|
|
1682
1738
|
});
|
|
@@ -1699,7 +1755,10 @@ async function processGenerationJob(inferenceClient, params, onProgress) {
|
|
|
1699
1755
|
params.language,
|
|
1700
1756
|
params.context,
|
|
1701
1757
|
params.temperature,
|
|
1702
|
-
params.maxTokens
|
|
1758
|
+
params.maxTokens,
|
|
1759
|
+
void 0,
|
|
1760
|
+
// logger
|
|
1761
|
+
params.sourceLanguage
|
|
1703
1762
|
);
|
|
1704
1763
|
onProgress(85, "Creating resource...", "creating");
|
|
1705
1764
|
return {
|