npm - @semiont/jobs - Versions diffs - 0.5.1 → 0.5.3 - Mend

@semiont/jobs 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/index.js CHANGED Viewed

@@ -435,8 +435,21 @@ function isFailedJob(job) {
 function isCancelledJob(job) {
   return job.status === "cancelled";
 }
+function languageName(tag) {
+  return getLocaleEnglishName(tag) || tag;
+}
+function sourceLanguageGuidance(sourceLanguage) {
+  if (!sourceLanguage) return "";
+  return `
+Source text language: ${languageName(sourceLanguage)}.`;
+}
+function bodyLanguageGuidance(language, kind) {
+  if (!language || language === "en") return "";
+  return `
-// src/workers/detection/motivation-prompts.ts
+IMPORTANT: Write your ${kind} in ${languageName(language)}.`;
+}
 var MotivationPrompts = class {
   /**
    * Build a prompt for detecting comment-worthy passages
@@ -447,8 +460,10 @@ var MotivationPrompts = class {
    * @param density - Optional target number of comments per 2000 words
    * @returns Formatted prompt string
    */
-  static buildCommentPrompt(content, instructions, tone, density) {
+  static buildCommentPrompt(content, instructions, tone, density, language, sourceLanguage) {
     let prompt;
+    const sourceLang = sourceLanguageGuidance(sourceLanguage);
+    const bodyLang = bodyLanguageGuidance(language, "comments");
     if (instructions) {
       const toneGuidance = tone ? ` Use a ${tone} tone.` : "";
       const densityGuidance = density ? `
@@ -456,7 +471,7 @@ var MotivationPrompts = class {
 Aim for approximately ${density} comments per 2000 words of text.` : "";
       prompt = `Add comments to passages in this text following these instructions:
-${instructions}${toneGuidance}${densityGuidance}
+${instructions}${toneGuidance}${densityGuidance}${sourceLang}${bodyLang}
 Text to analyze:
 ---
@@ -492,7 +507,7 @@ Guidelines:
 - Provide comments that ADD VALUE beyond restating the text
 - Focus on explanation, background, or connections to other ideas
 - Avoid obvious or trivial comments
-- Keep comments concise (1-3 sentences typically)${densityGuidance}
+- Keep comments concise (1-3 sentences typically)${densityGuidance}${sourceLang}${bodyLang}
 Text to analyze:
 ---
@@ -524,15 +539,16 @@ Example format:
    * @param density - Optional target number of highlights per 2000 words
    * @returns Formatted prompt string
    */
-  static buildHighlightPrompt(content, instructions, density) {
+  static buildHighlightPrompt(content, instructions, density, sourceLanguage) {
     let prompt;
+    const sourceLang = sourceLanguageGuidance(sourceLanguage);
     if (instructions) {
       const densityGuidance = density ? `
 Aim for approximately ${density} highlights per 2000 words of text.` : "";
       prompt = `Identify passages in this text to highlight following these instructions:
-${instructions}${densityGuidance}
+${instructions}${densityGuidance}${sourceLang}
 Text to analyze:
 ---
@@ -565,7 +581,7 @@ Guidelines:
 - Highlight notable quotes or particularly striking statements
 - Highlight critical decisions, action items, or turning points
 - Select passages that are SIGNIFICANT, not just interesting
-- Avoid trivial or obvious content${densityGuidance}
+- Avoid trivial or obvious content${densityGuidance}${sourceLang}
 Text to analyze:
 ---
@@ -597,8 +613,10 @@ Example format:
    * @param density - Optional target number of assessments per 2000 words
    * @returns Formatted prompt string
    */
-  static buildAssessmentPrompt(content, instructions, tone, density) {
+  static buildAssessmentPrompt(content, instructions, tone, density, language, sourceLanguage) {
     let prompt;
+    const sourceLang = sourceLanguageGuidance(sourceLanguage);
+    const bodyLang = bodyLanguageGuidance(language, "assessments");
     if (instructions) {
       const toneGuidance = tone ? ` Use a ${tone} tone.` : "";
       const densityGuidance = density ? `
@@ -606,7 +624,7 @@ Example format:
 Aim for approximately ${density} assessments per 2000 words of text.` : "";
       prompt = `Assess passages in this text following these instructions:
-${instructions}${toneGuidance}${densityGuidance}
+${instructions}${toneGuidance}${densityGuidance}${sourceLang}${bodyLang}
 Text to analyze:
 ---
@@ -642,7 +660,7 @@ Guidelines:
 - Assess evidence quality, logical soundness, or practical implications
 - Provide assessments that ADD INSIGHT beyond restating the text
 - Focus on passages where evaluation would help readers form judgments
-- Keep assessments concise yet substantive (1-3 sentences typically)${densityGuidance}
+- Keep assessments concise yet substantive (1-3 sentences typically)${densityGuidance}${sourceLang}${bodyLang}
 Text to analyze:
 ---
@@ -678,7 +696,8 @@ Example format:
    * @param categoryExamples - Example questions/guidance for this category
    * @returns Formatted prompt string
    */
-  static buildTagPrompt(content, category, schemaName, schemaDescription, schemaDomain, categoryDescription, categoryExamples) {
+  static buildTagPrompt(content, category, schemaName, schemaDescription, schemaDomain, categoryDescription, categoryExamples, sourceLanguage) {
+    const sourceLang = sourceLanguageGuidance(sourceLanguage);
     const prompt = `You are analyzing a text using the ${schemaName} framework.
 Schema: ${schemaDescription}
@@ -697,7 +716,7 @@ Guidelines:
 - Look for passages that explicitly fulfill this role
 - Passages can be sentences, paragraphs, or sections
 - Aim for precision - only tag passages that clearly serve this structural role
-- Typical documents have 1-5 instances of each category (some may have 0)
+- Typical documents have 1-5 instances of each category (some may have 0)${sourceLang}
 Text to analyze:
 ---
@@ -928,159 +947,6 @@ var MotivationParsers = class {
   }
 };
-// ../ontology/dist/index.js
-var TAG_SCHEMAS = {
-  "legal-irac": {
-    id: "legal-irac",
-    name: "Legal Analysis (IRAC)",
-    description: "Issue, Rule, Application, Conclusion framework for legal reasoning",
-    domain: "legal",
-    tags: [
-      {
-        name: "Issue",
-        description: "The legal question or problem to be resolved",
-        examples: [
-          "What is the central legal question?",
-          "What must the court decide?",
-          "What is the dispute about?"
-        ]
-      },
-      {
-        name: "Rule",
-        description: "The relevant law, statute, or legal principle",
-        examples: [
-          "What law applies?",
-          "What is the legal standard?",
-          "What statute governs this case?"
-        ]
-      },
-      {
-        name: "Application",
-        description: "How the rule applies to the specific facts",
-        examples: [
-          "How does the law apply to these facts?",
-          "Analysis of the case",
-          "How do the facts satisfy the legal standard?"
-        ]
-      },
-      {
-        name: "Conclusion",
-        description: "The resolution or outcome based on the analysis",
-        examples: [
-          "What is the court's decision?",
-          "What is the final judgment?",
-          "What is the holding?"
-        ]
-      }
-    ]
-  },
-  "scientific-imrad": {
-    id: "scientific-imrad",
-    name: "Scientific Paper (IMRAD)",
-    description: "Introduction, Methods, Results, Discussion structure for research papers",
-    domain: "scientific",
-    tags: [
-      {
-        name: "Introduction",
-        description: "Background, context, and research question",
-        examples: [
-          "What is the research question?",
-          "Why is this important?",
-          "What is the hypothesis?"
-        ]
-      },
-      {
-        name: "Methods",
-        description: "Experimental design and procedures",
-        examples: [
-          "How was the study conducted?",
-          "What methods were used?",
-          "What was the experimental design?"
-        ]
-      },
-      {
-        name: "Results",
-        description: "Findings and observations",
-        examples: [
-          "What did the study find?",
-          "What are the data?",
-          "What were the observations?"
-        ]
-      },
-      {
-        name: "Discussion",
-        description: "Interpretation and implications of results",
-        examples: [
-          "What do the results mean?",
-          "What are the implications?",
-          "How do these findings relate to prior work?"
-        ]
-      }
-    ]
-  },
-  "argument-toulmin": {
-    id: "argument-toulmin",
-    name: "Argument Structure (Toulmin)",
-    description: "Claim, Evidence, Warrant, Counterargument, Rebuttal framework for argumentation",
-    domain: "general",
-    tags: [
-      {
-        name: "Claim",
-        description: "The main assertion or thesis",
-        examples: [
-          "What is being argued?",
-          "What is the main point?",
-          "What position is being taken?"
-        ]
-      },
-      {
-        name: "Evidence",
-        description: "Data or facts supporting the claim",
-        examples: [
-          "What supports this claim?",
-          "What are the facts?",
-          "What data is provided?"
-        ]
-      },
-      {
-        name: "Warrant",
-        description: "Reasoning connecting evidence to claim",
-        examples: [
-          "Why does this evidence support the claim?",
-          "What is the logic?",
-          "How does this reasoning work?"
-        ]
-      },
-      {
-        name: "Counterargument",
-        description: "Opposing viewpoints or objections",
-        examples: [
-          "What are the objections?",
-          "What do critics say?",
-          "What are alternative views?"
-        ]
-      },
-      {
-        name: "Rebuttal",
-        description: "Response to counterarguments",
-        examples: [
-          "How is the objection addressed?",
-          "Why is the counterargument wrong?",
-          "How is the criticism answered?"
-        ]
-      }
-    ]
-  }
-};
-function getTagSchema(schemaId) {
-  return TAG_SCHEMAS[schemaId] || null;
-}
-function getSchemaCategory(schemaId, categoryName) {
-  const schema = getTagSchema(schemaId);
-  if (!schema) return null;
-  return schema.tags.find((tag) => tag.name === categoryName) || null;
-}
 // src/workers/annotation-detection.ts
 var AnnotationDetection = class {
   /**
@@ -1099,40 +965,58 @@ var AnnotationDetection = class {
     return Buffer.concat(chunks).toString("utf-8");
   }
   /**
-   * Detect comments in content
+   * Detect comments in content.
+   *
+   * `language` is the locale the LLM should write comment text in (annotation
+   * body locale). `sourceLanguage` is the locale of the content being analyzed
+   * (source-resource locale). See `types.ts` "Locale conventions" for the
+   * full discussion.
    */
-  static async detectComments(content, client, instructions, tone, density) {
-    const prompt = MotivationPrompts.buildCommentPrompt(content, instructions, tone, density);
+  static async detectComments(content, client, instructions, tone, density, language, sourceLanguage) {
+    const prompt = MotivationPrompts.buildCommentPrompt(content, instructions, tone, density, language, sourceLanguage);
     const response = await client.generateText(prompt, 3e3, 0.4);
     return MotivationParsers.parseComments(response, content);
   }
   /**
-   * Detect highlights in content
+   * Detect highlights in content.
+   *
+   * Highlights have no body — only `sourceLanguage` (source-resource locale)
+   * applies, used in the prompt so the LLM analyzes non-English source
+   * correctly.
    */
-  static async detectHighlights(content, client, instructions, density) {
-    const prompt = MotivationPrompts.buildHighlightPrompt(content, instructions, density);
+  static async detectHighlights(content, client, instructions, density, sourceLanguage) {
+    const prompt = MotivationPrompts.buildHighlightPrompt(content, instructions, density, sourceLanguage);
     const response = await client.generateText(prompt, 2e3, 0.3);
     return MotivationParsers.parseHighlights(response, content);
   }
   /**
-   * Detect assessments in content
+   * Detect assessments in content.
+   *
+   * `language` is the locale the LLM should write assessment text in
+   * (annotation body locale). `sourceLanguage` is the locale of the content
+   * being analyzed (source-resource locale).
    */
-  static async detectAssessments(content, client, instructions, tone, density) {
-    const prompt = MotivationPrompts.buildAssessmentPrompt(content, instructions, tone, density);
+  static async detectAssessments(content, client, instructions, tone, density, language, sourceLanguage) {
+    const prompt = MotivationPrompts.buildAssessmentPrompt(content, instructions, tone, density, language, sourceLanguage);
     const response = await client.generateText(prompt, 3e3, 0.3);
     return MotivationParsers.parseAssessments(response, content);
   }
   /**
-   * Detect tags in content for a specific category
+   * Detect tags in content for a specific category.
+   *
+   * The full `TagSchema` is supplied by the dispatcher (resolved against
+   * the per-KB tag-schema projection at job-creation time) so the worker
+   * is independent of the registry.
+   *
+   * `sourceLanguage` is the locale of the content being analyzed. Body-locale
+   * (`language`) doesn't influence the tag prompt — categories are schema
+   * identifiers, not LLM-generated text — so it's consumed at the body-stamp
+   * site, not here.
    */
-  static async detectTags(content, client, schemaId, category) {
-    const schema = getTagSchema(schemaId);
-    if (!schema) {
-      throw new Error(`Invalid tag schema: ${schemaId}`);
-    }
-    const categoryInfo = getSchemaCategory(schemaId, category);
+  static async detectTags(content, client, schema, category, sourceLanguage) {
+    const categoryInfo = schema.tags.find((t) => t.name === category);
     if (!categoryInfo) {
-      throw new Error(`Invalid category "${category}" for schema ${schemaId}`);
+      throw new Error(`Invalid category "${category}" for schema ${schema.id}`);
     }
     const prompt = MotivationPrompts.buildTagPrompt(
       content,
@@ -1141,16 +1025,15 @@ var AnnotationDetection = class {
       schema.description,
       schema.domain,
       categoryInfo.description,
-      categoryInfo.examples
+      categoryInfo.examples,
+      sourceLanguage
     );
     const response = await client.generateText(prompt, 4e3, 0.2);
     const parsedTags = MotivationParsers.parseTags(response);
     return MotivationParsers.validateTagOffsets(parsedTags, content, category);
   }
 };
-// src/workers/detection/entity-extractor.ts
-async function extractEntities(exact, entityTypes, client, includeDescriptiveReferences = false, logger) {
+async function extractEntities(exact, entityTypes, client, includeDescriptiveReferences = false, logger, sourceLanguage) {
   const entityTypesDescription = entityTypes.map((et) => {
     if (typeof et === "string") {
       return et;
@@ -1179,8 +1062,11 @@ Examples:
 ` : `
 Find direct mentions only (names, proper nouns). Do not include pronouns or descriptive references.
 `;
+  const sourceLangGuidance = sourceLanguage ? `
+Source text language: ${getLocaleEnglishName(sourceLanguage) || sourceLanguage}.
+` : "";
   const prompt = `Identify entity references in the following text. Look for mentions of: ${entityTypesDescription}.
-${descriptiveReferenceGuidance}
+${descriptiveReferenceGuidance}${sourceLangGuidance}
 Text to analyze:
 """
 ${exact}
@@ -1374,12 +1260,13 @@ Example output:
 function getLanguageName(locale) {
   return getLocaleEnglishName(locale) || locale;
 }
-async function generateResourceFromTopic(topic, entityTypes, client, userPrompt, locale, context, temperature, maxTokens, logger) {
+async function generateResourceFromTopic(topic, entityTypes, client, userPrompt, locale, context, temperature, maxTokens, logger, sourceLanguage) {
   logger?.debug("Generating resource from topic", {
     topicPreview: topic.substring(0, 100),
     entityTypes,
     hasUserPrompt: !!userPrompt,
     locale,
+    sourceLanguage,
     hasContext: !!context,
     temperature,
     maxTokens
@@ -1389,6 +1276,9 @@ async function generateResourceFromTopic(topic, entityTypes, client, userPrompt,
   const languageInstruction = locale && locale !== "en" ? `
 IMPORTANT: Write the entire resource in ${getLanguageName(locale)}.` : "";
+  const sourceLanguageInstruction = sourceLanguage ? `
+The source resource and embedded context are in ${getLanguageName(sourceLanguage)}.` : "";
   let annotationSection = "";
   if (context) {
     const parts = [];
@@ -1450,7 +1340,7 @@ ${parts.join("\n")}`;
   const structureGuidance = finalMaxTokens >= 1e3 ? "organized into titled sections (## Section) with well-structured paragraphs" : "organized into well-structured paragraphs";
   const prompt = `Generate a concise, informative resource about "${topic}".
 ${entityTypes.length > 0 ? `Focus on these entity types: ${entityTypes.join(", ")}.` : ""}
-${userPrompt ? `Additional context: ${userPrompt}` : ""}${annotationSection}${contextSection}${graphContextSection}${languageInstruction}
+${userPrompt ? `Additional context: ${userPrompt}` : ""}${annotationSection}${contextSection}${graphContextSection}${sourceLanguageInstruction}${languageInstruction}
 Requirements:
 - Start with a clear heading (# Title)
@@ -1527,7 +1417,8 @@ async function processHighlightJob(content, inferenceClient, params, userId, gen
     content,
     inferenceClient,
     params.instructions,
-    params.density
+    params.density,
+    params.sourceLanguage
   );
   onProgress(60, `Creating ${highlights.length} annotations...`, "creating");
   const annotations = highlights.map(
@@ -1547,16 +1438,19 @@ async function processCommentJob(content, inferenceClient, params, userId, gener
     inferenceClient,
     params.instructions,
     params.tone,
-    params.density
+    params.density,
+    params.language,
+    params.sourceLanguage
   );
   onProgress(60, `Creating ${comments.length} annotations...`, "creating");
+  const bodyLanguage = params.language ?? "en";
   const annotations = comments.map(
     (c) => (
       // Match the pre-#651 CommentAnnotationWorker: include format and
       // language on the body TextualBody. Optional in the schema, but
       // consumers that do language-aware rendering rely on them.
       buildTextAnnotation(params.resourceId, userId, generator, "commenting", c, [
-        { type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language: "en" }
+        { type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language: bodyLanguage }
       ])
     )
   );
@@ -1574,9 +1468,12 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
     inferenceClient,
     params.instructions,
     params.tone,
-    params.density
+    params.density,
+    params.language,
+    params.sourceLanguage
   );
   onProgress(60, `Creating ${assessments.length} annotations...`, "creating");
+  const bodyLanguage = params.language ?? "en";
   const annotations = assessments.map(
     (a) => (
       // Single-object body with purpose aligned to motivation, matching the
@@ -1590,7 +1487,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
         value: a.assessment,
         purpose: "assessing",
         format: "text/plain",
-        language: "en"
+        language: bodyLanguage
       })
     )
   );
@@ -1609,6 +1506,7 @@ async function processReferenceJob(content, inferenceClient, params, userId, gen
   let errors = 0;
   const allAnnotations = [];
   onProgress(10, "Loading resource...", "analyzing", { requestParams });
+  const bodyLanguage = params.language ?? "en";
   for (let i = 0; i < entityTypeNames.length; i++) {
     const entityTypeName = entityTypeNames[i];
     if (!entityTypeName) continue;
@@ -1627,11 +1525,14 @@ async function processReferenceJob(content, inferenceClient, params, userId, gen
       [entityTypeName],
       inferenceClient,
       params.includeDescriptiveReferences ?? false,
-      logger
+      logger,
+      params.sourceLanguage
     );
     totalFound += extractedEntities.length;
     completedEntityTypes.push({ entityType: entityTypeName, foundCount: extractedEntities.length });
-    const unresolvedBody = [{ type: "TextualBody", value: entityTypeName, purpose: "tagging" }];
+    const unresolvedBody = [
+      { type: "TextualBody", value: entityTypeName, purpose: "tagging", format: "text/plain", language: bodyLanguage }
+    ];
     for (const entity of extractedEntities) {
       try {
         const validated = validateAndCorrectOffsets(content, entity.startOffset, entity.endOffset, entity.exact);
@@ -1664,20 +1565,22 @@ async function processTagJob(content, inferenceClient, params, userId, generator
     const categoryTags = await AnnotationDetection.detectTags(
       content,
       inferenceClient,
-      params.schemaId,
-      category
+      params.schema,
+      category,
+      params.sourceLanguage
     );
     allTags.push(...categoryTags);
   }
   const tags = allTags;
   onProgress(60, `Creating ${tags.length} tag annotations...`, "creating");
+  const bodyLanguage = params.language ?? "en";
   const byCategory = {};
   const annotations = tags.map((t) => {
     const category = t.category ?? "unknown";
     byCategory[category] = (byCategory[category] ?? 0) + 1;
     return buildTextAnnotation(params.resourceId, userId, generator, "tagging", t, [
-      { type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: "en" },
-      { type: "TextualBody", value: params.schemaId, purpose: "classifying", format: "text/plain" }
+      { type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: bodyLanguage },
+      { type: "TextualBody", value: params.schema.id, purpose: "classifying", format: "text/plain" }
     ]);
   });
   onProgress(100, `Complete! Created ${annotations.length} tags`, "creating");
@@ -1699,7 +1602,10 @@ async function processGenerationJob(inferenceClient, params, onProgress) {
     params.language,
     params.context,
     params.temperature,
-    params.maxTokens
+    params.maxTokens,
+    void 0,
+    // logger
+    params.sourceLanguage
   );
   onProgress(85, "Creating resource...", "creating");
   return {