@semiont/jobs 0.5.2 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/fs-job-queue.d.ts +79 -0
- package/dist/fs-job-queue.d.ts.map +1 -0
- package/dist/index.d.ts +20 -623
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +74 -218
- package/dist/index.js.map +1 -1
- package/dist/job-claim-adapter.d.ts +76 -0
- package/dist/job-claim-adapter.d.ts.map +1 -0
- package/dist/job-queue-interface.d.ts +19 -0
- package/dist/job-queue-interface.d.ts.map +1 -0
- package/dist/job-queue-state-unit.d.ts +26 -0
- package/dist/job-queue-state-unit.d.ts.map +1 -0
- package/dist/job-worker.d.ts +67 -0
- package/dist/job-worker.d.ts.map +1 -0
- package/dist/processors.d.ts +41 -0
- package/dist/processors.d.ts.map +1 -0
- package/dist/types.d.ts +319 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/worker-main.d.ts +22 -2
- package/dist/worker-main.d.ts.map +1 -0
- package/dist/worker-main.js +175 -276
- package/dist/worker-main.js.map +1 -1
- package/dist/worker-process.d.ts +47 -0
- package/dist/worker-process.d.ts.map +1 -0
- package/dist/workers/annotation-detection.d.ts +61 -0
- package/dist/workers/annotation-detection.d.ts.map +1 -0
- package/dist/workers/detection/entity-extractor.d.ts +42 -0
- package/dist/workers/detection/entity-extractor.d.ts.map +1 -0
- package/dist/workers/detection/motivation-parsers.d.ts +116 -0
- package/dist/workers/detection/motivation-parsers.d.ts.map +1 -0
- package/dist/workers/detection/motivation-prompts.d.ts +57 -0
- package/dist/workers/detection/motivation-prompts.d.ts.map +1 -0
- package/dist/workers/generation/resource-generation.d.ts +23 -0
- package/dist/workers/generation/resource-generation.d.ts.map +1 -0
- package/package.json +3 -3
package/dist/worker-main.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { createTomlConfigLoader, baseUrl, RESOURCE_BROADCAST_TYPES, resourceId, validateAndCorrectOffsets, didToAgent, getLocaleEnglishName } from '@semiont/core';
|
|
1
|
+
import { createTomlConfigLoader, softwareToAgent, baseUrl, RESOURCE_BROADCAST_TYPES, resourceId, validateAndCorrectOffsets, didToAgent, getLocaleEnglishName } from '@semiont/core';
|
|
2
2
|
import { deriveStorageUri } from '@semiont/content';
|
|
3
3
|
import { withSpan, SpanKind, recordJobOutcome } from '@semiont/observability';
|
|
4
4
|
import { generateAnnotationId } from '@semiont/event-sourcing';
|
|
@@ -6388,9 +6388,9 @@ var require_groupBy = __commonJS({
|
|
|
6388
6388
|
} else {
|
|
6389
6389
|
duration = elementOrOptions.duration, element = elementOrOptions.element, connector = elementOrOptions.connector;
|
|
6390
6390
|
}
|
|
6391
|
-
var
|
|
6391
|
+
var groups2 = /* @__PURE__ */ new Map();
|
|
6392
6392
|
var notify = function(cb) {
|
|
6393
|
-
|
|
6393
|
+
groups2.forEach(cb);
|
|
6394
6394
|
cb(subscriber);
|
|
6395
6395
|
};
|
|
6396
6396
|
var handleError = function(err) {
|
|
@@ -6403,9 +6403,9 @@ var require_groupBy = __commonJS({
|
|
|
6403
6403
|
var groupBySourceSubscriber = new OperatorSubscriber_1.OperatorSubscriber(subscriber, function(value) {
|
|
6404
6404
|
try {
|
|
6405
6405
|
var key_1 = keySelector(value);
|
|
6406
|
-
var group_1 =
|
|
6406
|
+
var group_1 = groups2.get(key_1);
|
|
6407
6407
|
if (!group_1) {
|
|
6408
|
-
|
|
6408
|
+
groups2.set(key_1, group_1 = connector ? connector() : new Subject_1.Subject());
|
|
6409
6409
|
var grouped = createGroupedObservable(key_1, group_1);
|
|
6410
6410
|
subscriber.next(grouped);
|
|
6411
6411
|
if (duration) {
|
|
@@ -6413,7 +6413,7 @@ var require_groupBy = __commonJS({
|
|
|
6413
6413
|
group_1.complete();
|
|
6414
6414
|
durationSubscriber_1 === null || durationSubscriber_1 === void 0 ? void 0 : durationSubscriber_1.unsubscribe();
|
|
6415
6415
|
}, void 0, void 0, function() {
|
|
6416
|
-
return
|
|
6416
|
+
return groups2.delete(key_1);
|
|
6417
6417
|
});
|
|
6418
6418
|
groupBySourceSubscriber.add(innerFrom_1.innerFrom(duration(grouped)).subscribe(durationSubscriber_1));
|
|
6419
6419
|
}
|
|
@@ -6427,7 +6427,7 @@ var require_groupBy = __commonJS({
|
|
|
6427
6427
|
return consumer.complete();
|
|
6428
6428
|
});
|
|
6429
6429
|
}, handleError, function() {
|
|
6430
|
-
return
|
|
6430
|
+
return groups2.clear();
|
|
6431
6431
|
}, function() {
|
|
6432
6432
|
teardownAttempted = true;
|
|
6433
6433
|
return activeGroups === 0;
|
|
@@ -9804,159 +9804,6 @@ var MotivationParsers = class {
|
|
|
9804
9804
|
}
|
|
9805
9805
|
};
|
|
9806
9806
|
|
|
9807
|
-
// ../ontology/dist/index.js
|
|
9808
|
-
var TAG_SCHEMAS = {
|
|
9809
|
-
"legal-irac": {
|
|
9810
|
-
id: "legal-irac",
|
|
9811
|
-
name: "Legal Analysis (IRAC)",
|
|
9812
|
-
description: "Issue, Rule, Application, Conclusion framework for legal reasoning",
|
|
9813
|
-
domain: "legal",
|
|
9814
|
-
tags: [
|
|
9815
|
-
{
|
|
9816
|
-
name: "Issue",
|
|
9817
|
-
description: "The legal question or problem to be resolved",
|
|
9818
|
-
examples: [
|
|
9819
|
-
"What is the central legal question?",
|
|
9820
|
-
"What must the court decide?",
|
|
9821
|
-
"What is the dispute about?"
|
|
9822
|
-
]
|
|
9823
|
-
},
|
|
9824
|
-
{
|
|
9825
|
-
name: "Rule",
|
|
9826
|
-
description: "The relevant law, statute, or legal principle",
|
|
9827
|
-
examples: [
|
|
9828
|
-
"What law applies?",
|
|
9829
|
-
"What is the legal standard?",
|
|
9830
|
-
"What statute governs this case?"
|
|
9831
|
-
]
|
|
9832
|
-
},
|
|
9833
|
-
{
|
|
9834
|
-
name: "Application",
|
|
9835
|
-
description: "How the rule applies to the specific facts",
|
|
9836
|
-
examples: [
|
|
9837
|
-
"How does the law apply to these facts?",
|
|
9838
|
-
"Analysis of the case",
|
|
9839
|
-
"How do the facts satisfy the legal standard?"
|
|
9840
|
-
]
|
|
9841
|
-
},
|
|
9842
|
-
{
|
|
9843
|
-
name: "Conclusion",
|
|
9844
|
-
description: "The resolution or outcome based on the analysis",
|
|
9845
|
-
examples: [
|
|
9846
|
-
"What is the court's decision?",
|
|
9847
|
-
"What is the final judgment?",
|
|
9848
|
-
"What is the holding?"
|
|
9849
|
-
]
|
|
9850
|
-
}
|
|
9851
|
-
]
|
|
9852
|
-
},
|
|
9853
|
-
"scientific-imrad": {
|
|
9854
|
-
id: "scientific-imrad",
|
|
9855
|
-
name: "Scientific Paper (IMRAD)",
|
|
9856
|
-
description: "Introduction, Methods, Results, Discussion structure for research papers",
|
|
9857
|
-
domain: "scientific",
|
|
9858
|
-
tags: [
|
|
9859
|
-
{
|
|
9860
|
-
name: "Introduction",
|
|
9861
|
-
description: "Background, context, and research question",
|
|
9862
|
-
examples: [
|
|
9863
|
-
"What is the research question?",
|
|
9864
|
-
"Why is this important?",
|
|
9865
|
-
"What is the hypothesis?"
|
|
9866
|
-
]
|
|
9867
|
-
},
|
|
9868
|
-
{
|
|
9869
|
-
name: "Methods",
|
|
9870
|
-
description: "Experimental design and procedures",
|
|
9871
|
-
examples: [
|
|
9872
|
-
"How was the study conducted?",
|
|
9873
|
-
"What methods were used?",
|
|
9874
|
-
"What was the experimental design?"
|
|
9875
|
-
]
|
|
9876
|
-
},
|
|
9877
|
-
{
|
|
9878
|
-
name: "Results",
|
|
9879
|
-
description: "Findings and observations",
|
|
9880
|
-
examples: [
|
|
9881
|
-
"What did the study find?",
|
|
9882
|
-
"What are the data?",
|
|
9883
|
-
"What were the observations?"
|
|
9884
|
-
]
|
|
9885
|
-
},
|
|
9886
|
-
{
|
|
9887
|
-
name: "Discussion",
|
|
9888
|
-
description: "Interpretation and implications of results",
|
|
9889
|
-
examples: [
|
|
9890
|
-
"What do the results mean?",
|
|
9891
|
-
"What are the implications?",
|
|
9892
|
-
"How do these findings relate to prior work?"
|
|
9893
|
-
]
|
|
9894
|
-
}
|
|
9895
|
-
]
|
|
9896
|
-
},
|
|
9897
|
-
"argument-toulmin": {
|
|
9898
|
-
id: "argument-toulmin",
|
|
9899
|
-
name: "Argument Structure (Toulmin)",
|
|
9900
|
-
description: "Claim, Evidence, Warrant, Counterargument, Rebuttal framework for argumentation",
|
|
9901
|
-
domain: "general",
|
|
9902
|
-
tags: [
|
|
9903
|
-
{
|
|
9904
|
-
name: "Claim",
|
|
9905
|
-
description: "The main assertion or thesis",
|
|
9906
|
-
examples: [
|
|
9907
|
-
"What is being argued?",
|
|
9908
|
-
"What is the main point?",
|
|
9909
|
-
"What position is being taken?"
|
|
9910
|
-
]
|
|
9911
|
-
},
|
|
9912
|
-
{
|
|
9913
|
-
name: "Evidence",
|
|
9914
|
-
description: "Data or facts supporting the claim",
|
|
9915
|
-
examples: [
|
|
9916
|
-
"What supports this claim?",
|
|
9917
|
-
"What are the facts?",
|
|
9918
|
-
"What data is provided?"
|
|
9919
|
-
]
|
|
9920
|
-
},
|
|
9921
|
-
{
|
|
9922
|
-
name: "Warrant",
|
|
9923
|
-
description: "Reasoning connecting evidence to claim",
|
|
9924
|
-
examples: [
|
|
9925
|
-
"Why does this evidence support the claim?",
|
|
9926
|
-
"What is the logic?",
|
|
9927
|
-
"How does this reasoning work?"
|
|
9928
|
-
]
|
|
9929
|
-
},
|
|
9930
|
-
{
|
|
9931
|
-
name: "Counterargument",
|
|
9932
|
-
description: "Opposing viewpoints or objections",
|
|
9933
|
-
examples: [
|
|
9934
|
-
"What are the objections?",
|
|
9935
|
-
"What do critics say?",
|
|
9936
|
-
"What are alternative views?"
|
|
9937
|
-
]
|
|
9938
|
-
},
|
|
9939
|
-
{
|
|
9940
|
-
name: "Rebuttal",
|
|
9941
|
-
description: "Response to counterarguments",
|
|
9942
|
-
examples: [
|
|
9943
|
-
"How is the objection addressed?",
|
|
9944
|
-
"Why is the counterargument wrong?",
|
|
9945
|
-
"How is the criticism answered?"
|
|
9946
|
-
]
|
|
9947
|
-
}
|
|
9948
|
-
]
|
|
9949
|
-
}
|
|
9950
|
-
};
|
|
9951
|
-
function getTagSchema(schemaId) {
|
|
9952
|
-
return TAG_SCHEMAS[schemaId] || null;
|
|
9953
|
-
}
|
|
9954
|
-
function getSchemaCategory(schemaId, categoryName) {
|
|
9955
|
-
const schema = getTagSchema(schemaId);
|
|
9956
|
-
if (!schema) return null;
|
|
9957
|
-
return schema.tags.find((tag) => tag.name === categoryName) || null;
|
|
9958
|
-
}
|
|
9959
|
-
|
|
9960
9807
|
// src/workers/annotation-detection.ts
|
|
9961
9808
|
var AnnotationDetection = class {
|
|
9962
9809
|
/**
|
|
@@ -9984,7 +9831,7 @@ var AnnotationDetection = class {
|
|
|
9984
9831
|
*/
|
|
9985
9832
|
static async detectComments(content, client, instructions, tone, density, language, sourceLanguage) {
|
|
9986
9833
|
const prompt = MotivationPrompts.buildCommentPrompt(content, instructions, tone, density, language, sourceLanguage);
|
|
9987
|
-
const response = await client.generateText(prompt, 3e3, 0.4);
|
|
9834
|
+
const response = await client.generateText(prompt, 3e3, 0.4, { format: "json" });
|
|
9988
9835
|
return MotivationParsers.parseComments(response, content);
|
|
9989
9836
|
}
|
|
9990
9837
|
/**
|
|
@@ -9996,7 +9843,7 @@ var AnnotationDetection = class {
|
|
|
9996
9843
|
*/
|
|
9997
9844
|
static async detectHighlights(content, client, instructions, density, sourceLanguage) {
|
|
9998
9845
|
const prompt = MotivationPrompts.buildHighlightPrompt(content, instructions, density, sourceLanguage);
|
|
9999
|
-
const response = await client.generateText(prompt, 2e3, 0.3);
|
|
9846
|
+
const response = await client.generateText(prompt, 2e3, 0.3, { format: "json" });
|
|
10000
9847
|
return MotivationParsers.parseHighlights(response, content);
|
|
10001
9848
|
}
|
|
10002
9849
|
/**
|
|
@@ -10008,25 +9855,25 @@ var AnnotationDetection = class {
|
|
|
10008
9855
|
*/
|
|
10009
9856
|
static async detectAssessments(content, client, instructions, tone, density, language, sourceLanguage) {
|
|
10010
9857
|
const prompt = MotivationPrompts.buildAssessmentPrompt(content, instructions, tone, density, language, sourceLanguage);
|
|
10011
|
-
const response = await client.generateText(prompt, 3e3, 0.3);
|
|
9858
|
+
const response = await client.generateText(prompt, 3e3, 0.3, { format: "json" });
|
|
10012
9859
|
return MotivationParsers.parseAssessments(response, content);
|
|
10013
9860
|
}
|
|
10014
9861
|
/**
|
|
10015
9862
|
* Detect tags in content for a specific category.
|
|
10016
9863
|
*
|
|
9864
|
+
* The full `TagSchema` is supplied by the dispatcher (resolved against
|
|
9865
|
+
* the per-KB tag-schema projection at job-creation time) so the worker
|
|
9866
|
+
* is independent of the registry.
|
|
9867
|
+
*
|
|
10017
9868
|
* `sourceLanguage` is the locale of the content being analyzed. Body-locale
|
|
10018
9869
|
* (`language`) doesn't influence the tag prompt — categories are schema
|
|
10019
9870
|
* identifiers, not LLM-generated text — so it's consumed at the body-stamp
|
|
10020
9871
|
* site, not here.
|
|
10021
9872
|
*/
|
|
10022
|
-
static async detectTags(content, client,
|
|
10023
|
-
const
|
|
10024
|
-
if (!schema) {
|
|
10025
|
-
throw new Error(`Invalid tag schema: ${schemaId}`);
|
|
10026
|
-
}
|
|
10027
|
-
const categoryInfo = getSchemaCategory(schemaId, category);
|
|
9873
|
+
static async detectTags(content, client, schema, category, sourceLanguage) {
|
|
9874
|
+
const categoryInfo = schema.tags.find((t) => t.name === category);
|
|
10028
9875
|
if (!categoryInfo) {
|
|
10029
|
-
throw new Error(`Invalid category "${category}" for schema ${
|
|
9876
|
+
throw new Error(`Invalid category "${category}" for schema ${schema.id}`);
|
|
10030
9877
|
}
|
|
10031
9878
|
const prompt = MotivationPrompts.buildTagPrompt(
|
|
10032
9879
|
content,
|
|
@@ -10038,12 +9885,12 @@ var AnnotationDetection = class {
|
|
|
10038
9885
|
categoryInfo.examples,
|
|
10039
9886
|
sourceLanguage
|
|
10040
9887
|
);
|
|
10041
|
-
const response = await client.generateText(prompt, 4e3, 0.2);
|
|
9888
|
+
const response = await client.generateText(prompt, 4e3, 0.2, { format: "json" });
|
|
10042
9889
|
const parsedTags = MotivationParsers.parseTags(response);
|
|
10043
9890
|
return MotivationParsers.validateTagOffsets(parsedTags, content, category);
|
|
10044
9891
|
}
|
|
10045
9892
|
};
|
|
10046
|
-
async function extractEntities(exact, entityTypes, client, includeDescriptiveReferences
|
|
9893
|
+
async function extractEntities(exact, entityTypes, client, includeDescriptiveReferences, logger2, sourceLanguage) {
|
|
10047
9894
|
const entityTypesDescription = entityTypes.map((et) => {
|
|
10048
9895
|
if (typeof et === "string") {
|
|
10049
9896
|
return et;
|
|
@@ -10094,48 +9941,57 @@ If no entities are found, respond with an empty array [].
|
|
|
10094
9941
|
|
|
10095
9942
|
Example output:
|
|
10096
9943
|
[{"exact":"Alice","entityType":"Person","startOffset":0,"endOffset":5,"prefix":"","suffix":" went to"},{"exact":"Paris","entityType":"Location","startOffset":20,"endOffset":25,"prefix":"went to ","suffix":" yesterday"}]`;
|
|
9944
|
+
logger2.debug("Sending entity extraction request", { entityTypes: entityTypesDescription });
|
|
10097
9945
|
const response = await client.generateTextWithMetadata(
|
|
10098
9946
|
prompt,
|
|
10099
9947
|
4e3,
|
|
10100
9948
|
// Increased to handle many entities without truncation
|
|
10101
|
-
0.3
|
|
9949
|
+
0.3,
|
|
10102
9950
|
// Lower temperature for more consistent extraction
|
|
9951
|
+
// Force grammar-constrained JSON output. Without this, Ollama models
|
|
9952
|
+
// periodically emit malformed JSON (truncated brackets, mid-token
|
|
9953
|
+
// breaks at higher token counts) which silently parse-fails into
|
|
9954
|
+
// [] downstream. The prompt's schema (which keys, what types) still
|
|
9955
|
+
// governs *what* the JSON contains; `format: 'json'` governs that
|
|
9956
|
+
// it's syntactically valid.
|
|
9957
|
+
{ format: "json" }
|
|
10103
9958
|
);
|
|
9959
|
+
logger2.debug("Got entity extraction response", { responseLength: response.text.length });
|
|
10104
9960
|
try {
|
|
10105
9961
|
let jsonStr = response.text.trim();
|
|
10106
9962
|
if (jsonStr.startsWith("```")) {
|
|
10107
9963
|
jsonStr = jsonStr.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
|
|
10108
9964
|
}
|
|
10109
9965
|
const entities = JSON.parse(jsonStr);
|
|
10110
|
-
logger2
|
|
9966
|
+
logger2.debug("Parsed entities from AI response", { count: entities.length });
|
|
10111
9967
|
if (response.stopReason === "max_tokens") {
|
|
10112
9968
|
const errorMsg = `AI response truncated: Found ${entities.length} entities but response hit max_tokens limit. Increase max_tokens or reduce resource size.`;
|
|
10113
|
-
logger2
|
|
9969
|
+
logger2.error(errorMsg);
|
|
10114
9970
|
throw new Error(errorMsg);
|
|
10115
9971
|
}
|
|
10116
9972
|
return entities.map((entity, idx) => {
|
|
10117
|
-
let
|
|
10118
|
-
let
|
|
10119
|
-
logger2
|
|
9973
|
+
let start = entity.startOffset;
|
|
9974
|
+
let end = entity.endOffset;
|
|
9975
|
+
logger2.debug("Processing entity", {
|
|
10120
9976
|
index: idx + 1,
|
|
10121
9977
|
total: entities.length,
|
|
10122
9978
|
type: entity.entityType,
|
|
10123
9979
|
text: entity.exact,
|
|
10124
|
-
offsetsFromAI: `[${
|
|
9980
|
+
offsetsFromAI: `[${start}:${end}]`
|
|
10125
9981
|
});
|
|
10126
|
-
const extractedText = exact.substring(
|
|
9982
|
+
const extractedText = exact.substring(start, end);
|
|
10127
9983
|
let anchorMethod;
|
|
10128
9984
|
if (extractedText === entity.exact) {
|
|
10129
9985
|
anchorMethod = "llm-exact";
|
|
10130
|
-
logger2
|
|
9986
|
+
logger2.debug("Entity anchored", {
|
|
10131
9987
|
text: entity.exact,
|
|
10132
9988
|
entityType: entity.entityType,
|
|
10133
9989
|
anchorMethod
|
|
10134
9990
|
});
|
|
10135
9991
|
} else {
|
|
10136
|
-
logger2
|
|
9992
|
+
logger2.debug("LLM offsets mismatch \u2014 attempting re-anchor", {
|
|
10137
9993
|
expected: entity.exact,
|
|
10138
|
-
llmOffsets: `[${
|
|
9994
|
+
llmOffsets: `[${start}:${end}]`,
|
|
10139
9995
|
foundAtLlmOffsets: extractedText
|
|
10140
9996
|
});
|
|
10141
9997
|
let occurrenceCount = 0;
|
|
@@ -10148,10 +10004,10 @@ Example output:
|
|
|
10148
10004
|
}
|
|
10149
10005
|
if (occurrenceCount === 0) {
|
|
10150
10006
|
anchorMethod = "dropped";
|
|
10151
|
-
logger2
|
|
10007
|
+
logger2.error("Entity text not found in resource \u2014 dropping", {
|
|
10152
10008
|
text: entity.exact,
|
|
10153
10009
|
entityType: entity.entityType,
|
|
10154
|
-
llmOffsets: `[${
|
|
10010
|
+
llmOffsets: `[${start}:${end}]`,
|
|
10155
10011
|
anchorMethod,
|
|
10156
10012
|
resourceStart: exact.substring(0, 200)
|
|
10157
10013
|
});
|
|
@@ -10177,9 +10033,9 @@ Example output:
|
|
|
10177
10033
|
}
|
|
10178
10034
|
if (recoveredOffset !== -1) {
|
|
10179
10035
|
anchorMethod = "context-recovered";
|
|
10180
|
-
|
|
10181
|
-
|
|
10182
|
-
logger2
|
|
10036
|
+
start = recoveredOffset;
|
|
10037
|
+
end = recoveredOffset + entity.exact.length;
|
|
10038
|
+
logger2.debug("Entity anchored", {
|
|
10183
10039
|
text: entity.exact,
|
|
10184
10040
|
entityType: entity.entityType,
|
|
10185
10041
|
anchorMethod,
|
|
@@ -10187,9 +10043,9 @@ Example output:
|
|
|
10187
10043
|
});
|
|
10188
10044
|
} else if (occurrenceCount === 1) {
|
|
10189
10045
|
anchorMethod = "unique-match";
|
|
10190
|
-
|
|
10191
|
-
|
|
10192
|
-
logger2
|
|
10046
|
+
start = firstOccurrence;
|
|
10047
|
+
end = firstOccurrence + entity.exact.length;
|
|
10048
|
+
logger2.debug("Entity anchored", {
|
|
10193
10049
|
text: entity.exact,
|
|
10194
10050
|
entityType: entity.entityType,
|
|
10195
10051
|
anchorMethod,
|
|
@@ -10197,9 +10053,9 @@ Example output:
|
|
|
10197
10053
|
});
|
|
10198
10054
|
} else {
|
|
10199
10055
|
anchorMethod = "first-of-many";
|
|
10200
|
-
|
|
10201
|
-
|
|
10202
|
-
logger2
|
|
10056
|
+
start = firstOccurrence;
|
|
10057
|
+
end = firstOccurrence + entity.exact.length;
|
|
10058
|
+
logger2.warn("Entity anchored at first of multiple occurrences \u2014 may be wrong", {
|
|
10203
10059
|
text: entity.exact,
|
|
10204
10060
|
entityType: entity.entityType,
|
|
10205
10061
|
anchorMethod,
|
|
@@ -10214,58 +10070,71 @@ Example output:
|
|
|
10214
10070
|
return {
|
|
10215
10071
|
exact: entity.exact,
|
|
10216
10072
|
entityType: entity.entityType,
|
|
10217
|
-
|
|
10218
|
-
|
|
10073
|
+
start,
|
|
10074
|
+
end,
|
|
10219
10075
|
prefix: entity.prefix,
|
|
10220
10076
|
suffix: entity.suffix
|
|
10221
10077
|
};
|
|
10222
10078
|
}).filter((entity) => {
|
|
10223
10079
|
if (entity === null) {
|
|
10224
|
-
logger2
|
|
10080
|
+
logger2.debug("Filtered entity: null");
|
|
10225
10081
|
return false;
|
|
10226
10082
|
}
|
|
10227
|
-
if (entity.
|
|
10228
|
-
logger2
|
|
10083
|
+
if (entity.start === void 0 || entity.end === void 0) {
|
|
10084
|
+
logger2.warn("Filtered entity: missing offsets", { text: entity.exact });
|
|
10229
10085
|
return false;
|
|
10230
10086
|
}
|
|
10231
|
-
if (entity.
|
|
10232
|
-
logger2
|
|
10087
|
+
if (entity.start < 0) {
|
|
10088
|
+
logger2.warn("Filtered entity: negative start", {
|
|
10233
10089
|
text: entity.exact,
|
|
10234
|
-
|
|
10090
|
+
start: entity.start
|
|
10235
10091
|
});
|
|
10236
10092
|
return false;
|
|
10237
10093
|
}
|
|
10238
|
-
if (entity.
|
|
10239
|
-
logger2
|
|
10094
|
+
if (entity.end > exact.length) {
|
|
10095
|
+
logger2.warn("Filtered entity: end exceeds text length", {
|
|
10240
10096
|
text: entity.exact,
|
|
10241
|
-
|
|
10097
|
+
end: entity.end,
|
|
10242
10098
|
textLength: exact.length
|
|
10243
10099
|
});
|
|
10244
10100
|
return false;
|
|
10245
10101
|
}
|
|
10246
|
-
const extractedText = exact.substring(entity.
|
|
10102
|
+
const extractedText = exact.substring(entity.start, entity.end);
|
|
10247
10103
|
if (extractedText !== entity.exact) {
|
|
10248
|
-
logger2
|
|
10104
|
+
logger2.warn("Filtered entity: offset mismatch", {
|
|
10249
10105
|
expected: entity.exact,
|
|
10250
10106
|
got: extractedText,
|
|
10251
|
-
offsets: `[${entity.
|
|
10107
|
+
offsets: `[${entity.start}:${entity.end}]`
|
|
10252
10108
|
});
|
|
10253
10109
|
return false;
|
|
10254
10110
|
}
|
|
10255
|
-
logger2
|
|
10111
|
+
logger2.debug("Accepted entity", {
|
|
10256
10112
|
text: entity.exact,
|
|
10257
|
-
offsets: `[${entity.
|
|
10113
|
+
offsets: `[${entity.start}:${entity.end}]`
|
|
10258
10114
|
});
|
|
10259
10115
|
return true;
|
|
10260
10116
|
});
|
|
10261
10117
|
} catch (error) {
|
|
10118
|
+
logger2.error("Failed to parse entity extraction response", {
|
|
10119
|
+
error: error instanceof Error ? error.message : String(error)
|
|
10120
|
+
});
|
|
10262
10121
|
return [];
|
|
10263
10122
|
}
|
|
10264
10123
|
}
|
|
10265
10124
|
function getLanguageName(locale) {
|
|
10266
10125
|
return getLocaleEnglishName(locale) || locale;
|
|
10267
10126
|
}
|
|
10268
|
-
async function generateResourceFromTopic(topic, entityTypes, client, userPrompt, locale, context, temperature, maxTokens,
|
|
10127
|
+
async function generateResourceFromTopic(topic, entityTypes, client, logger2, userPrompt, locale, context, temperature, maxTokens, sourceLanguage) {
|
|
10128
|
+
logger2.debug("Generating resource from topic", {
|
|
10129
|
+
topicPreview: topic.substring(0, 100),
|
|
10130
|
+
entityTypes,
|
|
10131
|
+
hasUserPrompt: !!userPrompt,
|
|
10132
|
+
locale,
|
|
10133
|
+
sourceLanguage,
|
|
10134
|
+
hasContext: !!context,
|
|
10135
|
+
temperature,
|
|
10136
|
+
maxTokens
|
|
10137
|
+
});
|
|
10269
10138
|
const finalTemperature = temperature ?? 0.7;
|
|
10270
10139
|
const finalMaxTokens = maxTokens ?? 500;
|
|
10271
10140
|
const languageInstruction = locale && locale !== "en" ? `
|
|
@@ -10364,18 +10233,33 @@ Requirements:
|
|
|
10364
10233
|
content
|
|
10365
10234
|
};
|
|
10366
10235
|
};
|
|
10236
|
+
logger2.debug("Sending prompt to inference", {
|
|
10237
|
+
promptLength: prompt.length,
|
|
10238
|
+
temperature: finalTemperature,
|
|
10239
|
+
maxTokens: finalMaxTokens
|
|
10240
|
+
});
|
|
10367
10241
|
const response = await client.generateText(prompt, finalMaxTokens, finalTemperature);
|
|
10242
|
+
logger2.debug("Got response from inference", { responseLength: response.length });
|
|
10368
10243
|
const result = parseResponse(response);
|
|
10244
|
+
logger2.debug("Parsed response", {
|
|
10245
|
+
hasTitle: !!result.title,
|
|
10246
|
+
titleLength: result.title?.length,
|
|
10247
|
+
hasContent: !!result.content,
|
|
10248
|
+
contentLength: result.content?.length
|
|
10249
|
+
});
|
|
10369
10250
|
return result;
|
|
10370
10251
|
}
|
|
10371
10252
|
function buildTextAnnotation(resourceId, userId, generator, motivation, match, body) {
|
|
10253
|
+
const creator = didToAgent(userId);
|
|
10254
|
+
const wasAttributedTo = creator["@id"] === generator["@id"] ? [generator] : [creator, generator];
|
|
10372
10255
|
return {
|
|
10373
10256
|
"@context": "http://www.w3.org/ns/anno.jsonld",
|
|
10374
10257
|
"type": "Annotation",
|
|
10375
10258
|
"id": generateAnnotationId(),
|
|
10376
10259
|
motivation,
|
|
10377
|
-
creator
|
|
10260
|
+
creator,
|
|
10378
10261
|
generator,
|
|
10262
|
+
wasAttributedTo,
|
|
10379
10263
|
created: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10380
10264
|
target: {
|
|
10381
10265
|
type: "SpecificResource",
|
|
@@ -10518,7 +10402,7 @@ async function processReferenceJob(content, inferenceClient, params, userId, gen
|
|
|
10518
10402
|
];
|
|
10519
10403
|
for (const entity of extractedEntities) {
|
|
10520
10404
|
try {
|
|
10521
|
-
const validated = validateAndCorrectOffsets(content, entity.
|
|
10405
|
+
const validated = validateAndCorrectOffsets(content, entity.start, entity.end, entity.exact);
|
|
10522
10406
|
const ann = buildTextAnnotation(
|
|
10523
10407
|
params.resourceId,
|
|
10524
10408
|
userId,
|
|
@@ -10548,7 +10432,7 @@ async function processTagJob(content, inferenceClient, params, userId, generator
|
|
|
10548
10432
|
const categoryTags = await AnnotationDetection.detectTags(
|
|
10549
10433
|
content,
|
|
10550
10434
|
inferenceClient,
|
|
10551
|
-
params.
|
|
10435
|
+
params.schema,
|
|
10552
10436
|
category,
|
|
10553
10437
|
params.sourceLanguage
|
|
10554
10438
|
);
|
|
@@ -10563,7 +10447,7 @@ async function processTagJob(content, inferenceClient, params, userId, generator
|
|
|
10563
10447
|
byCategory[category] = (byCategory[category] ?? 0) + 1;
|
|
10564
10448
|
return buildTextAnnotation(params.resourceId, userId, generator, "tagging", t, [
|
|
10565
10449
|
{ type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: bodyLanguage },
|
|
10566
|
-
{ type: "TextualBody", value: params.
|
|
10450
|
+
{ type: "TextualBody", value: params.schema.id, purpose: "classifying", format: "text/plain" }
|
|
10567
10451
|
]);
|
|
10568
10452
|
});
|
|
10569
10453
|
onProgress(100, `Complete! Created ${annotations.length} tags`, "creating");
|
|
@@ -10572,7 +10456,7 @@ async function processTagJob(content, inferenceClient, params, userId, generator
|
|
|
10572
10456
|
result: { tagsFound: tags.length, tagsCreated: annotations.length, byCategory }
|
|
10573
10457
|
};
|
|
10574
10458
|
}
|
|
10575
|
-
async function processGenerationJob(inferenceClient, params, onProgress) {
|
|
10459
|
+
async function processGenerationJob(inferenceClient, params, onProgress, logger2) {
|
|
10576
10460
|
onProgress(20, "Fetching context...", "fetching");
|
|
10577
10461
|
const title = params.title ?? "Untitled";
|
|
10578
10462
|
const entityTypes = (params.entityTypes ?? []).map(String);
|
|
@@ -10581,13 +10465,12 @@ async function processGenerationJob(inferenceClient, params, onProgress) {
|
|
|
10581
10465
|
title,
|
|
10582
10466
|
entityTypes,
|
|
10583
10467
|
inferenceClient,
|
|
10468
|
+
logger2,
|
|
10584
10469
|
params.prompt,
|
|
10585
10470
|
params.language,
|
|
10586
10471
|
params.context,
|
|
10587
10472
|
params.temperature,
|
|
10588
10473
|
params.maxTokens,
|
|
10589
|
-
void 0,
|
|
10590
|
-
// logger
|
|
10591
10474
|
params.sourceLanguage
|
|
10592
10475
|
);
|
|
10593
10476
|
onProgress(85, "Creating resource...", "creating");
|
|
@@ -10662,7 +10545,7 @@ async function handleJob(adapter, config, job) {
|
|
|
10662
10545
|
}
|
|
10663
10546
|
}
|
|
10664
10547
|
async function handleJobInner(adapter, config, job) {
|
|
10665
|
-
const { session } = config;
|
|
10548
|
+
const { session, inferenceClient, generator } = config;
|
|
10666
10549
|
const { resourceId, userId, jobId, type: jobType } = job;
|
|
10667
10550
|
const annotationId = job.params.referenceId;
|
|
10668
10551
|
const lifecycleBase = {
|
|
@@ -10673,12 +10556,10 @@ async function handleJobInner(adapter, config, job) {
|
|
|
10673
10556
|
...annotationId ? { annotationId } : {}
|
|
10674
10557
|
};
|
|
10675
10558
|
await emitEvent(session, "job:start", lifecycleBase);
|
|
10676
|
-
|
|
10677
|
-
|
|
10678
|
-
adapter.failJob(jobId, `No inference engine configured for job type: ${jobType}`);
|
|
10559
|
+
if (!config.jobTypes.includes(jobType)) {
|
|
10560
|
+
adapter.failJob(jobId, `Worker not configured for job type: ${jobType}`);
|
|
10679
10561
|
return;
|
|
10680
10562
|
}
|
|
10681
|
-
const { inferenceClient, generator } = engine;
|
|
10682
10563
|
const onProgress = (percentage, message, stage, extra) => {
|
|
10683
10564
|
emitEvent(session, "job:report-progress", {
|
|
10684
10565
|
...lifecycleBase,
|
|
@@ -10758,7 +10639,8 @@ async function handleJobInner(adapter, config, job) {
|
|
|
10758
10639
|
job.params,
|
|
10759
10640
|
userId,
|
|
10760
10641
|
generator,
|
|
10761
|
-
onProgress
|
|
10642
|
+
onProgress,
|
|
10643
|
+
config.logger
|
|
10762
10644
|
);
|
|
10763
10645
|
for (const ann of annotations) {
|
|
10764
10646
|
await emitEvent(session, "mark:create", { annotation: ann, userId, resourceId });
|
|
@@ -10790,7 +10672,8 @@ async function handleJobInner(adapter, config, job) {
|
|
|
10790
10672
|
const genResult = await processGenerationJob(
|
|
10791
10673
|
inferenceClient,
|
|
10792
10674
|
job.params,
|
|
10793
|
-
onProgress
|
|
10675
|
+
onProgress,
|
|
10676
|
+
config.logger
|
|
10794
10677
|
);
|
|
10795
10678
|
const genParams = job.params;
|
|
10796
10679
|
const storageUri = deriveStorageUri(genResult.title, genResult.format);
|
|
@@ -10799,11 +10682,11 @@ async function handleJobInner(adapter, config, job) {
|
|
|
10799
10682
|
file: Buffer.from(genResult.content),
|
|
10800
10683
|
format: genResult.format,
|
|
10801
10684
|
storageUri,
|
|
10802
|
-
creationMethod: "generated",
|
|
10803
10685
|
sourceResourceId: resourceId,
|
|
10804
10686
|
...genParams.referenceId ? { sourceAnnotationId: genParams.referenceId } : {},
|
|
10805
10687
|
...genParams.prompt ? { generationPrompt: genParams.prompt } : {},
|
|
10806
10688
|
...genParams.language ? { language: genParams.language } : {},
|
|
10689
|
+
...genParams.entityTypes && genParams.entityTypes.length > 0 ? { entityTypes: genParams.entityTypes } : {},
|
|
10807
10690
|
generator
|
|
10808
10691
|
});
|
|
10809
10692
|
await emitEvent(session, "job:complete", {
|
|
@@ -10870,24 +10753,20 @@ function toClientConfig(w) {
|
|
|
10870
10753
|
...w.apiKey && { apiKey: w.apiKey }
|
|
10871
10754
|
};
|
|
10872
10755
|
}
|
|
10873
|
-
var
|
|
10874
|
-
var engines = {};
|
|
10756
|
+
var groups = /* @__PURE__ */ new Map();
|
|
10875
10757
|
for (const jobType of ALL_JOB_TYPES) {
|
|
10876
|
-
const
|
|
10877
|
-
const key = clientKey(
|
|
10878
|
-
let
|
|
10879
|
-
if (!
|
|
10880
|
-
|
|
10881
|
-
|
|
10882
|
-
|
|
10883
|
-
|
|
10884
|
-
|
|
10885
|
-
|
|
10886
|
-
|
|
10887
|
-
|
|
10888
|
-
model: w.model
|
|
10889
|
-
};
|
|
10890
|
-
engines[jobType] = { inferenceClient: client, generator };
|
|
10758
|
+
const inference = resolveWorker(jobType);
|
|
10759
|
+
const key = clientKey(inference);
|
|
10760
|
+
let group = groups.get(key);
|
|
10761
|
+
if (!group) {
|
|
10762
|
+
group = {
|
|
10763
|
+
inference,
|
|
10764
|
+
jobTypes: [],
|
|
10765
|
+
client: createInferenceClient(toClientConfig(inference), logger)
|
|
10766
|
+
};
|
|
10767
|
+
groups.set(key, group);
|
|
10768
|
+
}
|
|
10769
|
+
group.jobTypes.push(jobType);
|
|
10891
10770
|
}
|
|
10892
10771
|
function parseBackendUrl(url) {
|
|
10893
10772
|
const parsed = new URL(url);
|
|
@@ -10896,35 +10775,35 @@ function parseBackendUrl(url) {
|
|
|
10896
10775
|
const port = parsed.port ? Number(parsed.port) : protocol === "https" ? 443 : 80;
|
|
10897
10776
|
return { protocol, host, port };
|
|
10898
10777
|
}
|
|
10899
|
-
async function
|
|
10778
|
+
async function authenticateAgent(provider, model) {
|
|
10900
10779
|
if (!workerSecret) {
|
|
10901
|
-
|
|
10902
|
-
return "";
|
|
10780
|
+
throw new Error("SEMIONT_WORKER_SECRET is required to authenticate worker agents");
|
|
10903
10781
|
}
|
|
10904
|
-
const response = await fetch(`${backendBaseUrl}/api/tokens/
|
|
10782
|
+
const response = await fetch(`${backendBaseUrl}/api/tokens/agent`, {
|
|
10905
10783
|
method: "POST",
|
|
10906
10784
|
headers: { "Content-Type": "application/json" },
|
|
10907
|
-
body: JSON.stringify({ secret: workerSecret })
|
|
10785
|
+
body: JSON.stringify({ secret: workerSecret, provider, model })
|
|
10908
10786
|
});
|
|
10909
10787
|
if (!response.ok) {
|
|
10910
|
-
throw new Error(`
|
|
10788
|
+
throw new Error(`Agent authentication failed for ${provider}:${model}: ${response.status} ${response.statusText}`);
|
|
10911
10789
|
}
|
|
10912
|
-
|
|
10913
|
-
return token;
|
|
10790
|
+
return await response.json();
|
|
10914
10791
|
}
|
|
10915
|
-
async function
|
|
10916
|
-
const {
|
|
10917
|
-
initObservabilityNode({ serviceName: "semiont-worker" });
|
|
10918
|
-
logger.info("Authenticating", { baseUrl: backendBaseUrl });
|
|
10919
|
-
const initialToken = await authenticate();
|
|
10920
|
-
logger.info("Authenticated");
|
|
10792
|
+
async function startAgentWorker(group) {
|
|
10793
|
+
const { inference } = group;
|
|
10921
10794
|
const { protocol, host, port } = parseBackendUrl(backendBaseUrl);
|
|
10922
|
-
const
|
|
10795
|
+
const { token: initialToken, did } = await authenticateAgent(inference.type, inference.model);
|
|
10796
|
+
const generator = softwareToAgent({
|
|
10797
|
+
domain: host,
|
|
10798
|
+
provider: inference.type,
|
|
10799
|
+
model: inference.model
|
|
10800
|
+
});
|
|
10801
|
+
const kbId = `agent-${inference.type}-${inference.model}-${hostname()}`;
|
|
10923
10802
|
const endpoint = { kind: "http", host, port, protocol };
|
|
10924
10803
|
const kb = {
|
|
10925
10804
|
id: kbId,
|
|
10926
|
-
label:
|
|
10927
|
-
email: `
|
|
10805
|
+
label: `${inference.type} / ${inference.model} @ ${host}`,
|
|
10806
|
+
email: `agent@${host}`,
|
|
10928
10807
|
endpoint
|
|
10929
10808
|
};
|
|
10930
10809
|
const storage = new InMemorySessionStorage();
|
|
@@ -10945,37 +10824,58 @@ async function main() {
|
|
|
10945
10824
|
token$,
|
|
10946
10825
|
refresh: async () => {
|
|
10947
10826
|
try {
|
|
10948
|
-
|
|
10827
|
+
const { token } = await authenticateAgent(inference.type, inference.model);
|
|
10828
|
+
return token;
|
|
10949
10829
|
} catch (err) {
|
|
10950
|
-
logger.error("
|
|
10951
|
-
error: err instanceof Error ? err.message : String(err)
|
|
10830
|
+
logger.error("Agent token refresh failed", {
|
|
10831
|
+
error: err instanceof Error ? err.message : String(err),
|
|
10832
|
+
agent: did
|
|
10952
10833
|
});
|
|
10953
10834
|
return null;
|
|
10954
10835
|
}
|
|
10955
10836
|
},
|
|
10956
|
-
// No validate callback — workers are service principals with no
|
|
10957
|
-
// user record to fetch. `session.user$` stays null.
|
|
10958
10837
|
onError: (err) => {
|
|
10959
|
-
logger.error("Session error", { code: err.code, message: err.message });
|
|
10838
|
+
logger.error("Session error", { code: err.code, message: err.message, agent: did });
|
|
10960
10839
|
}
|
|
10961
10840
|
});
|
|
10962
10841
|
await session.ready;
|
|
10963
|
-
const
|
|
10842
|
+
const adapter = startWorkerProcess({
|
|
10964
10843
|
session,
|
|
10965
|
-
jobTypes:
|
|
10966
|
-
|
|
10844
|
+
jobTypes: group.jobTypes,
|
|
10845
|
+
inferenceClient: group.client,
|
|
10846
|
+
generator,
|
|
10967
10847
|
logger
|
|
10968
10848
|
});
|
|
10969
|
-
logger.info("
|
|
10849
|
+
logger.info("Agent ready", {
|
|
10850
|
+
did,
|
|
10851
|
+
provider: inference.type,
|
|
10852
|
+
model: inference.model,
|
|
10853
|
+
jobTypes: group.jobTypes
|
|
10854
|
+
});
|
|
10855
|
+
return {
|
|
10856
|
+
session,
|
|
10857
|
+
dispose: async () => {
|
|
10858
|
+
adapter.dispose();
|
|
10859
|
+
await session.dispose();
|
|
10860
|
+
}
|
|
10861
|
+
};
|
|
10862
|
+
}
|
|
10863
|
+
async function main() {
|
|
10864
|
+
const { initObservabilityNode } = await import('@semiont/observability/node');
|
|
10865
|
+
initObservabilityNode({ serviceName: "semiont-worker" });
|
|
10866
|
+
logger.info("Starting agents", {
|
|
10970
10867
|
baseUrl: backendBaseUrl,
|
|
10971
|
-
|
|
10972
|
-
|
|
10973
|
-
|
|
10868
|
+
agents: Array.from(groups.values()).map((g) => ({
|
|
10869
|
+
provider: g.inference.type,
|
|
10870
|
+
model: g.inference.model,
|
|
10871
|
+
jobTypes: g.jobTypes
|
|
10872
|
+
}))
|
|
10974
10873
|
});
|
|
10874
|
+
const workers = await Promise.all(Array.from(groups.values()).map(startAgentWorker));
|
|
10975
10875
|
const health = createServer((req, res) => {
|
|
10976
10876
|
if (req.url === "/health") {
|
|
10977
10877
|
res.writeHead(200, { "Content-Type": "application/json" });
|
|
10978
|
-
res.end(JSON.stringify({ status: "ok" }));
|
|
10878
|
+
res.end(JSON.stringify({ status: "ok", agents: workers.length }));
|
|
10979
10879
|
} else {
|
|
10980
10880
|
res.writeHead(404);
|
|
10981
10881
|
res.end();
|
|
@@ -10986,8 +10886,7 @@ async function main() {
|
|
|
10986
10886
|
});
|
|
10987
10887
|
const shutdown = async () => {
|
|
10988
10888
|
logger.info("Shutting down");
|
|
10989
|
-
|
|
10990
|
-
await session.dispose();
|
|
10889
|
+
await Promise.all(workers.map((w) => w.dispose()));
|
|
10991
10890
|
health.close();
|
|
10992
10891
|
process.exit(0);
|
|
10993
10892
|
};
|