@semiont/jobs 0.5.5 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -42
- package/dist/index.d.ts +4 -70
- package/dist/index.js +224 -430
- package/dist/index.js.map +1 -1
- package/dist/worker-main.js +228 -288
- package/dist/worker-main.js.map +1 -1
- package/package.json +8 -4
package/dist/worker-main.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { createTomlConfigLoader, softwareToAgent, baseUrl, RESOURCE_BROADCAST_TYPES, resourceId, validateAndCorrectOffsets, didToAgent, getLocaleEnglishName } from '@semiont/core';
|
|
2
1
|
import { deriveStorageUri } from '@semiont/content';
|
|
3
2
|
import { withSpan, SpanKind, recordJobOutcome } from '@semiont/observability';
|
|
3
|
+
import { createTomlConfigLoader, softwareToAgent, baseUrl, reconcileSelector, didToAgent, getLocaleEnglishName } from '@semiont/core';
|
|
4
4
|
import { generateAnnotationId } from '@semiont/event-sourcing';
|
|
5
5
|
import { createInferenceClient } from '@semiont/inference';
|
|
6
6
|
import { createServer } from 'http';
|
|
@@ -9337,17 +9337,15 @@ ${content.substring(0, 8e3)}
|
|
|
9337
9337
|
|
|
9338
9338
|
Return a JSON array of comments. Each comment must have:
|
|
9339
9339
|
- "exact": the exact text passage being commented on (quoted verbatim from source)
|
|
9340
|
-
- "
|
|
9341
|
-
- "
|
|
9342
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
9343
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
9340
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
9341
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
9344
9342
|
- "comment": your comment following the instructions above
|
|
9345
9343
|
|
|
9346
9344
|
Respond with a valid JSON array.
|
|
9347
9345
|
|
|
9348
9346
|
Example:
|
|
9349
9347
|
[
|
|
9350
|
-
{"exact": "the quarterly review meeting", "
|
|
9348
|
+
{"exact": "the quarterly review meeting", "prefix": "We need to schedule ", "suffix": " for next month.", "comment": "Who will lead this? Should we invite the external auditors?"}
|
|
9351
9349
|
]`;
|
|
9352
9350
|
} else {
|
|
9353
9351
|
const toneGuidance = tone ? `
|
|
@@ -9373,17 +9371,15 @@ ${content.substring(0, 8e3)}
|
|
|
9373
9371
|
|
|
9374
9372
|
Return a JSON array of comments. Each comment should have:
|
|
9375
9373
|
- "exact": the exact text passage being commented on (quoted verbatim from source)
|
|
9376
|
-
- "
|
|
9377
|
-
- "
|
|
9378
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
9379
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
9374
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
9375
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
9380
9376
|
- "comment": your explanatory comment (1-3 sentences, provide context/background/clarification)
|
|
9381
9377
|
|
|
9382
9378
|
Respond with a valid JSON array.
|
|
9383
9379
|
|
|
9384
9380
|
Example format:
|
|
9385
9381
|
[
|
|
9386
|
-
{"exact": "Ouranos", "
|
|
9382
|
+
{"exact": "Ouranos", "prefix": "In the beginning, ", "suffix": " ruled the universe", "comment": "Ouranos (also spelled Uranus) is the primordial Greek deity personifying the sky. In Hesiod's Theogony, he is the son and husband of Gaia (Earth) and father of the Titans."}
|
|
9387
9383
|
]`;
|
|
9388
9384
|
}
|
|
9389
9385
|
return prompt;
|
|
@@ -9414,16 +9410,14 @@ ${content.substring(0, 8e3)}
|
|
|
9414
9410
|
|
|
9415
9411
|
Return a JSON array of highlights. Each highlight must have:
|
|
9416
9412
|
- "exact": the exact text passage to highlight (quoted verbatim from source)
|
|
9417
|
-
- "
|
|
9418
|
-
- "
|
|
9419
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
9420
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
9413
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
9414
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
9421
9415
|
|
|
9422
9416
|
Respond with a valid JSON array.
|
|
9423
9417
|
|
|
9424
9418
|
Example:
|
|
9425
9419
|
[
|
|
9426
|
-
{"exact": "revenue grew 45% year-over-year", "
|
|
9420
|
+
{"exact": "revenue grew 45% year-over-year", "prefix": "In Q3 2024, ", "suffix": ", exceeding all forecasts."}
|
|
9427
9421
|
]`;
|
|
9428
9422
|
} else {
|
|
9429
9423
|
const densityGuidance = density ? `
|
|
@@ -9447,16 +9441,14 @@ ${content.substring(0, 8e3)}
|
|
|
9447
9441
|
|
|
9448
9442
|
Return a JSON array of highlights. Each highlight should have:
|
|
9449
9443
|
- "exact": the exact text passage to highlight (quoted verbatim from source)
|
|
9450
|
-
- "
|
|
9451
|
-
- "
|
|
9452
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
9453
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
9444
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
9445
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
9454
9446
|
|
|
9455
9447
|
Respond with a valid JSON array.
|
|
9456
9448
|
|
|
9457
9449
|
Example format:
|
|
9458
9450
|
[
|
|
9459
|
-
{"exact": "we will discontinue support for legacy systems by March 2025", "
|
|
9451
|
+
{"exact": "we will discontinue support for legacy systems by March 2025", "prefix": "After careful consideration, ", "suffix": ". This decision affects"}
|
|
9460
9452
|
]`;
|
|
9461
9453
|
}
|
|
9462
9454
|
return prompt;
|
|
@@ -9490,17 +9482,15 @@ ${content.substring(0, 8e3)}
|
|
|
9490
9482
|
|
|
9491
9483
|
Return a JSON array of assessments. Each assessment must have:
|
|
9492
9484
|
- "exact": the exact text passage being assessed (quoted verbatim from source)
|
|
9493
|
-
- "
|
|
9494
|
-
- "
|
|
9495
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
9496
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
9485
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
9486
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
9497
9487
|
- "assessment": your assessment following the instructions above
|
|
9498
9488
|
|
|
9499
9489
|
Respond with a valid JSON array.
|
|
9500
9490
|
|
|
9501
9491
|
Example:
|
|
9502
9492
|
[
|
|
9503
|
-
{"exact": "the quarterly revenue target", "
|
|
9493
|
+
{"exact": "the quarterly revenue target", "prefix": "We established ", "suffix": " for Q4 2024.", "assessment": "This target seems ambitious given market conditions. Consider revising based on recent trends."}
|
|
9504
9494
|
]`;
|
|
9505
9495
|
} else {
|
|
9506
9496
|
const toneGuidance = tone ? `
|
|
@@ -9526,17 +9516,15 @@ ${content.substring(0, 8e3)}
|
|
|
9526
9516
|
|
|
9527
9517
|
Return a JSON array of assessments. Each assessment should have:
|
|
9528
9518
|
- "exact": the exact text passage being assessed (quoted verbatim from source)
|
|
9529
|
-
- "
|
|
9530
|
-
- "
|
|
9531
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
9532
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
9519
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
9520
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
9533
9521
|
- "assessment": your analytical assessment (1-3 sentences, evaluate validity/strength/implications)
|
|
9534
9522
|
|
|
9535
9523
|
Respond with a valid JSON array.
|
|
9536
9524
|
|
|
9537
9525
|
Example format:
|
|
9538
9526
|
[
|
|
9539
|
-
{"exact": "AI will replace most jobs by 2030", "
|
|
9527
|
+
{"exact": "AI will replace most jobs by 2030", "prefix": "Many experts predict that ", "suffix": ", fundamentally reshaping", "assessment": "This claim lacks nuance and supporting evidence. Employment patterns historically show job transformation rather than wholesale replacement. The timeline appears speculative without specific sector analysis."}
|
|
9540
9528
|
]`;
|
|
9541
9529
|
}
|
|
9542
9530
|
return prompt;
|
|
@@ -9582,17 +9570,15 @@ ${content}
|
|
|
9582
9570
|
|
|
9583
9571
|
Return a JSON array of tags. Each tag should have:
|
|
9584
9572
|
- "exact": the exact text passage (quoted verbatim from source)
|
|
9585
|
-
- "
|
|
9586
|
-
- "
|
|
9587
|
-
- "prefix": up to 32 characters of text immediately before the passage
|
|
9588
|
-
- "suffix": up to 32 characters of text immediately after the passage
|
|
9573
|
+
- "prefix": up to 64 characters of text immediately before the passage
|
|
9574
|
+
- "suffix": up to 64 characters of text immediately after the passage
|
|
9589
9575
|
|
|
9590
9576
|
Respond with a valid JSON array.
|
|
9591
9577
|
|
|
9592
9578
|
Example format:
|
|
9593
9579
|
[
|
|
9594
|
-
{"exact": "What duty did the defendant owe?", "
|
|
9595
|
-
{"exact": "In tort law, a duty of care is established when...", "
|
|
9580
|
+
{"exact": "What duty did the defendant owe?", "prefix": "The central question is: ", "suffix": " This question must be"},
|
|
9581
|
+
{"exact": "In tort law, a duty of care is established when...", "prefix": "Legal framework:\\n", "suffix": "\\n\\nApplying this standard"}
|
|
9596
9582
|
]`;
|
|
9597
9583
|
return prompt;
|
|
9598
9584
|
}
|
|
@@ -9660,23 +9646,29 @@ var MotivationParsers = class {
|
|
|
9660
9646
|
try {
|
|
9661
9647
|
const parsed = extractObjectsFromArray(response);
|
|
9662
9648
|
const valid = parsed.filter(
|
|
9663
|
-
(c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.
|
|
9649
|
+
(c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.comment === "string" && c.comment.trim().length > 0
|
|
9664
9650
|
);
|
|
9665
9651
|
console.log(`[MotivationParsers] Parsed ${valid.length} valid comments from ${parsed.length} total`);
|
|
9666
9652
|
const validatedComments = [];
|
|
9667
9653
|
for (const comment of valid) {
|
|
9668
|
-
|
|
9669
|
-
|
|
9670
|
-
|
|
9671
|
-
|
|
9672
|
-
|
|
9673
|
-
|
|
9674
|
-
|
|
9675
|
-
|
|
9676
|
-
|
|
9677
|
-
|
|
9678
|
-
|
|
9679
|
-
|
|
9654
|
+
const reconciled = reconcileSelector(content, {
|
|
9655
|
+
exact: comment.exact,
|
|
9656
|
+
...typeof comment.prefix === "string" ? { prefix: comment.prefix } : {},
|
|
9657
|
+
...typeof comment.suffix === "string" ? { suffix: comment.suffix } : {}
|
|
9658
|
+
});
|
|
9659
|
+
if (!reconciled) {
|
|
9660
|
+
console.warn(`[MotivationParsers] Dropped hallucinated comment "${comment.exact}"`);
|
|
9661
|
+
continue;
|
|
9662
|
+
}
|
|
9663
|
+
logAnchorMethod("comment", comment.exact, reconciled.anchorMethod);
|
|
9664
|
+
validatedComments.push({
|
|
9665
|
+
comment: comment.comment,
|
|
9666
|
+
exact: reconciled.exact,
|
|
9667
|
+
start: reconciled.start,
|
|
9668
|
+
end: reconciled.end,
|
|
9669
|
+
...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
|
|
9670
|
+
...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
|
|
9671
|
+
});
|
|
9680
9672
|
}
|
|
9681
9673
|
return validatedComments;
|
|
9682
9674
|
} catch (error) {
|
|
@@ -9695,22 +9687,27 @@ var MotivationParsers = class {
|
|
|
9695
9687
|
try {
|
|
9696
9688
|
const parsed = extractObjectsFromArray(response);
|
|
9697
9689
|
const highlights = parsed.filter(
|
|
9698
|
-
(h) => !!h && typeof h === "object" && typeof h.exact === "string"
|
|
9690
|
+
(h) => !!h && typeof h === "object" && typeof h.exact === "string"
|
|
9699
9691
|
);
|
|
9700
9692
|
const validatedHighlights = [];
|
|
9701
9693
|
for (const highlight of highlights) {
|
|
9702
|
-
|
|
9703
|
-
|
|
9704
|
-
|
|
9705
|
-
|
|
9706
|
-
|
|
9707
|
-
|
|
9708
|
-
|
|
9709
|
-
|
|
9710
|
-
|
|
9711
|
-
|
|
9712
|
-
|
|
9713
|
-
|
|
9694
|
+
const reconciled = reconcileSelector(content, {
|
|
9695
|
+
exact: highlight.exact,
|
|
9696
|
+
...typeof highlight.prefix === "string" ? { prefix: highlight.prefix } : {},
|
|
9697
|
+
...typeof highlight.suffix === "string" ? { suffix: highlight.suffix } : {}
|
|
9698
|
+
});
|
|
9699
|
+
if (!reconciled) {
|
|
9700
|
+
console.warn(`[MotivationParsers] Dropped hallucinated highlight "${highlight.exact}"`);
|
|
9701
|
+
continue;
|
|
9702
|
+
}
|
|
9703
|
+
logAnchorMethod("highlight", highlight.exact, reconciled.anchorMethod);
|
|
9704
|
+
validatedHighlights.push({
|
|
9705
|
+
exact: reconciled.exact,
|
|
9706
|
+
start: reconciled.start,
|
|
9707
|
+
end: reconciled.end,
|
|
9708
|
+
...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
|
|
9709
|
+
...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
|
|
9710
|
+
});
|
|
9714
9711
|
}
|
|
9715
9712
|
return validatedHighlights;
|
|
9716
9713
|
} catch (error) {
|
|
@@ -9730,22 +9727,28 @@ var MotivationParsers = class {
|
|
|
9730
9727
|
try {
|
|
9731
9728
|
const parsed = extractObjectsFromArray(response);
|
|
9732
9729
|
const assessments = parsed.filter(
|
|
9733
|
-
(a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.
|
|
9730
|
+
(a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.assessment === "string"
|
|
9734
9731
|
);
|
|
9735
9732
|
const validatedAssessments = [];
|
|
9736
9733
|
for (const assessment of assessments) {
|
|
9737
|
-
|
|
9738
|
-
|
|
9739
|
-
|
|
9740
|
-
|
|
9741
|
-
|
|
9742
|
-
|
|
9743
|
-
|
|
9744
|
-
|
|
9745
|
-
|
|
9746
|
-
|
|
9747
|
-
|
|
9748
|
-
|
|
9734
|
+
const reconciled = reconcileSelector(content, {
|
|
9735
|
+
exact: assessment.exact,
|
|
9736
|
+
...typeof assessment.prefix === "string" ? { prefix: assessment.prefix } : {},
|
|
9737
|
+
...typeof assessment.suffix === "string" ? { suffix: assessment.suffix } : {}
|
|
9738
|
+
});
|
|
9739
|
+
if (!reconciled) {
|
|
9740
|
+
console.warn(`[MotivationParsers] Dropped hallucinated assessment "${assessment.exact}"`);
|
|
9741
|
+
continue;
|
|
9742
|
+
}
|
|
9743
|
+
logAnchorMethod("assessment", assessment.exact, reconciled.anchorMethod);
|
|
9744
|
+
validatedAssessments.push({
|
|
9745
|
+
assessment: assessment.assessment,
|
|
9746
|
+
exact: reconciled.exact,
|
|
9747
|
+
start: reconciled.start,
|
|
9748
|
+
end: reconciled.end,
|
|
9749
|
+
...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
|
|
9750
|
+
...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
|
|
9751
|
+
});
|
|
9749
9752
|
}
|
|
9750
9753
|
return validatedAssessments;
|
|
9751
9754
|
} catch (error) {
|
|
@@ -9755,17 +9758,15 @@ var MotivationParsers = class {
|
|
|
9755
9758
|
}
|
|
9756
9759
|
}
|
|
9757
9760
|
/**
|
|
9758
|
-
* Parse
|
|
9759
|
-
*
|
|
9760
|
-
*
|
|
9761
|
-
* @param response - Raw AI response string (may include markdown code fences)
|
|
9762
|
-
* @returns Array of tag matches (offsets not yet validated)
|
|
9761
|
+
* Parse the LLM's tag response into raw, pre-reconciliation tag inputs.
|
|
9762
|
+
* Reconciliation happens in `validateTagOffsets`, which adds `start`/`end`
|
|
9763
|
+
* by anchoring `exact` against the source content.
|
|
9763
9764
|
*/
|
|
9764
9765
|
static parseTags(response) {
|
|
9765
9766
|
try {
|
|
9766
9767
|
const parsed = extractObjectsFromArray(response);
|
|
9767
9768
|
const valid = parsed.filter(
|
|
9768
|
-
(t) => !!t && typeof t === "object" && typeof t.exact === "string" &&
|
|
9769
|
+
(t) => !!t && typeof t === "object" && typeof t.exact === "string" && t.exact.trim().length > 0
|
|
9769
9770
|
);
|
|
9770
9771
|
console.log(`[MotivationParsers] Parsed ${valid.length} valid tags from ${parsed.length} total`);
|
|
9771
9772
|
return valid;
|
|
@@ -9775,34 +9776,38 @@ var MotivationParsers = class {
|
|
|
9775
9776
|
}
|
|
9776
9777
|
}
|
|
9777
9778
|
/**
|
|
9778
|
-
*
|
|
9779
|
-
* Helper for tag detection after initial parsing
|
|
9780
|
-
*
|
|
9781
|
-
* @param tags - Parsed tags without validated offsets
|
|
9782
|
-
* @param content - Original content to validate against
|
|
9783
|
-
* @param category - Category to assign to validated tags
|
|
9784
|
-
* @returns Array of validated tag matches
|
|
9779
|
+
* Anchor raw tag inputs against source content and add category.
|
|
9785
9780
|
*/
|
|
9786
9781
|
static validateTagOffsets(tags, content, category) {
|
|
9787
9782
|
const validatedTags = [];
|
|
9788
9783
|
for (const tag of tags) {
|
|
9789
|
-
|
|
9790
|
-
|
|
9791
|
-
|
|
9792
|
-
|
|
9793
|
-
|
|
9794
|
-
|
|
9795
|
-
|
|
9796
|
-
|
|
9797
|
-
|
|
9798
|
-
|
|
9799
|
-
|
|
9800
|
-
|
|
9801
|
-
|
|
9784
|
+
const reconciled = reconcileSelector(content, {
|
|
9785
|
+
exact: tag.exact,
|
|
9786
|
+
...typeof tag.prefix === "string" ? { prefix: tag.prefix } : {},
|
|
9787
|
+
...typeof tag.suffix === "string" ? { suffix: tag.suffix } : {}
|
|
9788
|
+
});
|
|
9789
|
+
if (!reconciled) {
|
|
9790
|
+
console.warn(`[MotivationParsers] Dropped hallucinated tag "${tag.exact}" for category "${category}"`);
|
|
9791
|
+
continue;
|
|
9792
|
+
}
|
|
9793
|
+
logAnchorMethod("tag", tag.exact, reconciled.anchorMethod);
|
|
9794
|
+
validatedTags.push({
|
|
9795
|
+
category,
|
|
9796
|
+
exact: reconciled.exact,
|
|
9797
|
+
start: reconciled.start,
|
|
9798
|
+
end: reconciled.end,
|
|
9799
|
+
...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
|
|
9800
|
+
...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
|
|
9801
|
+
});
|
|
9802
9802
|
}
|
|
9803
9803
|
return validatedTags;
|
|
9804
9804
|
}
|
|
9805
9805
|
};
|
|
9806
|
+
function logAnchorMethod(motivation, exact, anchorMethod) {
|
|
9807
|
+
if (anchorMethod === "first-of-many" || anchorMethod === "fuzzy-match") {
|
|
9808
|
+
console.warn(`[MotivationParsers] ${motivation} anchored via ${anchorMethod}: "${exact}"`);
|
|
9809
|
+
}
|
|
9810
|
+
}
|
|
9806
9811
|
|
|
9807
9812
|
// src/workers/annotation-detection.ts
|
|
9808
9813
|
var AnnotationDetection = class {
|
|
@@ -9930,17 +9935,15 @@ ${exact}
|
|
|
9930
9935
|
"""
|
|
9931
9936
|
|
|
9932
9937
|
Respond with a JSON array of entities found. Each entity should have:
|
|
9933
|
-
- exact: the exact text span from the input
|
|
9938
|
+
- exact: the exact text span from the input (quoted verbatim \u2014 character-for-character)
|
|
9934
9939
|
- entityType: one of the provided entity types
|
|
9935
|
-
-
|
|
9936
|
-
-
|
|
9937
|
-
- prefix: up to 32 characters of text immediately before the entity (helps identify correct occurrence)
|
|
9938
|
-
- suffix: up to 32 characters of text immediately after the entity (helps identify correct occurrence)
|
|
9940
|
+
- prefix: up to 64 characters of text immediately before the entity (used to disambiguate when the same text appears more than once)
|
|
9941
|
+
- suffix: up to 64 characters of text immediately after the entity (same purpose)
|
|
9939
9942
|
|
|
9940
9943
|
If no entities are found, respond with an empty array [].
|
|
9941
9944
|
|
|
9942
9945
|
Example output:
|
|
9943
|
-
[{"exact":"Alice","entityType":"Person","
|
|
9946
|
+
[{"exact":"Alice","entityType":"Person","prefix":"","suffix":" went to"},{"exact":"Paris","entityType":"Location","prefix":"went to ","suffix":" yesterday"}]`;
|
|
9944
9947
|
logger2.debug("Sending entity extraction request", { entityTypes: entityTypesDescription });
|
|
9945
9948
|
const response = await client.generateTextWithMetadata(
|
|
9946
9949
|
prompt,
|
|
@@ -9969,151 +9972,18 @@ Example output:
|
|
|
9969
9972
|
logger2.error(errorMsg);
|
|
9970
9973
|
throw new Error(errorMsg);
|
|
9971
9974
|
}
|
|
9972
|
-
return entities.
|
|
9973
|
-
|
|
9974
|
-
|
|
9975
|
-
|
|
9976
|
-
|
|
9977
|
-
|
|
9978
|
-
|
|
9979
|
-
|
|
9980
|
-
|
|
9981
|
-
}
|
|
9982
|
-
|
|
9983
|
-
|
|
9984
|
-
if (extractedText === entity.exact) {
|
|
9985
|
-
anchorMethod = "llm-exact";
|
|
9986
|
-
logger2.debug("Entity anchored", {
|
|
9987
|
-
text: entity.exact,
|
|
9988
|
-
entityType: entity.entityType,
|
|
9989
|
-
anchorMethod
|
|
9990
|
-
});
|
|
9991
|
-
} else {
|
|
9992
|
-
logger2.debug("LLM offsets mismatch \u2014 attempting re-anchor", {
|
|
9993
|
-
expected: entity.exact,
|
|
9994
|
-
llmOffsets: `[${start}:${end}]`,
|
|
9995
|
-
foundAtLlmOffsets: extractedText
|
|
9996
|
-
});
|
|
9997
|
-
let occurrenceCount = 0;
|
|
9998
|
-
let firstOccurrence = -1;
|
|
9999
|
-
let searchPos = 0;
|
|
10000
|
-
while ((searchPos = exact.indexOf(entity.exact, searchPos)) !== -1) {
|
|
10001
|
-
if (firstOccurrence === -1) firstOccurrence = searchPos;
|
|
10002
|
-
occurrenceCount++;
|
|
10003
|
-
searchPos++;
|
|
10004
|
-
}
|
|
10005
|
-
if (occurrenceCount === 0) {
|
|
10006
|
-
anchorMethod = "dropped";
|
|
10007
|
-
logger2.error("Entity text not found in resource \u2014 dropping", {
|
|
10008
|
-
text: entity.exact,
|
|
10009
|
-
entityType: entity.entityType,
|
|
10010
|
-
llmOffsets: `[${start}:${end}]`,
|
|
10011
|
-
anchorMethod,
|
|
10012
|
-
resourceStart: exact.substring(0, 200)
|
|
10013
|
-
});
|
|
10014
|
-
return null;
|
|
10015
|
-
}
|
|
10016
|
-
let recoveredOffset = -1;
|
|
10017
|
-
if (entity.prefix || entity.suffix) {
|
|
10018
|
-
let p = 0;
|
|
10019
|
-
while ((p = exact.indexOf(entity.exact, p)) !== -1) {
|
|
10020
|
-
const candidatePrefix = exact.substring(Math.max(0, p - 32), p);
|
|
10021
|
-
const candidateSuffix = exact.substring(
|
|
10022
|
-
p + entity.exact.length,
|
|
10023
|
-
Math.min(exact.length, p + entity.exact.length + 32)
|
|
10024
|
-
);
|
|
10025
|
-
const prefixMatch = !entity.prefix || candidatePrefix.endsWith(entity.prefix);
|
|
10026
|
-
const suffixMatch = !entity.suffix || candidateSuffix.startsWith(entity.suffix);
|
|
10027
|
-
if (prefixMatch && suffixMatch) {
|
|
10028
|
-
recoveredOffset = p;
|
|
10029
|
-
break;
|
|
10030
|
-
}
|
|
10031
|
-
p++;
|
|
10032
|
-
}
|
|
10033
|
-
}
|
|
10034
|
-
if (recoveredOffset !== -1) {
|
|
10035
|
-
anchorMethod = "context-recovered";
|
|
10036
|
-
start = recoveredOffset;
|
|
10037
|
-
end = recoveredOffset + entity.exact.length;
|
|
10038
|
-
logger2.debug("Entity anchored", {
|
|
10039
|
-
text: entity.exact,
|
|
10040
|
-
entityType: entity.entityType,
|
|
10041
|
-
anchorMethod,
|
|
10042
|
-
offsetDiff: recoveredOffset - entity.startOffset
|
|
10043
|
-
});
|
|
10044
|
-
} else if (occurrenceCount === 1) {
|
|
10045
|
-
anchorMethod = "unique-match";
|
|
10046
|
-
start = firstOccurrence;
|
|
10047
|
-
end = firstOccurrence + entity.exact.length;
|
|
10048
|
-
logger2.debug("Entity anchored", {
|
|
10049
|
-
text: entity.exact,
|
|
10050
|
-
entityType: entity.entityType,
|
|
10051
|
-
anchorMethod,
|
|
10052
|
-
offsetDiff: firstOccurrence - entity.startOffset
|
|
10053
|
-
});
|
|
10054
|
-
} else {
|
|
10055
|
-
anchorMethod = "first-of-many";
|
|
10056
|
-
start = firstOccurrence;
|
|
10057
|
-
end = firstOccurrence + entity.exact.length;
|
|
10058
|
-
logger2.warn("Entity anchored at first of multiple occurrences \u2014 may be wrong", {
|
|
10059
|
-
text: entity.exact,
|
|
10060
|
-
entityType: entity.entityType,
|
|
10061
|
-
anchorMethod,
|
|
10062
|
-
occurrenceCount,
|
|
10063
|
-
chosenOffset: firstOccurrence,
|
|
10064
|
-
llmOffsets: `[${entity.startOffset}:${entity.endOffset}]`,
|
|
10065
|
-
hasPrefix: !!entity.prefix,
|
|
10066
|
-
hasSuffix: !!entity.suffix
|
|
10067
|
-
});
|
|
10068
|
-
}
|
|
10069
|
-
}
|
|
10070
|
-
return {
|
|
10071
|
-
exact: entity.exact,
|
|
10072
|
-
entityType: entity.entityType,
|
|
10073
|
-
start,
|
|
10074
|
-
end,
|
|
10075
|
-
prefix: entity.prefix,
|
|
10076
|
-
suffix: entity.suffix
|
|
10077
|
-
};
|
|
10078
|
-
}).filter((entity) => {
|
|
10079
|
-
if (entity === null) {
|
|
10080
|
-
logger2.debug("Filtered entity: null");
|
|
10081
|
-
return false;
|
|
10082
|
-
}
|
|
10083
|
-
if (entity.start === void 0 || entity.end === void 0) {
|
|
10084
|
-
logger2.warn("Filtered entity: missing offsets", { text: entity.exact });
|
|
10085
|
-
return false;
|
|
10086
|
-
}
|
|
10087
|
-
if (entity.start < 0) {
|
|
10088
|
-
logger2.warn("Filtered entity: negative start", {
|
|
10089
|
-
text: entity.exact,
|
|
10090
|
-
start: entity.start
|
|
10091
|
-
});
|
|
10092
|
-
return false;
|
|
10093
|
-
}
|
|
10094
|
-
if (entity.end > exact.length) {
|
|
10095
|
-
logger2.warn("Filtered entity: end exceeds text length", {
|
|
10096
|
-
text: entity.exact,
|
|
10097
|
-
end: entity.end,
|
|
10098
|
-
textLength: exact.length
|
|
10099
|
-
});
|
|
10100
|
-
return false;
|
|
10101
|
-
}
|
|
10102
|
-
const extractedText = exact.substring(entity.start, entity.end);
|
|
10103
|
-
if (extractedText !== entity.exact) {
|
|
10104
|
-
logger2.warn("Filtered entity: offset mismatch", {
|
|
10105
|
-
expected: entity.exact,
|
|
10106
|
-
got: extractedText,
|
|
10107
|
-
offsets: `[${entity.start}:${entity.end}]`
|
|
10108
|
-
});
|
|
10109
|
-
return false;
|
|
10110
|
-
}
|
|
10111
|
-
logger2.debug("Accepted entity", {
|
|
10112
|
-
text: entity.exact,
|
|
10113
|
-
offsets: `[${entity.start}:${entity.end}]`
|
|
10114
|
-
});
|
|
10115
|
-
return true;
|
|
10116
|
-
});
|
|
9975
|
+
return entities.filter((e) => {
|
|
9976
|
+
const ok = e && typeof e === "object" && typeof e.exact === "string" && typeof e.entityType === "string";
|
|
9977
|
+
if (!ok) {
|
|
9978
|
+
logger2.debug("Dropped malformed LLM entity", { entity: e });
|
|
9979
|
+
}
|
|
9980
|
+
return ok;
|
|
9981
|
+
}).map((entity) => ({
|
|
9982
|
+
exact: entity.exact,
|
|
9983
|
+
entityType: entity.entityType,
|
|
9984
|
+
...typeof entity.prefix === "string" ? { prefix: entity.prefix } : {},
|
|
9985
|
+
...typeof entity.suffix === "string" ? { suffix: entity.suffix } : {}
|
|
9986
|
+
}));
|
|
10117
9987
|
} catch (error) {
|
|
10118
9988
|
logger2.error("Failed to parse entity extraction response", {
|
|
10119
9989
|
error: error instanceof Error ? error.message : String(error)
|
|
@@ -10249,7 +10119,59 @@ Requirements:
|
|
|
10249
10119
|
});
|
|
10250
10120
|
return result;
|
|
10251
10121
|
}
|
|
10252
|
-
function
|
|
10122
|
+
function toMatch(r) {
|
|
10123
|
+
return {
|
|
10124
|
+
exact: r.exact,
|
|
10125
|
+
start: r.start,
|
|
10126
|
+
end: r.end,
|
|
10127
|
+
...r.prefix !== void 0 ? { prefix: r.prefix } : {},
|
|
10128
|
+
...r.suffix !== void 0 ? { suffix: r.suffix } : {}
|
|
10129
|
+
};
|
|
10130
|
+
}
|
|
10131
|
+
function annotationDedupeKey(ann) {
|
|
10132
|
+
const target = ann.target;
|
|
10133
|
+
const selectors = Array.isArray(target?.selector) ? target.selector : [];
|
|
10134
|
+
const pos = selectors.find((s) => s.type === "TextPositionSelector");
|
|
10135
|
+
return [
|
|
10136
|
+
ann.motivation,
|
|
10137
|
+
pos?.start ?? "?",
|
|
10138
|
+
pos?.end ?? "?",
|
|
10139
|
+
JSON.stringify(ann.body ?? null)
|
|
10140
|
+
].join("|");
|
|
10141
|
+
}
|
|
10142
|
+
function dedupeAnnotations(annotations) {
|
|
10143
|
+
const seen = /* @__PURE__ */ new Set();
|
|
10144
|
+
const out = [];
|
|
10145
|
+
for (const ann of annotations) {
|
|
10146
|
+
const key = annotationDedupeKey(ann);
|
|
10147
|
+
if (seen.has(key)) continue;
|
|
10148
|
+
seen.add(key);
|
|
10149
|
+
out.push(ann);
|
|
10150
|
+
}
|
|
10151
|
+
return out;
|
|
10152
|
+
}
|
|
10153
|
+
function buildTextAnnotation(content, resourceId, userId, generator, motivation, match, body) {
|
|
10154
|
+
if (content.substring(match.start, match.end) !== match.exact) {
|
|
10155
|
+
throw new Error(
|
|
10156
|
+
`buildTextAnnotation invariant: content.substring(${match.start}, ${match.end}) !== exact for resource ${resourceId}, motivation ${motivation}`
|
|
10157
|
+
);
|
|
10158
|
+
}
|
|
10159
|
+
if (match.prefix !== void 0) {
|
|
10160
|
+
const actualPrefix = content.substring(Math.max(0, match.start - match.prefix.length), match.start);
|
|
10161
|
+
if (actualPrefix !== match.prefix) {
|
|
10162
|
+
throw new Error(
|
|
10163
|
+
`buildTextAnnotation invariant: content prefix-slice !== prefix for resource ${resourceId}, motivation ${motivation}`
|
|
10164
|
+
);
|
|
10165
|
+
}
|
|
10166
|
+
}
|
|
10167
|
+
if (match.suffix !== void 0) {
|
|
10168
|
+
const actualSuffix = content.substring(match.end, Math.min(content.length, match.end + match.suffix.length));
|
|
10169
|
+
if (actualSuffix !== match.suffix) {
|
|
10170
|
+
throw new Error(
|
|
10171
|
+
`buildTextAnnotation invariant: content suffix-slice !== suffix for resource ${resourceId}, motivation ${motivation}`
|
|
10172
|
+
);
|
|
10173
|
+
}
|
|
10174
|
+
}
|
|
10253
10175
|
const creator = didToAgent(userId);
|
|
10254
10176
|
const wasAttributedTo = creator["@id"] === generator["@id"] ? [generator] : [creator, generator];
|
|
10255
10177
|
return {
|
|
@@ -10288,9 +10210,9 @@ async function processHighlightJob(content, inferenceClient, params, userId, gen
|
|
|
10288
10210
|
params.sourceLanguage
|
|
10289
10211
|
);
|
|
10290
10212
|
onProgress(60, `Creating ${highlights.length} annotations...`, "creating");
|
|
10291
|
-
const annotations = highlights.map(
|
|
10292
|
-
(h) => buildTextAnnotation(params.resourceId, userId, generator, "highlighting", h)
|
|
10293
|
-
);
|
|
10213
|
+
const annotations = dedupeAnnotations(highlights.map(
|
|
10214
|
+
(h) => buildTextAnnotation(content, params.resourceId, userId, generator, "highlighting", h)
|
|
10215
|
+
));
|
|
10294
10216
|
onProgress(100, `Complete! Created ${annotations.length} highlights`, "creating");
|
|
10295
10217
|
return {
|
|
10296
10218
|
annotations,
|
|
@@ -10311,16 +10233,16 @@ async function processCommentJob(content, inferenceClient, params, userId, gener
|
|
|
10311
10233
|
);
|
|
10312
10234
|
onProgress(60, `Creating ${comments.length} annotations...`, "creating");
|
|
10313
10235
|
const bodyLanguage = params.language ?? "en";
|
|
10314
|
-
const annotations = comments.map(
|
|
10236
|
+
const annotations = dedupeAnnotations(comments.map(
|
|
10315
10237
|
(c) => (
|
|
10316
10238
|
// Match the pre-#651 CommentAnnotationWorker: include format and
|
|
10317
10239
|
// language on the body TextualBody. Optional in the schema, but
|
|
10318
10240
|
// consumers that do language-aware rendering rely on them.
|
|
10319
|
-
buildTextAnnotation(params.resourceId, userId, generator, "commenting", c, [
|
|
10241
|
+
buildTextAnnotation(content, params.resourceId, userId, generator, "commenting", c, [
|
|
10320
10242
|
{ type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language: bodyLanguage }
|
|
10321
10243
|
])
|
|
10322
10244
|
)
|
|
10323
|
-
);
|
|
10245
|
+
));
|
|
10324
10246
|
onProgress(100, `Complete! Created ${annotations.length} comments`, "creating");
|
|
10325
10247
|
return {
|
|
10326
10248
|
annotations,
|
|
@@ -10341,7 +10263,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
|
|
|
10341
10263
|
);
|
|
10342
10264
|
onProgress(60, `Creating ${assessments.length} annotations...`, "creating");
|
|
10343
10265
|
const bodyLanguage = params.language ?? "en";
|
|
10344
|
-
const annotations = assessments.map(
|
|
10266
|
+
const annotations = dedupeAnnotations(assessments.map(
|
|
10345
10267
|
(a) => (
|
|
10346
10268
|
// Single-object body with purpose aligned to motivation, matching the
|
|
10347
10269
|
// pre-#651 AssessmentAnnotationWorker's shape and the majority of
|
|
@@ -10349,7 +10271,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
|
|
|
10349
10271
|
// purpose='describing' — that loses the "this is an assessment, not
|
|
10350
10272
|
// a description" signal and breaks existing readers that access
|
|
10351
10273
|
// `body.value` directly on the object.
|
|
10352
|
-
buildTextAnnotation(params.resourceId, userId, generator, "assessing", a, {
|
|
10274
|
+
buildTextAnnotation(content, params.resourceId, userId, generator, "assessing", a, {
|
|
10353
10275
|
type: "TextualBody",
|
|
10354
10276
|
value: a.assessment,
|
|
10355
10277
|
purpose: "assessing",
|
|
@@ -10357,7 +10279,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
|
|
|
10357
10279
|
language: bodyLanguage
|
|
10358
10280
|
})
|
|
10359
10281
|
)
|
|
10360
|
-
);
|
|
10282
|
+
));
|
|
10361
10283
|
onProgress(100, `Complete! Created ${annotations.length} assessments`, "creating");
|
|
10362
10284
|
return {
|
|
10363
10285
|
annotations,
|
|
@@ -10401,27 +10323,44 @@ async function processReferenceJob(content, inferenceClient, params, userId, gen
|
|
|
10401
10323
|
{ type: "TextualBody", value: entityTypeName, purpose: "tagging", format: "text/plain", language: bodyLanguage }
|
|
10402
10324
|
];
|
|
10403
10325
|
for (const entity of extractedEntities) {
|
|
10404
|
-
|
|
10405
|
-
|
|
10406
|
-
|
|
10407
|
-
|
|
10408
|
-
|
|
10409
|
-
|
|
10410
|
-
|
|
10411
|
-
|
|
10412
|
-
|
|
10413
|
-
);
|
|
10414
|
-
allAnnotations.push(ann);
|
|
10415
|
-
totalEmitted++;
|
|
10416
|
-
} catch {
|
|
10326
|
+
const reconciled = reconcileSelector(content, {
|
|
10327
|
+
exact: entity.exact,
|
|
10328
|
+
...entity.prefix !== void 0 ? { prefix: entity.prefix } : {},
|
|
10329
|
+
...entity.suffix !== void 0 ? { suffix: entity.suffix } : {}
|
|
10330
|
+
});
|
|
10331
|
+
if (!reconciled) {
|
|
10332
|
+
logger2.error("Entity dropped \u2014 text not found in source", {
|
|
10333
|
+
text: entity.exact,
|
|
10334
|
+
entityType: entity.entityType
|
|
10335
|
+
});
|
|
10417
10336
|
errors++;
|
|
10337
|
+
continue;
|
|
10338
|
+
}
|
|
10339
|
+
if (reconciled.anchorMethod === "first-of-many" || reconciled.anchorMethod === "fuzzy-match") {
|
|
10340
|
+
logger2.warn("Entity anchored via degraded method", {
|
|
10341
|
+
text: entity.exact,
|
|
10342
|
+
entityType: entity.entityType,
|
|
10343
|
+
anchorMethod: reconciled.anchorMethod
|
|
10344
|
+
});
|
|
10418
10345
|
}
|
|
10346
|
+
const ann = buildTextAnnotation(
|
|
10347
|
+
content,
|
|
10348
|
+
params.resourceId,
|
|
10349
|
+
userId,
|
|
10350
|
+
generator,
|
|
10351
|
+
"linking",
|
|
10352
|
+
toMatch(reconciled),
|
|
10353
|
+
unresolvedBody
|
|
10354
|
+
);
|
|
10355
|
+
allAnnotations.push(ann);
|
|
10356
|
+
totalEmitted++;
|
|
10419
10357
|
}
|
|
10420
10358
|
}
|
|
10421
|
-
|
|
10359
|
+
const annotations = dedupeAnnotations(allAnnotations);
|
|
10360
|
+
onProgress(100, `Complete! Created ${annotations.length} references`, "creating");
|
|
10422
10361
|
return {
|
|
10423
|
-
annotations
|
|
10424
|
-
result: { totalFound, totalEmitted, errors }
|
|
10362
|
+
annotations,
|
|
10363
|
+
result: { totalFound, totalEmitted: annotations.length, errors }
|
|
10425
10364
|
};
|
|
10426
10365
|
}
|
|
10427
10366
|
async function processTagJob(content, inferenceClient, params, userId, generator, onProgress) {
|
|
@@ -10441,15 +10380,19 @@ async function processTagJob(content, inferenceClient, params, userId, generator
|
|
|
10441
10380
|
const tags = allTags;
|
|
10442
10381
|
onProgress(60, `Creating ${tags.length} tag annotations...`, "creating");
|
|
10443
10382
|
const bodyLanguage = params.language ?? "en";
|
|
10444
|
-
const
|
|
10445
|
-
const annotations = tags.map((t) => {
|
|
10383
|
+
const annotations = dedupeAnnotations(tags.map((t) => {
|
|
10446
10384
|
const category = t.category ?? "unknown";
|
|
10447
|
-
|
|
10448
|
-
return buildTextAnnotation(params.resourceId, userId, generator, "tagging", t, [
|
|
10385
|
+
return buildTextAnnotation(content, params.resourceId, userId, generator, "tagging", t, [
|
|
10449
10386
|
{ type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: bodyLanguage },
|
|
10450
10387
|
{ type: "TextualBody", value: params.schema.id, purpose: "classifying", format: "text/plain" }
|
|
10451
10388
|
]);
|
|
10452
|
-
});
|
|
10389
|
+
}));
|
|
10390
|
+
const byCategory = {};
|
|
10391
|
+
for (const ann of annotations) {
|
|
10392
|
+
const body = ann.body;
|
|
10393
|
+
const category = Array.isArray(body) && typeof body[0]?.value === "string" ? body[0].value : "unknown";
|
|
10394
|
+
byCategory[category] = (byCategory[category] ?? 0) + 1;
|
|
10395
|
+
}
|
|
10453
10396
|
onProgress(100, `Complete! Created ${annotations.length} tags`, "creating");
|
|
10454
10397
|
return {
|
|
10455
10398
|
annotations,
|
|
@@ -10487,10 +10430,7 @@ async function processGenerationJob(inferenceClient, params, onProgress, logger2
|
|
|
10487
10430
|
|
|
10488
10431
|
// src/worker-process.ts
|
|
10489
10432
|
async function emitEvent(session, channel, payload) {
|
|
10490
|
-
|
|
10491
|
-
const rawScope = isBroadcast ? payload.resourceId : void 0;
|
|
10492
|
-
const resourceScope = rawScope ? resourceId(rawScope) : void 0;
|
|
10493
|
-
await session.client.transport.emit(channel, payload, resourceScope);
|
|
10433
|
+
await session.client.transport.emit(channel, payload);
|
|
10494
10434
|
}
|
|
10495
10435
|
function startWorkerProcess(config) {
|
|
10496
10436
|
const { session, logger: logger2 } = config;
|