@semiont/jobs 0.5.5 → 0.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- import { createTomlConfigLoader, softwareToAgent, baseUrl, RESOURCE_BROADCAST_TYPES, resourceId, validateAndCorrectOffsets, didToAgent, getLocaleEnglishName } from '@semiont/core';
2
1
  import { deriveStorageUri } from '@semiont/content';
3
2
  import { withSpan, SpanKind, recordJobOutcome } from '@semiont/observability';
3
+ import { createTomlConfigLoader, softwareToAgent, baseUrl, reconcileSelector, didToAgent, getLocaleEnglishName } from '@semiont/core';
4
4
  import { generateAnnotationId } from '@semiont/event-sourcing';
5
5
  import { createInferenceClient } from '@semiont/inference';
6
6
  import { createServer } from 'http';
@@ -9337,17 +9337,15 @@ ${content.substring(0, 8e3)}
9337
9337
 
9338
9338
  Return a JSON array of comments. Each comment must have:
9339
9339
  - "exact": the exact text passage being commented on (quoted verbatim from source)
9340
- - "start": character offset where the passage starts
9341
- - "end": character offset where the passage ends
9342
- - "prefix": up to 32 characters of text immediately before the passage
9343
- - "suffix": up to 32 characters of text immediately after the passage
9340
+ - "prefix": up to 64 characters of text immediately before the passage
9341
+ - "suffix": up to 64 characters of text immediately after the passage
9344
9342
  - "comment": your comment following the instructions above
9345
9343
 
9346
9344
  Respond with a valid JSON array.
9347
9345
 
9348
9346
  Example:
9349
9347
  [
9350
- {"exact": "the quarterly review meeting", "start": 142, "end": 169, "prefix": "We need to schedule ", "suffix": " for next month.", "comment": "Who will lead this? Should we invite the external auditors?"}
9348
+ {"exact": "the quarterly review meeting", "prefix": "We need to schedule ", "suffix": " for next month.", "comment": "Who will lead this? Should we invite the external auditors?"}
9351
9349
  ]`;
9352
9350
  } else {
9353
9351
  const toneGuidance = tone ? `
@@ -9373,17 +9371,15 @@ ${content.substring(0, 8e3)}
9373
9371
 
9374
9372
  Return a JSON array of comments. Each comment should have:
9375
9373
  - "exact": the exact text passage being commented on (quoted verbatim from source)
9376
- - "start": character offset where the passage starts
9377
- - "end": character offset where the passage ends
9378
- - "prefix": up to 32 characters of text immediately before the passage
9379
- - "suffix": up to 32 characters of text immediately after the passage
9374
+ - "prefix": up to 64 characters of text immediately before the passage
9375
+ - "suffix": up to 64 characters of text immediately after the passage
9380
9376
  - "comment": your explanatory comment (1-3 sentences, provide context/background/clarification)
9381
9377
 
9382
9378
  Respond with a valid JSON array.
9383
9379
 
9384
9380
  Example format:
9385
9381
  [
9386
- {"exact": "Ouranos", "start": 52, "end": 59, "prefix": "In the beginning, ", "suffix": " ruled the universe", "comment": "Ouranos (also spelled Uranus) is the primordial Greek deity personifying the sky. In Hesiod's Theogony, he is the son and husband of Gaia (Earth) and father of the Titans."}
9382
+ {"exact": "Ouranos", "prefix": "In the beginning, ", "suffix": " ruled the universe", "comment": "Ouranos (also spelled Uranus) is the primordial Greek deity personifying the sky. In Hesiod's Theogony, he is the son and husband of Gaia (Earth) and father of the Titans."}
9387
9383
  ]`;
9388
9384
  }
9389
9385
  return prompt;
@@ -9414,16 +9410,14 @@ ${content.substring(0, 8e3)}
9414
9410
 
9415
9411
  Return a JSON array of highlights. Each highlight must have:
9416
9412
  - "exact": the exact text passage to highlight (quoted verbatim from source)
9417
- - "start": character offset where the passage starts
9418
- - "end": character offset where the passage ends
9419
- - "prefix": up to 32 characters of text immediately before the passage
9420
- - "suffix": up to 32 characters of text immediately after the passage
9413
+ - "prefix": up to 64 characters of text immediately before the passage
9414
+ - "suffix": up to 64 characters of text immediately after the passage
9421
9415
 
9422
9416
  Respond with a valid JSON array.
9423
9417
 
9424
9418
  Example:
9425
9419
  [
9426
- {"exact": "revenue grew 45% year-over-year", "start": 142, "end": 174, "prefix": "In Q3 2024, ", "suffix": ", exceeding all forecasts."}
9420
+ {"exact": "revenue grew 45% year-over-year", "prefix": "In Q3 2024, ", "suffix": ", exceeding all forecasts."}
9427
9421
  ]`;
9428
9422
  } else {
9429
9423
  const densityGuidance = density ? `
@@ -9447,16 +9441,14 @@ ${content.substring(0, 8e3)}
9447
9441
 
9448
9442
  Return a JSON array of highlights. Each highlight should have:
9449
9443
  - "exact": the exact text passage to highlight (quoted verbatim from source)
9450
- - "start": character offset where the passage starts
9451
- - "end": character offset where the passage ends
9452
- - "prefix": up to 32 characters of text immediately before the passage
9453
- - "suffix": up to 32 characters of text immediately after the passage
9444
+ - "prefix": up to 64 characters of text immediately before the passage
9445
+ - "suffix": up to 64 characters of text immediately after the passage
9454
9446
 
9455
9447
  Respond with a valid JSON array.
9456
9448
 
9457
9449
  Example format:
9458
9450
  [
9459
- {"exact": "we will discontinue support for legacy systems by March 2025", "start": 52, "end": 113, "prefix": "After careful consideration, ", "suffix": ". This decision affects"}
9451
+ {"exact": "we will discontinue support for legacy systems by March 2025", "prefix": "After careful consideration, ", "suffix": ". This decision affects"}
9460
9452
  ]`;
9461
9453
  }
9462
9454
  return prompt;
@@ -9490,17 +9482,15 @@ ${content.substring(0, 8e3)}
9490
9482
 
9491
9483
  Return a JSON array of assessments. Each assessment must have:
9492
9484
  - "exact": the exact text passage being assessed (quoted verbatim from source)
9493
- - "start": character offset where the passage starts
9494
- - "end": character offset where the passage ends
9495
- - "prefix": up to 32 characters of text immediately before the passage
9496
- - "suffix": up to 32 characters of text immediately after the passage
9485
+ - "prefix": up to 64 characters of text immediately before the passage
9486
+ - "suffix": up to 64 characters of text immediately after the passage
9497
9487
  - "assessment": your assessment following the instructions above
9498
9488
 
9499
9489
  Respond with a valid JSON array.
9500
9490
 
9501
9491
  Example:
9502
9492
  [
9503
- {"exact": "the quarterly revenue target", "start": 142, "end": 169, "prefix": "We established ", "suffix": " for Q4 2024.", "assessment": "This target seems ambitious given market conditions. Consider revising based on recent trends."}
9493
+ {"exact": "the quarterly revenue target", "prefix": "We established ", "suffix": " for Q4 2024.", "assessment": "This target seems ambitious given market conditions. Consider revising based on recent trends."}
9504
9494
  ]`;
9505
9495
  } else {
9506
9496
  const toneGuidance = tone ? `
@@ -9526,17 +9516,15 @@ ${content.substring(0, 8e3)}
9526
9516
 
9527
9517
  Return a JSON array of assessments. Each assessment should have:
9528
9518
  - "exact": the exact text passage being assessed (quoted verbatim from source)
9529
- - "start": character offset where the passage starts
9530
- - "end": character offset where the passage ends
9531
- - "prefix": up to 32 characters of text immediately before the passage
9532
- - "suffix": up to 32 characters of text immediately after the passage
9519
+ - "prefix": up to 64 characters of text immediately before the passage
9520
+ - "suffix": up to 64 characters of text immediately after the passage
9533
9521
  - "assessment": your analytical assessment (1-3 sentences, evaluate validity/strength/implications)
9534
9522
 
9535
9523
  Respond with a valid JSON array.
9536
9524
 
9537
9525
  Example format:
9538
9526
  [
9539
- {"exact": "AI will replace most jobs by 2030", "start": 52, "end": 89, "prefix": "Many experts predict that ", "suffix": ", fundamentally reshaping", "assessment": "This claim lacks nuance and supporting evidence. Employment patterns historically show job transformation rather than wholesale replacement. The timeline appears speculative without specific sector analysis."}
9527
+ {"exact": "AI will replace most jobs by 2030", "prefix": "Many experts predict that ", "suffix": ", fundamentally reshaping", "assessment": "This claim lacks nuance and supporting evidence. Employment patterns historically show job transformation rather than wholesale replacement. The timeline appears speculative without specific sector analysis."}
9540
9528
  ]`;
9541
9529
  }
9542
9530
  return prompt;
@@ -9582,17 +9570,15 @@ ${content}
9582
9570
 
9583
9571
  Return a JSON array of tags. Each tag should have:
9584
9572
  - "exact": the exact text passage (quoted verbatim from source)
9585
- - "start": character offset where the passage starts
9586
- - "end": character offset where the passage ends
9587
- - "prefix": up to 32 characters of text immediately before the passage
9588
- - "suffix": up to 32 characters of text immediately after the passage
9573
+ - "prefix": up to 64 characters of text immediately before the passage
9574
+ - "suffix": up to 64 characters of text immediately after the passage
9589
9575
 
9590
9576
  Respond with a valid JSON array.
9591
9577
 
9592
9578
  Example format:
9593
9579
  [
9594
- {"exact": "What duty did the defendant owe?", "start": 142, "end": 175, "prefix": "The central question is: ", "suffix": " This question must be"},
9595
- {"exact": "In tort law, a duty of care is established when...", "start": 412, "end": 520, "prefix": "Legal framework:\\n", "suffix": "\\n\\nApplying this standard"}
9580
+ {"exact": "What duty did the defendant owe?", "prefix": "The central question is: ", "suffix": " This question must be"},
9581
+ {"exact": "In tort law, a duty of care is established when...", "prefix": "Legal framework:\\n", "suffix": "\\n\\nApplying this standard"}
9596
9582
  ]`;
9597
9583
  return prompt;
9598
9584
  }
@@ -9660,23 +9646,29 @@ var MotivationParsers = class {
9660
9646
  try {
9661
9647
  const parsed = extractObjectsFromArray(response);
9662
9648
  const valid = parsed.filter(
9663
- (c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.start === "number" && typeof c.end === "number" && typeof c.comment === "string" && c.comment.trim().length > 0
9649
+ (c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.comment === "string" && c.comment.trim().length > 0
9664
9650
  );
9665
9651
  console.log(`[MotivationParsers] Parsed ${valid.length} valid comments from ${parsed.length} total`);
9666
9652
  const validatedComments = [];
9667
9653
  for (const comment of valid) {
9668
- try {
9669
- const validated = validateAndCorrectOffsets(content, comment.start, comment.end, comment.exact);
9670
- validatedComments.push({
9671
- ...comment,
9672
- start: validated.start,
9673
- end: validated.end,
9674
- prefix: validated.prefix,
9675
- suffix: validated.suffix
9676
- });
9677
- } catch (error) {
9678
- console.warn(`[MotivationParsers] Skipping invalid comment "${comment.exact}":`, error);
9679
- }
9654
+ const reconciled = reconcileSelector(content, {
9655
+ exact: comment.exact,
9656
+ ...typeof comment.prefix === "string" ? { prefix: comment.prefix } : {},
9657
+ ...typeof comment.suffix === "string" ? { suffix: comment.suffix } : {}
9658
+ });
9659
+ if (!reconciled) {
9660
+ console.warn(`[MotivationParsers] Dropped hallucinated comment "${comment.exact}"`);
9661
+ continue;
9662
+ }
9663
+ logAnchorMethod("comment", comment.exact, reconciled.anchorMethod);
9664
+ validatedComments.push({
9665
+ comment: comment.comment,
9666
+ exact: reconciled.exact,
9667
+ start: reconciled.start,
9668
+ end: reconciled.end,
9669
+ ...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
9670
+ ...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
9671
+ });
9680
9672
  }
9681
9673
  return validatedComments;
9682
9674
  } catch (error) {
@@ -9695,22 +9687,27 @@ var MotivationParsers = class {
9695
9687
  try {
9696
9688
  const parsed = extractObjectsFromArray(response);
9697
9689
  const highlights = parsed.filter(
9698
- (h) => !!h && typeof h === "object" && typeof h.exact === "string" && typeof h.start === "number" && typeof h.end === "number"
9690
+ (h) => !!h && typeof h === "object" && typeof h.exact === "string"
9699
9691
  );
9700
9692
  const validatedHighlights = [];
9701
9693
  for (const highlight of highlights) {
9702
- try {
9703
- const validated = validateAndCorrectOffsets(content, highlight.start, highlight.end, highlight.exact);
9704
- validatedHighlights.push({
9705
- ...highlight,
9706
- start: validated.start,
9707
- end: validated.end,
9708
- prefix: validated.prefix,
9709
- suffix: validated.suffix
9710
- });
9711
- } catch (error) {
9712
- console.warn(`[MotivationParsers] Skipping invalid highlight "${highlight.exact}":`, error);
9713
- }
9694
+ const reconciled = reconcileSelector(content, {
9695
+ exact: highlight.exact,
9696
+ ...typeof highlight.prefix === "string" ? { prefix: highlight.prefix } : {},
9697
+ ...typeof highlight.suffix === "string" ? { suffix: highlight.suffix } : {}
9698
+ });
9699
+ if (!reconciled) {
9700
+ console.warn(`[MotivationParsers] Dropped hallucinated highlight "${highlight.exact}"`);
9701
+ continue;
9702
+ }
9703
+ logAnchorMethod("highlight", highlight.exact, reconciled.anchorMethod);
9704
+ validatedHighlights.push({
9705
+ exact: reconciled.exact,
9706
+ start: reconciled.start,
9707
+ end: reconciled.end,
9708
+ ...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
9709
+ ...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
9710
+ });
9714
9711
  }
9715
9712
  return validatedHighlights;
9716
9713
  } catch (error) {
@@ -9730,22 +9727,28 @@ var MotivationParsers = class {
9730
9727
  try {
9731
9728
  const parsed = extractObjectsFromArray(response);
9732
9729
  const assessments = parsed.filter(
9733
- (a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.start === "number" && typeof a.end === "number" && typeof a.assessment === "string"
9730
+ (a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.assessment === "string"
9734
9731
  );
9735
9732
  const validatedAssessments = [];
9736
9733
  for (const assessment of assessments) {
9737
- try {
9738
- const validated = validateAndCorrectOffsets(content, assessment.start, assessment.end, assessment.exact);
9739
- validatedAssessments.push({
9740
- ...assessment,
9741
- start: validated.start,
9742
- end: validated.end,
9743
- prefix: validated.prefix,
9744
- suffix: validated.suffix
9745
- });
9746
- } catch (error) {
9747
- console.warn(`[MotivationParsers] Skipping invalid assessment "${assessment.exact}":`, error);
9748
- }
9734
+ const reconciled = reconcileSelector(content, {
9735
+ exact: assessment.exact,
9736
+ ...typeof assessment.prefix === "string" ? { prefix: assessment.prefix } : {},
9737
+ ...typeof assessment.suffix === "string" ? { suffix: assessment.suffix } : {}
9738
+ });
9739
+ if (!reconciled) {
9740
+ console.warn(`[MotivationParsers] Dropped hallucinated assessment "${assessment.exact}"`);
9741
+ continue;
9742
+ }
9743
+ logAnchorMethod("assessment", assessment.exact, reconciled.anchorMethod);
9744
+ validatedAssessments.push({
9745
+ assessment: assessment.assessment,
9746
+ exact: reconciled.exact,
9747
+ start: reconciled.start,
9748
+ end: reconciled.end,
9749
+ ...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
9750
+ ...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
9751
+ });
9749
9752
  }
9750
9753
  return validatedAssessments;
9751
9754
  } catch (error) {
@@ -9755,17 +9758,15 @@ var MotivationParsers = class {
9755
9758
  }
9756
9759
  }
9757
9760
  /**
9758
- * Parse and validate AI response for tag detection
9759
- * Note: Does NOT validate offsets - caller must do that with content
9760
- *
9761
- * @param response - Raw AI response string (may include markdown code fences)
9762
- * @returns Array of tag matches (offsets not yet validated)
9761
+ * Parse the LLM's tag response into raw, pre-reconciliation tag inputs.
9762
+ * Reconciliation happens in `validateTagOffsets`, which adds `start`/`end`
9763
+ * by anchoring `exact` against the source content.
9763
9764
  */
9764
9765
  static parseTags(response) {
9765
9766
  try {
9766
9767
  const parsed = extractObjectsFromArray(response);
9767
9768
  const valid = parsed.filter(
9768
- (t) => !!t && typeof t === "object" && typeof t.exact === "string" && typeof t.start === "number" && typeof t.end === "number" && t.exact.trim().length > 0
9769
+ (t) => !!t && typeof t === "object" && typeof t.exact === "string" && t.exact.trim().length > 0
9769
9770
  );
9770
9771
  console.log(`[MotivationParsers] Parsed ${valid.length} valid tags from ${parsed.length} total`);
9771
9772
  return valid;
@@ -9775,34 +9776,38 @@ var MotivationParsers = class {
9775
9776
  }
9776
9777
  }
9777
9778
  /**
9778
- * Validate tag offsets against content and add category
9779
- * Helper for tag detection after initial parsing
9780
- *
9781
- * @param tags - Parsed tags without validated offsets
9782
- * @param content - Original content to validate against
9783
- * @param category - Category to assign to validated tags
9784
- * @returns Array of validated tag matches
9779
+ * Anchor raw tag inputs against source content and add category.
9785
9780
  */
9786
9781
  static validateTagOffsets(tags, content, category) {
9787
9782
  const validatedTags = [];
9788
9783
  for (const tag of tags) {
9789
- try {
9790
- const validated = validateAndCorrectOffsets(content, tag.start, tag.end, tag.exact);
9791
- validatedTags.push({
9792
- ...tag,
9793
- category,
9794
- start: validated.start,
9795
- end: validated.end,
9796
- prefix: validated.prefix,
9797
- suffix: validated.suffix
9798
- });
9799
- } catch (error) {
9800
- console.warn(`[MotivationParsers] Skipping invalid tag for category "${category}":`, error);
9801
- }
9784
+ const reconciled = reconcileSelector(content, {
9785
+ exact: tag.exact,
9786
+ ...typeof tag.prefix === "string" ? { prefix: tag.prefix } : {},
9787
+ ...typeof tag.suffix === "string" ? { suffix: tag.suffix } : {}
9788
+ });
9789
+ if (!reconciled) {
9790
+ console.warn(`[MotivationParsers] Dropped hallucinated tag "${tag.exact}" for category "${category}"`);
9791
+ continue;
9792
+ }
9793
+ logAnchorMethod("tag", tag.exact, reconciled.anchorMethod);
9794
+ validatedTags.push({
9795
+ category,
9796
+ exact: reconciled.exact,
9797
+ start: reconciled.start,
9798
+ end: reconciled.end,
9799
+ ...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
9800
+ ...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
9801
+ });
9802
9802
  }
9803
9803
  return validatedTags;
9804
9804
  }
9805
9805
  };
9806
+ function logAnchorMethod(motivation, exact, anchorMethod) {
9807
+ if (anchorMethod === "first-of-many" || anchorMethod === "fuzzy-match") {
9808
+ console.warn(`[MotivationParsers] ${motivation} anchored via ${anchorMethod}: "${exact}"`);
9809
+ }
9810
+ }
9806
9811
 
9807
9812
  // src/workers/annotation-detection.ts
9808
9813
  var AnnotationDetection = class {
@@ -9930,17 +9935,15 @@ ${exact}
9930
9935
  """
9931
9936
 
9932
9937
  Respond with a JSON array of entities found. Each entity should have:
9933
- - exact: the exact text span from the input
9938
+ - exact: the exact text span from the input (quoted verbatim \u2014 character-for-character)
9934
9939
  - entityType: one of the provided entity types
9935
- - startOffset: character position where the entity starts (0-indexed)
9936
- - endOffset: character position where the entity ends
9937
- - prefix: up to 32 characters of text immediately before the entity (helps identify correct occurrence)
9938
- - suffix: up to 32 characters of text immediately after the entity (helps identify correct occurrence)
9940
+ - prefix: up to 64 characters of text immediately before the entity (used to disambiguate when the same text appears more than once)
9941
+ - suffix: up to 64 characters of text immediately after the entity (same purpose)
9939
9942
 
9940
9943
  If no entities are found, respond with an empty array [].
9941
9944
 
9942
9945
  Example output:
9943
- [{"exact":"Alice","entityType":"Person","startOffset":0,"endOffset":5,"prefix":"","suffix":" went to"},{"exact":"Paris","entityType":"Location","startOffset":20,"endOffset":25,"prefix":"went to ","suffix":" yesterday"}]`;
9946
+ [{"exact":"Alice","entityType":"Person","prefix":"","suffix":" went to"},{"exact":"Paris","entityType":"Location","prefix":"went to ","suffix":" yesterday"}]`;
9944
9947
  logger2.debug("Sending entity extraction request", { entityTypes: entityTypesDescription });
9945
9948
  const response = await client.generateTextWithMetadata(
9946
9949
  prompt,
@@ -9969,151 +9972,18 @@ Example output:
9969
9972
  logger2.error(errorMsg);
9970
9973
  throw new Error(errorMsg);
9971
9974
  }
9972
- return entities.map((entity, idx) => {
9973
- let start = entity.startOffset;
9974
- let end = entity.endOffset;
9975
- logger2.debug("Processing entity", {
9976
- index: idx + 1,
9977
- total: entities.length,
9978
- type: entity.entityType,
9979
- text: entity.exact,
9980
- offsetsFromAI: `[${start}:${end}]`
9981
- });
9982
- const extractedText = exact.substring(start, end);
9983
- let anchorMethod;
9984
- if (extractedText === entity.exact) {
9985
- anchorMethod = "llm-exact";
9986
- logger2.debug("Entity anchored", {
9987
- text: entity.exact,
9988
- entityType: entity.entityType,
9989
- anchorMethod
9990
- });
9991
- } else {
9992
- logger2.debug("LLM offsets mismatch \u2014 attempting re-anchor", {
9993
- expected: entity.exact,
9994
- llmOffsets: `[${start}:${end}]`,
9995
- foundAtLlmOffsets: extractedText
9996
- });
9997
- let occurrenceCount = 0;
9998
- let firstOccurrence = -1;
9999
- let searchPos = 0;
10000
- while ((searchPos = exact.indexOf(entity.exact, searchPos)) !== -1) {
10001
- if (firstOccurrence === -1) firstOccurrence = searchPos;
10002
- occurrenceCount++;
10003
- searchPos++;
10004
- }
10005
- if (occurrenceCount === 0) {
10006
- anchorMethod = "dropped";
10007
- logger2.error("Entity text not found in resource \u2014 dropping", {
10008
- text: entity.exact,
10009
- entityType: entity.entityType,
10010
- llmOffsets: `[${start}:${end}]`,
10011
- anchorMethod,
10012
- resourceStart: exact.substring(0, 200)
10013
- });
10014
- return null;
10015
- }
10016
- let recoveredOffset = -1;
10017
- if (entity.prefix || entity.suffix) {
10018
- let p = 0;
10019
- while ((p = exact.indexOf(entity.exact, p)) !== -1) {
10020
- const candidatePrefix = exact.substring(Math.max(0, p - 32), p);
10021
- const candidateSuffix = exact.substring(
10022
- p + entity.exact.length,
10023
- Math.min(exact.length, p + entity.exact.length + 32)
10024
- );
10025
- const prefixMatch = !entity.prefix || candidatePrefix.endsWith(entity.prefix);
10026
- const suffixMatch = !entity.suffix || candidateSuffix.startsWith(entity.suffix);
10027
- if (prefixMatch && suffixMatch) {
10028
- recoveredOffset = p;
10029
- break;
10030
- }
10031
- p++;
10032
- }
10033
- }
10034
- if (recoveredOffset !== -1) {
10035
- anchorMethod = "context-recovered";
10036
- start = recoveredOffset;
10037
- end = recoveredOffset + entity.exact.length;
10038
- logger2.debug("Entity anchored", {
10039
- text: entity.exact,
10040
- entityType: entity.entityType,
10041
- anchorMethod,
10042
- offsetDiff: recoveredOffset - entity.startOffset
10043
- });
10044
- } else if (occurrenceCount === 1) {
10045
- anchorMethod = "unique-match";
10046
- start = firstOccurrence;
10047
- end = firstOccurrence + entity.exact.length;
10048
- logger2.debug("Entity anchored", {
10049
- text: entity.exact,
10050
- entityType: entity.entityType,
10051
- anchorMethod,
10052
- offsetDiff: firstOccurrence - entity.startOffset
10053
- });
10054
- } else {
10055
- anchorMethod = "first-of-many";
10056
- start = firstOccurrence;
10057
- end = firstOccurrence + entity.exact.length;
10058
- logger2.warn("Entity anchored at first of multiple occurrences \u2014 may be wrong", {
10059
- text: entity.exact,
10060
- entityType: entity.entityType,
10061
- anchorMethod,
10062
- occurrenceCount,
10063
- chosenOffset: firstOccurrence,
10064
- llmOffsets: `[${entity.startOffset}:${entity.endOffset}]`,
10065
- hasPrefix: !!entity.prefix,
10066
- hasSuffix: !!entity.suffix
10067
- });
10068
- }
10069
- }
10070
- return {
10071
- exact: entity.exact,
10072
- entityType: entity.entityType,
10073
- start,
10074
- end,
10075
- prefix: entity.prefix,
10076
- suffix: entity.suffix
10077
- };
10078
- }).filter((entity) => {
10079
- if (entity === null) {
10080
- logger2.debug("Filtered entity: null");
10081
- return false;
10082
- }
10083
- if (entity.start === void 0 || entity.end === void 0) {
10084
- logger2.warn("Filtered entity: missing offsets", { text: entity.exact });
10085
- return false;
10086
- }
10087
- if (entity.start < 0) {
10088
- logger2.warn("Filtered entity: negative start", {
10089
- text: entity.exact,
10090
- start: entity.start
10091
- });
10092
- return false;
10093
- }
10094
- if (entity.end > exact.length) {
10095
- logger2.warn("Filtered entity: end exceeds text length", {
10096
- text: entity.exact,
10097
- end: entity.end,
10098
- textLength: exact.length
10099
- });
10100
- return false;
10101
- }
10102
- const extractedText = exact.substring(entity.start, entity.end);
10103
- if (extractedText !== entity.exact) {
10104
- logger2.warn("Filtered entity: offset mismatch", {
10105
- expected: entity.exact,
10106
- got: extractedText,
10107
- offsets: `[${entity.start}:${entity.end}]`
10108
- });
10109
- return false;
10110
- }
10111
- logger2.debug("Accepted entity", {
10112
- text: entity.exact,
10113
- offsets: `[${entity.start}:${entity.end}]`
10114
- });
10115
- return true;
10116
- });
9975
+ return entities.filter((e) => {
9976
+ const ok = e && typeof e === "object" && typeof e.exact === "string" && typeof e.entityType === "string";
9977
+ if (!ok) {
9978
+ logger2.debug("Dropped malformed LLM entity", { entity: e });
9979
+ }
9980
+ return ok;
9981
+ }).map((entity) => ({
9982
+ exact: entity.exact,
9983
+ entityType: entity.entityType,
9984
+ ...typeof entity.prefix === "string" ? { prefix: entity.prefix } : {},
9985
+ ...typeof entity.suffix === "string" ? { suffix: entity.suffix } : {}
9986
+ }));
10117
9987
  } catch (error) {
10118
9988
  logger2.error("Failed to parse entity extraction response", {
10119
9989
  error: error instanceof Error ? error.message : String(error)
@@ -10249,7 +10119,59 @@ Requirements:
10249
10119
  });
10250
10120
  return result;
10251
10121
  }
10252
- function buildTextAnnotation(resourceId, userId, generator, motivation, match, body) {
10122
+ function toMatch(r) {
10123
+ return {
10124
+ exact: r.exact,
10125
+ start: r.start,
10126
+ end: r.end,
10127
+ ...r.prefix !== void 0 ? { prefix: r.prefix } : {},
10128
+ ...r.suffix !== void 0 ? { suffix: r.suffix } : {}
10129
+ };
10130
+ }
10131
+ function annotationDedupeKey(ann) {
10132
+ const target = ann.target;
10133
+ const selectors = Array.isArray(target?.selector) ? target.selector : [];
10134
+ const pos = selectors.find((s) => s.type === "TextPositionSelector");
10135
+ return [
10136
+ ann.motivation,
10137
+ pos?.start ?? "?",
10138
+ pos?.end ?? "?",
10139
+ JSON.stringify(ann.body ?? null)
10140
+ ].join("|");
10141
+ }
10142
+ function dedupeAnnotations(annotations) {
10143
+ const seen = /* @__PURE__ */ new Set();
10144
+ const out = [];
10145
+ for (const ann of annotations) {
10146
+ const key = annotationDedupeKey(ann);
10147
+ if (seen.has(key)) continue;
10148
+ seen.add(key);
10149
+ out.push(ann);
10150
+ }
10151
+ return out;
10152
+ }
10153
+ function buildTextAnnotation(content, resourceId, userId, generator, motivation, match, body) {
10154
+ if (content.substring(match.start, match.end) !== match.exact) {
10155
+ throw new Error(
10156
+ `buildTextAnnotation invariant: content.substring(${match.start}, ${match.end}) !== exact for resource ${resourceId}, motivation ${motivation}`
10157
+ );
10158
+ }
10159
+ if (match.prefix !== void 0) {
10160
+ const actualPrefix = content.substring(Math.max(0, match.start - match.prefix.length), match.start);
10161
+ if (actualPrefix !== match.prefix) {
10162
+ throw new Error(
10163
+ `buildTextAnnotation invariant: content prefix-slice !== prefix for resource ${resourceId}, motivation ${motivation}`
10164
+ );
10165
+ }
10166
+ }
10167
+ if (match.suffix !== void 0) {
10168
+ const actualSuffix = content.substring(match.end, Math.min(content.length, match.end + match.suffix.length));
10169
+ if (actualSuffix !== match.suffix) {
10170
+ throw new Error(
10171
+ `buildTextAnnotation invariant: content suffix-slice !== suffix for resource ${resourceId}, motivation ${motivation}`
10172
+ );
10173
+ }
10174
+ }
10253
10175
  const creator = didToAgent(userId);
10254
10176
  const wasAttributedTo = creator["@id"] === generator["@id"] ? [generator] : [creator, generator];
10255
10177
  return {
@@ -10288,9 +10210,9 @@ async function processHighlightJob(content, inferenceClient, params, userId, gen
10288
10210
  params.sourceLanguage
10289
10211
  );
10290
10212
  onProgress(60, `Creating ${highlights.length} annotations...`, "creating");
10291
- const annotations = highlights.map(
10292
- (h) => buildTextAnnotation(params.resourceId, userId, generator, "highlighting", h)
10293
- );
10213
+ const annotations = dedupeAnnotations(highlights.map(
10214
+ (h) => buildTextAnnotation(content, params.resourceId, userId, generator, "highlighting", h)
10215
+ ));
10294
10216
  onProgress(100, `Complete! Created ${annotations.length} highlights`, "creating");
10295
10217
  return {
10296
10218
  annotations,
@@ -10311,16 +10233,16 @@ async function processCommentJob(content, inferenceClient, params, userId, gener
10311
10233
  );
10312
10234
  onProgress(60, `Creating ${comments.length} annotations...`, "creating");
10313
10235
  const bodyLanguage = params.language ?? "en";
10314
- const annotations = comments.map(
10236
+ const annotations = dedupeAnnotations(comments.map(
10315
10237
  (c) => (
10316
10238
  // Match the pre-#651 CommentAnnotationWorker: include format and
10317
10239
  // language on the body TextualBody. Optional in the schema, but
10318
10240
  // consumers that do language-aware rendering rely on them.
10319
- buildTextAnnotation(params.resourceId, userId, generator, "commenting", c, [
10241
+ buildTextAnnotation(content, params.resourceId, userId, generator, "commenting", c, [
10320
10242
  { type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language: bodyLanguage }
10321
10243
  ])
10322
10244
  )
10323
- );
10245
+ ));
10324
10246
  onProgress(100, `Complete! Created ${annotations.length} comments`, "creating");
10325
10247
  return {
10326
10248
  annotations,
@@ -10341,7 +10263,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
10341
10263
  );
10342
10264
  onProgress(60, `Creating ${assessments.length} annotations...`, "creating");
10343
10265
  const bodyLanguage = params.language ?? "en";
10344
- const annotations = assessments.map(
10266
+ const annotations = dedupeAnnotations(assessments.map(
10345
10267
  (a) => (
10346
10268
  // Single-object body with purpose aligned to motivation, matching the
10347
10269
  // pre-#651 AssessmentAnnotationWorker's shape and the majority of
@@ -10349,7 +10271,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
10349
10271
  // purpose='describing' — that loses the "this is an assessment, not
10350
10272
  // a description" signal and breaks existing readers that access
10351
10273
  // `body.value` directly on the object.
10352
- buildTextAnnotation(params.resourceId, userId, generator, "assessing", a, {
10274
+ buildTextAnnotation(content, params.resourceId, userId, generator, "assessing", a, {
10353
10275
  type: "TextualBody",
10354
10276
  value: a.assessment,
10355
10277
  purpose: "assessing",
@@ -10357,7 +10279,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
10357
10279
  language: bodyLanguage
10358
10280
  })
10359
10281
  )
10360
- );
10282
+ ));
10361
10283
  onProgress(100, `Complete! Created ${annotations.length} assessments`, "creating");
10362
10284
  return {
10363
10285
  annotations,
@@ -10401,27 +10323,44 @@ async function processReferenceJob(content, inferenceClient, params, userId, gen
10401
10323
  { type: "TextualBody", value: entityTypeName, purpose: "tagging", format: "text/plain", language: bodyLanguage }
10402
10324
  ];
10403
10325
  for (const entity of extractedEntities) {
10404
- try {
10405
- const validated = validateAndCorrectOffsets(content, entity.start, entity.end, entity.exact);
10406
- const ann = buildTextAnnotation(
10407
- params.resourceId,
10408
- userId,
10409
- generator,
10410
- "linking",
10411
- validated,
10412
- unresolvedBody
10413
- );
10414
- allAnnotations.push(ann);
10415
- totalEmitted++;
10416
- } catch {
10326
+ const reconciled = reconcileSelector(content, {
10327
+ exact: entity.exact,
10328
+ ...entity.prefix !== void 0 ? { prefix: entity.prefix } : {},
10329
+ ...entity.suffix !== void 0 ? { suffix: entity.suffix } : {}
10330
+ });
10331
+ if (!reconciled) {
10332
+ logger2.error("Entity dropped \u2014 text not found in source", {
10333
+ text: entity.exact,
10334
+ entityType: entity.entityType
10335
+ });
10417
10336
  errors++;
10337
+ continue;
10338
+ }
10339
+ if (reconciled.anchorMethod === "first-of-many" || reconciled.anchorMethod === "fuzzy-match") {
10340
+ logger2.warn("Entity anchored via degraded method", {
10341
+ text: entity.exact,
10342
+ entityType: entity.entityType,
10343
+ anchorMethod: reconciled.anchorMethod
10344
+ });
10418
10345
  }
10346
+ const ann = buildTextAnnotation(
10347
+ content,
10348
+ params.resourceId,
10349
+ userId,
10350
+ generator,
10351
+ "linking",
10352
+ toMatch(reconciled),
10353
+ unresolvedBody
10354
+ );
10355
+ allAnnotations.push(ann);
10356
+ totalEmitted++;
10419
10357
  }
10420
10358
  }
10421
- onProgress(100, `Complete! Created ${totalEmitted} references`, "creating");
10359
+ const annotations = dedupeAnnotations(allAnnotations);
10360
+ onProgress(100, `Complete! Created ${annotations.length} references`, "creating");
10422
10361
  return {
10423
- annotations: allAnnotations,
10424
- result: { totalFound, totalEmitted, errors }
10362
+ annotations,
10363
+ result: { totalFound, totalEmitted: annotations.length, errors }
10425
10364
  };
10426
10365
  }
10427
10366
  async function processTagJob(content, inferenceClient, params, userId, generator, onProgress) {
@@ -10441,15 +10380,19 @@ async function processTagJob(content, inferenceClient, params, userId, generator
10441
10380
  const tags = allTags;
10442
10381
  onProgress(60, `Creating ${tags.length} tag annotations...`, "creating");
10443
10382
  const bodyLanguage = params.language ?? "en";
10444
- const byCategory = {};
10445
- const annotations = tags.map((t) => {
10383
+ const annotations = dedupeAnnotations(tags.map((t) => {
10446
10384
  const category = t.category ?? "unknown";
10447
- byCategory[category] = (byCategory[category] ?? 0) + 1;
10448
- return buildTextAnnotation(params.resourceId, userId, generator, "tagging", t, [
10385
+ return buildTextAnnotation(content, params.resourceId, userId, generator, "tagging", t, [
10449
10386
  { type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: bodyLanguage },
10450
10387
  { type: "TextualBody", value: params.schema.id, purpose: "classifying", format: "text/plain" }
10451
10388
  ]);
10452
- });
10389
+ }));
10390
+ const byCategory = {};
10391
+ for (const ann of annotations) {
10392
+ const body = ann.body;
10393
+ const category = Array.isArray(body) && typeof body[0]?.value === "string" ? body[0].value : "unknown";
10394
+ byCategory[category] = (byCategory[category] ?? 0) + 1;
10395
+ }
10453
10396
  onProgress(100, `Complete! Created ${annotations.length} tags`, "creating");
10454
10397
  return {
10455
10398
  annotations,
@@ -10487,10 +10430,7 @@ async function processGenerationJob(inferenceClient, params, onProgress, logger2
10487
10430
 
10488
10431
  // src/worker-process.ts
10489
10432
  async function emitEvent(session, channel, payload) {
10490
- const isBroadcast = RESOURCE_BROADCAST_TYPES.includes(channel);
10491
- const rawScope = isBroadcast ? payload.resourceId : void 0;
10492
- const resourceScope = rawScope ? resourceId(rawScope) : void 0;
10493
- await session.client.transport.emit(channel, payload, resourceScope);
10433
+ await session.client.transport.emit(channel, payload);
10494
10434
  }
10495
10435
  function startWorkerProcess(config) {
10496
10436
  const { session, logger: logger2 } = config;