@semiont/jobs 0.5.5 → 0.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
- import { createTomlConfigLoader, softwareToAgent, baseUrl, RESOURCE_BROADCAST_TYPES, resourceId, validateAndCorrectOffsets, didToAgent, getLocaleEnglishName } from '@semiont/core';
1
+ import { createTomlConfigLoader, softwareToAgent, baseUrl, getPrimaryMediaType, textExtractionOf, reconcileSelector, didToAgent, getLocaleEnglishName } from '@semiont/core';
2
2
  import { deriveStorageUri } from '@semiont/content';
3
3
  import { withSpan, SpanKind, recordJobOutcome } from '@semiont/observability';
4
4
  import { generateAnnotationId } from '@semiont/event-sourcing';
@@ -8,7 +8,7 @@ import { existsSync, readFileSync } from 'fs';
8
8
  import { homedir, hostname } from 'os';
9
9
  import { join } from 'path';
10
10
  import { InMemorySessionStorage, setStoredSession, kbBackendUrl, SemiontClient, SemiontSession } from '@semiont/sdk';
11
- import { HttpTransport, HttpContentTransport } from '@semiont/api-client';
11
+ import { HttpTransport, HttpContentTransport } from '@semiont/http-transport';
12
12
  import { createProcessLogger } from '@semiont/observability/process-logger';
13
13
 
14
14
  var __create = Object.create;
@@ -9337,17 +9337,15 @@ ${content.substring(0, 8e3)}
9337
9337
 
9338
9338
  Return a JSON array of comments. Each comment must have:
9339
9339
  - "exact": the exact text passage being commented on (quoted verbatim from source)
9340
- - "start": character offset where the passage starts
9341
- - "end": character offset where the passage ends
9342
- - "prefix": up to 32 characters of text immediately before the passage
9343
- - "suffix": up to 32 characters of text immediately after the passage
9340
+ - "prefix": up to 64 characters of text immediately before the passage
9341
+ - "suffix": up to 64 characters of text immediately after the passage
9344
9342
  - "comment": your comment following the instructions above
9345
9343
 
9346
9344
  Respond with a valid JSON array.
9347
9345
 
9348
9346
  Example:
9349
9347
  [
9350
- {"exact": "the quarterly review meeting", "start": 142, "end": 169, "prefix": "We need to schedule ", "suffix": " for next month.", "comment": "Who will lead this? Should we invite the external auditors?"}
9348
+ {"exact": "the quarterly review meeting", "prefix": "We need to schedule ", "suffix": " for next month.", "comment": "Who will lead this? Should we invite the external auditors?"}
9351
9349
  ]`;
9352
9350
  } else {
9353
9351
  const toneGuidance = tone ? `
@@ -9373,17 +9371,15 @@ ${content.substring(0, 8e3)}
9373
9371
 
9374
9372
  Return a JSON array of comments. Each comment should have:
9375
9373
  - "exact": the exact text passage being commented on (quoted verbatim from source)
9376
- - "start": character offset where the passage starts
9377
- - "end": character offset where the passage ends
9378
- - "prefix": up to 32 characters of text immediately before the passage
9379
- - "suffix": up to 32 characters of text immediately after the passage
9374
+ - "prefix": up to 64 characters of text immediately before the passage
9375
+ - "suffix": up to 64 characters of text immediately after the passage
9380
9376
  - "comment": your explanatory comment (1-3 sentences, provide context/background/clarification)
9381
9377
 
9382
9378
  Respond with a valid JSON array.
9383
9379
 
9384
9380
  Example format:
9385
9381
  [
9386
- {"exact": "Ouranos", "start": 52, "end": 59, "prefix": "In the beginning, ", "suffix": " ruled the universe", "comment": "Ouranos (also spelled Uranus) is the primordial Greek deity personifying the sky. In Hesiod's Theogony, he is the son and husband of Gaia (Earth) and father of the Titans."}
9382
+ {"exact": "Ouranos", "prefix": "In the beginning, ", "suffix": " ruled the universe", "comment": "Ouranos (also spelled Uranus) is the primordial Greek deity personifying the sky. In Hesiod's Theogony, he is the son and husband of Gaia (Earth) and father of the Titans."}
9387
9383
  ]`;
9388
9384
  }
9389
9385
  return prompt;
@@ -9414,16 +9410,14 @@ ${content.substring(0, 8e3)}
9414
9410
 
9415
9411
  Return a JSON array of highlights. Each highlight must have:
9416
9412
  - "exact": the exact text passage to highlight (quoted verbatim from source)
9417
- - "start": character offset where the passage starts
9418
- - "end": character offset where the passage ends
9419
- - "prefix": up to 32 characters of text immediately before the passage
9420
- - "suffix": up to 32 characters of text immediately after the passage
9413
+ - "prefix": up to 64 characters of text immediately before the passage
9414
+ - "suffix": up to 64 characters of text immediately after the passage
9421
9415
 
9422
9416
  Respond with a valid JSON array.
9423
9417
 
9424
9418
  Example:
9425
9419
  [
9426
- {"exact": "revenue grew 45% year-over-year", "start": 142, "end": 174, "prefix": "In Q3 2024, ", "suffix": ", exceeding all forecasts."}
9420
+ {"exact": "revenue grew 45% year-over-year", "prefix": "In Q3 2024, ", "suffix": ", exceeding all forecasts."}
9427
9421
  ]`;
9428
9422
  } else {
9429
9423
  const densityGuidance = density ? `
@@ -9447,16 +9441,14 @@ ${content.substring(0, 8e3)}
9447
9441
 
9448
9442
  Return a JSON array of highlights. Each highlight should have:
9449
9443
  - "exact": the exact text passage to highlight (quoted verbatim from source)
9450
- - "start": character offset where the passage starts
9451
- - "end": character offset where the passage ends
9452
- - "prefix": up to 32 characters of text immediately before the passage
9453
- - "suffix": up to 32 characters of text immediately after the passage
9444
+ - "prefix": up to 64 characters of text immediately before the passage
9445
+ - "suffix": up to 64 characters of text immediately after the passage
9454
9446
 
9455
9447
  Respond with a valid JSON array.
9456
9448
 
9457
9449
  Example format:
9458
9450
  [
9459
- {"exact": "we will discontinue support for legacy systems by March 2025", "start": 52, "end": 113, "prefix": "After careful consideration, ", "suffix": ". This decision affects"}
9451
+ {"exact": "we will discontinue support for legacy systems by March 2025", "prefix": "After careful consideration, ", "suffix": ". This decision affects"}
9460
9452
  ]`;
9461
9453
  }
9462
9454
  return prompt;
@@ -9490,17 +9482,15 @@ ${content.substring(0, 8e3)}
9490
9482
 
9491
9483
  Return a JSON array of assessments. Each assessment must have:
9492
9484
  - "exact": the exact text passage being assessed (quoted verbatim from source)
9493
- - "start": character offset where the passage starts
9494
- - "end": character offset where the passage ends
9495
- - "prefix": up to 32 characters of text immediately before the passage
9496
- - "suffix": up to 32 characters of text immediately after the passage
9485
+ - "prefix": up to 64 characters of text immediately before the passage
9486
+ - "suffix": up to 64 characters of text immediately after the passage
9497
9487
  - "assessment": your assessment following the instructions above
9498
9488
 
9499
9489
  Respond with a valid JSON array.
9500
9490
 
9501
9491
  Example:
9502
9492
  [
9503
- {"exact": "the quarterly revenue target", "start": 142, "end": 169, "prefix": "We established ", "suffix": " for Q4 2024.", "assessment": "This target seems ambitious given market conditions. Consider revising based on recent trends."}
9493
+ {"exact": "the quarterly revenue target", "prefix": "We established ", "suffix": " for Q4 2024.", "assessment": "This target seems ambitious given market conditions. Consider revising based on recent trends."}
9504
9494
  ]`;
9505
9495
  } else {
9506
9496
  const toneGuidance = tone ? `
@@ -9526,17 +9516,15 @@ ${content.substring(0, 8e3)}
9526
9516
 
9527
9517
  Return a JSON array of assessments. Each assessment should have:
9528
9518
  - "exact": the exact text passage being assessed (quoted verbatim from source)
9529
- - "start": character offset where the passage starts
9530
- - "end": character offset where the passage ends
9531
- - "prefix": up to 32 characters of text immediately before the passage
9532
- - "suffix": up to 32 characters of text immediately after the passage
9519
+ - "prefix": up to 64 characters of text immediately before the passage
9520
+ - "suffix": up to 64 characters of text immediately after the passage
9533
9521
  - "assessment": your analytical assessment (1-3 sentences, evaluate validity/strength/implications)
9534
9522
 
9535
9523
  Respond with a valid JSON array.
9536
9524
 
9537
9525
  Example format:
9538
9526
  [
9539
- {"exact": "AI will replace most jobs by 2030", "start": 52, "end": 89, "prefix": "Many experts predict that ", "suffix": ", fundamentally reshaping", "assessment": "This claim lacks nuance and supporting evidence. Employment patterns historically show job transformation rather than wholesale replacement. The timeline appears speculative without specific sector analysis."}
9527
+ {"exact": "AI will replace most jobs by 2030", "prefix": "Many experts predict that ", "suffix": ", fundamentally reshaping", "assessment": "This claim lacks nuance and supporting evidence. Employment patterns historically show job transformation rather than wholesale replacement. The timeline appears speculative without specific sector analysis."}
9540
9528
  ]`;
9541
9529
  }
9542
9530
  return prompt;
@@ -9582,17 +9570,15 @@ ${content}
9582
9570
 
9583
9571
  Return a JSON array of tags. Each tag should have:
9584
9572
  - "exact": the exact text passage (quoted verbatim from source)
9585
- - "start": character offset where the passage starts
9586
- - "end": character offset where the passage ends
9587
- - "prefix": up to 32 characters of text immediately before the passage
9588
- - "suffix": up to 32 characters of text immediately after the passage
9573
+ - "prefix": up to 64 characters of text immediately before the passage
9574
+ - "suffix": up to 64 characters of text immediately after the passage
9589
9575
 
9590
9576
  Respond with a valid JSON array.
9591
9577
 
9592
9578
  Example format:
9593
9579
  [
9594
- {"exact": "What duty did the defendant owe?", "start": 142, "end": 175, "prefix": "The central question is: ", "suffix": " This question must be"},
9595
- {"exact": "In tort law, a duty of care is established when...", "start": 412, "end": 520, "prefix": "Legal framework:\\n", "suffix": "\\n\\nApplying this standard"}
9580
+ {"exact": "What duty did the defendant owe?", "prefix": "The central question is: ", "suffix": " This question must be"},
9581
+ {"exact": "In tort law, a duty of care is established when...", "prefix": "Legal framework:\\n", "suffix": "\\n\\nApplying this standard"}
9596
9582
  ]`;
9597
9583
  return prompt;
9598
9584
  }
@@ -9660,23 +9646,29 @@ var MotivationParsers = class {
9660
9646
  try {
9661
9647
  const parsed = extractObjectsFromArray(response);
9662
9648
  const valid = parsed.filter(
9663
- (c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.start === "number" && typeof c.end === "number" && typeof c.comment === "string" && c.comment.trim().length > 0
9649
+ (c) => !!c && typeof c === "object" && typeof c.exact === "string" && typeof c.comment === "string" && c.comment.trim().length > 0
9664
9650
  );
9665
9651
  console.log(`[MotivationParsers] Parsed ${valid.length} valid comments from ${parsed.length} total`);
9666
9652
  const validatedComments = [];
9667
9653
  for (const comment of valid) {
9668
- try {
9669
- const validated = validateAndCorrectOffsets(content, comment.start, comment.end, comment.exact);
9670
- validatedComments.push({
9671
- ...comment,
9672
- start: validated.start,
9673
- end: validated.end,
9674
- prefix: validated.prefix,
9675
- suffix: validated.suffix
9676
- });
9677
- } catch (error) {
9678
- console.warn(`[MotivationParsers] Skipping invalid comment "${comment.exact}":`, error);
9679
- }
9654
+ const reconciled = reconcileSelector(content, {
9655
+ exact: comment.exact,
9656
+ ...typeof comment.prefix === "string" ? { prefix: comment.prefix } : {},
9657
+ ...typeof comment.suffix === "string" ? { suffix: comment.suffix } : {}
9658
+ });
9659
+ if (!reconciled) {
9660
+ console.warn(`[MotivationParsers] Dropped hallucinated comment "${comment.exact}"`);
9661
+ continue;
9662
+ }
9663
+ logAnchorMethod("comment", comment.exact, reconciled.anchorMethod);
9664
+ validatedComments.push({
9665
+ comment: comment.comment,
9666
+ exact: reconciled.exact,
9667
+ start: reconciled.start,
9668
+ end: reconciled.end,
9669
+ ...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
9670
+ ...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
9671
+ });
9680
9672
  }
9681
9673
  return validatedComments;
9682
9674
  } catch (error) {
@@ -9695,22 +9687,27 @@ var MotivationParsers = class {
9695
9687
  try {
9696
9688
  const parsed = extractObjectsFromArray(response);
9697
9689
  const highlights = parsed.filter(
9698
- (h) => !!h && typeof h === "object" && typeof h.exact === "string" && typeof h.start === "number" && typeof h.end === "number"
9690
+ (h) => !!h && typeof h === "object" && typeof h.exact === "string"
9699
9691
  );
9700
9692
  const validatedHighlights = [];
9701
9693
  for (const highlight of highlights) {
9702
- try {
9703
- const validated = validateAndCorrectOffsets(content, highlight.start, highlight.end, highlight.exact);
9704
- validatedHighlights.push({
9705
- ...highlight,
9706
- start: validated.start,
9707
- end: validated.end,
9708
- prefix: validated.prefix,
9709
- suffix: validated.suffix
9710
- });
9711
- } catch (error) {
9712
- console.warn(`[MotivationParsers] Skipping invalid highlight "${highlight.exact}":`, error);
9713
- }
9694
+ const reconciled = reconcileSelector(content, {
9695
+ exact: highlight.exact,
9696
+ ...typeof highlight.prefix === "string" ? { prefix: highlight.prefix } : {},
9697
+ ...typeof highlight.suffix === "string" ? { suffix: highlight.suffix } : {}
9698
+ });
9699
+ if (!reconciled) {
9700
+ console.warn(`[MotivationParsers] Dropped hallucinated highlight "${highlight.exact}"`);
9701
+ continue;
9702
+ }
9703
+ logAnchorMethod("highlight", highlight.exact, reconciled.anchorMethod);
9704
+ validatedHighlights.push({
9705
+ exact: reconciled.exact,
9706
+ start: reconciled.start,
9707
+ end: reconciled.end,
9708
+ ...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
9709
+ ...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
9710
+ });
9714
9711
  }
9715
9712
  return validatedHighlights;
9716
9713
  } catch (error) {
@@ -9730,22 +9727,28 @@ var MotivationParsers = class {
9730
9727
  try {
9731
9728
  const parsed = extractObjectsFromArray(response);
9732
9729
  const assessments = parsed.filter(
9733
- (a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.start === "number" && typeof a.end === "number" && typeof a.assessment === "string"
9730
+ (a) => !!a && typeof a === "object" && typeof a.exact === "string" && typeof a.assessment === "string"
9734
9731
  );
9735
9732
  const validatedAssessments = [];
9736
9733
  for (const assessment of assessments) {
9737
- try {
9738
- const validated = validateAndCorrectOffsets(content, assessment.start, assessment.end, assessment.exact);
9739
- validatedAssessments.push({
9740
- ...assessment,
9741
- start: validated.start,
9742
- end: validated.end,
9743
- prefix: validated.prefix,
9744
- suffix: validated.suffix
9745
- });
9746
- } catch (error) {
9747
- console.warn(`[MotivationParsers] Skipping invalid assessment "${assessment.exact}":`, error);
9748
- }
9734
+ const reconciled = reconcileSelector(content, {
9735
+ exact: assessment.exact,
9736
+ ...typeof assessment.prefix === "string" ? { prefix: assessment.prefix } : {},
9737
+ ...typeof assessment.suffix === "string" ? { suffix: assessment.suffix } : {}
9738
+ });
9739
+ if (!reconciled) {
9740
+ console.warn(`[MotivationParsers] Dropped hallucinated assessment "${assessment.exact}"`);
9741
+ continue;
9742
+ }
9743
+ logAnchorMethod("assessment", assessment.exact, reconciled.anchorMethod);
9744
+ validatedAssessments.push({
9745
+ assessment: assessment.assessment,
9746
+ exact: reconciled.exact,
9747
+ start: reconciled.start,
9748
+ end: reconciled.end,
9749
+ ...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
9750
+ ...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
9751
+ });
9749
9752
  }
9750
9753
  return validatedAssessments;
9751
9754
  } catch (error) {
@@ -9755,17 +9758,15 @@ var MotivationParsers = class {
9755
9758
  }
9756
9759
  }
9757
9760
  /**
9758
- * Parse and validate AI response for tag detection
9759
- * Note: Does NOT validate offsets - caller must do that with content
9760
- *
9761
- * @param response - Raw AI response string (may include markdown code fences)
9762
- * @returns Array of tag matches (offsets not yet validated)
9761
+ * Parse the LLM's tag response into raw, pre-reconciliation tag inputs.
9762
+ * Reconciliation happens in `validateTagOffsets`, which adds `start`/`end`
9763
+ * by anchoring `exact` against the source content.
9763
9764
  */
9764
9765
  static parseTags(response) {
9765
9766
  try {
9766
9767
  const parsed = extractObjectsFromArray(response);
9767
9768
  const valid = parsed.filter(
9768
- (t) => !!t && typeof t === "object" && typeof t.exact === "string" && typeof t.start === "number" && typeof t.end === "number" && t.exact.trim().length > 0
9769
+ (t) => !!t && typeof t === "object" && typeof t.exact === "string" && t.exact.trim().length > 0
9769
9770
  );
9770
9771
  console.log(`[MotivationParsers] Parsed ${valid.length} valid tags from ${parsed.length} total`);
9771
9772
  return valid;
@@ -9775,52 +9776,41 @@ var MotivationParsers = class {
9775
9776
  }
9776
9777
  }
9777
9778
  /**
9778
- * Validate tag offsets against content and add category
9779
- * Helper for tag detection after initial parsing
9780
- *
9781
- * @param tags - Parsed tags without validated offsets
9782
- * @param content - Original content to validate against
9783
- * @param category - Category to assign to validated tags
9784
- * @returns Array of validated tag matches
9779
+ * Anchor raw tag inputs against source content and add category.
9785
9780
  */
9786
9781
  static validateTagOffsets(tags, content, category) {
9787
9782
  const validatedTags = [];
9788
9783
  for (const tag of tags) {
9789
- try {
9790
- const validated = validateAndCorrectOffsets(content, tag.start, tag.end, tag.exact);
9791
- validatedTags.push({
9792
- ...tag,
9793
- category,
9794
- start: validated.start,
9795
- end: validated.end,
9796
- prefix: validated.prefix,
9797
- suffix: validated.suffix
9798
- });
9799
- } catch (error) {
9800
- console.warn(`[MotivationParsers] Skipping invalid tag for category "${category}":`, error);
9801
- }
9784
+ const reconciled = reconcileSelector(content, {
9785
+ exact: tag.exact,
9786
+ ...typeof tag.prefix === "string" ? { prefix: tag.prefix } : {},
9787
+ ...typeof tag.suffix === "string" ? { suffix: tag.suffix } : {}
9788
+ });
9789
+ if (!reconciled) {
9790
+ console.warn(`[MotivationParsers] Dropped hallucinated tag "${tag.exact}" for category "${category}"`);
9791
+ continue;
9792
+ }
9793
+ logAnchorMethod("tag", tag.exact, reconciled.anchorMethod);
9794
+ validatedTags.push({
9795
+ category,
9796
+ exact: reconciled.exact,
9797
+ start: reconciled.start,
9798
+ end: reconciled.end,
9799
+ ...reconciled.prefix !== void 0 ? { prefix: reconciled.prefix } : {},
9800
+ ...reconciled.suffix !== void 0 ? { suffix: reconciled.suffix } : {}
9801
+ });
9802
9802
  }
9803
9803
  return validatedTags;
9804
9804
  }
9805
9805
  };
9806
+ function logAnchorMethod(motivation, exact, anchorMethod) {
9807
+ if (anchorMethod === "first-of-many" || anchorMethod === "fuzzy-match") {
9808
+ console.warn(`[MotivationParsers] ${motivation} anchored via ${anchorMethod}: "${exact}"`);
9809
+ }
9810
+ }
9806
9811
 
9807
9812
  // src/workers/annotation-detection.ts
9808
9813
  var AnnotationDetection = class {
9809
- /**
9810
- * Fetch content from a ContentFetcher and read the stream to a string.
9811
- * Shared helper for all workers.
9812
- */
9813
- static async fetchContent(contentFetcher, resourceId) {
9814
- const stream = await contentFetcher(resourceId);
9815
- if (!stream) {
9816
- throw new Error(`Could not load content for resource ${resourceId}`);
9817
- }
9818
- const chunks = [];
9819
- for await (const chunk of stream) {
9820
- chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
9821
- }
9822
- return Buffer.concat(chunks).toString("utf-8");
9823
- }
9824
9814
  /**
9825
9815
  * Detect comments in content.
9826
9816
  *
@@ -9930,17 +9920,15 @@ ${exact}
9930
9920
  """
9931
9921
 
9932
9922
  Respond with a JSON array of entities found. Each entity should have:
9933
- - exact: the exact text span from the input
9923
+ - exact: the exact text span from the input (quoted verbatim \u2014 character-for-character)
9934
9924
  - entityType: one of the provided entity types
9935
- - startOffset: character position where the entity starts (0-indexed)
9936
- - endOffset: character position where the entity ends
9937
- - prefix: up to 32 characters of text immediately before the entity (helps identify correct occurrence)
9938
- - suffix: up to 32 characters of text immediately after the entity (helps identify correct occurrence)
9925
+ - prefix: up to 64 characters of text immediately before the entity (used to disambiguate when the same text appears more than once)
9926
+ - suffix: up to 64 characters of text immediately after the entity (same purpose)
9939
9927
 
9940
9928
  If no entities are found, respond with an empty array [].
9941
9929
 
9942
9930
  Example output:
9943
- [{"exact":"Alice","entityType":"Person","startOffset":0,"endOffset":5,"prefix":"","suffix":" went to"},{"exact":"Paris","entityType":"Location","startOffset":20,"endOffset":25,"prefix":"went to ","suffix":" yesterday"}]`;
9931
+ [{"exact":"Alice","entityType":"Person","prefix":"","suffix":" went to"},{"exact":"Paris","entityType":"Location","prefix":"went to ","suffix":" yesterday"}]`;
9944
9932
  logger2.debug("Sending entity extraction request", { entityTypes: entityTypesDescription });
9945
9933
  const response = await client.generateTextWithMetadata(
9946
9934
  prompt,
@@ -9969,151 +9957,18 @@ Example output:
9969
9957
  logger2.error(errorMsg);
9970
9958
  throw new Error(errorMsg);
9971
9959
  }
9972
- return entities.map((entity, idx) => {
9973
- let start = entity.startOffset;
9974
- let end = entity.endOffset;
9975
- logger2.debug("Processing entity", {
9976
- index: idx + 1,
9977
- total: entities.length,
9978
- type: entity.entityType,
9979
- text: entity.exact,
9980
- offsetsFromAI: `[${start}:${end}]`
9981
- });
9982
- const extractedText = exact.substring(start, end);
9983
- let anchorMethod;
9984
- if (extractedText === entity.exact) {
9985
- anchorMethod = "llm-exact";
9986
- logger2.debug("Entity anchored", {
9987
- text: entity.exact,
9988
- entityType: entity.entityType,
9989
- anchorMethod
9990
- });
9991
- } else {
9992
- logger2.debug("LLM offsets mismatch \u2014 attempting re-anchor", {
9993
- expected: entity.exact,
9994
- llmOffsets: `[${start}:${end}]`,
9995
- foundAtLlmOffsets: extractedText
9996
- });
9997
- let occurrenceCount = 0;
9998
- let firstOccurrence = -1;
9999
- let searchPos = 0;
10000
- while ((searchPos = exact.indexOf(entity.exact, searchPos)) !== -1) {
10001
- if (firstOccurrence === -1) firstOccurrence = searchPos;
10002
- occurrenceCount++;
10003
- searchPos++;
10004
- }
10005
- if (occurrenceCount === 0) {
10006
- anchorMethod = "dropped";
10007
- logger2.error("Entity text not found in resource \u2014 dropping", {
10008
- text: entity.exact,
10009
- entityType: entity.entityType,
10010
- llmOffsets: `[${start}:${end}]`,
10011
- anchorMethod,
10012
- resourceStart: exact.substring(0, 200)
10013
- });
10014
- return null;
10015
- }
10016
- let recoveredOffset = -1;
10017
- if (entity.prefix || entity.suffix) {
10018
- let p = 0;
10019
- while ((p = exact.indexOf(entity.exact, p)) !== -1) {
10020
- const candidatePrefix = exact.substring(Math.max(0, p - 32), p);
10021
- const candidateSuffix = exact.substring(
10022
- p + entity.exact.length,
10023
- Math.min(exact.length, p + entity.exact.length + 32)
10024
- );
10025
- const prefixMatch = !entity.prefix || candidatePrefix.endsWith(entity.prefix);
10026
- const suffixMatch = !entity.suffix || candidateSuffix.startsWith(entity.suffix);
10027
- if (prefixMatch && suffixMatch) {
10028
- recoveredOffset = p;
10029
- break;
10030
- }
10031
- p++;
10032
- }
10033
- }
10034
- if (recoveredOffset !== -1) {
10035
- anchorMethod = "context-recovered";
10036
- start = recoveredOffset;
10037
- end = recoveredOffset + entity.exact.length;
10038
- logger2.debug("Entity anchored", {
10039
- text: entity.exact,
10040
- entityType: entity.entityType,
10041
- anchorMethod,
10042
- offsetDiff: recoveredOffset - entity.startOffset
10043
- });
10044
- } else if (occurrenceCount === 1) {
10045
- anchorMethod = "unique-match";
10046
- start = firstOccurrence;
10047
- end = firstOccurrence + entity.exact.length;
10048
- logger2.debug("Entity anchored", {
10049
- text: entity.exact,
10050
- entityType: entity.entityType,
10051
- anchorMethod,
10052
- offsetDiff: firstOccurrence - entity.startOffset
10053
- });
10054
- } else {
10055
- anchorMethod = "first-of-many";
10056
- start = firstOccurrence;
10057
- end = firstOccurrence + entity.exact.length;
10058
- logger2.warn("Entity anchored at first of multiple occurrences \u2014 may be wrong", {
10059
- text: entity.exact,
10060
- entityType: entity.entityType,
10061
- anchorMethod,
10062
- occurrenceCount,
10063
- chosenOffset: firstOccurrence,
10064
- llmOffsets: `[${entity.startOffset}:${entity.endOffset}]`,
10065
- hasPrefix: !!entity.prefix,
10066
- hasSuffix: !!entity.suffix
10067
- });
10068
- }
10069
- }
10070
- return {
10071
- exact: entity.exact,
10072
- entityType: entity.entityType,
10073
- start,
10074
- end,
10075
- prefix: entity.prefix,
10076
- suffix: entity.suffix
10077
- };
10078
- }).filter((entity) => {
10079
- if (entity === null) {
10080
- logger2.debug("Filtered entity: null");
10081
- return false;
10082
- }
10083
- if (entity.start === void 0 || entity.end === void 0) {
10084
- logger2.warn("Filtered entity: missing offsets", { text: entity.exact });
10085
- return false;
10086
- }
10087
- if (entity.start < 0) {
10088
- logger2.warn("Filtered entity: negative start", {
10089
- text: entity.exact,
10090
- start: entity.start
10091
- });
10092
- return false;
10093
- }
10094
- if (entity.end > exact.length) {
10095
- logger2.warn("Filtered entity: end exceeds text length", {
10096
- text: entity.exact,
10097
- end: entity.end,
10098
- textLength: exact.length
10099
- });
10100
- return false;
10101
- }
10102
- const extractedText = exact.substring(entity.start, entity.end);
10103
- if (extractedText !== entity.exact) {
10104
- logger2.warn("Filtered entity: offset mismatch", {
10105
- expected: entity.exact,
10106
- got: extractedText,
10107
- offsets: `[${entity.start}:${entity.end}]`
10108
- });
10109
- return false;
10110
- }
10111
- logger2.debug("Accepted entity", {
10112
- text: entity.exact,
10113
- offsets: `[${entity.start}:${entity.end}]`
10114
- });
10115
- return true;
10116
- });
9960
+ return entities.filter((e) => {
9961
+ const ok = e && typeof e === "object" && typeof e.exact === "string" && typeof e.entityType === "string";
9962
+ if (!ok) {
9963
+ logger2.debug("Dropped malformed LLM entity", { entity: e });
9964
+ }
9965
+ return ok;
9966
+ }).map((entity) => ({
9967
+ exact: entity.exact,
9968
+ entityType: entity.entityType,
9969
+ ...typeof entity.prefix === "string" ? { prefix: entity.prefix } : {},
9970
+ ...typeof entity.suffix === "string" ? { suffix: entity.suffix } : {}
9971
+ }));
10117
9972
  } catch (error) {
10118
9973
  logger2.error("Failed to parse entity extraction response", {
10119
9974
  error: error instanceof Error ? error.message : String(error)
@@ -10249,7 +10104,59 @@ Requirements:
10249
10104
  });
10250
10105
  return result;
10251
10106
  }
10252
- function buildTextAnnotation(resourceId, userId, generator, motivation, match, body) {
10107
+ function toMatch(r) {
10108
+ return {
10109
+ exact: r.exact,
10110
+ start: r.start,
10111
+ end: r.end,
10112
+ ...r.prefix !== void 0 ? { prefix: r.prefix } : {},
10113
+ ...r.suffix !== void 0 ? { suffix: r.suffix } : {}
10114
+ };
10115
+ }
10116
+ function annotationDedupeKey(ann) {
10117
+ const target = ann.target;
10118
+ const selectors = Array.isArray(target?.selector) ? target.selector : [];
10119
+ const pos = selectors.find((s) => s.type === "TextPositionSelector");
10120
+ return [
10121
+ ann.motivation,
10122
+ pos?.start ?? "?",
10123
+ pos?.end ?? "?",
10124
+ JSON.stringify(ann.body ?? null)
10125
+ ].join("|");
10126
+ }
10127
+ function dedupeAnnotations(annotations) {
10128
+ const seen = /* @__PURE__ */ new Set();
10129
+ const out = [];
10130
+ for (const ann of annotations) {
10131
+ const key = annotationDedupeKey(ann);
10132
+ if (seen.has(key)) continue;
10133
+ seen.add(key);
10134
+ out.push(ann);
10135
+ }
10136
+ return out;
10137
+ }
10138
+ function buildTextAnnotation(content, resourceId, userId, generator, motivation, match, body) {
10139
+ if (content.substring(match.start, match.end) !== match.exact) {
10140
+ throw new Error(
10141
+ `buildTextAnnotation invariant: content.substring(${match.start}, ${match.end}) !== exact for resource ${resourceId}, motivation ${motivation}`
10142
+ );
10143
+ }
10144
+ if (match.prefix !== void 0) {
10145
+ const actualPrefix = content.substring(Math.max(0, match.start - match.prefix.length), match.start);
10146
+ if (actualPrefix !== match.prefix) {
10147
+ throw new Error(
10148
+ `buildTextAnnotation invariant: content prefix-slice !== prefix for resource ${resourceId}, motivation ${motivation}`
10149
+ );
10150
+ }
10151
+ }
10152
+ if (match.suffix !== void 0) {
10153
+ const actualSuffix = content.substring(match.end, Math.min(content.length, match.end + match.suffix.length));
10154
+ if (actualSuffix !== match.suffix) {
10155
+ throw new Error(
10156
+ `buildTextAnnotation invariant: content suffix-slice !== suffix for resource ${resourceId}, motivation ${motivation}`
10157
+ );
10158
+ }
10159
+ }
10253
10160
  const creator = didToAgent(userId);
10254
10161
  const wasAttributedTo = creator["@id"] === generator["@id"] ? [generator] : [creator, generator];
10255
10162
  return {
@@ -10288,9 +10195,9 @@ async function processHighlightJob(content, inferenceClient, params, userId, gen
10288
10195
  params.sourceLanguage
10289
10196
  );
10290
10197
  onProgress(60, `Creating ${highlights.length} annotations...`, "creating");
10291
- const annotations = highlights.map(
10292
- (h) => buildTextAnnotation(params.resourceId, userId, generator, "highlighting", h)
10293
- );
10198
+ const annotations = dedupeAnnotations(highlights.map(
10199
+ (h) => buildTextAnnotation(content, params.resourceId, userId, generator, "highlighting", h)
10200
+ ));
10294
10201
  onProgress(100, `Complete! Created ${annotations.length} highlights`, "creating");
10295
10202
  return {
10296
10203
  annotations,
@@ -10311,16 +10218,16 @@ async function processCommentJob(content, inferenceClient, params, userId, gener
10311
10218
  );
10312
10219
  onProgress(60, `Creating ${comments.length} annotations...`, "creating");
10313
10220
  const bodyLanguage = params.language ?? "en";
10314
- const annotations = comments.map(
10221
+ const annotations = dedupeAnnotations(comments.map(
10315
10222
  (c) => (
10316
10223
  // Match the pre-#651 CommentAnnotationWorker: include format and
10317
10224
  // language on the body TextualBody. Optional in the schema, but
10318
10225
  // consumers that do language-aware rendering rely on them.
10319
- buildTextAnnotation(params.resourceId, userId, generator, "commenting", c, [
10226
+ buildTextAnnotation(content, params.resourceId, userId, generator, "commenting", c, [
10320
10227
  { type: "TextualBody", value: c.comment, purpose: "commenting", format: "text/plain", language: bodyLanguage }
10321
10228
  ])
10322
10229
  )
10323
- );
10230
+ ));
10324
10231
  onProgress(100, `Complete! Created ${annotations.length} comments`, "creating");
10325
10232
  return {
10326
10233
  annotations,
@@ -10341,7 +10248,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
10341
10248
  );
10342
10249
  onProgress(60, `Creating ${assessments.length} annotations...`, "creating");
10343
10250
  const bodyLanguage = params.language ?? "en";
10344
- const annotations = assessments.map(
10251
+ const annotations = dedupeAnnotations(assessments.map(
10345
10252
  (a) => (
10346
10253
  // Single-object body with purpose aligned to motivation, matching the
10347
10254
  // pre-#651 AssessmentAnnotationWorker's shape and the majority of
@@ -10349,7 +10256,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
10349
10256
  // purpose='describing' — that loses the "this is an assessment, not
10350
10257
  // a description" signal and breaks existing readers that access
10351
10258
  // `body.value` directly on the object.
10352
- buildTextAnnotation(params.resourceId, userId, generator, "assessing", a, {
10259
+ buildTextAnnotation(content, params.resourceId, userId, generator, "assessing", a, {
10353
10260
  type: "TextualBody",
10354
10261
  value: a.assessment,
10355
10262
  purpose: "assessing",
@@ -10357,7 +10264,7 @@ async function processAssessmentJob(content, inferenceClient, params, userId, ge
10357
10264
  language: bodyLanguage
10358
10265
  })
10359
10266
  )
10360
- );
10267
+ ));
10361
10268
  onProgress(100, `Complete! Created ${annotations.length} assessments`, "creating");
10362
10269
  return {
10363
10270
  annotations,
@@ -10401,27 +10308,44 @@ async function processReferenceJob(content, inferenceClient, params, userId, gen
10401
10308
  { type: "TextualBody", value: entityTypeName, purpose: "tagging", format: "text/plain", language: bodyLanguage }
10402
10309
  ];
10403
10310
  for (const entity of extractedEntities) {
10404
- try {
10405
- const validated = validateAndCorrectOffsets(content, entity.start, entity.end, entity.exact);
10406
- const ann = buildTextAnnotation(
10407
- params.resourceId,
10408
- userId,
10409
- generator,
10410
- "linking",
10411
- validated,
10412
- unresolvedBody
10413
- );
10414
- allAnnotations.push(ann);
10415
- totalEmitted++;
10416
- } catch {
10311
+ const reconciled = reconcileSelector(content, {
10312
+ exact: entity.exact,
10313
+ ...entity.prefix !== void 0 ? { prefix: entity.prefix } : {},
10314
+ ...entity.suffix !== void 0 ? { suffix: entity.suffix } : {}
10315
+ });
10316
+ if (!reconciled) {
10317
+ logger2.error("Entity dropped \u2014 text not found in source", {
10318
+ text: entity.exact,
10319
+ entityType: entity.entityType
10320
+ });
10417
10321
  errors++;
10322
+ continue;
10323
+ }
10324
+ if (reconciled.anchorMethod === "first-of-many" || reconciled.anchorMethod === "fuzzy-match") {
10325
+ logger2.warn("Entity anchored via degraded method", {
10326
+ text: entity.exact,
10327
+ entityType: entity.entityType,
10328
+ anchorMethod: reconciled.anchorMethod
10329
+ });
10418
10330
  }
10331
+ const ann = buildTextAnnotation(
10332
+ content,
10333
+ params.resourceId,
10334
+ userId,
10335
+ generator,
10336
+ "linking",
10337
+ toMatch(reconciled),
10338
+ unresolvedBody
10339
+ );
10340
+ allAnnotations.push(ann);
10341
+ totalEmitted++;
10419
10342
  }
10420
10343
  }
10421
- onProgress(100, `Complete! Created ${totalEmitted} references`, "creating");
10344
+ const annotations = dedupeAnnotations(allAnnotations);
10345
+ onProgress(100, `Complete! Created ${annotations.length} references`, "creating");
10422
10346
  return {
10423
- annotations: allAnnotations,
10424
- result: { totalFound, totalEmitted, errors }
10347
+ annotations,
10348
+ result: { totalFound, totalEmitted: annotations.length, errors }
10425
10349
  };
10426
10350
  }
10427
10351
  async function processTagJob(content, inferenceClient, params, userId, generator, onProgress) {
@@ -10441,15 +10365,19 @@ async function processTagJob(content, inferenceClient, params, userId, generator
10441
10365
  const tags = allTags;
10442
10366
  onProgress(60, `Creating ${tags.length} tag annotations...`, "creating");
10443
10367
  const bodyLanguage = params.language ?? "en";
10444
- const byCategory = {};
10445
- const annotations = tags.map((t) => {
10368
+ const annotations = dedupeAnnotations(tags.map((t) => {
10446
10369
  const category = t.category ?? "unknown";
10447
- byCategory[category] = (byCategory[category] ?? 0) + 1;
10448
- return buildTextAnnotation(params.resourceId, userId, generator, "tagging", t, [
10370
+ return buildTextAnnotation(content, params.resourceId, userId, generator, "tagging", t, [
10449
10371
  { type: "TextualBody", value: category, purpose: "tagging", format: "text/plain", language: bodyLanguage },
10450
10372
  { type: "TextualBody", value: params.schema.id, purpose: "classifying", format: "text/plain" }
10451
10373
  ]);
10452
- });
10374
+ }));
10375
+ const byCategory = {};
10376
+ for (const ann of annotations) {
10377
+ const body = ann.body;
10378
+ const category = Array.isArray(body) && typeof body[0]?.value === "string" ? body[0].value : "unknown";
10379
+ byCategory[category] = (byCategory[category] ?? 0) + 1;
10380
+ }
10453
10381
  onProgress(100, `Complete! Created ${annotations.length} tags`, "creating");
10454
10382
  return {
10455
10383
  annotations,
@@ -10487,10 +10415,7 @@ async function processGenerationJob(inferenceClient, params, onProgress, logger2
10487
10415
 
10488
10416
  // src/worker-process.ts
10489
10417
  async function emitEvent(session, channel, payload) {
10490
- const isBroadcast = RESOURCE_BROADCAST_TYPES.includes(channel);
10491
- const rawScope = isBroadcast ? payload.resourceId : void 0;
10492
- const resourceScope = rawScope ? resourceId(rawScope) : void 0;
10493
- await session.client.transport.emit(channel, payload, resourceScope);
10418
+ await session.client.transport.emit(channel, payload);
10494
10419
  }
10495
10420
  function startWorkerProcess(config) {
10496
10421
  const { session, logger: logger2 } = config;
@@ -10560,6 +10485,17 @@ async function handleJobInner(adapter, config, job) {
10560
10485
  adapter.failJob(jobId, `Worker not configured for job type: ${jobType}`);
10561
10486
  return;
10562
10487
  }
10488
+ if (jobType !== "generation") {
10489
+ const descriptor = await session.client.browse.resource(resourceId);
10490
+ const mediaType = getPrimaryMediaType(descriptor);
10491
+ const extraction = mediaType ? textExtractionOf(mediaType) : "none";
10492
+ if (extraction === "pdf-text-layer") {
10493
+ throw new Error(`Cannot run ${jobType} on resource ${resourceId}: PDF text-layer detection is not yet supported`);
10494
+ }
10495
+ if (extraction !== "decode") {
10496
+ throw new Error(`Cannot run ${jobType} on resource ${resourceId}: media type '${mediaType ?? "unknown"}' has no extractable text to analyze`);
10497
+ }
10498
+ }
10563
10499
  const onProgress = (percentage, message, stage, extra) => {
10564
10500
  emitEvent(session, "job:report-progress", {
10565
10501
  ...lifecycleBase,