@themoltnet/pi-extension 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +0 -1
- package/dist/index.js +352 -69
- package/package.json +22 -7
package/dist/index.d.ts
CHANGED
|
@@ -246,7 +246,6 @@ declare const Task: TObject< {
|
|
|
246
246
|
input: TRecord<TString, TUnknown>;
|
|
247
247
|
inputSchemaCid: TString;
|
|
248
248
|
inputCid: TString;
|
|
249
|
-
criteriaCid: TUnion<[TString, TNull]>;
|
|
250
249
|
references: TArray<TObject< {
|
|
251
250
|
taskId: TUnion<[TString, TNull]>;
|
|
252
251
|
outputCid: TString;
|
package/dist/index.js
CHANGED
|
@@ -8558,7 +8558,13 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
|
|
|
8558
8558
|
/**
|
|
8559
8559
|
* How a judge must score a single criterion.
|
|
8560
8560
|
*
|
|
8561
|
-
* - `
|
|
8561
|
+
* - `llm_score`: 0..1 continuous, `rationale` required. Smooths failures
|
|
8562
|
+
* into the gradient — use `llm_checklist` instead for properties where
|
|
8563
|
+
* a single failure is a real failure (grounding, faithfulness).
|
|
8564
|
+
* - `llm_checklist`: judge enumerates per-claim assertions with
|
|
8565
|
+
* `{passed, evidence}`. The criterion's numeric `score` is derived:
|
|
8566
|
+
* `1` iff every assertion passes, else `0`. Per-claim evidence is the
|
|
8567
|
+
* dataset for cluster-analysis of failure modes. See #999.
|
|
8562
8568
|
* - `boolean`: 0 or 1, `rationale` optional.
|
|
8563
8569
|
* - `deterministic_signature_check`: judge runs a signature check;
|
|
8564
8570
|
* result is 0 or 1. No LLM discretion.
|
|
@@ -8566,11 +8572,31 @@ if (!FormatRegistry.Has("date-time")) FormatRegistry.Set("date-time", (v) => !Nu
|
|
|
8566
8572
|
* appears in the rendered output; 0 or 1.
|
|
8567
8573
|
*/
|
|
8568
8574
|
var RubricScoringMode = Type$1.Union([
|
|
8569
|
-
Type$1.Literal("
|
|
8575
|
+
Type$1.Literal("llm_score"),
|
|
8576
|
+
Type$1.Literal("llm_checklist"),
|
|
8570
8577
|
Type$1.Literal("boolean"),
|
|
8571
8578
|
Type$1.Literal("deterministic_signature_check"),
|
|
8572
8579
|
Type$1.Literal("deterministic_coverage_check")
|
|
8573
8580
|
], { $id: "RubricScoringMode" });
|
|
8581
|
+
/**
|
|
8582
|
+
* One binary check produced by an `llm_checklist`-mode criterion.
|
|
8583
|
+
*
|
|
8584
|
+
* `evidence` is REQUIRED for both PASS and FAIL — agentskills.io grading
|
|
8585
|
+
* principle: \"Don't give the benefit of the doubt.\" A PASS without
|
|
8586
|
+
* concrete evidence (a quoted span, an entry id, a source location)
|
|
8587
|
+
* cannot be audited. A FAIL without evidence cannot be clustered into
|
|
8588
|
+
* structural fixes. The same shape is reused by `judge-eval-variant`
|
|
8589
|
+
* (#943) so tooling, dashboards, and analysis stay uniform.
|
|
8590
|
+
*/
|
|
8591
|
+
var AssertionResult = Type$1.Object({
|
|
8592
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8593
|
+
text: Type$1.String({ minLength: 1 }),
|
|
8594
|
+
passed: Type$1.Boolean(),
|
|
8595
|
+
evidence: Type$1.String({ minLength: 1 })
|
|
8596
|
+
}, {
|
|
8597
|
+
$id: "AssertionResult",
|
|
8598
|
+
additionalProperties: false
|
|
8599
|
+
});
|
|
8574
8600
|
var RubricCriterion = Type$1.Object({
|
|
8575
8601
|
id: Type$1.String({ minLength: 1 }),
|
|
8576
8602
|
description: Type$1.String({ minLength: 1 }),
|
|
@@ -8630,44 +8656,165 @@ unrelated subsystems and the test coverage on the auth path is
|
|
|
8630
8656
|
unchanged" is.
|
|
8631
8657
|
`.trim();
|
|
8632
8658
|
//#endregion
|
|
8659
|
+
//#region ../tasks/src/success-criteria.ts
|
|
8660
|
+
/**
|
|
8661
|
+
* SuccessCriteria — imposer-stated acceptance criteria, evaluated in two
|
|
8662
|
+
* complementary places.
|
|
8663
|
+
*
|
|
8664
|
+
* Before this envelope existed, criteria were scattered: a vestigial
|
|
8665
|
+
* `criteriaCid` column nobody resolved, an `acceptanceCriteria: string[]`
|
|
8666
|
+
* field on `fulfill_brief.input` that was "interpreted by the claiming
|
|
8667
|
+
* agent," and inline `rubric` / `criteria[]` fields on judgment-task
|
|
8668
|
+
* inputs. None of those were machine-verifiable end-to-end.
|
|
8669
|
+
*
|
|
8670
|
+
* This module defines a single, content-addressable envelope an imposer
|
|
8671
|
+
* attaches to any task type. It has four orthogonal sections — pick
|
|
8672
|
+
* whichever apply per task type:
|
|
8673
|
+
*
|
|
8674
|
+
* - `gates` Deterministic structural checks (CID/schema match)
|
|
8675
|
+
* - `assertions` Declarative claims about output JSON
|
|
8676
|
+
* - `rubric` Weighted-criteria scoring instrument, reused
|
|
8677
|
+
* verbatim from `./rubric.ts`.
|
|
8678
|
+
* - `sideEffects` Required process side-effects (e.g. diary entry)
|
|
8679
|
+
*
|
|
8680
|
+
* ## Two roles, two task types
|
|
8681
|
+
*
|
|
8682
|
+
* **Producer self-assessment** (fulfillment tasks: `fulfill_brief`,
|
|
8683
|
+
* `curate_pack`, `render_pack`). The producer **LLM** evaluates the
|
|
8684
|
+
* criteria against its own output and emits a `VerificationRecord`
|
|
8685
|
+
* inside `output.verification`. The daemon is pure passthrough — it
|
|
8686
|
+
* does not run `evaluateAssertions`, does not inspect the verification
|
|
8687
|
+
* record. The REST API is dumb storage; it never re-runs assertions and
|
|
8688
|
+
* never runs LLMs. The cross-field rule
|
|
8689
|
+
* `requireVerificationWhenCriteriaPresent` enforces "verification
|
|
8690
|
+
* required iff successCriteria present" at task-output validation time
|
|
8691
|
+
* (server-side schema check). Self-assessment is a truthful self-rating,
|
|
8692
|
+
* NOT enforcement — `verification.passed=false` does not block /complete
|
|
8693
|
+
* and does not affect `acceptedAttemptN`. See
|
|
8694
|
+
* `docs/agent-runtime.md` for the full producer/judge flow.
|
|
8695
|
+
*
|
|
8696
|
+
* **Binding evaluation** (judgment tasks: `assess_brief`, `judge_pack`).
|
|
8697
|
+
* A separate task whose IS the application of `successCriteria` to
|
|
8698
|
+
* someone else's output. Different agent (enforced at claim time), same
|
|
8699
|
+
* envelope. The judge's verdict is binding: this is the *gate* in the
|
|
8700
|
+
* MoltNet model. The rubric inside `successCriteria.rubric` IS the job
|
|
8701
|
+
* spec for the judge.
|
|
8702
|
+
*
|
|
8703
|
+
* The clean chain: producer task with `successCriteria` → producer
|
|
8704
|
+
* self-assesses honestly → imposer (or automation) creates a downstream
|
|
8705
|
+
* judgment task that references the same `successCriteria` (or a
|
|
8706
|
+
* stricter rubric) → judgment task delivers the binding verdict.
|
|
8707
|
+
*
|
|
8708
|
+
* Storage: SuccessCriteria lives inline at `task.input.successCriteria`,
|
|
8709
|
+
* pinned via the task's `inputCid`. No separate column or hash. When
|
|
8710
|
+
* #881 lands, the `rubric` field can graduate to `{ rubricCid }` lookup
|
|
8711
|
+
* without changing this envelope, and producer + judge tasks can pin
|
|
8712
|
+
* the SAME rubric across the chain for end-to-end auditability.
|
|
8713
|
+
*/
|
|
8714
|
+
var SchemaCheckSpec = Type$1.Object({ schemaCid: Type$1.String({ minLength: 1 }) }, { additionalProperties: false });
|
|
8715
|
+
var CidEqualsSpec = Type$1.Object({
|
|
8716
|
+
path: Type$1.String({ minLength: 1 }),
|
|
8717
|
+
expected: Type$1.String({ minLength: 1 })
|
|
8718
|
+
}, { additionalProperties: false });
|
|
8719
|
+
var Gate = Type$1.Union([Type$1.Object({
|
|
8720
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8721
|
+
kind: Type$1.Literal("schema-check"),
|
|
8722
|
+
spec: SchemaCheckSpec,
|
|
8723
|
+
required: Type$1.Boolean()
|
|
8724
|
+
}, { additionalProperties: false }), Type$1.Object({
|
|
8725
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8726
|
+
kind: Type$1.Literal("cid-equals"),
|
|
8727
|
+
spec: CidEqualsSpec,
|
|
8728
|
+
required: Type$1.Boolean()
|
|
8729
|
+
}, { additionalProperties: false })], { $id: "Gate" });
|
|
8730
|
+
var AssertionOp = Type$1.Union([
|
|
8731
|
+
Type$1.Literal("exists"),
|
|
8732
|
+
Type$1.Literal("equals"),
|
|
8733
|
+
Type$1.Literal("matches"),
|
|
8734
|
+
Type$1.Literal("in-range"),
|
|
8735
|
+
Type$1.Literal("min-length")
|
|
8736
|
+
], { $id: "AssertionOp" });
|
|
8737
|
+
var Assertion = Type$1.Object({
|
|
8738
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8739
|
+
path: Type$1.String({ minLength: 1 }),
|
|
8740
|
+
op: AssertionOp,
|
|
8741
|
+
value: Type$1.Optional(Type$1.Unknown())
|
|
8742
|
+
}, {
|
|
8743
|
+
$id: "Assertion",
|
|
8744
|
+
additionalProperties: false
|
|
8745
|
+
});
|
|
8746
|
+
var SideEffectsSpec = Type$1.Object({
|
|
8747
|
+
diaryEntryRequired: Type$1.Optional(Type$1.Boolean()),
|
|
8748
|
+
diaryEntryTags: Type$1.Optional(Type$1.Array(Type$1.String({ minLength: 1 }))),
|
|
8749
|
+
referencedEntries: Type$1.Optional(Type$1.Integer({ minimum: 0 }))
|
|
8750
|
+
}, {
|
|
8751
|
+
$id: "SideEffectsSpec",
|
|
8752
|
+
additionalProperties: false
|
|
8753
|
+
});
|
|
8754
|
+
var SuccessCriteria = Type$1.Object({
|
|
8755
|
+
version: Type$1.Literal(1),
|
|
8756
|
+
gates: Type$1.Optional(Type$1.Array(Gate)),
|
|
8757
|
+
assertions: Type$1.Optional(Type$1.Array(Assertion)),
|
|
8758
|
+
rubric: Type$1.Optional(Rubric),
|
|
8759
|
+
minComposite: Type$1.Optional(Type$1.Number({
|
|
8760
|
+
minimum: 0,
|
|
8761
|
+
maximum: 1
|
|
8762
|
+
})),
|
|
8763
|
+
sideEffects: Type$1.Optional(SideEffectsSpec)
|
|
8764
|
+
}, {
|
|
8765
|
+
$id: "SuccessCriteria",
|
|
8766
|
+
additionalProperties: false
|
|
8767
|
+
});
|
|
8768
|
+
var VerificationResultStatus = Type$1.Union([
|
|
8769
|
+
Type$1.Literal("pass"),
|
|
8770
|
+
Type$1.Literal("fail"),
|
|
8771
|
+
Type$1.Literal("skip")
|
|
8772
|
+
], { $id: "VerificationResultStatus" });
|
|
8773
|
+
var VerificationResultKind = Type$1.Union([
|
|
8774
|
+
Type$1.Literal("gate"),
|
|
8775
|
+
Type$1.Literal("assertion"),
|
|
8776
|
+
Type$1.Literal("rubric"),
|
|
8777
|
+
Type$1.Literal("sideEffect")
|
|
8778
|
+
], { $id: "VerificationResultKind" });
|
|
8779
|
+
var VerificationResult = Type$1.Object({
|
|
8780
|
+
id: Type$1.String({ minLength: 1 }),
|
|
8781
|
+
kind: VerificationResultKind,
|
|
8782
|
+
status: VerificationResultStatus,
|
|
8783
|
+
detail: Type$1.Optional(Type$1.String())
|
|
8784
|
+
}, {
|
|
8785
|
+
$id: "VerificationResult",
|
|
8786
|
+
additionalProperties: false
|
|
8787
|
+
});
|
|
8788
|
+
var VerificationRecord = Type$1.Object({
|
|
8789
|
+
inputCid: Type$1.String({ minLength: 1 }),
|
|
8790
|
+
results: Type$1.Array(VerificationResult),
|
|
8791
|
+
passed: Type$1.Boolean()
|
|
8792
|
+
}, {
|
|
8793
|
+
$id: "VerificationRecord",
|
|
8794
|
+
additionalProperties: false
|
|
8795
|
+
});
|
|
8796
|
+
//#endregion
|
|
8633
8797
|
//#region ../tasks/src/task-types/assess-brief.ts
|
|
8634
8798
|
/**
|
|
8635
8799
|
* `assess_brief` — independently evaluate a fulfilled brief.
|
|
8636
8800
|
*
|
|
8637
8801
|
* output_kind: judgment
|
|
8638
|
-
* criteria: required (rubric
|
|
8639
|
-
*
|
|
8802
|
+
* criteria: required (`successCriteria.rubric` — same envelope as
|
|
8803
|
+
* `judge_pack`)
|
|
8640
8804
|
* references: required (must reference the target `fulfill_brief` task)
|
|
8641
8805
|
*
|
|
8642
8806
|
* The assessor is a different agent from the producer (enforced by the
|
|
8643
8807
|
* server / runtime at claim time — not in the wire schema).
|
|
8808
|
+
*
|
|
8809
|
+
* The rubric in `successCriteria` IS the job spec — the assessor applies
|
|
8810
|
+
* it to the target task's output and emits per-criterion scores. Other
|
|
8811
|
+
* sections (`assertions`, `gates`, `sideEffects`) MAY be present and are
|
|
8812
|
+
* evaluated against the *assessor's output*.
|
|
8644
8813
|
*/
|
|
8645
8814
|
var ASSESS_BRIEF_TYPE = "assess_brief";
|
|
8646
|
-
/**
|
|
8647
|
-
* One criterion lifted from the rubric. Denormalized into the input so the
|
|
8648
|
-
* assessor prompt can be built without a second fetch; the `criteria_cid`
|
|
8649
|
-
* on the Task row remains authoritative for verification.
|
|
8650
|
-
*/
|
|
8651
|
-
var AssessBriefCriterion = Type$1.Object({
|
|
8652
|
-
id: Type$1.String({ minLength: 1 }),
|
|
8653
|
-
description: Type$1.String({ minLength: 1 }),
|
|
8654
|
-
weight: Type$1.Number({
|
|
8655
|
-
minimum: 0,
|
|
8656
|
-
maximum: 1
|
|
8657
|
-
}),
|
|
8658
|
-
scoring: Type$1.Union([
|
|
8659
|
-
Type$1.Literal("llm_judged"),
|
|
8660
|
-
Type$1.Literal("boolean"),
|
|
8661
|
-
Type$1.Literal("deterministic_signature_check")
|
|
8662
|
-
])
|
|
8663
|
-
}, {
|
|
8664
|
-
$id: "AssessBriefCriterion",
|
|
8665
|
-
additionalProperties: false
|
|
8666
|
-
});
|
|
8667
8815
|
var AssessBriefInput = Type$1.Object({
|
|
8668
8816
|
targetTaskId: Type$1.String({ format: "uuid" }),
|
|
8669
|
-
|
|
8670
|
-
rubricPreamble: Type$1.Optional(Type$1.String())
|
|
8817
|
+
successCriteria: SuccessCriteria
|
|
8671
8818
|
}, {
|
|
8672
8819
|
$id: "AssessBriefInput",
|
|
8673
8820
|
additionalProperties: false
|
|
@@ -8736,7 +8883,8 @@ var CuratePackInput = Type$1.Object({
|
|
|
8736
8883
|
prefix: Type$1.Optional(Type$1.String())
|
|
8737
8884
|
}, { additionalProperties: false })),
|
|
8738
8885
|
tokenBudget: Type$1.Optional(Type$1.Number({ minimum: 500 })),
|
|
8739
|
-
recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")]))
|
|
8886
|
+
recipe: Type$1.Optional(Type$1.Union([Type$1.Literal("topic-focused-v1"), Type$1.Literal("scope-inventory-v1")])),
|
|
8887
|
+
successCriteria: Type$1.Optional(SuccessCriteria)
|
|
8740
8888
|
}, {
|
|
8741
8889
|
$id: "CuratePackInput",
|
|
8742
8890
|
additionalProperties: false
|
|
@@ -8761,7 +8909,8 @@ var CuratePackOutput = Type$1.Object({
|
|
|
8761
8909
|
droppedIds: Type$1.Optional(Type$1.Array(Type$1.String({ format: "uuid" }))),
|
|
8762
8910
|
notes: Type$1.String({ minLength: 1 })
|
|
8763
8911
|
}, { additionalProperties: false }))),
|
|
8764
|
-
summary: Type$1.String({ minLength: 1 })
|
|
8912
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
8913
|
+
verification: Type$1.Optional(VerificationRecord)
|
|
8765
8914
|
}, {
|
|
8766
8915
|
$id: "CuratePackOutput",
|
|
8767
8916
|
additionalProperties: false
|
|
@@ -8780,6 +8929,7 @@ var FulfillBriefInput = Type$1.Object({
|
|
|
8780
8929
|
brief: Type$1.String({ minLength: 1 }),
|
|
8781
8930
|
title: Type$1.Optional(Type$1.String()),
|
|
8782
8931
|
acceptanceCriteria: Type$1.Optional(Type$1.Array(Type$1.String())),
|
|
8932
|
+
successCriteria: Type$1.Optional(SuccessCriteria),
|
|
8783
8933
|
seedFiles: Type$1.Optional(Type$1.Array(Type$1.String())),
|
|
8784
8934
|
scopeHint: Type$1.Optional(Type$1.String())
|
|
8785
8935
|
}, {
|
|
@@ -8799,7 +8949,8 @@ var FulfillBriefOutput = Type$1.Object({
|
|
|
8799
8949
|
}, { additionalProperties: false })),
|
|
8800
8950
|
pullRequestUrl: Type$1.Union([Type$1.String(), Type$1.Null()]),
|
|
8801
8951
|
diaryEntryIds: Type$1.Array(Type$1.String({ format: "uuid" })),
|
|
8802
|
-
summary: Type$1.String({ minLength: 1 })
|
|
8952
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
8953
|
+
verification: Type$1.Optional(VerificationRecord)
|
|
8803
8954
|
}, {
|
|
8804
8955
|
$id: "FulfillBriefOutput",
|
|
8805
8956
|
additionalProperties: false
|
|
@@ -8810,19 +8961,18 @@ var FulfillBriefOutput = Type$1.Object({
|
|
|
8810
8961
|
* `judge_pack` — independently score a rendered pack against a rubric.
|
|
8811
8962
|
*
|
|
8812
8963
|
* output_kind: judgment
|
|
8813
|
-
* criteria: required (
|
|
8814
|
-
*
|
|
8964
|
+
* criteria: required (`successCriteria.rubric` — see #852 amendment and
|
|
8965
|
+
* Phase 2 issue #881)
|
|
8815
8966
|
* references: required (must reference the `render_pack` task it judges,
|
|
8816
8967
|
* role='judged_work')
|
|
8817
8968
|
*
|
|
8818
8969
|
* Step 3 of the three-session attribution loop (#875). Mirrors
|
|
8819
8970
|
* `assess_brief` in shape, but over a rendered context pack.
|
|
8820
8971
|
*
|
|
8821
|
-
* Phase 1 rubric storage: the rubric body
|
|
8822
|
-
*
|
|
8823
|
-
* replace the inline body with a `
|
|
8824
|
-
*
|
|
8825
|
-
* building without a fetch.
|
|
8972
|
+
* Phase 1 rubric storage: the rubric body lives at
|
|
8973
|
+
* `input.successCriteria.rubric` and is pinned via the task's `inputCid`.
|
|
8974
|
+
* Phase 2 (#881) will replace the inline body with a `rubricCid`
|
|
8975
|
+
* referencing a stored `rubrics` row; the envelope stays the same.
|
|
8826
8976
|
*
|
|
8827
8977
|
* The judge MUST be a different agent from the renderer. Enforced at
|
|
8828
8978
|
* claim time by the runtime, not in the wire schema.
|
|
@@ -8831,7 +8981,7 @@ var JUDGE_PACK_TYPE = "judge_pack";
|
|
|
8831
8981
|
var JudgePackInput = Type$1.Object({
|
|
8832
8982
|
renderedPackId: Type$1.String({ format: "uuid" }),
|
|
8833
8983
|
sourcePackId: Type$1.String({ format: "uuid" }),
|
|
8834
|
-
|
|
8984
|
+
successCriteria: SuccessCriteria
|
|
8835
8985
|
}, {
|
|
8836
8986
|
$id: "JudgePackInput",
|
|
8837
8987
|
additionalProperties: false
|
|
@@ -8844,6 +8994,7 @@ var JudgePackScore = Type$1.Object({
|
|
|
8844
8994
|
maximum: 1
|
|
8845
8995
|
}),
|
|
8846
8996
|
rationale: Type$1.Optional(Type$1.String()),
|
|
8997
|
+
assertions: Type$1.Optional(Type$1.Array(AssertionResult, { minItems: 1 })),
|
|
8847
8998
|
evidence: Type$1.Optional(Type$1.Record(Type$1.String(), Type$1.Unknown()))
|
|
8848
8999
|
}, {
|
|
8849
9000
|
$id: "JudgePackScore",
|
|
@@ -8862,6 +9013,39 @@ var JudgePackOutput = Type$1.Object({
|
|
|
8862
9013
|
$id: "JudgePackOutput",
|
|
8863
9014
|
additionalProperties: false
|
|
8864
9015
|
});
|
|
9016
|
+
/**
|
|
9017
|
+
* Cross-field validator for JudgePackOutput. Run after the TypeBox
|
|
9018
|
+
* schema check passes. Enforces invariants the schema can't express:
|
|
9019
|
+
*
|
|
9020
|
+
* 1. If a `JudgePackScore` carries an `assertions` array (i.e. the
|
|
9021
|
+
* judge ran the criterion in `llm_checklist` mode), its numeric
|
|
9022
|
+
* `score` MUST equal `1` if every `assertions[i].passed` is true,
|
|
9023
|
+
* else `0`. The prompt instructs the judge to derive `score` from
|
|
9024
|
+
* the array, but the LLM can drift — without this check, the
|
|
9025
|
+
* runtime accepts inconsistent payloads and propagates them into
|
|
9026
|
+
* composite scores and judge attestations (#999 P1).
|
|
9027
|
+
*
|
|
9028
|
+
* 2. If `score` is exactly `1` AND `assertions` is present, every
|
|
9029
|
+
* assertion must have `passed: true`. Catches the failure mode in
|
|
9030
|
+
* the issue: "score: 1 with a failing assertion accepted."
|
|
9031
|
+
*
|
|
9032
|
+
* Cross-rubric checks (e.g. "did the judge populate `assertions` for
|
|
9033
|
+
* every criterion the rubric marked `llm_checklist`?") require the
|
|
9034
|
+
* input rubric and live in a separate, runtime-side validator. This
|
|
9035
|
+
* one is rubric-agnostic on purpose — it catches within-score
|
|
9036
|
+
* inconsistency without needing the original task input.
|
|
9037
|
+
*/
|
|
9038
|
+
function validateJudgePackOutput(output) {
|
|
9039
|
+
const scores = output.scores;
|
|
9040
|
+
for (let i = 0; i < scores.length; i++) {
|
|
9041
|
+
const s = scores[i];
|
|
9042
|
+
if (!s.assertions) continue;
|
|
9043
|
+
const allPassed = s.assertions.every((a) => a.passed);
|
|
9044
|
+
const expected = allPassed ? 1 : 0;
|
|
9045
|
+
if (s.score !== expected) return `scores[${i}] (criterionId="${s.criterionId}"): assertions ${allPassed ? "all pass" : "have at least one fail"} but score=${s.score}. Score must be derived: 1 iff every assertion passes, else 0 (#999 llm_checklist rule).`;
|
|
9046
|
+
}
|
|
9047
|
+
return null;
|
|
9048
|
+
}
|
|
8865
9049
|
//#endregion
|
|
8866
9050
|
//#region ../tasks/src/task-types/render-pack.ts
|
|
8867
9051
|
/**
|
|
@@ -8883,7 +9067,8 @@ var RENDER_PACK_TYPE = "render_pack";
|
|
|
8883
9067
|
var RenderPackInput = Type$1.Object({
|
|
8884
9068
|
packId: Type$1.String({ format: "uuid" }),
|
|
8885
9069
|
persist: Type$1.Optional(Type$1.Boolean()),
|
|
8886
|
-
pinned: Type$1.Optional(Type$1.Boolean())
|
|
9070
|
+
pinned: Type$1.Optional(Type$1.Boolean()),
|
|
9071
|
+
successCriteria: Type$1.Optional(SuccessCriteria)
|
|
8887
9072
|
}, {
|
|
8888
9073
|
$id: "RenderPackInput",
|
|
8889
9074
|
additionalProperties: false
|
|
@@ -8894,7 +9079,8 @@ var RenderPackOutput = Type$1.Object({
|
|
|
8894
9079
|
renderMethod: Type$1.String({ minLength: 1 }),
|
|
8895
9080
|
byteSize: Type$1.Number({ minimum: 0 }),
|
|
8896
9081
|
entriesRendered: Type$1.Number({ minimum: 0 }),
|
|
8897
|
-
summary: Type$1.String({ minLength: 1 })
|
|
9082
|
+
summary: Type$1.String({ minLength: 1 }),
|
|
9083
|
+
verification: Type$1.Optional(VerificationRecord)
|
|
8898
9084
|
}, {
|
|
8899
9085
|
$id: "RenderPackOutput",
|
|
8900
9086
|
additionalProperties: false
|
|
@@ -8902,6 +9088,33 @@ var RenderPackOutput = Type$1.Object({
|
|
|
8902
9088
|
//#endregion
|
|
8903
9089
|
//#region ../tasks/src/task-types/index.ts
|
|
8904
9090
|
/**
|
|
9091
|
+
* Validate that a judgment-task input carries a rubric inside its
|
|
9092
|
+
* `successCriteria` envelope, and that the rubric's weights sum to 1.
|
|
9093
|
+
* Used for `assess_brief` and `judge_pack`.
|
|
9094
|
+
*/
|
|
9095
|
+
function validateJudgmentInput(input) {
|
|
9096
|
+
const sc = input.successCriteria;
|
|
9097
|
+
if (!sc) return "successCriteria is required for judgment tasks";
|
|
9098
|
+
if (!sc.rubric) return "successCriteria.rubric is required for judgment tasks";
|
|
9099
|
+
return validateRubricWeights(sc.rubric);
|
|
9100
|
+
}
|
|
9101
|
+
/**
|
|
9102
|
+
* Cross-field rule: when `input.successCriteria` is set, the producer's
|
|
9103
|
+
* output MUST carry a `verification` block (the LLM's self-assessment).
|
|
9104
|
+
* When it is unset, the output MUST NOT carry one (avoid garbage data).
|
|
9105
|
+
*
|
|
9106
|
+
* Used by all three fulfillment task types. Judgment task outputs do
|
|
9107
|
+
* NOT use this — their entire output IS a structured judgment, so a
|
|
9108
|
+
* separate self-assessment field would be circular.
|
|
9109
|
+
*/
|
|
9110
|
+
function requireVerificationWhenCriteriaPresent(output, input) {
|
|
9111
|
+
const hasCriteria = input !== void 0 && input !== null && input.successCriteria !== void 0;
|
|
9112
|
+
const hasVerification = output.verification !== void 0;
|
|
9113
|
+
if (hasCriteria && !hasVerification) return "output.verification is required because input.successCriteria is set; the producer LLM must self-assess against the criteria";
|
|
9114
|
+
if (!hasCriteria && hasVerification) return "output.verification was supplied but input.successCriteria is unset; omit verification when there are no criteria to assess against";
|
|
9115
|
+
return null;
|
|
9116
|
+
}
|
|
9117
|
+
/**
|
|
8905
9118
|
* Client-side task-type registry. Mirrors the server-owned DB registry
|
|
8906
9119
|
* (PR 2). PR 0 shipped the two brief types; this PR adds the three
|
|
8907
9120
|
* pack-pipeline types for the three-session attribution loop (#875).
|
|
@@ -8916,41 +9129,41 @@ var BUILT_IN_TASK_TYPES = {
|
|
|
8916
9129
|
inputSchema: FulfillBriefInput,
|
|
8917
9130
|
outputSchema: FulfillBriefOutput,
|
|
8918
9131
|
outputKind: "artifact",
|
|
8919
|
-
|
|
8920
|
-
|
|
9132
|
+
requiresReferences: false,
|
|
9133
|
+
validateOutput: requireVerificationWhenCriteriaPresent
|
|
8921
9134
|
},
|
|
8922
9135
|
[ASSESS_BRIEF_TYPE]: {
|
|
8923
9136
|
name: ASSESS_BRIEF_TYPE,
|
|
8924
9137
|
inputSchema: AssessBriefInput,
|
|
8925
9138
|
outputSchema: AssessBriefOutput,
|
|
8926
9139
|
outputKind: "judgment",
|
|
8927
|
-
|
|
8928
|
-
|
|
9140
|
+
requiresReferences: true,
|
|
9141
|
+
validateInput: validateJudgmentInput
|
|
8929
9142
|
},
|
|
8930
9143
|
[CURATE_PACK_TYPE]: {
|
|
8931
9144
|
name: CURATE_PACK_TYPE,
|
|
8932
9145
|
inputSchema: CuratePackInput,
|
|
8933
9146
|
outputSchema: CuratePackOutput,
|
|
8934
9147
|
outputKind: "artifact",
|
|
8935
|
-
|
|
8936
|
-
|
|
9148
|
+
requiresReferences: false,
|
|
9149
|
+
validateOutput: requireVerificationWhenCriteriaPresent
|
|
8937
9150
|
},
|
|
8938
9151
|
[RENDER_PACK_TYPE]: {
|
|
8939
9152
|
name: RENDER_PACK_TYPE,
|
|
8940
9153
|
inputSchema: RenderPackInput,
|
|
8941
9154
|
outputSchema: RenderPackOutput,
|
|
8942
9155
|
outputKind: "artifact",
|
|
8943
|
-
|
|
8944
|
-
|
|
9156
|
+
requiresReferences: false,
|
|
9157
|
+
validateOutput: requireVerificationWhenCriteriaPresent
|
|
8945
9158
|
},
|
|
8946
9159
|
[JUDGE_PACK_TYPE]: {
|
|
8947
9160
|
name: JUDGE_PACK_TYPE,
|
|
8948
9161
|
inputSchema: JudgePackInput,
|
|
8949
9162
|
outputSchema: JudgePackOutput,
|
|
8950
9163
|
outputKind: "judgment",
|
|
8951
|
-
requiresCriteria: false,
|
|
8952
9164
|
requiresReferences: true,
|
|
8953
|
-
validateInput:
|
|
9165
|
+
validateInput: validateJudgmentInput,
|
|
9166
|
+
validateOutput: validateJudgePackOutput
|
|
8954
9167
|
}
|
|
8955
9168
|
};
|
|
8956
9169
|
//#endregion
|
|
@@ -8980,13 +9193,22 @@ function schemaErrors(prefix, schema, value) {
|
|
|
8980
9193
|
message: error.message
|
|
8981
9194
|
}));
|
|
8982
9195
|
}
|
|
8983
|
-
function validateTaskOutput(taskType, output) {
|
|
9196
|
+
function validateTaskOutput(taskType, output, input) {
|
|
8984
9197
|
const entry = getTaskTypeEntry(taskType);
|
|
8985
9198
|
if (!entry) return [{
|
|
8986
9199
|
field: "taskType",
|
|
8987
9200
|
message: `Unknown task type: ${taskType}`
|
|
8988
9201
|
}];
|
|
8989
|
-
|
|
9202
|
+
const errors = schemaErrors("output", entry.outputSchema, output);
|
|
9203
|
+
if (errors.length > 0) return errors;
|
|
9204
|
+
if (entry.validateOutput) {
|
|
9205
|
+
const validationError = entry.validateOutput(output, input);
|
|
9206
|
+
if (validationError) return [{
|
|
9207
|
+
field: "output",
|
|
9208
|
+
message: validationError
|
|
9209
|
+
}];
|
|
9210
|
+
}
|
|
9211
|
+
return [];
|
|
8990
9212
|
}
|
|
8991
9213
|
/**
|
|
8992
9214
|
* Resolve the TypeBox output schema registered for `taskType`. Returns
|
|
@@ -9126,7 +9348,6 @@ Type$1.Object({
|
|
|
9126
9348
|
input: Type$1.Record(Type$1.String(), Type$1.Unknown()),
|
|
9127
9349
|
inputSchemaCid: Cid,
|
|
9128
9350
|
inputCid: Cid,
|
|
9129
|
-
criteriaCid: Type$1.Union([Cid, Type$1.Null()]),
|
|
9130
9351
|
references: Type$1.Array(TaskRef),
|
|
9131
9352
|
correlationId: Type$1.Union([Uuid, Type$1.Null()]),
|
|
9132
9353
|
imposedByAgentId: Type$1.Union([Uuid, Type$1.Null()]),
|
|
@@ -9340,11 +9561,12 @@ function buildFinalOutputBlock(opts) {
|
|
|
9340
9561
|
* anything) work without any code path here.
|
|
9341
9562
|
*/
|
|
9342
9563
|
function buildAssessBriefPrompt(input, ctx) {
|
|
9343
|
-
const
|
|
9344
|
-
const
|
|
9564
|
+
const rubric = input.successCriteria.rubric;
|
|
9565
|
+
const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
|
|
9566
|
+
const preambleSection = rubric.preamble ? [
|
|
9345
9567
|
"### Rubric preamble",
|
|
9346
9568
|
"",
|
|
9347
|
-
|
|
9569
|
+
rubric.preamble,
|
|
9348
9570
|
""
|
|
9349
9571
|
].join("\n") : "";
|
|
9350
9572
|
return [
|
|
@@ -9394,7 +9616,7 @@ function buildAssessBriefPrompt(input, ctx) {
|
|
|
9394
9616
|
"",
|
|
9395
9617
|
"### Scoring rules",
|
|
9396
9618
|
"",
|
|
9397
|
-
"- `
|
|
9619
|
+
"- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4 sentences).",
|
|
9398
9620
|
"- `boolean`: score exactly 0 or 1. `rationale` optional.",
|
|
9399
9621
|
"- `deterministic_signature_check`: run `moltnet entry verify` on every diary entry returned by step 3 above AND `git verify-commit` on every commit. Score 1 iff ALL signatures are valid; otherwise 0. Populate `evidence.commitsVerified`, `evidence.commitsTotal`, `evidence.signatureFailures`.",
|
|
9400
9622
|
"",
|
|
@@ -9418,6 +9640,39 @@ function buildAssessBriefPrompt(input, ctx) {
|
|
|
9418
9640
|
].filter(Boolean).join("\n");
|
|
9419
9641
|
}
|
|
9420
9642
|
//#endregion
|
|
9643
|
+
//#region ../agent-runtime/src/prompts/self-verification.ts
|
|
9644
|
+
function buildSelfVerificationBlock(taskId) {
|
|
9645
|
+
return [
|
|
9646
|
+
"## Self-verification",
|
|
9647
|
+
"",
|
|
9648
|
+
`Call \`moltnet_get_task\` with task id \`${taskId}\` and read \`input.successCriteria\`.`,
|
|
9649
|
+
"",
|
|
9650
|
+
"- If `input.successCriteria` is **absent**, omit `verification` from your",
|
|
9651
|
+
" final output entirely.",
|
|
9652
|
+
"- If `input.successCriteria` is **present**, you MUST include a",
|
|
9653
|
+
" `verification` block in your final output. Evaluate every applicable",
|
|
9654
|
+
" item — `gates`, `assertions`, `rubric` criteria, `sideEffects` — against",
|
|
9655
|
+
" your produced work and emit one result per id. Be honest: a `fail` with",
|
|
9656
|
+
" a one-line reason is more useful than a false `pass`. Use `skip` (with a",
|
|
9657
|
+
" `detail`) when you genuinely could not determine a result. Compute",
|
|
9658
|
+
" `passed = results.every(r => r.status !== 'fail')`.",
|
|
9659
|
+
"",
|
|
9660
|
+
"Verification shape:",
|
|
9661
|
+
"",
|
|
9662
|
+
"```json",
|
|
9663
|
+
"{",
|
|
9664
|
+
" \"inputCid\": \"<the inputCid you saw on the task>\",",
|
|
9665
|
+
" \"results\": [",
|
|
9666
|
+
" { \"id\": \"<criterion id>\", \"kind\": \"assertion|gate|rubric|sideEffect\",",
|
|
9667
|
+
" \"status\": \"pass|fail|skip\", \"detail\": \"<optional one-liner>\" }",
|
|
9668
|
+
" ],",
|
|
9669
|
+
" \"passed\": <boolean>",
|
|
9670
|
+
"}",
|
|
9671
|
+
"```",
|
|
9672
|
+
""
|
|
9673
|
+
].join("\n");
|
|
9674
|
+
}
|
|
9675
|
+
//#endregion
|
|
9421
9676
|
//#region ../agent-runtime/src/prompts/curate-pack.ts
|
|
9422
9677
|
/**
|
|
9423
9678
|
* Build the system prompt for a `curate_pack` task.
|
|
@@ -9552,6 +9807,7 @@ function buildCuratePackPrompt(input, ctx) {
|
|
|
9552
9807
|
" output, not in the diary.",
|
|
9553
9808
|
"- Respect hard include/exclude filters literally.",
|
|
9554
9809
|
"",
|
|
9810
|
+
buildSelfVerificationBlock(ctx.taskId),
|
|
9555
9811
|
buildFinalOutputBlock({
|
|
9556
9812
|
taskType: "curate_pack",
|
|
9557
9813
|
outputSchemaName: "CuratePackOutput",
|
|
@@ -9566,7 +9822,8 @@ function buildCuratePackPrompt(input, ctx) {
|
|
|
9566
9822
|
" \"checkpoints\": [",
|
|
9567
9823
|
" { \"phase\": \"recon\", \"candidateIds\": [...], \"droppedIds\": [...], \"notes\": \"...\" }",
|
|
9568
9824
|
" ],",
|
|
9569
|
-
" \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\"",
|
|
9825
|
+
" \"summary\": \"<2-4 sentences: what you looked for, how you narrowed, what defines the final set>\",",
|
|
9826
|
+
" \"verification\": <required iff input.successCriteria; see Self-verification>",
|
|
9570
9827
|
"}"
|
|
9571
9828
|
].join("\n")
|
|
9572
9829
|
})
|
|
@@ -9627,6 +9884,7 @@ function buildFulfillBriefPrompt(input, ctx) {
|
|
|
9627
9884
|
" `MoltNet-Diary: <id>` (per the runtime instructor).",
|
|
9628
9885
|
"6. Push the branch and open a PR.",
|
|
9629
9886
|
"",
|
|
9887
|
+
buildSelfVerificationBlock(ctx.taskId),
|
|
9630
9888
|
buildFinalOutputBlock({
|
|
9631
9889
|
taskType: "fulfill_brief",
|
|
9632
9890
|
outputSchemaName: "FulfillBriefOutput",
|
|
@@ -9636,7 +9894,8 @@ function buildFulfillBriefPrompt(input, ctx) {
|
|
|
9636
9894
|
" \"commits\": [{ \"sha\": \"...\", \"message\": \"...\", \"diaryEntryId\": \"...\" }],",
|
|
9637
9895
|
" \"pullRequestUrl\": \"<url-or-null>\",",
|
|
9638
9896
|
" \"diaryEntryIds\": [\"...\"],",
|
|
9639
|
-
" \"summary\": \"<1-3 sentence recap>\"",
|
|
9897
|
+
" \"summary\": \"<1-3 sentence recap>\",",
|
|
9898
|
+
" \"verification\": <required iff input.successCriteria; see Self-verification>",
|
|
9640
9899
|
"}"
|
|
9641
9900
|
].join("\n")
|
|
9642
9901
|
})
|
|
@@ -9645,7 +9904,8 @@ function buildFulfillBriefPrompt(input, ctx) {
|
|
|
9645
9904
|
//#endregion
|
|
9646
9905
|
//#region ../agent-runtime/src/prompts/judge-pack.ts
|
|
9647
9906
|
function buildJudgePackPrompt(input, ctx) {
|
|
9648
|
-
const { renderedPackId, sourcePackId,
|
|
9907
|
+
const { renderedPackId, sourcePackId, successCriteria } = input;
|
|
9908
|
+
const rubric = successCriteria.rubric;
|
|
9649
9909
|
const criteriaList = rubric.criteria.map((c, i) => `${i + 1}. **${c.id}** (weight ${c.weight}, scoring: \`${c.scoring}\`) — ${c.description}`).join("\n");
|
|
9650
9910
|
const preambleSection = rubric.preamble ? [
|
|
9651
9911
|
"### Rubric preamble",
|
|
@@ -9675,7 +9935,7 @@ function buildJudgePackPrompt(input, ctx) {
|
|
|
9675
9935
|
"",
|
|
9676
9936
|
"1. Call `moltnet_rendered_pack_get` for the rendered pack. Keep the",
|
|
9677
9937
|
" `content` string — you will score it.",
|
|
9678
|
-
"2. Call `moltnet_pack_get` with `
|
|
9938
|
+
"2. Call `moltnet_pack_get` with `expandEntries: true` for the source",
|
|
9679
9939
|
" pack. Keep the source entries for grounding / coverage checks.",
|
|
9680
9940
|
"3. For each criterion, score according to its `scoring` mode (see",
|
|
9681
9941
|
" Scoring rules below). Produce rationales where required.",
|
|
@@ -9688,9 +9948,23 @@ function buildJudgePackPrompt(input, ctx) {
|
|
|
9688
9948
|
"",
|
|
9689
9949
|
"### Scoring rules",
|
|
9690
9950
|
"",
|
|
9691
|
-
"- `
|
|
9951
|
+
"- `llm_score`: score 0..1 continuous. `rationale` REQUIRED (2–4",
|
|
9692
9952
|
" sentences pointing at specific evidence in the rendered content or",
|
|
9693
|
-
" the source entries).",
|
|
9953
|
+
" the source entries). NOTE: this mode smooths individual failures",
|
|
9954
|
+
" into the gradient. Prefer `llm_checklist` for grounding,",
|
|
9955
|
+
" faithfulness, or any property where one failure is a real failure.",
|
|
9956
|
+
"- `llm_checklist`: enumerate per-claim binary assertions instead of",
|
|
9957
|
+
" picking a continuous score. For each assertion, return",
|
|
9958
|
+
" `{ id, text, passed: bool, evidence: string }`. `evidence` is",
|
|
9959
|
+
" REQUIRED for both PASS and FAIL — for PASS, quote the supporting",
|
|
9960
|
+
" span (rendered or source) or cite the source entry id; for FAIL,",
|
|
9961
|
+
" quote the offending claim verbatim and explain why it fails.",
|
|
9962
|
+
" Don't give the benefit of the doubt: if a claim looks supported but",
|
|
9963
|
+
" you cannot point at the supporting source span, mark it FAIL with",
|
|
9964
|
+
" evidence = \"no supporting span found\". Set the criterion `score`",
|
|
9965
|
+
" to `1` iff every assertion passes, else `0` — the runtime checks",
|
|
9966
|
+
" this matches the assertions array. Populate `assertions` on the",
|
|
9967
|
+
" score object; leave `evidence` (the structured record) empty.",
|
|
9694
9968
|
"- `boolean`: score exactly 0 or 1. `rationale` optional.",
|
|
9695
9969
|
"- `deterministic_signature_check`: batch-fetch ALL referenced source",
|
|
9696
9970
|
" entries in a single call — `moltnet_list_entries` with `entryIds` set",
|
|
@@ -9730,7 +10004,14 @@ function buildJudgePackPrompt(input, ctx) {
|
|
|
9730
10004
|
shapeSketch: [
|
|
9731
10005
|
"{",
|
|
9732
10006
|
" \"scores\": [",
|
|
9733
|
-
" { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} }",
|
|
10007
|
+
" { \"criterionId\": \"...\", \"score\": 0.0, \"rationale\": \"...\", \"evidence\": {} },",
|
|
10008
|
+
" {",
|
|
10009
|
+
" \"criterionId\": \"<llm_checklist criterion>\",",
|
|
10010
|
+
" \"score\": 0, // 1 iff every assertion passed",
|
|
10011
|
+
" \"assertions\": [",
|
|
10012
|
+
" { \"id\": \"claim-1\", \"text\": \"...\", \"passed\": false, \"evidence\": \"...\" }",
|
|
10013
|
+
" ]",
|
|
10014
|
+
" }",
|
|
9734
10015
|
" ],",
|
|
9735
10016
|
" \"composite\": <sum-of-weighted-scores>,",
|
|
9736
10017
|
" \"verdict\": \"<1-3 sentence overall>\",",
|
|
@@ -9772,7 +10053,7 @@ function buildRenderPackPrompt(input, ctx) {
|
|
|
9772
10053
|
"",
|
|
9773
10054
|
"## Workflow",
|
|
9774
10055
|
"",
|
|
9775
|
-
"1. Call `moltnet_pack_get` with `
|
|
10056
|
+
"1. Call `moltnet_pack_get` with `expandEntries: true` to inspect the",
|
|
9776
10057
|
" source entries. Read it — you need the entry count for your output.",
|
|
9777
10058
|
"2. Call `moltnet_pack_render` with:",
|
|
9778
10059
|
` - \`packId\`: \`${packId}\``,
|
|
@@ -9787,6 +10068,7 @@ function buildRenderPackPrompt(input, ctx) {
|
|
|
9787
10068
|
"- Do NOT write diary entries unless a genuine incident occurs",
|
|
9788
10069
|
" (rendering failure, invariant violation).",
|
|
9789
10070
|
"",
|
|
10071
|
+
buildSelfVerificationBlock(ctx.taskId),
|
|
9790
10072
|
buildFinalOutputBlock({
|
|
9791
10073
|
taskType: "render_pack",
|
|
9792
10074
|
outputSchemaName: "RenderPackOutput",
|
|
@@ -9797,7 +10079,8 @@ function buildRenderPackPrompt(input, ctx) {
|
|
|
9797
10079
|
" \"renderMethod\": \"<label>\",",
|
|
9798
10080
|
" \"byteSize\": <int>,",
|
|
9799
10081
|
" \"entriesRendered\": <int>,",
|
|
9800
|
-
" \"summary\": \"<1-3 sentence recap>\"",
|
|
10082
|
+
" \"summary\": \"<1-3 sentence recap>\",",
|
|
10083
|
+
" \"verification\": <required iff input.successCriteria; see Self-verification>",
|
|
9801
10084
|
"}"
|
|
9802
10085
|
].join("\n")
|
|
9803
10086
|
})
|
|
@@ -13567,9 +13850,9 @@ function createSubmitOutputTool(taskType, opts = {}) {
|
|
|
13567
13850
|
description: contract.description,
|
|
13568
13851
|
parameters: schema,
|
|
13569
13852
|
async execute(_id, params) {
|
|
13570
|
-
const errors =
|
|
13853
|
+
const errors = validateTaskOutput(taskType, params);
|
|
13571
13854
|
if (errors.length > 0) {
|
|
13572
|
-
const detailMsg = errors.slice(0, 3).map((err) => `${err.
|
|
13855
|
+
const detailMsg = errors.slice(0, 3).map((err) => `${err.field}: ${err.message}`).join("; ");
|
|
13573
13856
|
const details = {
|
|
13574
13857
|
captured: false,
|
|
13575
13858
|
callCount,
|
|
@@ -13583,7 +13866,7 @@ function createSubmitOutputTool(taskType, opts = {}) {
|
|
|
13583
13866
|
return {
|
|
13584
13867
|
content: [{
|
|
13585
13868
|
type: "text",
|
|
13586
|
-
text: `Output failed
|
|
13869
|
+
text: `Output failed validation: ${detailMsg}. Re-call this tool with a corrected output.`
|
|
13587
13870
|
}],
|
|
13588
13871
|
details,
|
|
13589
13872
|
isError: true
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@themoltnet/pi-extension",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.12.0",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "MoltNet pi extension — sandboxed tool execution in Gondolin VMs with MoltNet identity and persistent memory",
|
|
6
6
|
"license": "MIT",
|
|
@@ -31,8 +31,8 @@
|
|
|
31
31
|
"@earendil-works/gondolin": "^0.7.0",
|
|
32
32
|
"@opentelemetry/api": "^1.9.0",
|
|
33
33
|
"@sinclair/typebox": "^0.34.0",
|
|
34
|
-
"@themoltnet/
|
|
35
|
-
"@themoltnet/
|
|
34
|
+
"@themoltnet/agent-runtime": "0.9.0",
|
|
35
|
+
"@themoltnet/sdk": "0.98.0"
|
|
36
36
|
},
|
|
37
37
|
"peerDependencies": {
|
|
38
38
|
"@mariozechner/pi-coding-agent": ">=0.73.0",
|
|
@@ -61,10 +61,25 @@
|
|
|
61
61
|
"engines": {
|
|
62
62
|
"node": ">=22"
|
|
63
63
|
},
|
|
64
|
+
"nx": {
|
|
65
|
+
"tags": [
|
|
66
|
+
"type:runtime",
|
|
67
|
+
"scope:agent",
|
|
68
|
+
"platform:extension"
|
|
69
|
+
],
|
|
70
|
+
"targets": {
|
|
71
|
+
"test-ci": {
|
|
72
|
+
"executor": "nx:noop",
|
|
73
|
+
"dependsOn": [
|
|
74
|
+
"test"
|
|
75
|
+
],
|
|
76
|
+
"metadata": {
|
|
77
|
+
"description": "Alias for `test` on projects without atomization."
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
},
|
|
64
82
|
"scripts": {
|
|
65
|
-
"
|
|
66
|
-
"check:pack": "tsx ../../tools/src/check-pack.ts --package .",
|
|
67
|
-
"build": "vite build",
|
|
68
|
-
"test": "vitest run --passWithNoTests"
|
|
83
|
+
"check:pack": "tsx ../../tools/src/check-pack.ts --package ."
|
|
69
84
|
}
|
|
70
85
|
}
|