@badliveware/pi-tool-feedback 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/README.md +9 -2
- package/index.ts +8 -3
- package/package.json +4 -3
- package/src/core.ts +14 -16
- package/src/per-tool.ts +52 -0
- package/src/prompts.ts +16 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.2.0
|
|
4
|
+
|
|
5
|
+
- Add per-tool feedback responses so agents can report different experiences for different watched tools in the same turn.
|
|
6
|
+
- Record follow-up tool categories after watched calls so feedback logs can distinguish routine verification from compensatory follow-up work.
|
|
7
|
+
- Update package imports and peer dependencies for the `@earendil-works/*` Pi packages.
|
|
8
|
+
|
|
9
|
+
## 0.1.1
|
|
10
|
+
|
|
11
|
+
- Clarified dogfood feedback prompts so agents call `tool_feedback` silently and avoid acknowledging the feedback request to the user.
|
|
12
|
+
|
|
13
|
+
## 0.1.0
|
|
14
|
+
|
|
15
|
+
- Initial public package release for watched-tool feedback prompts and passive turn summaries.
|
|
16
|
+
- Added custom feedback fields so projects can ask for domain-specific ratings such as ranking quality or latency acceptability.
|
|
17
|
+
- Delivered active feedback requests as Pi custom messages instead of user messages.
|
|
18
|
+
- Added prompt wording and README guidance that frame agent self-feedback as noisy subjective signal rather than ground truth.
|
package/README.md
CHANGED
|
@@ -86,7 +86,7 @@ Other options:
|
|
|
86
86
|
|
|
87
87
|
### Extra feedback fields
|
|
88
88
|
|
|
89
|
-
The built-in feedback schema stays stable, but you can add project- or user-specific fields. The active prompt lists these fields and agents answer them inside `fieldResponses`.
|
|
89
|
+
The built-in feedback schema stays stable, but you can add project- or user-specific fields. The active prompt lists these fields and agents answer them inside `fieldResponses`. Set `required: true` for fields you want every feedback prompt to answer; invalid or missing required answers are logged in `fieldResponseErrors`.
|
|
90
90
|
|
|
91
91
|
```json
|
|
92
92
|
{
|
|
@@ -135,11 +135,12 @@ Read-only state/config inspection. Use it to see the loaded mode, watch rules, c
|
|
|
135
135
|
|
|
136
136
|
### `tool_feedback`
|
|
137
137
|
|
|
138
|
-
Records one structured feedback entry. Typical agent response after a feedback prompt:
|
|
138
|
+
Records one structured feedback entry. When several watched tools were used and the experience differed by tool, agents can set `primaryWatchedTool` and optional `perToolResponses` keyed by tool name. Typical agent response after a feedback prompt:
|
|
139
139
|
|
|
140
140
|
```json
|
|
141
141
|
{
|
|
142
142
|
"watchedTools": ["code_intel_impact_map"],
|
|
143
|
+
"primaryWatchedTool": "code_intel_impact_map",
|
|
143
144
|
"perceivedUsefulness": "medium",
|
|
144
145
|
"wouldUseAgainSameSituation": "yes",
|
|
145
146
|
"followupWasRoutine": "yes",
|
|
@@ -151,6 +152,12 @@ Records one structured feedback entry. Typical agent response after a feedback p
|
|
|
151
152
|
"improvement": "better_summary",
|
|
152
153
|
"fieldResponses": {
|
|
153
154
|
"rankingQuality": "mixed"
|
|
155
|
+
},
|
|
156
|
+
"perToolResponses": {
|
|
157
|
+
"code_intel_impact_map": {
|
|
158
|
+
"outputSeemedTooNoisy": "yes",
|
|
159
|
+
"fieldResponses": { "rankingQuality": "mixed" }
|
|
160
|
+
}
|
|
154
161
|
}
|
|
155
162
|
}
|
|
156
163
|
```
|
package/index.ts
CHANGED
|
@@ -1,12 +1,13 @@
|
|
|
1
|
-
import { Type } from "@
|
|
2
|
-
import type { ExtensionAPI, ExtensionContext } from "@
|
|
3
|
-
import { Text } from "@
|
|
1
|
+
import { Type } from "@earendil-works/pi-ai";
|
|
2
|
+
import type { ExtensionAPI, ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
3
|
+
import { Text } from "@earendil-works/pi-tui";
|
|
4
4
|
import {
|
|
5
5
|
appendLog,
|
|
6
6
|
categoryForTool,
|
|
7
7
|
feedbackLogPath,
|
|
8
8
|
feedbackPrompt,
|
|
9
9
|
feedbackRecord,
|
|
10
|
+
invocationIdFor,
|
|
10
11
|
isRecord,
|
|
11
12
|
loadToolFeedbackConfig,
|
|
12
13
|
logSafeFeedbackRecord,
|
|
@@ -17,6 +18,7 @@ import {
|
|
|
17
18
|
resultErrorKind,
|
|
18
19
|
resultOk,
|
|
19
20
|
resultTruncated,
|
|
21
|
+
sessionIdFromContext,
|
|
20
22
|
stringValue,
|
|
21
23
|
unique,
|
|
22
24
|
type AgentUsage,
|
|
@@ -131,6 +133,8 @@ export default function toolFeedback(pi: ExtensionAPI): void {
|
|
|
131
133
|
description: "Record concise structured feedback after using watched tools. This stores feedback only; it does not change the watched tool.",
|
|
132
134
|
parameters: Type.Object({
|
|
133
135
|
watchedTools: Type.Array(Type.String(), { description: "Watched tool names this feedback covers." }),
|
|
136
|
+
primaryWatchedTool: Type.Optional(Type.String({ description: "Primary watched tool this feedback mostly concerns when multiple tools were used." })),
|
|
137
|
+
perToolResponses: Type.Optional(Type.Record(Type.String(), Type.Unknown(), { description: "Optional per-tool structured responses when multiple watched tools differed materially. Keys are watched tool names; values may include base feedback fields and fieldResponses." })),
|
|
134
138
|
perceivedUsefulness: Type.Union([Type.Literal("high"), Type.Literal("medium"), Type.Literal("low"), Type.Literal("none"), Type.Literal("unknown")], { description: "How useful the tool felt for this task. Allowed values are: `high`, `medium`, `low`, `none`, `unknown`." }),
|
|
135
139
|
wouldUseAgainSameSituation: Type.Union([Type.Literal("yes"), Type.Literal("no"), Type.Literal("unsure"), Type.Literal("unknown")], { description: "Whether you would use the same tool again for a similar situation. Allowed values are: `yes`, `no`, `unsure`, `unknown`." }),
|
|
136
140
|
followupWasRoutine: Type.Optional(Type.Union([Type.Literal("yes"), Type.Literal("no"), Type.Literal("unknown")], { description: "Whether follow-up work felt routine rather than caused by tool insufficiency. Allowed values are: `yes`, `no`, `unknown`." })),
|
|
@@ -196,6 +200,7 @@ export default function toolFeedback(pi: ExtensionAPI): void {
|
|
|
196
200
|
const watched: WatchedToolCall & { startedAt: number } = {
|
|
197
201
|
toolName,
|
|
198
202
|
toolCallId,
|
|
203
|
+
invocationId: invocationIdFor(sessionIdFromContext(ctx), toolCallId),
|
|
199
204
|
category,
|
|
200
205
|
confirmReferences: inputRecord ? stringValue(inputRecord.confirmReferences) : undefined,
|
|
201
206
|
turnIndex: turn.turnIndex,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@badliveware/pi-tool-feedback",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "Generic watched-tool feedback prompts and passive summaries for Pi.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"keywords": [
|
|
@@ -25,6 +25,7 @@
|
|
|
25
25
|
},
|
|
26
26
|
"files": [
|
|
27
27
|
"README.md",
|
|
28
|
+
"CHANGELOG.md",
|
|
28
29
|
"LICENSE",
|
|
29
30
|
"index.ts",
|
|
30
31
|
"src",
|
|
@@ -36,8 +37,8 @@
|
|
|
36
37
|
]
|
|
37
38
|
},
|
|
38
39
|
"peerDependencies": {
|
|
39
|
-
"@
|
|
40
|
-
"@
|
|
40
|
+
"@earendil-works/pi-ai": "*",
|
|
41
|
+
"@earendil-works/pi-coding-agent": "*"
|
|
41
42
|
},
|
|
42
43
|
"engines": {
|
|
43
44
|
"node": ">=20"
|
package/src/core.ts
CHANGED
|
@@ -2,7 +2,9 @@ import * as crypto from "node:crypto";
|
|
|
2
2
|
import * as fs from "node:fs";
|
|
3
3
|
import * as os from "node:os";
|
|
4
4
|
import * as path from "node:path";
|
|
5
|
-
import type { ExtensionContext } from "@
|
|
5
|
+
import type { ExtensionContext } from "@earendil-works/pi-coding-agent";
|
|
6
|
+
import { sanitizePerToolResponses } from "./per-tool.ts";
|
|
7
|
+
import { BASE_FIELD_PROMPT, DEFAULT_TASK_PROMPT } from "./prompts.ts";
|
|
6
8
|
|
|
7
9
|
export type FeedbackMode = "off" | "passive" | "ask-agent" | "both";
|
|
8
10
|
export type PerceivedUsefulness = "high" | "medium" | "low" | "none" | "unknown";
|
|
@@ -53,6 +55,7 @@ export interface LoadedConfig {
|
|
|
53
55
|
export interface WatchedToolCall {
|
|
54
56
|
toolName: string;
|
|
55
57
|
toolCallId: string;
|
|
58
|
+
invocationId: string;
|
|
56
59
|
category: string;
|
|
57
60
|
confirmReferences?: string;
|
|
58
61
|
turnIndex: number;
|
|
@@ -100,6 +103,7 @@ export interface TurnSummary {
|
|
|
100
103
|
confirmReferences: string[];
|
|
101
104
|
toolCategories: string[];
|
|
102
105
|
categoriesAfterFirstWatchedCall: string[];
|
|
106
|
+
watchedResults: Array<Record<string, unknown>>;
|
|
103
107
|
}
|
|
104
108
|
|
|
105
109
|
export interface FeedbackRecord {
|
|
@@ -109,6 +113,8 @@ export interface FeedbackRecord {
|
|
|
109
113
|
sessionId: string;
|
|
110
114
|
repoRoot: string;
|
|
111
115
|
watchedTools: string[];
|
|
116
|
+
primaryWatchedTool?: string;
|
|
117
|
+
perToolResponses?: Record<string, Record<string, unknown>>;
|
|
112
118
|
perceivedUsefulness: PerceivedUsefulness;
|
|
113
119
|
wouldUseAgainSameSituation: WouldUseAgain;
|
|
114
120
|
followupWasRoutine?: YesNoUnknown;
|
|
@@ -126,21 +132,6 @@ export interface FeedbackRecord {
|
|
|
126
132
|
}
|
|
127
133
|
|
|
128
134
|
const CONFIG_FILE_NAME = "tool-feedback.json";
|
|
129
|
-
const DEFAULT_TASK_PROMPT = [
|
|
130
|
-
"You used watched tools in the previous prompt. Please call `tool_feedback` once with concise structured feedback.",
|
|
131
|
-
"Focus on your own experience using the tool: whether it seemed useful, whether it felt incomplete or noisy, whether follow-up work was routine or compensatory, whether you would use it again in the same situation, and what one improvement would help most.",
|
|
132
|
-
"This is a dogfood feedback request, not new implementation work.",
|
|
133
|
-
].join("\n\n");
|
|
134
|
-
|
|
135
|
-
const BASE_FIELD_PROMPT = [
|
|
136
|
-
"Base `tool_feedback` field values:",
|
|
137
|
-
"- perceivedUsefulness: `high`, `medium`, `low`, `none`, or `unknown`",
|
|
138
|
-
"- wouldUseAgainSameSituation: `yes`, `no`, `unsure`, or `unknown`",
|
|
139
|
-
"- followupWasRoutine, followupNeededBecauseToolWasInsufficient, outputSeemedTooNoisy, outputSeemedIncomplete, missedImportantContext: `yes`, `no`, or `unknown`",
|
|
140
|
-
"- confidence: `high`, `medium`, or `low`",
|
|
141
|
-
"- improvement (optional): `better_ranking`, `higher_cap`, `better_summary`, `better_docs`, `less_noise`, `faster`, or `other`",
|
|
142
|
-
"Use `fieldResponses` only for configured extra fields. You do not need to inspect extension source to answer this prompt.",
|
|
143
|
-
].join("\n");
|
|
144
135
|
|
|
145
136
|
export const DEFAULT_CONFIG: ToolFeedbackConfig = {
|
|
146
137
|
mode: "passive",
|
|
@@ -320,6 +311,10 @@ function shortHash(value: string): string {
|
|
|
320
311
|
return crypto.createHash("sha256").update(value).digest("hex").slice(0, 16);
|
|
321
312
|
}
|
|
322
313
|
|
|
314
|
+
export function invocationIdFor(sessionId: string, toolCallId: string): string {
|
|
315
|
+
return shortHash(`${sessionId}\0${toolCallId}`);
|
|
316
|
+
}
|
|
317
|
+
|
|
323
318
|
export function unique(items: Array<string | undefined>): string[] {
|
|
324
319
|
return [...new Set(items.filter((item): item is string => typeof item === "string" && item.length > 0))];
|
|
325
320
|
}
|
|
@@ -391,6 +386,7 @@ export function makeTurnSummary(turn: TurnUsage, ctx: ExtensionContext): TurnSum
|
|
|
391
386
|
confirmReferences: unique(turn.watchedCalls.map((call) => call.confirmReferences)),
|
|
392
387
|
toolCategories: unique(turn.toolCalls.map((call) => call.category)),
|
|
393
388
|
categoriesAfterFirstWatchedCall: unique(turn.toolCalls.filter((call) => call.sequence > firstWatchedSequence).map((call) => call.category)),
|
|
389
|
+
watchedResults: turn.watchedResults.map((result) => ({ toolName: result.toolName, invocationId: result.invocationId, ok: result.ok, isError: result.isError, truncated: result.truncated, errorKind: result.errorKind, durationMs: result.durationMs })),
|
|
394
390
|
};
|
|
395
391
|
}
|
|
396
392
|
|
|
@@ -488,6 +484,8 @@ export function feedbackRecord(input: Record<string, unknown>, ctx: ExtensionCon
|
|
|
488
484
|
sessionId: sessionIdFromContext(ctx),
|
|
489
485
|
repoRoot: ctx.cwd,
|
|
490
486
|
watchedTools: normalizeStringArray(input.watchedTools),
|
|
487
|
+
primaryWatchedTool: stringValue(input.primaryWatchedTool),
|
|
488
|
+
perToolResponses: sanitizePerToolResponses(input.perToolResponses),
|
|
491
489
|
perceivedUsefulness: perceivedUsefulness(input.perceivedUsefulness),
|
|
492
490
|
wouldUseAgainSameSituation: wouldUseAgain(input.wouldUseAgainSameSituation),
|
|
493
491
|
followupWasRoutine: yesNoUnknown(input.followupWasRoutine),
|
package/src/per-tool.ts
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
type FeedbackFieldValue = string | number | boolean;
|
|
2
|
+
|
|
3
|
+
const allowedBaseFields = new Set([
|
|
4
|
+
"perceivedUsefulness",
|
|
5
|
+
"wouldUseAgainSameSituation",
|
|
6
|
+
"followupWasRoutine",
|
|
7
|
+
"followupNeededBecauseToolWasInsufficient",
|
|
8
|
+
"outputSeemedTooNoisy",
|
|
9
|
+
"outputSeemedIncomplete",
|
|
10
|
+
"missedImportantContext",
|
|
11
|
+
"confidence",
|
|
12
|
+
"improvement",
|
|
13
|
+
]);
|
|
14
|
+
|
|
15
|
+
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
16
|
+
return typeof value === "object" && value !== null;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
function scalar(value: unknown): FeedbackFieldValue | undefined {
|
|
20
|
+
if (typeof value === "string" || typeof value === "boolean") return value;
|
|
21
|
+
if (typeof value === "number" && Number.isFinite(value)) return value;
|
|
22
|
+
return undefined;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function sanitizeFieldResponses(value: unknown): Record<string, FeedbackFieldValue> | undefined {
|
|
26
|
+
if (!isRecord(value)) return undefined;
|
|
27
|
+
const output: Record<string, FeedbackFieldValue> = {};
|
|
28
|
+
for (const [name, raw] of Object.entries(value)) {
|
|
29
|
+
const item = scalar(raw);
|
|
30
|
+
if (item !== undefined) output[name] = item;
|
|
31
|
+
}
|
|
32
|
+
return Object.keys(output).length > 0 ? output : undefined;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function sanitizePerToolResponses(input: unknown): Record<string, Record<string, unknown>> | undefined {
|
|
36
|
+
if (!isRecord(input)) return undefined;
|
|
37
|
+
const output: Record<string, Record<string, unknown>> = {};
|
|
38
|
+
for (const [toolName, rawResponse] of Object.entries(input)) {
|
|
39
|
+
if (!toolName.trim() || !isRecord(rawResponse)) continue;
|
|
40
|
+
const response: Record<string, unknown> = {};
|
|
41
|
+
for (const [name, raw] of Object.entries(rawResponse)) {
|
|
42
|
+
if (allowedBaseFields.has(name)) {
|
|
43
|
+
const item = scalar(raw);
|
|
44
|
+
if (item !== undefined) response[name] = item;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
const fieldResponses = sanitizeFieldResponses(rawResponse.fieldResponses);
|
|
48
|
+
if (fieldResponses) response.fieldResponses = fieldResponses;
|
|
49
|
+
if (Object.keys(response).length > 0) output[toolName.trim()] = response;
|
|
50
|
+
}
|
|
51
|
+
return Object.keys(output).length > 0 ? output : undefined;
|
|
52
|
+
}
|
package/src/prompts.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export const DEFAULT_TASK_PROMPT = [
|
|
2
|
+
"You used watched tools in the previous prompt. Please call `tool_feedback` once with concise structured feedback.",
|
|
3
|
+
"Focus on your own experience using the tool: whether it seemed useful, whether it felt incomplete or noisy, whether follow-up work was routine or compensatory, whether you would use it again in the same situation, and what one improvement would help most. When multiple watched tools differed materially, include `primaryWatchedTool` and optional `perToolResponses` keyed by tool name.",
|
|
4
|
+
"This is a dogfood feedback request, not new implementation work.",
|
|
5
|
+
"Do not acknowledge or summarize this request to the user. Call `tool_feedback` silently and continue or stop if you were done.",
|
|
6
|
+
].join("\n\n");
|
|
7
|
+
|
|
8
|
+
export const BASE_FIELD_PROMPT = [
|
|
9
|
+
"Base `tool_feedback` field values:",
|
|
10
|
+
"- perceivedUsefulness: `high`, `medium`, `low`, `none`, or `unknown`",
|
|
11
|
+
"- wouldUseAgainSameSituation: `yes`, `no`, `unsure`, or `unknown`",
|
|
12
|
+
"- followupWasRoutine, followupNeededBecauseToolWasInsufficient, outputSeemedTooNoisy, outputSeemedIncomplete, missedImportantContext: `yes`, `no`, or `unknown`",
|
|
13
|
+
"- confidence: `high`, `medium`, or `low`",
|
|
14
|
+
"- improvement (optional): `better_ranking`, `higher_cap`, `better_summary`, `better_docs`, `less_noise`, `faster`, or `other`",
|
|
15
|
+
"Use `fieldResponses` only for configured extra fields. Include all required configured fields, using `unknown` when that is the most honest answer. You do not need to inspect extension source to answer this prompt.",
|
|
16
|
+
].join("\n");
|