@wa008/ui-audit-mcp 2.2.2 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -5
- package/dist/src/evaluation/checklist.js +67 -67
- package/dist/src/evaluation/checklist.js.map +1 -1
- package/dist/src/evaluation/state-machine.d.ts +43 -0
- package/dist/src/evaluation/state-machine.d.ts.map +1 -0
- package/dist/src/evaluation/state-machine.js +79 -0
- package/dist/src/evaluation/state-machine.js.map +1 -0
- package/dist/src/index.d.ts +6 -6
- package/dist/src/index.js +12 -15
- package/dist/src/index.js.map +1 -1
- package/dist/src/logger/audit-log.d.ts.map +1 -1
- package/dist/src/logger/audit-log.js +2 -0
- package/dist/src/logger/audit-log.js.map +1 -1
- package/dist/src/tools/evaluate.d.ts +41 -0
- package/dist/src/tools/evaluate.d.ts.map +1 -0
- package/dist/src/tools/evaluate.js +87 -0
- package/dist/src/tools/evaluate.js.map +1 -0
- package/dist/src/tools/get-audit-status.js +19 -19
- package/dist/src/tools/get-audit-status.js.map +1 -1
- package/dist/src/tools/swipe.js +1 -1
- package/dist/src/tools/swipe.js.map +1 -1
- package/dist/src/tools/take-screenshot.js +1 -1
- package/dist/src/tools/take-screenshot.js.map +1 -1
- package/dist/src/tools/tap.js +1 -1
- package/dist/src/tools/tap.js.map +1 -1
- package/dist/src/types.d.ts +2 -0
- package/dist/src/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/dist/src/tools/get-evaluation-criteria.d.ts +0 -15
- package/dist/src/tools/get-evaluation-criteria.d.ts.map +0 -1
- package/dist/src/tools/get-evaluation-criteria.js +0 -29
- package/dist/src/tools/get-evaluation-criteria.js.map +0 -1
- package/dist/src/tools/submit-dimension-score.d.ts +0 -27
- package/dist/src/tools/submit-dimension-score.d.ts.map +0 -1
- package/dist/src/tools/submit-dimension-score.js +0 -35
- package/dist/src/tools/submit-dimension-score.js.map +0 -1
package/README.md
CHANGED
|
@@ -54,16 +54,15 @@ npm install
|
|
|
54
54
|
| `launch_app` | Launch app by Bundle ID |
|
|
55
55
|
| `take_screenshot` | Capture screen & track evaluation step |
|
|
56
56
|
| `tap` / `swipe` | Interact with UI (ratio coordinates), screenshot, and track step |
|
|
57
|
-
| `
|
|
58
|
-
| `submit_dimension_score` | Submit score (1-10) for a single dimension on a tracked step |
|
|
57
|
+
| `evaluate` | Unified evaluation: get dimension prompt (initial) or submit score + get next (submit mode) |
|
|
59
58
|
| `get_audit_status` | View missing evaluations dashboard or final markdown report |
|
|
60
59
|
|
|
61
60
|
## Typical Workflow
|
|
62
61
|
1. `launch_app("com.example.app")`
|
|
63
62
|
2. `take_screenshot("MyTestCase", 1, "Verify Initial Screen")` or `tap(0.5, 0.5, "MyTestCase", 2, "Click Login")`
|
|
64
|
-
3. `
|
|
65
|
-
4. Agent analyzes UI → `
|
|
66
|
-
5. Repeat
|
|
63
|
+
3. `evaluate("MyTestCase", 1)` → Returns the first dimension prompt + `evaluationToken`
|
|
64
|
+
4. Agent analyzes UI → `evaluate("MyTestCase", 1, token, 9, "No overlap")` → Records score, returns next dimension prompt + new token
|
|
65
|
+
5. Repeat step 4 until all 5 dimensions are evaluated
|
|
67
66
|
6. `get_audit_status(["MyTestCase"])` → Output the full markdown report
|
|
68
67
|
|
|
69
68
|
## Data
|
|
@@ -3,98 +3,98 @@ export const DIMENSIONS = [
|
|
|
3
3
|
{
|
|
4
4
|
id: "overlap",
|
|
5
5
|
name: "Element Overlap & Safe Areas",
|
|
6
|
-
description: "You are a user
|
|
7
|
-
"Evaluate the screen from the perspective of a real human user trying to read and tap efficiently.\n" +
|
|
6
|
+
description: "You are evaluating this screen as a real user trying to read content and tap buttons efficiently.\n" +
|
|
8
7
|
"Assume there ARE overlap issues until you have visually scanned the entire screen.\n\n" +
|
|
9
8
|
"CHECK FOR VISUAL INTERFERENCE & COGNITIVE NOISE:\n" +
|
|
10
|
-
"1. SYSTEM EDGE INTERFERENCE (CRITICAL): Scan the very top, bottom, and corners. Do any app elements (
|
|
11
|
-
"2. INTERACTIVE CROWDING: Are
|
|
12
|
-
"3.
|
|
13
|
-
"4. MODAL/POPUP OBSTRUCTION: If a popup or
|
|
14
|
-
"SCORING ATTITUDE: Focus on
|
|
15
|
-
"NOTE: Do NOT evaluate
|
|
16
|
-
scoringGuide: "0-2: Severe visual
|
|
17
|
-
"3-4: Elements don't
|
|
18
|
-
"5-6: All elements are separate, but touch targets
|
|
19
|
-
"7-8: Clean separation. No visual interference.
|
|
20
|
-
"9-10: Perfect
|
|
9
|
+
"1. SYSTEM EDGE INTERFERENCE (CRITICAL): Scan the very top, bottom, and corners of the screen. Do any app elements (buttons, icons, avatars, titles) visually touch, overlap with, or sit uncomfortably close to system indicators (time, battery, signal, notch/Dynamic Island, home indicator bar)? If a user might confuse an app element with a system element, or accidentally tap the wrong one, this is a CRITICAL BUG.\n" +
|
|
10
|
+
"2. INTERACTIVE CROWDING: Are any two tappable elements (buttons, links, cards) so close together that a user with a normal-sized finger might accidentally tap the wrong one? Are touch targets dangerously close to screen edges where system gestures occur?\n" +
|
|
11
|
+
"3. CONTENT COLLISION: Do any visible elements (text, images, floating buttons, badges) visually overlap each other, making any of them harder to read, recognize, or tap? Two elements occupying the same visual space is always a bug.\n" +
|
|
12
|
+
"4. MODAL/POPUP OBSTRUCTION: If a popup, sheet, or overlay is present, does it cover essential information that the user still needs to see, such as the content they are acting upon?\n\n" +
|
|
13
|
+
"SCORING ATTITUDE: Focus on what a real user would EXPERIENCE. If something 'feels' too close or visually confusing, it is an overlap failure. When in doubt, score LOW.\n" +
|
|
14
|
+
"NOTE: Do NOT evaluate spacing consistency (layout), color choices (style), or text meaning (info_clarity).",
|
|
15
|
+
scoringGuide: "0-2: Severe visual collision. Elements clearly overlap each other or invade system areas (e.g., a button sitting on top of the status bar), causing immediate confusion or making the interface unusable.\n" +
|
|
16
|
+
"3-4: Elements don't directly collide, but are awkwardly close to each other or to system edges, creating noticeable visual noise or high accidental-tap risk.\n" +
|
|
17
|
+
"5-6: All elements are visually separate, but some touch targets feel uncomfortably close. A careful user would be fine, but a hurried user might mis-tap.\n" +
|
|
18
|
+
"7-8: Clean separation throughout. No visual interference. The app clearly respects system boundaries and provides comfortable touch targets.\n" +
|
|
19
|
+
"9-10: Perfect separation — generous spacing between all interactive elements, completely unambiguous visual boundaries across the entire screen.",
|
|
21
20
|
},
|
|
22
21
|
{
|
|
23
22
|
id: "layout",
|
|
24
23
|
name: "Layout & Spatial Balance",
|
|
25
|
-
description: "You are
|
|
24
|
+
description: "You are evaluating this screen as a real user who expects a well-organized, visually balanced interface.\n" +
|
|
26
25
|
"Assume the layout has issues until you verify otherwise.\n\n" +
|
|
27
26
|
"CHECK EACH ASPECT:\n" +
|
|
28
|
-
"1. LETTERBOXING: Does the app appear to run inside a smaller box with black
|
|
29
|
-
"2. SCREEN UTILIZATION: Is there a large empty area that serves no purpose? Is content crammed into only
|
|
30
|
-
"3. ALIGNMENT: Pick any two similar elements (e.g., two cards, two labels). Are their
|
|
31
|
-
"4. SPACING: Are gaps between elements consistent?
|
|
32
|
-
"5. RESPONSIVE FIT: Does the layout feel
|
|
33
|
-
"
|
|
34
|
-
"
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
"
|
|
27
|
+
"1. LETTERBOXING: Does the app appear to run inside a smaller box with black or empty bars on any side? This is score 0, no exceptions.\n" +
|
|
28
|
+
"2. SCREEN UTILIZATION: Is there a large empty area that serves no purpose? Is content crammed into only part of the screen while the rest is wasted? A user should feel the app was designed for this exact screen size.\n" +
|
|
29
|
+
"3. ALIGNMENT: Pick any two similar elements (e.g., two cards, two labels). Are their edges aligned? Are the gaps between them equal? Visible misalignment = deduction.\n" +
|
|
30
|
+
"4. SPACING CONSISTENCY: Are gaps between elements consistent? If the space between Card A and Card B looks visibly different from the space between Card B and Card C, that is a problem.\n" +
|
|
31
|
+
"5. RESPONSIVE FIT: Does the layout feel natural for this screen size, or does it look stretched, squished, or designed for a different device?\n" +
|
|
32
|
+
"6. CONTENT OVERFLOW: Does any content look like it has outgrown its container? Look for: text that wraps unexpectedly causing one card to be noticeably taller than its neighbor in a grid, content that appears cut off at a container edge, or scrollable areas that clip content in a way that confuses the user about whether more content exists.\n\n" +
|
|
33
|
+
"SCORING ATTITUDE: Do not be generous. A layout that 'looks fine at a glance' but has visible imperfections upon closer inspection is a 6, not an 8.\n" +
|
|
34
|
+
"NOTE: Do NOT evaluate element collision (that is overlap), color choices (that is style), or text meaning (that is info_clarity).",
|
|
35
|
+
scoringGuide: "0-2: Letterboxing detected, or layout is severely broken (elements piled on top of each other, massive unusable empty areas, content pushed off-screen).\n" +
|
|
36
|
+
"3-4: Major layout issues — clearly uneven spacing, significant misalignment, or large wasted areas that make the app feel unfinished.\n" +
|
|
37
|
+
"5-6: Layout is functional but has noticeable imperfections — some uneven spacing, minor alignment issues, or one area that feels out of balance.\n" +
|
|
38
38
|
"7-8: Well-structured layout with only very minor imperfections that require careful inspection to notice.\n" +
|
|
39
39
|
"9-10: Flawless — pixel-perfect alignment, perfectly consistent spacing, optimal screen utilization. Extremely rare.",
|
|
40
40
|
},
|
|
41
41
|
{
|
|
42
42
|
id: "info_clarity",
|
|
43
43
|
name: "Information Clarity & Readability",
|
|
44
|
-
description: "You are
|
|
45
|
-
"
|
|
44
|
+
description: "You are evaluating this screen as a FIRST-TIME user who has never seen this app before.\n" +
|
|
45
|
+
"Your goal is to determine how quickly and easily a new user could understand this screen.\n\n" +
|
|
46
46
|
"CHECK EACH ASPECT:\n" +
|
|
47
|
-
"1. FIRST IMPRESSION: Within 2 seconds of looking at this screen, can you tell what it does and what the main action is? If
|
|
48
|
-
"2. BUTTON
|
|
49
|
-
"3. HIERARCHY: Is the most important element (title,
|
|
50
|
-
"4. TEXT
|
|
51
|
-
"5. CONTRAST:
|
|
52
|
-
"6. ICON MEANING: Are there any icons without labels
|
|
53
|
-
"SCORING ATTITUDE: If any single element
|
|
54
|
-
"
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
"
|
|
58
|
-
"
|
|
59
|
-
"
|
|
60
|
-
"9-10: Crystal clear — perfect hierarchy, unambiguous labels, no truncation, excellent contrast. Every element self-explanatory.",
|
|
47
|
+
"1. FIRST IMPRESSION: Within 2 seconds of looking at this screen, can you tell what it does and what the main action is? If it takes effort to figure out the screen's purpose, that is a problem.\n" +
|
|
48
|
+
"2. BUTTON & LABEL CLARITY: Read every button, link, and label. Would a first-time user immediately understand what each one does? Vague labels like 'Go' or unlabeled icon-only buttons are problematic.\n" +
|
|
49
|
+
"3. VISUAL HIERARCHY: Is the most important element (page title, primary action button) visually the most prominent? Or do secondary elements compete for the user's attention?\n" +
|
|
50
|
+
"4. TEXT COMPLETENESS: Is any text cut off with '...' (truncation), overflowing its container, or so small that a user would struggle to read it? Even one truncated label that hides meaningful information = deduction.\n" +
|
|
51
|
+
"5. READABILITY & CONTRAST: Can all text be comfortably read against its background? Pay special attention to small text, light-colored text on light backgrounds, or text placed over images.\n" +
|
|
52
|
+
"6. ICON MEANING: Are there any icons without text labels whose meaning might be ambiguous to a new user? A trash can icon is universally understood; a custom abstract icon is not.\n\n" +
|
|
53
|
+
"SCORING ATTITUDE: If any single element would make a first-time user pause and wonder 'what does this do?', cap the score at 7. Multiple confusing elements = score 5 or below.\n" +
|
|
54
|
+
"NOTE: Do NOT evaluate spacing (layout), visual collision (overlap), or color harmony (style).",
|
|
55
|
+
scoringGuide: "0-2: The screen's purpose is unclear, the primary action is missing or hidden, or labels are severely misleading.\n" +
|
|
56
|
+
"3-4: The core purpose is guessable but requires effort. Multiple labels or icons are ambiguous.\n" +
|
|
57
|
+
"5-6: Generally understandable but 1-2 elements are confusing (ambiguous icon, truncated text hiding important info, weak visual hierarchy).\n" +
|
|
58
|
+
"7-8: Clear and well-organized with only one very minor issue. A first-time user would understand the screen immediately.\n" +
|
|
59
|
+
"9-10: Crystal clear — perfect hierarchy, every label self-explanatory, no truncation, excellent contrast. A user of any background would instantly understand this screen.",
|
|
61
60
|
},
|
|
62
61
|
{
|
|
63
62
|
id: "style",
|
|
64
63
|
name: "Visual Style & Consistency",
|
|
65
|
-
description: "You are
|
|
66
|
-
"
|
|
64
|
+
description: "You are evaluating this screen as a user who expects a polished, professionally designed app.\n" +
|
|
65
|
+
"Your job is to find visual inconsistencies and rendering defects that make the app look unfinished.\n\n" +
|
|
67
66
|
"CHECK EACH ASPECT:\n" +
|
|
68
|
-
"1. COLOR PALETTE: Count the number of distinct colors used. Do they feel intentional and harmonious, or random?\n" +
|
|
69
|
-
"2. TYPOGRAPHY: Are
|
|
70
|
-
"3. COMPONENT
|
|
71
|
-
"4. ICONOGRAPHY: Are all icons
|
|
72
|
-
"5. POLISH: Does
|
|
73
|
-
"
|
|
74
|
-
"
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
"
|
|
78
|
-
"
|
|
79
|
-
"
|
|
67
|
+
"1. COLOR PALETTE: Count the number of distinct colors used. Do they feel intentional and harmonious, or random and clashing?\n" +
|
|
68
|
+
"2. TYPOGRAPHY: Are font sizes used consistently? Are there mixed font weights that don't follow a clear hierarchy? More than 2-3 font sizes without a clear reason is a problem.\n" +
|
|
69
|
+
"3. COMPONENT CONSISTENCY: Compare all elements of the same type. Do all buttons have the same shape, size, and styling? Do all cards have the same shadows, borders, and corner radius? Any difference between similar components = deduction.\n" +
|
|
70
|
+
"4. ICONOGRAPHY: Are all icons visually consistent (all outline style OR all filled style, consistent stroke width and sizing)? Mixing styles within the same screen = deduction.\n" +
|
|
71
|
+
"5. POLISH & FINISH: Does the app look professionally designed, or does it feel like an unfinished prototype? Placeholder content (e.g., 'Lorem ipsum', default system icons used inappropriately) = major deduction.\n" +
|
|
72
|
+
"6. RENDERING INTEGRITY: Inspect each UI element individually for visual defects. Look for: a visible ring, halo, or unintended border of a different color around an element (e.g., a white circle visible behind a colored circular button — this means the element was not built cleanly), mismatched shapes within a single component, clipping artifacts that cut off part of an icon or image, or any visual glitch that makes a component look broken. Even 1-2 pixels of visible defect = deduction.\n\n" +
|
|
73
|
+
"SCORING ATTITUDE: Consistency is binary — if two buttons of the same type look different, they are inconsistent, period. A single rendering defect on any element caps the score at 7.\n" +
|
|
74
|
+
"NOTE: Do NOT evaluate text meaning (info_clarity), spatial arrangement (layout), or element collision (overlap).",
|
|
75
|
+
scoringGuide: "0-2: No design system — clashing colors, random fonts, inconsistent components everywhere, or multiple rendering defects.\n" +
|
|
76
|
+
"3-4: Some design intent visible but multiple clear inconsistencies (different button styles, mixed icon sets, visible rendering glitches).\n" +
|
|
77
|
+
"5-6: Basic consistency exists but with noticeable deviations or at least one obvious rendering defect.\n" +
|
|
78
|
+
"7-8: Strong consistency with only 1 minor deviation or subtle defect requiring careful inspection.\n" +
|
|
79
|
+
"9-10: Pixel-perfect design system adherence. Every element cohesive and cleanly rendered. Extremely rare.",
|
|
80
80
|
},
|
|
81
81
|
{
|
|
82
82
|
id: "action_result",
|
|
83
83
|
name: "Action Result Verification",
|
|
84
|
-
description: "You are
|
|
85
|
-
"The step's expectedOutcome describes what SHOULD have happened. Compare the screenshot against it.\n\n" +
|
|
84
|
+
description: "You are evaluating whether the action the user just performed actually produced the expected result.\n" +
|
|
85
|
+
"The step's expectedOutcome describes what SHOULD have happened. Compare the current screenshot against it.\n\n" +
|
|
86
86
|
"CHECK EACH ASPECT:\n" +
|
|
87
|
-
"1.
|
|
88
|
-
"2. STATE CHANGE: If the action should
|
|
89
|
-
"3. ERROR
|
|
90
|
-
"4. NO RESPONSE: Does the screen look identical to before the action? If
|
|
91
|
-
"SCORING ATTITUDE: If the expectedOutcome is not clearly achieved, score LOW. Partial success is still a problem.\n" +
|
|
92
|
-
"If this step is a pure observation
|
|
93
|
-
scoringGuide: "0-2: Action clearly failed — wrong screen, error dialog, crash, or no
|
|
94
|
-
"3-4: Action had some effect but the result does not match expectedOutcome (e.g., navigated to wrong tab).\n" +
|
|
95
|
-
"5-6: Action partially achieved the goal but with unexpected side effects
|
|
96
|
-
"7-8: Action succeeded — the expected screen
|
|
97
|
-
"9-10: Action fully succeeded — screenshot
|
|
87
|
+
"1. CORRECT DESTINATION: If the user tapped a navigation element, did the correct screen appear? Landing on the wrong screen = score 0.\n" +
|
|
88
|
+
"2. VISIBLE STATE CHANGE: If the action should have added, removed, toggled, or updated something, is that change clearly visible in the screenshot? The user should be able to confirm the action worked.\n" +
|
|
89
|
+
"3. ERROR INDICATORS: Are there any error dialogs, crash screens, blank white screens, or infinite loading spinners? These all indicate the action failed.\n" +
|
|
90
|
+
"4. NO VISIBLE RESPONSE: Does the screen look identical to before the action was performed? If nothing changed, the action likely failed or missed its target.\n\n" +
|
|
91
|
+
"SCORING ATTITUDE: If the expectedOutcome is not clearly achieved in the screenshot, score LOW. Partial success is still a problem — the user expects the action to fully work.\n" +
|
|
92
|
+
"If this step is a pure observation with no expectedOutcome specified, score 10.",
|
|
93
|
+
scoringGuide: "0-2: Action clearly failed — wrong screen, error dialog, crash, blank screen, or no visible response at all.\n" +
|
|
94
|
+
"3-4: Action had some effect but the result does not match the expectedOutcome (e.g., navigated to the wrong tab, wrong data displayed).\n" +
|
|
95
|
+
"5-6: Action partially achieved the goal but with unexpected side effects, missing expected elements, or incomplete state changes.\n" +
|
|
96
|
+
"7-8: Action succeeded — the expected screen or state is present but with minor discrepancies from the expectedOutcome.\n" +
|
|
97
|
+
"9-10: Action fully succeeded — the screenshot clearly and completely matches the expectedOutcome. Or this is a pure observation step with no expectedOutcome.",
|
|
98
98
|
},
|
|
99
99
|
];
|
|
100
100
|
export const REQUIRED_DIMS = DIMENSIONS.map(d => d.id);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"checklist.js","sourceRoot":"","sources":["../../../src/evaluation/checklist.ts"],"names":[],"mappings":"AAEA,+DAA+D;AAC/D,MAAM,CAAC,MAAM,UAAU,GAAoB;IACvC;QACI,EAAE,EAAE,SAAS;QACb,IAAI,EAAE,8BAA8B;QACpC,WAAW,EACP,
|
|
1
|
+
{"version":3,"file":"checklist.js","sourceRoot":"","sources":["../../../src/evaluation/checklist.ts"],"names":[],"mappings":"AAEA,+DAA+D;AAC/D,MAAM,CAAC,MAAM,UAAU,GAAoB;IACvC;QACI,EAAE,EAAE,SAAS;QACb,IAAI,EAAE,8BAA8B;QACpC,WAAW,EACP,qGAAqG;YACrG,wFAAwF;YACxF,oDAAoD;YACpD,iaAAia;YACja,kQAAkQ;YAClQ,2OAA2O;YAC3O,2LAA2L;YAC3L,2KAA2K;YAC3K,4GAA4G;QAChH,YAAY,EACR,6MAA6M;YAC7M,iKAAiK;YACjK,6JAA6J;YAC7J,gJAAgJ;YAChJ,kJAAkJ;KACzJ;IACD;QACI,EAAE,EAAE,QAAQ;QACZ,IAAI,EAAE,0BAA0B;QAChC,WAAW,EACP,4GAA4G;YAC5G,8DAA8D;YAC9D,sBAAsB;YACtB,0IAA0I;YAC1I,4NAA4N;YAC5N,0KAA0K;YAC1K,6LAA6L;YAC7L,kJAAkJ;YAClJ,4VAA4V;YAC5V,uJAAuJ;YACvJ,mIAAmI;QACvI,YAAY,EACR,4JAA4J;YAC5J,yIAAyI;YACzI,oJAAoJ;YACpJ,6GAA6G;YAC7G,qHAAqH;KAC5H;IACD;QACI,EAAE,EAAE,cAAc;QAClB,IAAI,EAAE,mCAAmC;QACzC,WAAW,EACP,2FAA2F;YAC3F,+FAA+F;YAC/F,sBAAsB;YACtB,qMAAqM;YACrM,4MAA4M;YAC5M,kLAAkL;YAClL,4NAA4N;YAC5N,iMAAiM;YACjM,yLAAyL;YACzL,mLAAmL;YACnL,+FAA+F;QACnG,YAAY,EACR,qHAAqH;YACrH,mGAAmG;YACnG,+IAA+I;YAC/I,4HAA4H;YAC5H,4KAA4K;KACnL;IACD;QACI,EAAE,EAAE,OAAO;QACX,IAAI,EAAE,4BAA4B;QAClC,WAAW,EACP,iGAAiG;YACjG,yGAAyG;YACzG,sBAAsB;YACtB,gIAAgI;YAChI,oLAAoL;YACpL,kPAAkP;YAClP,oLAAoL;YACpL,wNAAwN;YACxN,ifAAif;YACjf,0LAA0L;YAC1L,kHAAkH;QACtH,YAAY,EACR,6HAA6H;YAC7H,8IAA8I;YAC9I,0GAA0G;YAC1G,sGAAsG;YACtG,2GAA2G;KAClH;IACD;QACI,EAAE,EAAE,eAAe;QACnB,IAAI,EAAE,4BAA4B;QAClC,WAAW,EACP,wGAAwG;YACxG,gHAAgH;YAChH,sBAAsB;YACtB,0IAA0I;YAC1I,6MAA6M;YAC7M,6JAA6J;YAC7J,mKAAmK;YACnK,kLAAkL;YAClL,iFAAiF;QACrF,YAAY,EACR,gHAAgH;YAChH,2IAA2I;YAC3I,qIAAqI;YACrI,0HAA0H;YAC1H,+JAA+J;KACtK;CACJ,CAAC;AAEF,MAAM,CAAC,MAAM,aAAa,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;AACvD,MAAM,CAAC,MAAM,aAAa,GAAG,CAAC,CAAC"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
export declare function getPendingTaskState(caseName: string, stepIndex: number): {
|
|
2
|
+
completed: boolean;
|
|
3
|
+
message: string;
|
|
4
|
+
dimensionId?: undefined;
|
|
5
|
+
dimensionName?: undefined;
|
|
6
|
+
description?: undefined;
|
|
7
|
+
scoringGuide?: undefined;
|
|
8
|
+
token?: undefined;
|
|
9
|
+
stepIndex?: undefined;
|
|
10
|
+
} | {
|
|
11
|
+
completed: boolean;
|
|
12
|
+
dimensionId: string;
|
|
13
|
+
dimensionName: string;
|
|
14
|
+
description: string;
|
|
15
|
+
scoringGuide: string;
|
|
16
|
+
token: string;
|
|
17
|
+
stepIndex: number;
|
|
18
|
+
message?: undefined;
|
|
19
|
+
};
|
|
20
|
+
export declare function validateAndAdvanceState(caseName: string, stepIndex: number, token: string, score: number, reason: string): {
|
|
21
|
+
completedDimId: string;
|
|
22
|
+
score: number;
|
|
23
|
+
nextState: {
|
|
24
|
+
completed: boolean;
|
|
25
|
+
message: string;
|
|
26
|
+
dimensionId?: undefined;
|
|
27
|
+
dimensionName?: undefined;
|
|
28
|
+
description?: undefined;
|
|
29
|
+
scoringGuide?: undefined;
|
|
30
|
+
token?: undefined;
|
|
31
|
+
stepIndex?: undefined;
|
|
32
|
+
} | {
|
|
33
|
+
completed: boolean;
|
|
34
|
+
dimensionId: string;
|
|
35
|
+
dimensionName: string;
|
|
36
|
+
description: string;
|
|
37
|
+
scoringGuide: string;
|
|
38
|
+
token: string;
|
|
39
|
+
stepIndex: number;
|
|
40
|
+
message?: undefined;
|
|
41
|
+
};
|
|
42
|
+
};
|
|
43
|
+
//# sourceMappingURL=state-machine.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"state-machine.d.ts","sourceRoot":"","sources":["../../../src/evaluation/state-machine.ts"],"names":[],"mappings":"AAQA,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM;;;;;;;;;;;;;;;;;;EAwCtE;AAED,wBAAgB,uBAAuB,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM;;;;;;;;;;;;;;;;;;;;;;EA4CxH"}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import crypto from "crypto";
|
|
2
|
+
import { readLog, writeLog } from "../logger/audit-log.js";
|
|
3
|
+
import { DIMENSIONS, REQUIRED_DIMS } from "./checklist.js";
|
|
4
|
+
function generateToken() {
|
|
5
|
+
return "tok_" + crypto.randomBytes(4).toString("hex");
|
|
6
|
+
}
|
|
7
|
+
export function getPendingTaskState(caseName, stepIndex) {
|
|
8
|
+
const log = readLog(caseName);
|
|
9
|
+
if (!log) {
|
|
10
|
+
throw new Error(`Test case not found: ${caseName}`);
|
|
11
|
+
}
|
|
12
|
+
const step = log.steps[stepIndex];
|
|
13
|
+
if (!step) {
|
|
14
|
+
throw new Error(`Step ${stepIndex} not found in case ${caseName}`);
|
|
15
|
+
}
|
|
16
|
+
if (step.currentDimIndex >= REQUIRED_DIMS.length) {
|
|
17
|
+
return {
|
|
18
|
+
completed: true,
|
|
19
|
+
message: `All ${REQUIRED_DIMS.length} dimensions for Step ${stepIndex} are fully evaluated! You may now proceed to interact with the device further (e.g. tap, swipe) or call get_audit_status.`
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
// Generate a token if there isn't one
|
|
23
|
+
if (!step.evaluationToken) {
|
|
24
|
+
step.evaluationToken = generateToken();
|
|
25
|
+
writeLog(log);
|
|
26
|
+
}
|
|
27
|
+
const currentDimId = REQUIRED_DIMS[step.currentDimIndex];
|
|
28
|
+
const criteria = DIMENSIONS.find(d => d.id === currentDimId);
|
|
29
|
+
if (!criteria) {
|
|
30
|
+
throw new Error(`Dimension '${currentDimId}' not found.`);
|
|
31
|
+
}
|
|
32
|
+
return {
|
|
33
|
+
completed: false,
|
|
34
|
+
dimensionId: currentDimId,
|
|
35
|
+
dimensionName: criteria.name,
|
|
36
|
+
description: criteria.description,
|
|
37
|
+
scoringGuide: criteria.scoringGuide,
|
|
38
|
+
token: step.evaluationToken,
|
|
39
|
+
stepIndex: stepIndex
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
export function validateAndAdvanceState(caseName, stepIndex, token, score, reason) {
|
|
43
|
+
const log = readLog(caseName);
|
|
44
|
+
if (!log) {
|
|
45
|
+
throw new Error(`Test case not found: ${caseName}`);
|
|
46
|
+
}
|
|
47
|
+
const step = log.steps[stepIndex];
|
|
48
|
+
if (!step) {
|
|
49
|
+
throw new Error(`Step ${stepIndex} not found in case ${caseName}`);
|
|
50
|
+
}
|
|
51
|
+
if (step.currentDimIndex >= REQUIRED_DIMS.length) {
|
|
52
|
+
throw new Error(`Step ${stepIndex} is already fully evaluated.`);
|
|
53
|
+
}
|
|
54
|
+
if (!step.evaluationToken || step.evaluationToken !== token) {
|
|
55
|
+
throw new Error("Invalid token. Do not guess tokens or run multiple submissions at once.");
|
|
56
|
+
}
|
|
57
|
+
const currentDimId = REQUIRED_DIMS[step.currentDimIndex];
|
|
58
|
+
// Record the score
|
|
59
|
+
step.evaluations[currentDimId] = {
|
|
60
|
+
score,
|
|
61
|
+
reason,
|
|
62
|
+
};
|
|
63
|
+
// Advance the state
|
|
64
|
+
step.currentDimIndex += 1;
|
|
65
|
+
// Clear the old token and generate a new one if not completed
|
|
66
|
+
if (step.currentDimIndex < REQUIRED_DIMS.length) {
|
|
67
|
+
step.evaluationToken = generateToken();
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
step.evaluationToken = undefined;
|
|
71
|
+
}
|
|
72
|
+
writeLog(log);
|
|
73
|
+
return {
|
|
74
|
+
completedDimId: currentDimId,
|
|
75
|
+
score,
|
|
76
|
+
nextState: getPendingTaskState(caseName, stepIndex)
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
//# sourceMappingURL=state-machine.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"state-machine.js","sourceRoot":"","sources":["../../../src/evaluation/state-machine.ts"],"names":[],"mappings":"AAAA,OAAO,MAAM,MAAM,QAAQ,CAAC;AAC5B,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,wBAAwB,CAAC;AAC3D,OAAO,EAAE,UAAU,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAE3D,SAAS,aAAa;IAClB,OAAO,MAAM,GAAG,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC;AAC1D,CAAC;AAED,MAAM,UAAU,mBAAmB,CAAC,QAAgB,EAAE,SAAiB;IACnE,MAAM,GAAG,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC9B,IAAI,CAAC,GAAG,EAAE,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,wBAAwB,QAAQ,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAClC,IAAI,CAAC,IAAI,EAAE,CAAC;QACR,MAAM,IAAI,KAAK,CAAC,QAAQ,SAAS,sBAAsB,QAAQ,EAAE,CAAC,CAAC;IACvE,CAAC;IAED,IAAI,IAAI,CAAC,eAAe,IAAI,aAAa,CAAC,MAAM,EAAE,CAAC;QAC/C,OAAO;YACH,SAAS,EAAE,IAAI;YACf,OAAO,EAAE,OAAO,aAAa,CAAC,MAAM,wBAAwB,SAAS,2HAA2H;SACnM,CAAC;IACN,CAAC;IAED,sCAAsC;IACtC,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;QACxB,IAAI,CAAC,eAAe,GAAG,aAAa,EAAE,CAAC;QACvC,QAAQ,CAAC,GAAG,CAAC,CAAC;IAClB,CAAC;IAED,MAAM,YAAY,GAAG,aAAa,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IACzD,MAAM,QAAQ,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,YAAY,CAAC,CAAC;IAE7D,IAAI,CAAC,QAAQ,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CAAC,cAAc,YAAY,cAAc,CAAC,CAAC;IAC9D,CAAC;IAED,OAAO;QACH,SAAS,EAAE,KAAK;QAChB,WAAW,EAAE,YAAY;QACzB,aAAa,EAAE,QAAQ,CAAC,IAAI;QAC5B,WAAW,EAAE,QAAQ,CAAC,WAAW;QACjC,YAAY,EAAE,QAAQ,CAAC,YAAY;QACnC,KAAK,EAAE,IAAI,CAAC,eAAe;QAC3B,SAAS,EAAE,SAAS;KACvB,CAAC;AACN,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,QAAgB,EAAE,SAAiB,EAAE,KAAa,EAAE,KAAa,EAAE,MAAc;IACrH,MAAM,GAAG,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC9B,IAAI,CAAC,GAAG,EAAE,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,wBAAwB,QAAQ,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAClC,IAAI,CAAC,IAAI,EAAE,CAAC;QACR,MAAM,IAAI,KAAK,CAAC,QAAQ,SAAS,sBAAsB,QAAQ,EAAE,CAAC,CAAC;IACvE,CAAC;IAED,IAAI,IAAI,CAAC,eAAe,IAAI,aAAa,CAAC,MAAM,EAAE,CAAC;QAC/C,MAAM,IAAI,KAAK,CAAC,QAAQ,SAAS,8BAA8B,CAAC,CAAC;IACrE,CAAC;IAED,IAAI,CAAC,IAAI,CAAC,eAAe,IAAI,IAAI,CAAC,eAAe,KAAK,KAAK,EAAE,CAAC;QAC1D,MAAM,IAAI,KAAK,CAAC,yEAAyE,CAAC,CAAC;IAC/F,CAAC;IAED,MAAM,YAAY,GAAG,aAAa,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;IAEzD,mBAAmB;IACnB,IAAI,CAAC,WAAW,CAAC,YAAY,CAAC,GAAG;QAC7B,KAAK;QACL,MAAM;KACT,CAAC;IAEF,oBAAoB;IACpB,IAAI,CAAC,eAAe,IAAI,CAAC,CAAC;IAE1B,8DAA8D;IAC9D,IAAI,IAAI,CAAC,eAAe,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC;QAC9C,IAAI,CAAC,eAAe,GAAG,aAAa,EAAE,CAAC;IAC3C,CAAC;SAAM,CAAC;QACJ,IAAI,CAAC,eAAe,GAAG,SAAS,CAAC;IACrC,CAAC;IAED,QAAQ,CAAC,GAAG,CAAC,CAAC;IAEd,OAAO;QACH,cAAc,EAAE,YAAY;QAC5B,KAAK;QACL,SAAS,EAAE,mBAAmB,CAAC,QAAQ,EAAE,SAAS,CAAC;KACtD,CAAC;AACN,CAAC"}
|
package/dist/src/index.d.ts
CHANGED
|
@@ -7,12 +7,12 @@
|
|
|
7
7
|
* and provides a rule-based, trackable evaluation workflow.
|
|
8
8
|
*
|
|
9
9
|
* Evaluation workflow for each step:
|
|
10
|
-
* 1. Perform an action (tap / swipe / take_screenshot) —
|
|
11
|
-
* 2.
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
10
|
+
* 1. Perform an action (tap / swipe / take_screenshot) — auto-takes a screenshot and registers a step.
|
|
11
|
+
* 2. Call evaluate(caseName, stepIndex) — returns the first dimension prompt + token.
|
|
12
|
+
* 3. Visually inspect the screenshot based on the prompt.
|
|
13
|
+
* 4. Call evaluate(caseName, stepIndex, token, score, reason) — records the score, returns the next dimension prompt + new token.
|
|
14
|
+
* 5. Repeat step 3-4 until all 5 dimensions are evaluated.
|
|
15
|
+
* 6. Call get_audit_status to get the final markdown report.
|
|
16
16
|
*/
|
|
17
17
|
export {};
|
|
18
18
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/src/index.js
CHANGED
|
@@ -7,12 +7,12 @@
|
|
|
7
7
|
* and provides a rule-based, trackable evaluation workflow.
|
|
8
8
|
*
|
|
9
9
|
* Evaluation workflow for each step:
|
|
10
|
-
* 1. Perform an action (tap / swipe / take_screenshot) —
|
|
11
|
-
* 2.
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
10
|
+
* 1. Perform an action (tap / swipe / take_screenshot) — auto-takes a screenshot and registers a step.
|
|
11
|
+
* 2. Call evaluate(caseName, stepIndex) — returns the first dimension prompt + token.
|
|
12
|
+
* 3. Visually inspect the screenshot based on the prompt.
|
|
13
|
+
* 4. Call evaluate(caseName, stepIndex, token, score, reason) — records the score, returns the next dimension prompt + new token.
|
|
14
|
+
* 5. Repeat step 3-4 until all 5 dimensions are evaluated.
|
|
15
|
+
* 6. Call get_audit_status to get the final markdown report.
|
|
16
16
|
*/
|
|
17
17
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
18
18
|
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
@@ -20,12 +20,11 @@ import { launchAppSchema, launchApp } from "./tools/launch-app.js";
|
|
|
20
20
|
import { takeScreenshotSchema, takeScreenshot } from "./tools/take-screenshot.js";
|
|
21
21
|
import { tapSchema, tap } from "./tools/tap.js";
|
|
22
22
|
import { swipeSchema, swipe } from "./tools/swipe.js";
|
|
23
|
-
import {
|
|
24
|
-
import { submitDimensionScoreSchema, submitDimensionScore } from "./tools/submit-dimension-score.js";
|
|
23
|
+
import { evaluateSchema, evaluate } from "./tools/evaluate.js";
|
|
25
24
|
import { getAuditStatusSchema, getAuditStatus } from "./tools/get-audit-status.js";
|
|
26
25
|
const server = new McpServer({
|
|
27
26
|
name: "ui-audit-mcp",
|
|
28
|
-
version: "2.
|
|
27
|
+
version: "2.3.0",
|
|
29
28
|
});
|
|
30
29
|
// ─── Device operation tools ────────────────────────────────
|
|
31
30
|
server.tool("launch_app", "Launch an iOS app on the simulator by its bundle ID.", launchAppSchema.shape, async (args) => launchApp(args));
|
|
@@ -36,12 +35,10 @@ server.tool("tap", "Tap at a position on screen (ratio 0-1). Automatically captu
|
|
|
36
35
|
server.tool("swipe", "Swipe on screen (ratio 0-1). Automatically captures a screenshot and registers a tracked step. " +
|
|
37
36
|
"After this call, evaluate the screenshot across all required dimensions before moving to the next action.", swipeSchema.shape, async (args) => swipe(args));
|
|
38
37
|
// ─── Evaluation tools ──────────────────────────────────────
|
|
39
|
-
server.tool("
|
|
40
|
-
"Call
|
|
41
|
-
"
|
|
42
|
-
|
|
43
|
-
"Every step requires scores for all 5 dimensions: overlap, layout, info_clarity, style, action_result. " +
|
|
44
|
-
"After submitting, continue with the remaining dimensions, then call get_audit_status to confirm completion.", submitDimensionScoreSchema.shape, async (args) => submitDimensionScore(args));
|
|
38
|
+
server.tool("evaluate", "Unified evaluation tool. Two modes:\n" +
|
|
39
|
+
"1) INITIAL: Call with only caseName + stepIndex (omit score/reason/token) → returns the first pending dimension's prompt, scoring guide, and evaluationToken.\n" +
|
|
40
|
+
"2) SUBMIT: Call with caseName + stepIndex + evaluationToken + score + reason → records the score, advances to the next dimension, and returns its prompt + new token (or completion message).\n" +
|
|
41
|
+
"Required dimensions per step: overlap, layout, info_clarity, style, action_result.", evaluateSchema.shape, async (args) => evaluate(args));
|
|
45
42
|
server.tool("get_audit_status", "Get the current dashboard or test case report. " +
|
|
46
43
|
"Use this to see which dimensions are missing (Pending) or to get the final markdown report if all steps are fully evaluated.", getAuditStatusSchema.shape, async (args) => getAuditStatus(args));
|
|
47
44
|
import { preflight } from "./device/adapter.js";
|
package/dist/src/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,yCAAyC,CAAC;AACpE,OAAO,EAAE,oBAAoB,EAAE,MAAM,2CAA2C,CAAC;AAEjF,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACnE,OAAO,EAAE,oBAAoB,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAClF,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,WAAW,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";AAEA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,yCAAyC,CAAC;AACpE,OAAO,EAAE,oBAAoB,EAAE,MAAM,2CAA2C,CAAC;AAEjF,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACnE,OAAO,EAAE,oBAAoB,EAAE,cAAc,EAAE,MAAM,4BAA4B,CAAC;AAClF,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,MAAM,gBAAgB,CAAC;AAChD,OAAO,EAAE,WAAW,EAAE,KAAK,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,cAAc,EAAE,QAAQ,EAAE,MAAM,qBAAqB,CAAC;AAC/D,OAAO,EAAE,oBAAoB,EAAE,cAAc,EAAE,MAAM,6BAA6B,CAAC;AAEnF,MAAM,MAAM,GAAG,IAAI,SAAS,CAAC;IACzB,IAAI,EAAE,cAAc;IACpB,OAAO,EAAE,OAAO;CACnB,CAAC,CAAC;AAEH,8DAA8D;AAE9D,MAAM,CAAC,IAAI,CACP,YAAY,EACZ,sDAAsD,EACtD,eAAe,CAAC,KAAK,EACrB,KAAK,EAAE,IAAI,EAAE,EAAE,CAAC,SAAS,CAAC,IAAI,CAAC,CAClC,CAAC;AAEF,MAAM,CAAC,IAAI,CACP,iBAAiB,EACjB,qFAAqF;IACrF,2GAA2G,EAC3G,oBAAoB,CAAC,KAAK,EAC1B,KAAK,EAAE,IAAI,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,CAAC,CACvC,CAAC;AAEF,MAAM,CAAC,IAAI,CACP,KAAK,EACL,6GAA6G;IAC7G,2GAA2G,EAC3G,SAAS,CAAC,KAAK,EACf,KAAK,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,CAC5B,CAAC;AAEF,MAAM,CAAC,IAAI,CACP,OAAO,EACP,iGAAiG;IACjG,2GAA2G,EAC3G,WAAW,CAAC,KAAK,EACjB,KAAK,EAAE,IAAI,EAAE,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAC9B,CAAC;AAEF,8DAA8D;AAE9D,MAAM,CAAC,IAAI,CACP,UAAU,EACV,uCAAuC;IACvC,iKAAiK;IACjK,iMAAiM;IACjM,oFAAoF,EACpF,cAAc,CAAC,KAAK,EACpB,KAAK,EAAE,IAAI,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,CACjC,CAAC;AAEF,MAAM,CAAC,IAAI,CACP,kBAAkB,EAClB,iDAAiD;IACjD,8HAA8H,EAC9H,oBAAoB,CAAC,KAAK,EAC1B,KAAK,EAAE,IAAI,EAAE,EAAE,CAAC,cAAc,CAAC,IAAI,CAAC,CACvC,CAAC;AAEF,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAEhD,8DAA8D;AAE9D,KAAK,UAAU,IAAI;IACf,0DAA0D;IAC1D,MAAM,KAAK,GAAG,MAAM,SAAS,EAAE,CAAC;IAChC,IAAI,CAAC,KAAK,CAAC,EAAE,EAAE,CAAC;QACZ,OAAO,CAAC,KAAK,CAAC,0CAA0C,CAAC,CAAC;QAC1D,KAAK,MAAM,CAAC,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;YAC5B,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;QAC9B,CAAC;QACD,OAAO,CAAC,KAAK,CAAC,sDAAsD,CAAC,CAAC;IAC1E,CAAC;IAED,MAAM,SAAS,GAAG,IAAI,oBAAoB,EAAE,CAAC;IAC7C,MAAM,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;IAChC,OAAO,CAAC,KAAK,CAAC,mDAAmD,CAAC,CAAC;AACvE,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,CAAC,GAAG,EAAE,EAAE;IACjB,OAAO,CAAC,KAAK,CAAC,6BAA6B,EAAE,GAAG,CAAC,CAAC;IAClD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AACpB,CAAC,CAAC,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"audit-log.d.ts","sourceRoot":"","sources":["../../../src/logger/audit-log.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,QAAQ,EAA8B,MAAM,aAAa,CAAC;AAmBnE,wBAAgB,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,QAAQ,GAAG,IAAI,CAMzD;AAED,wBAAgB,QAAQ,CAAC,GAAG,EAAE,QAAQ,GAAG,IAAI,CAG5C;AAED,wBAAgB,WAAW,IAAI,QAAQ,EAAE,CAexC;AAED,wBAAgB,UAAU,CACtB,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,WAAW,EAAE,MAAM,EACnB,UAAU,EAAE,KAAK,GAAG,OAAO,GAAG,YAAY,EAC1C,cAAc,EAAE,MAAM,EACtB,WAAW,CAAC,EAAE;IAAE,CAAC,EAAE,MAAM,CAAC;IAAC,CAAC,EAAE,MAAM,CAAA;CAAE,EACtC,eAAe,CAAC,EAAE,MAAM,GACzB,IAAI,
|
|
1
|
+
{"version":3,"file":"audit-log.d.ts","sourceRoot":"","sources":["../../../src/logger/audit-log.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,QAAQ,EAA8B,MAAM,aAAa,CAAC;AAmBnE,wBAAgB,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,QAAQ,GAAG,IAAI,CAMzD;AAED,wBAAgB,QAAQ,CAAC,GAAG,EAAE,QAAQ,GAAG,IAAI,CAG5C;AAED,wBAAgB,WAAW,IAAI,QAAQ,EAAE,CAexC;AAED,wBAAgB,UAAU,CACtB,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,WAAW,EAAE,MAAM,EACnB,UAAU,EAAE,KAAK,GAAG,OAAO,GAAG,YAAY,EAC1C,cAAc,EAAE,MAAM,EACtB,WAAW,CAAC,EAAE;IAAE,CAAC,EAAE,MAAM,CAAC;IAAC,CAAC,EAAE,MAAM,CAAA;CAAE,EACtC,eAAe,CAAC,EAAE,MAAM,GACzB,IAAI,CAyBN;AAED,wBAAgB,WAAW,CACvB,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,MAAM,EACjB,SAAS,EAAE,MAAM,EACjB,KAAK,EAAE,MAAM,EACb,MAAM,EAAE,MAAM,GACf,IAAI,CAiBN"}
|
|
@@ -64,6 +64,8 @@ export function recordStep(caseName, stepIndex, description, actionType, screens
|
|
|
64
64
|
expectedOutcome,
|
|
65
65
|
timestamp: new Date().toISOString(),
|
|
66
66
|
evaluations: existingStep ? existingStep.evaluations : {},
|
|
67
|
+
currentDimIndex: existingStep && existingStep.currentDimIndex !== undefined ? existingStep.currentDimIndex : 0,
|
|
68
|
+
evaluationToken: existingStep ? existingStep.evaluationToken : undefined,
|
|
67
69
|
};
|
|
68
70
|
writeLog(log);
|
|
69
71
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"audit-log.js","sourceRoot":"","sources":["../../../src/logger/audit-log.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,MAAM,IAAI,CAAC;AAGpB,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,EAAE,eAAe,CAAC,CAAC;AAC1D,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;AAC5C,MAAM,cAAc,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;AAE1D,2BAA2B;AAC3B,SAAS,UAAU;IACf,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1E,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACxE,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,cAAc,CAAC;QAAE,EAAE,CAAC,SAAS,CAAC,cAAc,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;AAC1F,CAAC;AAED,UAAU,EAAE,CAAC;AAEb,SAAS,UAAU,CAAC,QAAgB;IAChC,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,GAAG,QAAQ,OAAO,CAAC,CAAC;AAClD,CAAC;AAED,MAAM,UAAU,OAAO,CAAC,QAAgB;IACpC,MAAM,OAAO,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;IACrC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1B,OAAO,IAAI,CAAC;IAChB,CAAC;IACD,OAAO,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;AACzD,CAAC;AAED,MAAM,UAAU,QAAQ,CAAC,GAAa;IAClC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACzC,EAAE,CAAC,aAAa,CAAC,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACrE,CAAC;AAED,MAAM,UAAU,WAAW;IACvB,MAAM,IAAI,GAAe,EAAE,CAAC;IAC5B,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC;IACzC,MAAM,KAAK,GAAG,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IACtC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YACzB,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YACnE,IAAI,CAAC;gBACD,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;YACnC,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACT,OAAO,CAAC,KAAK,CAAC,6BAA6B,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC;YAC1D,CAAC;QACL,CAAC;IACL,CAAC;IACD,OAAO,IAAI,CAAC;AAChB,CAAC;AAED,MAAM,UAAU,UAAU,CACtB,QAAgB,EAChB,SAAiB,EACjB,WAAmB,EACnB,UAA0C,EAC1C,cAAsB,EACtB,WAAsC,EACtC,eAAwB;IAExB,IAAI,GAAG,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC5B,IAAI,CAAC,GAAG,EAAE,CAAC;QACP,GAAG,GAAG;YACF,QAAQ;YACR,KAAK,EAAE,EAAE;SACZ,CAAC;IACN,CAAC;IAED,MAAM,YAAY,GAAG,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAE1C,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,GAAG;QACnB,SAAS;QACT,WAAW;QACX,UAAU;QACV,cAAc;QACd,WAAW;QACX,eAAe;QACf,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,WAAW,EAAE,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE;
|
|
1
|
+
{"version":3,"file":"audit-log.js","sourceRoot":"","sources":["../../../src/logger/audit-log.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,MAAM,IAAI,CAAC;AAGpB,MAAM,QAAQ,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,EAAE,eAAe,CAAC,CAAC;AAC1D,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;AAC5C,MAAM,cAAc,GAAG,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC;AAE1D,2BAA2B;AAC3B,SAAS,UAAU;IACf,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,QAAQ,CAAC;QAAE,EAAE,CAAC,SAAS,CAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC1E,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,EAAE,CAAC,SAAS,CAAC,OAAO,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IACxE,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,cAAc,CAAC;QAAE,EAAE,CAAC,SAAS,CAAC,cAAc,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;AAC1F,CAAC;AAED,UAAU,EAAE,CAAC;AAEb,SAAS,UAAU,CAAC,QAAgB;IAChC,OAAO,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,GAAG,QAAQ,OAAO,CAAC,CAAC;AAClD,CAAC;AAED,MAAM,UAAU,OAAO,CAAC,QAAgB;IACpC,MAAM,OAAO,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC;IACrC,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE,CAAC;QAC1B,OAAO,IAAI,CAAC;IAChB,CAAC;IACD,OAAO,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,YAAY,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC;AACzD,CAAC;AAED,MAAM,UAAU,QAAQ,CAAC,GAAa;IAClC,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IACzC,EAAE,CAAC,aAAa,CAAC,OAAO,EAAE,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;AACrE,CAAC;AAED,MAAM,UAAU,WAAW;IACvB,MAAM,IAAI,GAAe,EAAE,CAAC;IAC5B,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC;IACzC,MAAM,KAAK,GAAG,EAAE,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IACtC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,IAAI,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YACzB,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC,CAAC;YACnE,IAAI,CAAC;gBACD,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;YACnC,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACT,OAAO,CAAC,KAAK,CAAC,6BAA6B,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC;YAC1D,CAAC;QACL,CAAC;IACL,CAAC;IACD,OAAO,IAAI,CAAC;AAChB,CAAC;AAED,MAAM,UAAU,UAAU,CACtB,QAAgB,EAChB,SAAiB,EACjB,WAAmB,EACnB,UAA0C,EAC1C,cAAsB,EACtB,WAAsC,EACtC,eAAwB;IAExB,IAAI,GAAG,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC5B,IAAI,CAAC,GAAG,EAAE,CAAC;QACP,GAAG,GAAG;YACF,QAAQ;YACR,KAAK,EAAE,EAAE;SACZ,CAAC;IACN,CAAC;IAED,MAAM,YAAY,GAAG,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAE1C,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,GAAG;QACnB,SAAS;QACT,WAAW;QACX,UAAU;QACV,cAAc;QACd,WAAW;QACX,eAAe;QACf,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,WAAW,EAAE,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,WAAW,CAAC,CAAC,CAAC,EAAE;QACzD,eAAe,EAAE,YAAY,IAAI,YAAY,CAAC,eAAe,KAAK,SAAS,CAAC,CAAC,CAAC,YAAY,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QAC9G,eAAe,EAAE,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,eAAe,CAAC,CAAC,CAAC,SAAS;KAC3E,CAAC;IAEF,QAAQ,CAAC,GAAG,CAAC,CAAC;AAClB,CAAC;AAED,MAAM,UAAU,WAAW,CACvB,QAAgB,EAChB,SAAiB,EACjB,SAAiB,EACjB,KAAa,EACb,MAAc;IAEd,MAAM,GAAG,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;IAC9B,IAAI,CAAC,GAAG,EAAE,CAAC;QACP,MAAM,IAAI,KAAK,CAAC,wBAAwB,QAAQ,EAAE,CAAC,CAAC;IACxD,CAAC;IAED,MAAM,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IAClC,IAAI,CAAC,IAAI,EAAE,CAAC;QACR,MAAM,IAAI,KAAK,CAAC,QAAQ,SAAS,sBAAsB,QAAQ,EAAE,CAAC,CAAC;IACvE,CAAC;IAED,IAAI,CAAC,WAAW,CAAC,SAAS,CAAC,GAAG;QAC1B,KAAK;QACL,MAAM;KACT,CAAC;IAEF,QAAQ,CAAC,GAAG,CAAC,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tool: evaluate
|
|
3
|
+
*
|
|
4
|
+
* Unified evaluation tool that combines get_pending_task and submit_evaluation.
|
|
5
|
+
*
|
|
6
|
+
* Two modes:
|
|
7
|
+
* 1. Initial mode (score/reason/token omitted):
|
|
8
|
+
* Returns the first pending dimension's prompt, scoring guide, and a token.
|
|
9
|
+
*
|
|
10
|
+
* 2. Submit-and-advance mode (score + reason + token all provided):
|
|
11
|
+
* Validates the token, records the score, advances to the next dimension.
|
|
12
|
+
* If more dimensions remain, returns the next prompt + new token.
|
|
13
|
+
* If all done, returns a completion message.
|
|
14
|
+
*/
|
|
15
|
+
import { z } from "zod";
|
|
16
|
+
export declare const evaluateSchema: z.ZodObject<{
|
|
17
|
+
caseName: z.ZodString;
|
|
18
|
+
stepIndex: z.ZodNumber;
|
|
19
|
+
evaluationToken: z.ZodOptional<z.ZodString>;
|
|
20
|
+
score: z.ZodOptional<z.ZodNumber>;
|
|
21
|
+
reason: z.ZodOptional<z.ZodString>;
|
|
22
|
+
}, "strip", z.ZodTypeAny, {
|
|
23
|
+
caseName: string;
|
|
24
|
+
stepIndex: number;
|
|
25
|
+
evaluationToken?: string | undefined;
|
|
26
|
+
score?: number | undefined;
|
|
27
|
+
reason?: string | undefined;
|
|
28
|
+
}, {
|
|
29
|
+
caseName: string;
|
|
30
|
+
stepIndex: number;
|
|
31
|
+
evaluationToken?: string | undefined;
|
|
32
|
+
score?: number | undefined;
|
|
33
|
+
reason?: string | undefined;
|
|
34
|
+
}>;
|
|
35
|
+
export declare function evaluate(input: z.infer<typeof evaluateSchema>): Promise<{
|
|
36
|
+
content: {
|
|
37
|
+
type: "text";
|
|
38
|
+
text: string;
|
|
39
|
+
}[];
|
|
40
|
+
}>;
|
|
41
|
+
//# sourceMappingURL=evaluate.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluate.d.ts","sourceRoot":"","sources":["../../../src/tools/evaluate.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,eAAO,MAAM,cAAc;;;;;;;;;;;;;;;;;;EAazB,CAAC;AAqBH,wBAAsB,QAAQ,CAAC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,OAAO,cAAc,CAAC;;;;;GAmEnE"}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tool: evaluate
|
|
3
|
+
*
|
|
4
|
+
* Unified evaluation tool that combines get_pending_task and submit_evaluation.
|
|
5
|
+
*
|
|
6
|
+
* Two modes:
|
|
7
|
+
* 1. Initial mode (score/reason/token omitted):
|
|
8
|
+
* Returns the first pending dimension's prompt, scoring guide, and a token.
|
|
9
|
+
*
|
|
10
|
+
* 2. Submit-and-advance mode (score + reason + token all provided):
|
|
11
|
+
* Validates the token, records the score, advances to the next dimension.
|
|
12
|
+
* If more dimensions remain, returns the next prompt + new token.
|
|
13
|
+
* If all done, returns a completion message.
|
|
14
|
+
*/
|
|
15
|
+
import { z } from "zod";
|
|
16
|
+
import { getPendingTaskState, validateAndAdvanceState } from "../evaluation/state-machine.js";
|
|
17
|
+
export const evaluateSchema = z.object({
|
|
18
|
+
caseName: z.string().describe("Name of the test case"),
|
|
19
|
+
stepIndex: z.number().int().positive().describe("Step index to evaluate"),
|
|
20
|
+
// Optional: omit all three for initial query, provide all three to submit
|
|
21
|
+
evaluationToken: z.string().optional().describe("The exact token from the previous evaluate call. Omit on the first call to get the initial dimension."),
|
|
22
|
+
score: z.number().int().min(0).max(10).optional().describe("Score from 0 (worst) to 10 (perfect). Required when submitting an evaluation."),
|
|
23
|
+
reason: z.string().optional().describe("Analysis and reason for this score. Required when submitting an evaluation."),
|
|
24
|
+
});
|
|
25
|
+
function renderDimensionPrompt(state, prefix) {
|
|
26
|
+
return `${prefix}\n\n` +
|
|
27
|
+
`🔴 CURRENT TASK: ${state.dimensionName}\n\n` +
|
|
28
|
+
`PROMPT: ${state.description}\n\n` +
|
|
29
|
+
`SCORING GUIDE: \n${state.scoringGuide}\n\n` +
|
|
30
|
+
`⚠️ ACTION REQUIRED:\n` +
|
|
31
|
+
`Inspect the screenshot specifically for this dimension. Then call \`evaluate\` with:\n` +
|
|
32
|
+
` "evaluationToken": "${state.token}"\n` +
|
|
33
|
+
` "score": <your score 0-10>\n` +
|
|
34
|
+
` "reason": "<your analysis>"\n\n` +
|
|
35
|
+
`Do NOT evaluate other dimensions until you finish this one.`;
|
|
36
|
+
}
|
|
37
|
+
export async function evaluate(input) {
|
|
38
|
+
const hasScore = input.score !== undefined;
|
|
39
|
+
const hasReason = input.reason !== undefined;
|
|
40
|
+
const hasToken = input.evaluationToken !== undefined;
|
|
41
|
+
const submissionFieldCount = [hasScore, hasReason, hasToken].filter(Boolean).length;
|
|
42
|
+
// Validate: all three must be present or all absent
|
|
43
|
+
if (submissionFieldCount > 0 && submissionFieldCount < 3) {
|
|
44
|
+
return {
|
|
45
|
+
content: [{
|
|
46
|
+
type: "text",
|
|
47
|
+
text: "❌ Error: `evaluationToken`, `score`, and `reason` must ALL be provided together, or ALL be omitted.\n" +
|
|
48
|
+
"- Omit all three to get the first pending dimension.\n" +
|
|
49
|
+
"- Provide all three to submit an evaluation and advance."
|
|
50
|
+
}]
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
// ── Mode 1: Initial query (no score/reason/token) ──
|
|
54
|
+
if (submissionFieldCount === 0) {
|
|
55
|
+
const state = getPendingTaskState(input.caseName, input.stepIndex);
|
|
56
|
+
if (state.completed) {
|
|
57
|
+
return {
|
|
58
|
+
content: [{
|
|
59
|
+
type: "text",
|
|
60
|
+
text: state.message || "✅ All dimensions for this step are fully evaluated."
|
|
61
|
+
}]
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
const markdown = renderDimensionPrompt(state, `📋 Starting evaluation for Step ${input.stepIndex}:`);
|
|
65
|
+
return {
|
|
66
|
+
content: [{ type: "text", text: markdown }],
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
// ── Mode 2: Submit score and advance ──
|
|
70
|
+
const result = validateAndAdvanceState(input.caseName, input.stepIndex, input.evaluationToken, input.score, input.reason);
|
|
71
|
+
const nextState = result.nextState;
|
|
72
|
+
if (nextState.completed) {
|
|
73
|
+
return {
|
|
74
|
+
content: [{
|
|
75
|
+
type: "text",
|
|
76
|
+
text: `✅ Score ${result.score} recorded for [${result.completedDimId}].\n\n` +
|
|
77
|
+
`🎉 All dimensions for Step ${input.stepIndex} are fully evaluated!\n` +
|
|
78
|
+
`You may now proceed to interact with the device (tap, swipe) or call get_audit_status.`
|
|
79
|
+
}]
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
const markdown = renderDimensionPrompt(nextState, `✅ Score ${result.score} recorded for [${result.completedDimId}].\n\n➡️ Next dimension:`);
|
|
83
|
+
return {
|
|
84
|
+
content: [{ type: "text", text: markdown }],
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
//# sourceMappingURL=evaluate.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluate.js","sourceRoot":"","sources":["../../../src/tools/evaluate.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,mBAAmB,EAAE,uBAAuB,EAAE,MAAM,gCAAgC,CAAC;AAE9F,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IACnC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IACtD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,wBAAwB,CAAC;IACzE,0EAA0E;IAC1E,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAC3C,uGAAuG,CAC1G;IACD,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CACtD,+EAA+E,CAClF;IACD,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAClC,6EAA6E,CAChF;CACJ,CAAC,CAAC;AAEH,SAAS,qBAAqB,CAAC,KAM9B,EAAE,MAAc;IACb,OAAO,GAAG,MAAM,MAAM;QAClB,oBAAoB,KAAK,CAAC,aAAa,MAAM;QAC7C,WAAW,KAAK,CAAC,WAAW,MAAM;QAClC,oBAAoB,KAAK,CAAC,YAAY,MAAM;QAC5C,uBAAuB;QACvB,wFAAwF;QACxF,yBAAyB,KAAK,CAAC,KAAK,KAAK;QACzC,gCAAgC;QAChC,mCAAmC;QACnC,6DAA6D,CAAC;AACtE,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,KAAqC;IAChE,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,KAAK,SAAS,CAAC;IAC3C,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,KAAK,SAAS,CAAC;IAC7C,MAAM,QAAQ,GAAG,KAAK,CAAC,eAAe,KAAK,SAAS,CAAC;IAErD,MAAM,oBAAoB,GAAG,CAAC,QAAQ,EAAE,SAAS,EAAE,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;IAEpF,oDAAoD;IACpD,IAAI,oBAAoB,GAAG,CAAC,IAAI,oBAAoB,GAAG,CAAC,EAAE,CAAC;QACvD,OAAO;YACH,OAAO,EAAE,CAAC;oBACN,IAAI,EAAE,MAAe;oBACrB,IAAI,EAAE,uGAAuG;wBACzG,wDAAwD;wBACxD,0DAA0D;iBACjE,CAAC;SACL,CAAC;IACN,CAAC;IAED,sDAAsD;IACtD,IAAI,oBAAoB,KAAK,CAAC,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAG,mBAAmB,CAAC,KAAK,CAAC,QAAQ,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;QAEnE,IAAI,KAAK,CAAC,SAAS,EAAE,CAAC;YAClB,OAAO;gBACH,OAAO,EAAE,CAAC;wBACN,IAAI,EAAE,MAAe;wBACrB,IAAI,EAAE,KAAK,CAAC,OAAO,IAAI,qDAAqD;qBAC/E,CAAC;aACL,CAAC;QACN,CAAC;QAED,MAAM,QAAQ,GAAG,qBAAqB,CAAC,KAAK,EAAE,mCAAmC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC;QACrG,OAAO;YACH,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;SACvD,CAAC;IACN,CAAC;IAED,yCAAyC;IACzC,MAAM,MAAM,GAAG,uBAAuB,CAClC,KAAK,CAAC,QAAQ,EACd,KAAK,CAAC,SAAS,EACf,KAAK,CAAC,eAAgB,EACtB,KAAK,CAAC,KAAM,EACZ,KAAK,CAAC,MAAO,CAChB,CAAC;IAEF,MAAM,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;IAEnC,IAAI,SAAS,CAAC,SAAS,EAAE,CAAC;QACtB,OAAO;YACH,OAAO,EAAE,CAAC;oBACN,IAAI,EAAE,MAAe;oBACrB,IAAI,EAAE,WAAW,MAAM,CAAC,KAAK,kBAAkB,MAAM,CAAC,cAAc,QAAQ;wBACxE,8BAA8B,KAAK,CAAC,SAAS,yBAAyB;wBACtE,wFAAwF;iBAC/F,CAAC;SACL,CAAC;IACN,CAAC;IAED,MAAM,QAAQ,GAAG,qBAAqB,CAAC,SAAS,EAC5C,WAAW,MAAM,CAAC,KAAK,kBAAkB,MAAM,CAAC,cAAc,0BAA0B,CAC3F,CAAC;IAEF,OAAO;QACH,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;KACvD,CAAC;AACN,CAAC"}
|
|
@@ -6,38 +6,38 @@ export const getAuditStatusSchema = z.object({
|
|
|
6
6
|
});
|
|
7
7
|
// Helper for rendering incomplete template
|
|
8
8
|
function renderIncompleteTemplate(log, missingItems) {
|
|
9
|
-
let md = `# 📊
|
|
10
|
-
md +=
|
|
9
|
+
let md = `# 📊 Audit Progress Report: ${log.caseName}\n`;
|
|
10
|
+
md += `**Status**: 🚧 Evaluation Incomplete\n\n`;
|
|
11
11
|
// Missing summary
|
|
12
|
-
md += `## 🔴
|
|
13
|
-
md += `Agent
|
|
12
|
+
md += `## 🔴 Pending Evaluations\n`;
|
|
13
|
+
md += `Attention Agent: You still need to complete the evaluations for the following dimensions to generate the final report:\n`;
|
|
14
14
|
for (const item of missingItems) {
|
|
15
|
-
md += `- Step ${item.stepIndex}:
|
|
15
|
+
md += `- Step ${item.stepIndex}: Missing [${item.missing.join(", ")}]\n`;
|
|
16
16
|
}
|
|
17
17
|
md += `\n`;
|
|
18
18
|
// Step details
|
|
19
|
-
md += `## 📝
|
|
19
|
+
md += `## 📝 Evaluated Records\n`;
|
|
20
20
|
md += renderStepDetails(log);
|
|
21
21
|
return md;
|
|
22
22
|
}
|
|
23
23
|
// Helper for rendering complete template
|
|
24
24
|
function renderCompleteTemplate(log, avg, passed, failedItems) {
|
|
25
|
-
let md = `# 🏆
|
|
26
|
-
md +=
|
|
27
|
-
md += `## 📈
|
|
28
|
-
md += `-
|
|
29
|
-
md += `-
|
|
25
|
+
let md = `# 🏆 Final Audit Report: ${log.caseName}\n`;
|
|
26
|
+
md += `**Status**: ✅ Evaluation Complete\n\n`;
|
|
27
|
+
md += `## 📈 Overall Scores\n`;
|
|
28
|
+
md += `- **Average Score**: ${avg.toFixed(1)} / 10\n`;
|
|
29
|
+
md += `- **Result**: ${passed ? "✅ Pass" : "❌ Fail"}\n`;
|
|
30
30
|
if (failedItems.length > 0) {
|
|
31
|
-
md += `-
|
|
31
|
+
md += `- **Failed Items**:\n`;
|
|
32
32
|
for (const f of failedItems) {
|
|
33
|
-
md += ` - Step ${f.stepIndex} [${f.dim}]: ${f.score}
|
|
33
|
+
md += ` - Step ${f.stepIndex} [${f.dim}]: Score ${f.score} - ${f.reason}\n`;
|
|
34
34
|
}
|
|
35
35
|
}
|
|
36
36
|
else {
|
|
37
|
-
md += `-
|
|
37
|
+
md += `- **Failed Items**: None\n`;
|
|
38
38
|
}
|
|
39
39
|
md += `\n`;
|
|
40
|
-
md += `## 📝
|
|
40
|
+
md += `## 📝 Detailed Steps\n`;
|
|
41
41
|
md += renderStepDetails(log);
|
|
42
42
|
return md;
|
|
43
43
|
}
|
|
@@ -46,18 +46,18 @@ function renderStepDetails(log) {
|
|
|
46
46
|
const steps = Object.values(log.steps).sort((a, b) => a.stepIndex - b.stepIndex);
|
|
47
47
|
for (const step of steps) {
|
|
48
48
|
md += `### Step ${step.stepIndex}: ${step.description}\n`;
|
|
49
|
-
md += `-
|
|
49
|
+
md += `- **Action**: ${step.actionType} ${step.coordinates ? `(x: ${step.coordinates.x}, y: ${step.coordinates.y})` : ""}\n`;
|
|
50
50
|
if (step.expectedOutcome) {
|
|
51
|
-
md += `-
|
|
51
|
+
md += `- **Expected Outcome**: ${step.expectedOutcome}\n`;
|
|
52
52
|
}
|
|
53
53
|
for (const dim of REQUIRED_DIMS) {
|
|
54
54
|
const ev = step.evaluations ? step.evaluations[dim] : undefined;
|
|
55
55
|
if (ev) {
|
|
56
56
|
const tag = ev.score >= PASSING_SCORE ? "Pass ✅" : "Fail ❌";
|
|
57
|
-
md += `- **${dim}**: ${ev.score}
|
|
57
|
+
md += `- **${dim}**: Score ${ev.score} ${tag} - ${ev.reason}\n`;
|
|
58
58
|
}
|
|
59
59
|
else {
|
|
60
|
-
md += `- **${dim}**: 🔴
|
|
60
|
+
md += `- **${dim}**: 🔴 Missing Evaluation\n`;
|
|
61
61
|
}
|
|
62
62
|
}
|
|
63
63
|
md += `\n`;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"get-audit-status.js","sourceRoot":"","sources":["../../../src/tools/get-audit-status.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAG1E,MAAM,CAAC,MAAM,oBAAoB,GAAG,CAAC,CAAC,MAAM,CAAC;IACzC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,gFAAgF,CAAC;CACvI,CAAC,CAAC;AAEH,2CAA2C;AAC3C,SAAS,wBAAwB,CAAC,GAAa,EAAE,YAAwD;IACrG,IAAI,EAAE,GAAG,
|
|
1
|
+
{"version":3,"file":"get-audit-status.js","sourceRoot":"","sources":["../../../src/tools/get-audit-status.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,OAAO,EAAE,WAAW,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAG1E,MAAM,CAAC,MAAM,oBAAoB,GAAG,CAAC,CAAC,MAAM,CAAC;IACzC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,gFAAgF,CAAC;CACvI,CAAC,CAAC;AAEH,2CAA2C;AAC3C,SAAS,wBAAwB,CAAC,GAAa,EAAE,YAAwD;IACrG,IAAI,EAAE,GAAG,+BAA+B,GAAG,CAAC,QAAQ,IAAI,CAAC;IACzD,EAAE,IAAI,0CAA0C,CAAC;IAEjD,kBAAkB;IAClB,EAAE,IAAI,6BAA6B,CAAC;IACpC,EAAE,IAAI,0HAA0H,CAAC;IACjI,KAAK,MAAM,IAAI,IAAI,YAAY,EAAE,CAAC;QAC9B,EAAE,IAAI,UAAU,IAAI,CAAC,SAAS,cAAc,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC;IAC7E,CAAC;IACD,EAAE,IAAI,IAAI,CAAC;IAEX,eAAe;IACf,EAAE,IAAI,2BAA2B,CAAC;IAClC,EAAE,IAAI,iBAAiB,CAAC,GAAG,CAAC,CAAC;IAE7B,OAAO,EAAE,CAAC;AACd,CAAC;AAED,yCAAyC;AACzC,SAAS,sBAAsB,CAAC,GAAa,EAAE,GAAW,EAAE,MAAe,EAAE,WAAkB;IAC3F,IAAI,EAAE,GAAG,4BAA4B,GAAG,CAAC,QAAQ,IAAI,CAAC;IACtD,EAAE,IAAI,uCAAuC,CAAC;IAE9C,EAAE,IAAI,wBAAwB,CAAC;IAC/B,EAAE,IAAI,wBAAwB,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS,CAAC;IACtD,EAAE,IAAI,iBAAiB,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC;IAExD,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,EAAE,IAAI,uBAAuB,CAAC;QAC9B,KAAK,MAAM,CAAC,IAAI,WAAW,EAAE,CAAC;YAC1B,EAAE,IAAI,YAAY,CAAC,CAAC,SAAS,KAAK,CAAC,CAAC,GAAG,YAAY,CAAC,CAAC,KAAK,MAAM,CAAC,CAAC,MAAM,IAAI,CAAC;QACjF,CAAC;IACL,CAAC;SAAM,CAAC;QACJ,EAAE,IAAI,4BAA4B,CAAC;IACvC,CAAC;IACD,EAAE,IAAI,IAAI,CAAC;IAEX,EAAE,IAAI,wBAAwB,CAAC;IAC/B,EAAE,IAAI,iBAAiB,CAAC,GAAG,CAAC,CAAC;IAE7B,OAAO,EAAE,CAAC;AACd,CAAC;AAED,SAAS,iBAAiB,CAAC,GAAa;IACpC,IAAI,EAAE,GAAG,EAAE,CAAC;IACZ,MAAM,KAAK,GAAG,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC;IACjF,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACvB,EAAE,IAAI,YAAY,IAAI,CAAC,SAAS,KAAK,IAAI,CAAC,WAAW,IAAI,CAAC;QAC1D,EAAE,IAAI,iBAAiB,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,WAAW,CAAC,CAAC,QAAQ,IAAI,CAAC,WAAW,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC;QAC7H,IAAI,IAAI,CAAC,eAAe,EAAE,CAAC;YACvB,EAAE,IAAI,2BAA2B,IAAI,CAAC,eAAe,IAAI,CAAC;QAC9D,CAAC;QAED,KAAK,MAAM,GAAG,IAAI,aAAa,EAAE,CAAC;YAC9B,MAAM,EAAE,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;YAChE,IAAI,EAAE,EAAE,CAAC;gBACL,MAAM,GAAG,GAAG,EAAE,CAAC,KAAK,IAAI,aAAa,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC;gBAC5D,EAAE,IAAI,OAAO,GAAG,aAAa,EAAE,CAAC,KAAK,IAAI,GAAG,MAAM,EAAE,CAAC,MAAM,IAAI,CAAC;YACpE,CAAC;iBAAM,CAAC;gBACJ,EAAE,IAAI,OAAO,GAAG,6BAA6B,CAAC;YAClD,CAAC;QACL,CAAC;QACD,EAAE,IAAI,IAAI,CAAC;IACf,CAAC;IACD,OAAO,EAAE,CAAC;AACd,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,KAA2C;IAC5E,IAAI,IAAI,GAAe,EAAE,CAAC;IAC1B,IAAI,KAAK,CAAC,SAAS,IAAI,KAAK,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAChD,KAAK,MAAM,IAAI,IAAI,KAAK,CAAC,SAAS,EAAE,CAAC;YACjC,MAAM,GAAG,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1B,IAAI,GAAG;gBAAE,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC5B,CAAC;IACL,CAAC;SAAM,CAAC;QACJ,IAAI,GAAG,WAAW,EAAE,CAAC;IACzB,CAAC;IAED,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACpB,OAAO;YACH,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,gFAAgF,EAAE,CAAC;SAC/H,CAAC;IACN,CAAC;IAED,IAAI,cAAc,GAAG,EAAE,CAAC;IAExB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACrB,IAAI,WAAW,GAAG,IAAI,CAAC;QACvB,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,WAAW,GAAU,EAAE,CAAC;QAC5B,IAAI,YAAY,GAA+C,EAAE,CAAC;QAElE,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1C,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC;YACtD,MAAM,OAAO,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;YAElE,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACrB,WAAW,GAAG,KAAK,CAAC;gBACpB,YAAY,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,CAAC,CAAC;YAC9D,CAAC;YAED,KAAK,MAAM,CAAC,GAAG,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,EAAE,CAAC;gBAC7D,UAAU,IAAI,EAAE,CAAC,KAAK,CAAC;gBACvB,UAAU,IAAI,CAAC,CAAC;gBAChB,IAAI,EAAE,CAAC,KAAK,GAAG,aAAa,EAAE,CAAC;oBAC3B,WAAW,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,GAAG,EAAE,KAAK,EAAE,EAAE,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;gBAC7F,CAAC;YACL,CAAC;QACL,CAAC;QAED,IAAI,CAAC,WAAW,EAAE,CAAC;YACf,cAAc,IAAI,wBAAwB,CAAC,GAAG,EAAE,YAAY,CAAC,GAAG,SAAS,CAAC;QAC9E,CAAC;aAAM,CAAC;YACJ,MAAM,GAAG,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC;YACzD,MAAM,MAAM,GAAG,WAAW,CAAC,MAAM,KAAK,CAAC,CAAC;YACxC,cAAc,IAAI,sBAAsB,CAAC,GAAG,EAAE,GAAG,EAAE,MAAM,EAAE,WAAW,CAAC,GAAG,SAAS,CAAC;QACxF,CAAC;IACL,CAAC;IAED,OAAO;QACH,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,cAAc,CAAC,IAAI,EAAE,EAAE,CAAC;KACpE,CAAC;AACN,CAAC"}
|
package/dist/src/tools/swipe.js
CHANGED
|
@@ -32,7 +32,7 @@ export async function swipe(input) {
|
|
|
32
32
|
{
|
|
33
33
|
type: "text",
|
|
34
34
|
text: JSON.stringify({
|
|
35
|
-
message: `Step ${input.stepIndex} recorded. Expected: "${input.expectedOutcome}". Please
|
|
35
|
+
message: `Step ${input.stepIndex} recorded. Expected: "${input.expectedOutcome}". Please call evaluate to retrieve the first evaluation dimension and its token before proceeding.`,
|
|
36
36
|
caseName: input.caseName,
|
|
37
37
|
stepIndex: input.stepIndex,
|
|
38
38
|
expectedOutcome: input.expectedOutcome,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"swipe.js","sourceRoot":"","sources":["../../../src/tools/swipe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,MAAM,MAAM,sBAAsB,CAAC;AAC/C,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AAEpD,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,8BAA8B,CAAC;IACzE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,4BAA4B,CAAC;IACvE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,4BAA4B,CAAC;IACrE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,0BAA0B,CAAC;IACnE,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IACtD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,gCAAgC,CAAC;IACjF,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,2CAA2C,CAAC;IAC7E,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,yFAAyF,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,KAAK,CAAC,KAAkC;IAC1D,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;IACtF,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QAClB,OAAO;YACH,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,oBAAoB,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC;SACjF,CAAC;IACN,CAAC;IAED,iDAAiD;IACjD,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;IAExD,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,cAAc,EAAE,CAAC;IACjD,UAAU,CACN,KAAK,CAAC,QAAQ,EACd,KAAK,CAAC,SAAS,EACf,KAAK,CAAC,WAAW,EACjB,OAAO,EACP,UAAU,CAAC,QAAQ,EACnB,EAAE,CAAC,EAAE,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,KAAK,CAAC,MAAM,EAAE,EACpC,KAAK,CAAC,eAAe,CACxB,CAAC;IAEF,OAAO;QACH,OAAO,EAAE;YACL;gBACI,IAAI,EAAE,OAAgB;gBACtB,IAAI,EAAE,UAAU,CAAC,WAAW;gBAC5B,QAAQ,EAAE,WAAoB;aACjC;YACD;gBACI,IAAI,EAAE,MAAe;gBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;oBACjB,OAAO,EAAE,QAAQ,KAAK,CAAC,SAAS,yBAAyB,KAAK,CAAC,eAAe,
|
|
1
|
+
{"version":3,"file":"swipe.js","sourceRoot":"","sources":["../../../src/tools/swipe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,MAAM,MAAM,sBAAsB,CAAC;AAC/C,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AAEpD,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC;IAChC,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,8BAA8B,CAAC;IACzE,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,4BAA4B,CAAC;IACvE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,4BAA4B,CAAC;IACrE,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,0BAA0B,CAAC;IACnE,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IACtD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,gCAAgC,CAAC;IACjF,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,2CAA2C,CAAC;IAC7E,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,yFAAyF,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,KAAK,CAAC,KAAkC;IAC1D,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,CAAC;IACtF,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QAClB,OAAO;YACH,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,oBAAoB,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC;SACjF,CAAC;IACN,CAAC;IAED,iDAAiD;IACjD,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;IAExD,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,cAAc,EAAE,CAAC;IACjD,UAAU,CACN,KAAK,CAAC,QAAQ,EACd,KAAK,CAAC,SAAS,EACf,KAAK,CAAC,WAAW,EACjB,OAAO,EACP,UAAU,CAAC,QAAQ,EACnB,EAAE,CAAC,EAAE,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,KAAK,CAAC,MAAM,EAAE,EACpC,KAAK,CAAC,eAAe,CACxB,CAAC;IAEF,OAAO;QACH,OAAO,EAAE;YACL;gBACI,IAAI,EAAE,OAAgB;gBACtB,IAAI,EAAE,UAAU,CAAC,WAAW;gBAC5B,QAAQ,EAAE,WAAoB;aACjC;YACD;gBACI,IAAI,EAAE,MAAe;gBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;oBACjB,OAAO,EAAE,QAAQ,KAAK,CAAC,SAAS,yBAAyB,KAAK,CAAC,eAAe,qGAAqG;oBACnL,QAAQ,EAAE,KAAK,CAAC,QAAQ;oBACxB,SAAS,EAAE,KAAK,CAAC,SAAS;oBAC1B,eAAe,EAAE,KAAK,CAAC,eAAe;oBACtC,WAAW,EAAE,UAAU,CAAC,KAAK;oBAC7B,YAAY,EAAE,UAAU,CAAC,MAAM;iBAClC,CAAC;aACL;SACJ;KACJ,CAAC;AACN,CAAC"}
|
|
@@ -20,7 +20,7 @@ export async function takeScreenshot(input) {
|
|
|
20
20
|
{
|
|
21
21
|
type: "text",
|
|
22
22
|
text: JSON.stringify({
|
|
23
|
-
message: `Step ${input.stepIndex} recorded. Please
|
|
23
|
+
message: `Step ${input.stepIndex} recorded. Please call evaluate to retrieve the first evaluation dimension and its token before proceeding.`,
|
|
24
24
|
caseName: input.caseName,
|
|
25
25
|
stepIndex: input.stepIndex,
|
|
26
26
|
expectedOutcome: input.expectedOutcome ?? "(pure observation)",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"take-screenshot.js","sourceRoot":"","sources":["../../../src/tools/take-screenshot.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,MAAM,MAAM,sBAAsB,CAAC;AAC/C,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AAEpD,MAAM,CAAC,MAAM,oBAAoB,GAAG,CAAC,CAAC,MAAM,CAAC;IACzC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IACtD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,gCAAgC,CAAC;IACjF,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,mCAAmC,CAAC;IACrE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,8EAA8E,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,KAA2C;IAC5E,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,cAAc,EAAE,CAAC;IAEjD,UAAU,CACN,KAAK,CAAC,QAAQ,EACd,KAAK,CAAC,SAAS,EACf,KAAK,CAAC,WAAW,EACjB,YAAY,EACZ,UAAU,CAAC,QAAQ,EACnB,SAAS,EACT,KAAK,CAAC,eAAe,CACxB,CAAC;IAEF,OAAO;QACH,OAAO,EAAE;YACL;gBACI,IAAI,EAAE,OAAgB;gBACtB,IAAI,EAAE,UAAU,CAAC,WAAW;gBAC5B,QAAQ,EAAE,WAAoB;aACjC;YACD;gBACI,IAAI,EAAE,MAAe;gBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;oBACjB,OAAO,EAAE,QAAQ,KAAK,CAAC,SAAS,
|
|
1
|
+
{"version":3,"file":"take-screenshot.js","sourceRoot":"","sources":["../../../src/tools/take-screenshot.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,MAAM,MAAM,sBAAsB,CAAC;AAC/C,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AAEpD,MAAM,CAAC,MAAM,oBAAoB,GAAG,CAAC,CAAC,MAAM,CAAC;IACzC,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IACtD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,gCAAgC,CAAC;IACjF,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,mCAAmC,CAAC;IACrE,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,8EAA8E,CAAC;CAClI,CAAC,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,cAAc,CAAC,KAA2C;IAC5E,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,cAAc,EAAE,CAAC;IAEjD,UAAU,CACN,KAAK,CAAC,QAAQ,EACd,KAAK,CAAC,SAAS,EACf,KAAK,CAAC,WAAW,EACjB,YAAY,EACZ,UAAU,CAAC,QAAQ,EACnB,SAAS,EACT,KAAK,CAAC,eAAe,CACxB,CAAC;IAEF,OAAO;QACH,OAAO,EAAE;YACL;gBACI,IAAI,EAAE,OAAgB;gBACtB,IAAI,EAAE,UAAU,CAAC,WAAW;gBAC5B,QAAQ,EAAE,WAAoB;aACjC;YACD;gBACI,IAAI,EAAE,MAAe;gBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;oBACjB,OAAO,EAAE,QAAQ,KAAK,CAAC,SAAS,6GAA6G;oBAC7I,QAAQ,EAAE,KAAK,CAAC,QAAQ;oBACxB,SAAS,EAAE,KAAK,CAAC,SAAS;oBAC1B,eAAe,EAAE,KAAK,CAAC,eAAe,IAAI,oBAAoB;oBAC9D,WAAW,EAAE,UAAU,CAAC,KAAK;oBAC7B,YAAY,EAAE,UAAU,CAAC,MAAM;iBAClC,CAAC;aACL;SACJ;KACJ,CAAC;AACN,CAAC"}
|
package/dist/src/tools/tap.js
CHANGED
|
@@ -30,7 +30,7 @@ export async function tap(input) {
|
|
|
30
30
|
{
|
|
31
31
|
type: "text",
|
|
32
32
|
text: JSON.stringify({
|
|
33
|
-
message: `Step ${input.stepIndex} recorded. Expected: "${input.expectedOutcome}". Please
|
|
33
|
+
message: `Step ${input.stepIndex} recorded. Expected: "${input.expectedOutcome}". Please call evaluate to retrieve the first evaluation dimension and its token before proceeding.`,
|
|
34
34
|
caseName: input.caseName,
|
|
35
35
|
stepIndex: input.stepIndex,
|
|
36
36
|
expectedOutcome: input.expectedOutcome,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tap.js","sourceRoot":"","sources":["../../../src/tools/tap.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,MAAM,MAAM,sBAAsB,CAAC;AAC/C,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AAEpD,MAAM,CAAC,MAAM,SAAS,GAAG,CAAC,CAAC,MAAM,CAAC;IAC9B,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,iDAAiD,CAAC;IACvF,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,+CAA+C,CAAC;IACrF,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IACtD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,gCAAgC,CAAC;IACjF,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,2CAA2C,CAAC;IAC7E,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,4EAA4E,CAAC;CACrH,CAAC,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,GAAG,CAAC,KAAgC;IACtD,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC;IAClD,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QAClB,OAAO;YACH,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,kBAAkB,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC;SAC/E,CAAC;IACN,CAAC;IAED,iDAAiD;IACjD,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;IAExD,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,cAAc,EAAE,CAAC;IACjD,UAAU,CACN,KAAK,CAAC,QAAQ,EACd,KAAK,CAAC,SAAS,EACf,KAAK,CAAC,WAAW,EACjB,KAAK,EACL,UAAU,CAAC,QAAQ,EACnB,EAAE,CAAC,EAAE,KAAK,CAAC,CAAC,EAAE,CAAC,EAAE,KAAK,CAAC,CAAC,EAAE,EAC1B,KAAK,CAAC,eAAe,CACxB,CAAC;IAEF,OAAO;QACH,OAAO,EAAE;YACL;gBACI,IAAI,EAAE,OAAgB;gBACtB,IAAI,EAAE,UAAU,CAAC,WAAW;gBAC5B,QAAQ,EAAE,WAAoB;aACjC;YACD;gBACI,IAAI,EAAE,MAAe;gBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;oBACjB,OAAO,EAAE,QAAQ,KAAK,CAAC,SAAS,yBAAyB,KAAK,CAAC,eAAe,
|
|
1
|
+
{"version":3,"file":"tap.js","sourceRoot":"","sources":["../../../src/tools/tap.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,KAAK,MAAM,MAAM,sBAAsB,CAAC;AAC/C,OAAO,EAAE,UAAU,EAAE,MAAM,wBAAwB,CAAC;AAEpD,MAAM,CAAC,MAAM,SAAS,GAAG,CAAC,CAAC,MAAM,CAAC;IAC9B,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,iDAAiD,CAAC;IACvF,CAAC,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,+CAA+C,CAAC;IACrF,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IACtD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,gCAAgC,CAAC;IACjF,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,2CAA2C,CAAC;IAC7E,eAAe,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,4EAA4E,CAAC;CACrH,CAAC,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,GAAG,CAAC,KAAgC;IACtD,MAAM,MAAM,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC;IAClD,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QAClB,OAAO;YACH,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,kBAAkB,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC;SAC/E,CAAC;IACN,CAAC;IAED,iDAAiD;IACjD,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC,CAAC;IAExD,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,cAAc,EAAE,CAAC;IACjD,UAAU,CACN,KAAK,CAAC,QAAQ,EACd,KAAK,CAAC,SAAS,EACf,KAAK,CAAC,WAAW,EACjB,KAAK,EACL,UAAU,CAAC,QAAQ,EACnB,EAAE,CAAC,EAAE,KAAK,CAAC,CAAC,EAAE,CAAC,EAAE,KAAK,CAAC,CAAC,EAAE,EAC1B,KAAK,CAAC,eAAe,CACxB,CAAC;IAEF,OAAO;QACH,OAAO,EAAE;YACL;gBACI,IAAI,EAAE,OAAgB;gBACtB,IAAI,EAAE,UAAU,CAAC,WAAW;gBAC5B,QAAQ,EAAE,WAAoB;aACjC;YACD;gBACI,IAAI,EAAE,MAAe;gBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;oBACjB,OAAO,EAAE,QAAQ,KAAK,CAAC,SAAS,yBAAyB,KAAK,CAAC,eAAe,qGAAqG;oBACnL,QAAQ,EAAE,KAAK,CAAC,QAAQ;oBACxB,SAAS,EAAE,KAAK,CAAC,SAAS;oBAC1B,eAAe,EAAE,KAAK,CAAC,eAAe;oBACtC,WAAW,EAAE,UAAU,CAAC,KAAK;oBAC7B,YAAY,EAAE,UAAU,CAAC,MAAM;iBAClC,CAAC;aACL;SACJ;KACJ,CAAC;AACN,CAAC"}
|
package/dist/src/types.d.ts
CHANGED
package/dist/src/types.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,aAAa;IAC1B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,WAAW,cAAc;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,UAAU;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,KAAK,GAAG,OAAO,GAAG,YAAY,CAAC;IAC3C,WAAW,CAAC,EAAE;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACvC,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,cAAc,EAAE,MAAM,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,aAAa;IAC1B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,MAAM,CAAC;IACb,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;CACxB;AAED,MAAM,WAAW,cAAc;IAC3B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,UAAU;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,KAAK,GAAG,OAAO,GAAG,YAAY,CAAC;IAC3C,WAAW,CAAC,EAAE;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACvC,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,cAAc,EAAE,MAAM,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAAC;IAC5C,eAAe,EAAE,MAAM,CAAC;IACxB,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,MAAM,WAAW,QAAQ;IACrB,QAAQ,EAAE,MAAM,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;CACrC"}
|
package/package.json
CHANGED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
import { z } from "zod";
|
|
2
|
-
export declare const getEvaluationCriteriaSchema: z.ZodObject<{
|
|
3
|
-
dimension: z.ZodOptional<z.ZodString>;
|
|
4
|
-
}, "strip", z.ZodTypeAny, {
|
|
5
|
-
dimension?: string | undefined;
|
|
6
|
-
}, {
|
|
7
|
-
dimension?: string | undefined;
|
|
8
|
-
}>;
|
|
9
|
-
export declare function getEvaluationCriteria(input: z.infer<typeof getEvaluationCriteriaSchema>): Promise<{
|
|
10
|
-
content: {
|
|
11
|
-
type: "text";
|
|
12
|
-
text: string;
|
|
13
|
-
}[];
|
|
14
|
-
}>;
|
|
15
|
-
//# sourceMappingURL=get-evaluation-criteria.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"get-evaluation-criteria.d.ts","sourceRoot":"","sources":["../../../src/tools/get-evaluation-criteria.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAGxB,eAAO,MAAM,2BAA2B;;;;;;EAEtC,CAAC;AAEH,wBAAsB,qBAAqB,CAAC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,OAAO,2BAA2B,CAAC;;;;;GAyB7F"}
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
import { z } from "zod";
|
|
2
|
-
import { DIMENSIONS } from "../evaluation/checklist.js";
|
|
3
|
-
export const getEvaluationCriteriaSchema = z.object({
|
|
4
|
-
dimension: z.string().optional().describe("ID of the dimension (e.g., 'overlap'). Omit to list all available dimensions."),
|
|
5
|
-
});
|
|
6
|
-
export async function getEvaluationCriteria(input) {
|
|
7
|
-
if (!input.dimension) {
|
|
8
|
-
return {
|
|
9
|
-
content: [{
|
|
10
|
-
type: "text",
|
|
11
|
-
text: JSON.stringify({
|
|
12
|
-
availableDimensions: DIMENSIONS.map(d => ({ id: d.id, name: d.name })),
|
|
13
|
-
tip: "Call get_evaluation_criteria with a specific dimension ID to see its scoring rubric."
|
|
14
|
-
}, null, 2),
|
|
15
|
-
}],
|
|
16
|
-
};
|
|
17
|
-
}
|
|
18
|
-
const criteria = DIMENSIONS.find(d => d.id === input.dimension);
|
|
19
|
-
if (!criteria) {
|
|
20
|
-
throw new Error(`Dimension '${input.dimension}' not found. Available dimensions: ${DIMENSIONS.map(d => d.id).join(", ")}`);
|
|
21
|
-
}
|
|
22
|
-
const markdown = `# 审计准则:${criteria.name} [${criteria.id}]\n\n` +
|
|
23
|
-
`## 🧠 核心考察点 (Description)\n${criteria.description}\n\n` +
|
|
24
|
-
`## 📊 评分细则 (Scoring Guide)\n${criteria.scoringGuide}\n`;
|
|
25
|
-
return {
|
|
26
|
-
content: [{ type: "text", text: markdown }],
|
|
27
|
-
};
|
|
28
|
-
}
|
|
29
|
-
//# sourceMappingURL=get-evaluation-criteria.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"get-evaluation-criteria.js","sourceRoot":"","sources":["../../../src/tools/get-evaluation-criteria.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,UAAU,EAAE,MAAM,4BAA4B,CAAC;AAExD,MAAM,CAAC,MAAM,2BAA2B,GAAG,CAAC,CAAC,MAAM,CAAC;IAChD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,+EAA+E,CAAC;CAC7H,CAAC,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,qBAAqB,CAAC,KAAkD;IAC1F,IAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;QACnB,OAAO;YACH,OAAO,EAAE,CAAC;oBACN,IAAI,EAAE,MAAe;oBACrB,IAAI,EAAE,IAAI,CAAC,SAAS,CAAC;wBACjB,mBAAmB,EAAE,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC;wBACtE,GAAG,EAAE,sFAAsF;qBAC9F,EAAE,IAAI,EAAE,CAAC,CAAC;iBACd,CAAC;SACL,CAAC;IACN,CAAC;IAED,MAAM,QAAQ,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,KAAK,KAAK,CAAC,SAAS,CAAC,CAAC;IAChE,IAAI,CAAC,QAAQ,EAAE,CAAC;QACZ,MAAM,IAAI,KAAK,CAAC,cAAc,KAAK,CAAC,SAAS,sCAAsC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC/H,CAAC;IAED,MAAM,QAAQ,GAAG,UAAU,QAAQ,CAAC,IAAI,KAAK,QAAQ,CAAC,EAAE,OAAO;QAC3D,8BAA8B,QAAQ,CAAC,WAAW,MAAM;QACxD,+BAA+B,QAAQ,CAAC,YAAY,IAAI,CAAC;IAE7D,OAAO;QACH,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,MAAe,EAAE,IAAI,EAAE,QAAQ,EAAE,CAAC;KACvD,CAAC;AACN,CAAC"}
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import { z } from "zod";
|
|
2
|
-
export declare const submitDimensionScoreSchema: z.ZodObject<{
|
|
3
|
-
caseName: z.ZodString;
|
|
4
|
-
stepIndex: z.ZodNumber;
|
|
5
|
-
dimension: z.ZodString;
|
|
6
|
-
score: z.ZodNumber;
|
|
7
|
-
reason: z.ZodString;
|
|
8
|
-
}, "strip", z.ZodTypeAny, {
|
|
9
|
-
caseName: string;
|
|
10
|
-
stepIndex: number;
|
|
11
|
-
dimension: string;
|
|
12
|
-
score: number;
|
|
13
|
-
reason: string;
|
|
14
|
-
}, {
|
|
15
|
-
caseName: string;
|
|
16
|
-
stepIndex: number;
|
|
17
|
-
dimension: string;
|
|
18
|
-
score: number;
|
|
19
|
-
reason: string;
|
|
20
|
-
}>;
|
|
21
|
-
export declare function submitDimensionScore(input: z.infer<typeof submitDimensionScoreSchema>): Promise<{
|
|
22
|
-
content: {
|
|
23
|
-
type: "text";
|
|
24
|
-
text: string;
|
|
25
|
-
}[];
|
|
26
|
-
}>;
|
|
27
|
-
//# sourceMappingURL=submit-dimension-score.d.ts.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"submit-dimension-score.d.ts","sourceRoot":"","sources":["../../../src/tools/submit-dimension-score.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB,eAAO,MAAM,0BAA0B;;;;;;;;;;;;;;;;;;EAMrC,CAAC;AAEH,wBAAsB,oBAAoB,CAAC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,OAAO,0BAA0B,CAAC;;;;;GA0B3F"}
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import { z } from "zod";
|
|
2
|
-
import { recordScore, readLog } from "../logger/audit-log.js";
|
|
3
|
-
import { REQUIRED_DIMS } from "../evaluation/checklist.js";
|
|
4
|
-
export const submitDimensionScoreSchema = z.object({
|
|
5
|
-
caseName: z.string().describe("Name of the test case"),
|
|
6
|
-
stepIndex: z.number().int().positive().describe("Step index this evaluation belongs to"),
|
|
7
|
-
dimension: z.string().describe(`The dimension ID (must be one of: ${REQUIRED_DIMS.join(", ")})`),
|
|
8
|
-
score: z.number().int().min(0).max(10).describe("Score from 0 (worst) to 10 (perfect)"),
|
|
9
|
-
reason: z.string().describe("Analysis and reason for this score"),
|
|
10
|
-
});
|
|
11
|
-
export async function submitDimensionScore(input) {
|
|
12
|
-
if (!REQUIRED_DIMS.includes(input.dimension)) {
|
|
13
|
-
throw new Error(`Invalid dimension '${input.dimension}'. Must be one of: ${REQUIRED_DIMS.join(", ")}`);
|
|
14
|
-
}
|
|
15
|
-
recordScore(input.caseName, input.stepIndex, input.dimension, input.score, input.reason);
|
|
16
|
-
// After writing, calculate remaining dimensions for this step to provide a helpful hint
|
|
17
|
-
const log = readLog(input.caseName);
|
|
18
|
-
const step = log?.steps[input.stepIndex];
|
|
19
|
-
const evaluated = step ? Object.keys(step.evaluations || {}) : [];
|
|
20
|
-
const remaining = REQUIRED_DIMS.filter(d => !evaluated.includes(d));
|
|
21
|
-
let hint;
|
|
22
|
-
if (remaining.length === 0) {
|
|
23
|
-
hint = `All 5 dimensions for Step ${input.stepIndex} are complete. You may call get_audit_status to review progress or proceed to the next action.`;
|
|
24
|
-
}
|
|
25
|
-
else {
|
|
26
|
-
hint = `${remaining.length} dimension(s) still needed for Step ${input.stepIndex}: ${remaining.join(", ")}. Please evaluate them before moving to the next action.`;
|
|
27
|
-
}
|
|
28
|
-
return {
|
|
29
|
-
content: [{
|
|
30
|
-
type: "text",
|
|
31
|
-
text: `Recorded [${input.dimension}] for ${input.caseName} Step ${input.stepIndex} — Score: ${input.score}. ${hint}`
|
|
32
|
-
}]
|
|
33
|
-
};
|
|
34
|
-
}
|
|
35
|
-
//# sourceMappingURL=submit-dimension-score.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"submit-dimension-score.js","sourceRoot":"","sources":["../../../src/tools/submit-dimension-score.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,WAAW,EAAE,OAAO,EAAE,MAAM,wBAAwB,CAAC;AAC9D,OAAO,EAAE,aAAa,EAAE,MAAM,4BAA4B,CAAC;AAE3D,MAAM,CAAC,MAAM,0BAA0B,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/C,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,uBAAuB,CAAC;IACtD,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,uCAAuC,CAAC;IACxF,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,qCAAqC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC;IAChG,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,QAAQ,CAAC,sCAAsC,CAAC;IACvF,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,oCAAoC,CAAC;CACpE,CAAC,CAAC;AAEH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CAAC,KAAiD;IACxF,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,KAAK,CAAC,SAAS,CAAC,EAAE,CAAC;QAC3C,MAAM,IAAI,KAAK,CAAC,sBAAsB,KAAK,CAAC,SAAS,sBAAsB,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC3G,CAAC;IAED,WAAW,CAAC,KAAK,CAAC,QAAQ,EAAE,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,KAAK,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;IAEzF,wFAAwF;IACxF,MAAM,GAAG,GAAG,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;IACpC,MAAM,IAAI,GAAG,GAAG,EAAE,KAAK,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC;IACzC,MAAM,SAAS,GAAG,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAClE,MAAM,SAAS,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC;IAEpE,IAAI,IAAY,CAAC;IACjB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACzB,IAAI,GAAG,6BAA6B,KAAK,CAAC,SAAS,gGAAgG,CAAC;IACxJ,CAAC;SAAM,CAAC;QACJ,IAAI,GAAG,GAAG,SAAS,CAAC,MAAM,uCAAuC,KAAK,CAAC,SAAS,KAAK,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,0DAA0D,CAAC;IACxK,CAAC;IAED,OAAO;QACH,OAAO,EAAE,CAAC;gBACN,IAAI,EAAE,MAAe;gBACrB,IAAI,EAAE,aAAa,KAAK,CAAC,SAAS,SAAS,KAAK,CAAC,QAAQ,SAAS,KAAK,CAAC,SAAS,aAAa,KAAK,CAAC,KAAK,KAAK,IAAI,EAAE;aACvH,CAAC;KACL,CAAC;AACN,CAAC"}
|