@rudderhq/agent-runtime-gemini-local 0.2.1 → 0.2.2-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/package.json +2 -2
  2. package/skills/conversation-to-skill/LICENSE.txt +202 -0
  3. package/skills/conversation-to-skill/SKILL.md +428 -0
  4. package/skills/conversation-to-skill/agents/analyzer.md +274 -0
  5. package/skills/conversation-to-skill/agents/comparator.md +202 -0
  6. package/skills/conversation-to-skill/agents/grader.md +223 -0
  7. package/skills/conversation-to-skill/assets/eval_review.html +146 -0
  8. package/skills/conversation-to-skill/eval-viewer/generate_review.py +471 -0
  9. package/skills/conversation-to-skill/eval-viewer/viewer.html +1325 -0
  10. package/skills/conversation-to-skill/references/compatibility.md +36 -0
  11. package/skills/conversation-to-skill/references/description-optimization.md +113 -0
  12. package/skills/conversation-to-skill/references/evaluation-suite.md +410 -0
  13. package/skills/conversation-to-skill/references/schemas.md +431 -0
  14. package/skills/conversation-to-skill/scripts/__init__.py +0 -0
  15. package/skills/conversation-to-skill/scripts/aggregate_benchmark.py +401 -0
  16. package/skills/conversation-to-skill/scripts/generate_report.py +335 -0
  17. package/skills/conversation-to-skill/scripts/improve_description.py +197 -0
  18. package/skills/conversation-to-skill/scripts/model_backends.py +115 -0
  19. package/skills/conversation-to-skill/scripts/package_skill.py +136 -0
  20. package/skills/conversation-to-skill/scripts/quick_validate.py +103 -0
  21. package/skills/conversation-to-skill/scripts/run_eval.py +363 -0
  22. package/skills/conversation-to-skill/scripts/run_loop.py +319 -0
  23. package/skills/conversation-to-skill/scripts/utils.py +223 -0
  24. package/skills/rudder/references/organization-skills.md +1 -1
  25. package/skills/skill-creator/SKILL.md +9 -0
  26. package/skills/skill-optimizer/CHANGELOG.md +29 -0
  27. package/skills/skill-optimizer/SKILL.md +205 -0
  28. package/skills/skill-optimizer/references/adapters/creative-brand-content.md +30 -0
  29. package/skills/skill-optimizer/references/adapters/customer-support-sales.md +30 -0
  30. package/skills/skill-optimizer/references/adapters/document-data-processing.md +31 -0
  31. package/skills/skill-optimizer/references/adapters/education-training.md +31 -0
  32. package/skills/skill-optimizer/references/adapters/finance-accounting.md +31 -0
  33. package/skills/skill-optimizer/references/adapters/healthcare-operations.md +30 -0
  34. package/skills/skill-optimizer/references/adapters/hr-people-ops.md +31 -0
  35. package/skills/skill-optimizer/references/adapters/legal-compliance.md +31 -0
  36. package/skills/skill-optimizer/references/adapters/operations-supply-chain.md +31 -0
  37. package/skills/skill-optimizer/references/adapters/personal-productivity.md +29 -0
  38. package/skills/skill-optimizer/references/adapters/research-knowledge.md +31 -0
  39. package/skills/skill-optimizer/references/adapters/software-ai.md +31 -0
  40. package/skills/skill-optimizer/references/domain-adapter-patterns.md +66 -0
  41. package/skills/skill-optimizer/references/eval-method.md +17 -0
  42. package/skills/skill-optimizer/references/universal-optimization-lens.md +73 -0
@@ -0,0 +1,431 @@
1
+ # JSON Schemas
2
+
3
+ This document defines the JSON schemas used by the bundled skill evaluation
4
+ toolchain for `conversation-to-skill`.
5
+
6
+ ---
7
+
8
+ ## evals.json
9
+
10
+ Defines the evals for a skill. Located at `evals/evals.json` within the skill directory.
11
+
12
+ ```json
13
+ {
14
+ "skill_name": "example-skill",
15
+ "evals": [
16
+ {
17
+ "id": 1,
18
+ "prompt": "User's example prompt",
19
+ "expected_output": "Description of expected result",
20
+ "files": ["evals/files/sample1.pdf"],
21
+ "expectations": [
22
+ "The output includes X",
23
+ "The skill used script Y"
24
+ ]
25
+ }
26
+ ]
27
+ }
28
+ ```
29
+
30
+ **Fields:**
31
+ - `skill_name`: Name matching the skill's frontmatter
32
+ - `evals[].id`: Unique integer identifier
33
+ - `evals[].prompt`: The task to execute
34
+ - `evals[].expected_output`: Human-readable description of success
35
+ - `evals[].files`: Optional list of input file paths (relative to skill root)
36
+ - `evals[].expectations`: List of verifiable statements
37
+
38
+ ---
39
+
40
+ ## history.json
41
+
42
+ Tracks version progression in Improve mode. Located at workspace root.
43
+
44
+ ```json
45
+ {
46
+ "started_at": "2026-01-15T10:30:00Z",
47
+ "skill_name": "pdf",
48
+ "current_best": "v2",
49
+ "iterations": [
50
+ {
51
+ "version": "v0",
52
+ "parent": null,
53
+ "expectation_pass_rate": 0.65,
54
+ "grading_result": "baseline",
55
+ "is_current_best": false
56
+ },
57
+ {
58
+ "version": "v1",
59
+ "parent": "v0",
60
+ "expectation_pass_rate": 0.75,
61
+ "grading_result": "won",
62
+ "is_current_best": false
63
+ },
64
+ {
65
+ "version": "v2",
66
+ "parent": "v1",
67
+ "expectation_pass_rate": 0.85,
68
+ "grading_result": "won",
69
+ "is_current_best": true
70
+ }
71
+ ]
72
+ }
73
+ ```
74
+
75
+ **Fields:**
76
+ - `started_at`: ISO timestamp of when improvement started
77
+ - `skill_name`: Name of the skill being improved
78
+ - `current_best`: Version identifier of the best performer
79
+ - `iterations[].version`: Version identifier (v0, v1, ...)
80
+ - `iterations[].parent`: Parent version this was derived from
81
+ - `iterations[].expectation_pass_rate`: Pass rate from grading
82
+ - `iterations[].grading_result`: "baseline", "won", "lost", or "tie"
83
+ - `iterations[].is_current_best`: Whether this is the current best version
84
+
85
+ ---
86
+
87
+ ## grading.json
88
+
89
+ Output from the grader agent. Located at `<run-dir>/grading.json`.
90
+
91
+ ```json
92
+ {
93
+ "expectations": [
94
+ {
95
+ "text": "The output includes the name 'John Smith'",
96
+ "passed": true,
97
+ "evidence": "Found in transcript Step 3: 'Extracted names: John Smith, Sarah Johnson'"
98
+ },
99
+ {
100
+ "text": "The spreadsheet has a SUM formula in cell B10",
101
+ "passed": false,
102
+ "evidence": "No spreadsheet was created. The output was a text file."
103
+ }
104
+ ],
105
+ "summary": {
106
+ "passed": 2,
107
+ "failed": 1,
108
+ "total": 3,
109
+ "pass_rate": 0.67
110
+ },
111
+ "execution_metrics": {
112
+ "tool_calls": {
113
+ "Read": 5,
114
+ "Write": 2,
115
+ "Bash": 8
116
+ },
117
+ "total_tool_calls": 15,
118
+ "total_steps": 6,
119
+ "errors_encountered": 0,
120
+ "output_chars": 12450,
121
+ "transcript_chars": 3200
122
+ },
123
+ "timing": {
124
+ "executor_duration_seconds": 165.0,
125
+ "grader_duration_seconds": 26.0,
126
+ "total_duration_seconds": 191.0
127
+ },
128
+ "claims": [
129
+ {
130
+ "claim": "The form has 12 fillable fields",
131
+ "type": "factual",
132
+ "verified": true,
133
+ "evidence": "Counted 12 fields in field_info.json"
134
+ }
135
+ ],
136
+ "user_notes_summary": {
137
+ "uncertainties": ["Used 2023 data, may be stale"],
138
+ "needs_review": [],
139
+ "workarounds": ["Fell back to text overlay for non-fillable fields"]
140
+ },
141
+ "eval_feedback": {
142
+ "suggestions": [
143
+ {
144
+ "assertion": "The output includes the name 'John Smith'",
145
+ "reason": "A hallucinated document that mentions the name would also pass"
146
+ }
147
+ ],
148
+ "overall": "Assertions check presence but not correctness."
149
+ }
150
+ }
151
+ ```
152
+
153
+ **Fields:**
154
+ - `expectations[]`: Graded expectations with evidence
155
+ - `summary`: Aggregate pass/fail counts
156
+ - `execution_metrics`: Tool usage and output size (from executor's metrics.json)
157
+ - `timing`: Wall clock timing (from timing.json)
158
+ - `claims`: Extracted and verified claims from the output
159
+ - `user_notes_summary`: Issues flagged by the executor
160
+ - `eval_feedback`: (optional) Improvement suggestions for the evals, only present when the grader identifies issues worth raising
161
+
162
+ ---
163
+
164
+ ## metrics.json
165
+
166
+ Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
167
+
168
+ ```json
169
+ {
170
+ "tool_calls": {
171
+ "Read": 5,
172
+ "Write": 2,
173
+ "Bash": 8,
174
+ "Edit": 1,
175
+ "Glob": 2,
176
+ "Grep": 0
177
+ },
178
+ "total_tool_calls": 18,
179
+ "total_steps": 6,
180
+ "files_created": ["filled_form.pdf", "field_values.json"],
181
+ "errors_encountered": 0,
182
+ "output_chars": 12450,
183
+ "transcript_chars": 3200
184
+ }
185
+ ```
186
+
187
+ **Fields:**
188
+ - `tool_calls`: Count per tool type
189
+ - `total_tool_calls`: Sum of all tool calls
190
+ - `total_steps`: Number of major execution steps
191
+ - `files_created`: List of output files created
192
+ - `errors_encountered`: Number of errors during execution
193
+ - `output_chars`: Total character count of output files
194
+ - `transcript_chars`: Character count of transcript
195
+
196
+ ---
197
+
198
+ ## timing.json
199
+
200
+ Wall clock timing for a run. Located at `<run-dir>/timing.json`.
201
+
202
+ **How to capture:** When a subagent task completes, the task notification includes `total_tokens` and `duration_ms`. Save these immediately — they are not persisted anywhere else and cannot be recovered after the fact.
203
+
204
+ ```json
205
+ {
206
+ "total_tokens": 84852,
207
+ "duration_ms": 23332,
208
+ "total_duration_seconds": 23.3,
209
+ "executor_start": "2026-01-15T10:30:00Z",
210
+ "executor_end": "2026-01-15T10:32:45Z",
211
+ "executor_duration_seconds": 165.0,
212
+ "grader_start": "2026-01-15T10:32:46Z",
213
+ "grader_end": "2026-01-15T10:33:12Z",
214
+ "grader_duration_seconds": 26.0
215
+ }
216
+ ```
217
+
218
+ ---
219
+
220
+ ## benchmark.json
221
+
222
+ Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
223
+
224
+ ```json
225
+ {
226
+ "metadata": {
227
+ "skill_name": "pdf",
228
+ "skill_path": "/path/to/pdf",
229
+ "executor_model": "claude-sonnet-4-20250514",
230
+ "analyzer_model": "most-capable-model",
231
+ "timestamp": "2026-01-15T10:30:00Z",
232
+ "evals_run": [1, 2, 3],
233
+ "runs_per_configuration": 3
234
+ },
235
+
236
+ "runs": [
237
+ {
238
+ "eval_id": 1,
239
+ "eval_name": "Ocean",
240
+ "configuration": "with_skill",
241
+ "run_number": 1,
242
+ "result": {
243
+ "pass_rate": 0.85,
244
+ "passed": 6,
245
+ "failed": 1,
246
+ "total": 7,
247
+ "time_seconds": 42.5,
248
+ "tokens": 3800,
249
+ "tool_calls": 18,
250
+ "errors": 0
251
+ },
252
+ "expectations": [
253
+ {"text": "...", "passed": true, "evidence": "..."}
254
+ ],
255
+ "notes": [
256
+ "Used 2023 data, may be stale",
257
+ "Fell back to text overlay for non-fillable fields"
258
+ ]
259
+ }
260
+ ],
261
+
262
+ "run_summary": {
263
+ "with_skill": {
264
+ "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
265
+ "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
266
+ "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
267
+ },
268
+ "without_skill": {
269
+ "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
270
+ "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
271
+ "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
272
+ },
273
+ "delta": {
274
+ "pass_rate": "+0.50",
275
+ "time_seconds": "+13.0",
276
+ "tokens": "+1700"
277
+ }
278
+ },
279
+
280
+ "notes": [
281
+ "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value",
282
+ "Eval 3 shows high variance (50% ± 40%) - may be flaky or model-dependent",
283
+ "Without-skill runs consistently fail on table extraction expectations",
284
+ "Skill adds 13s average execution time but improves pass rate by 50%"
285
+ ]
286
+ }
287
+ ```
288
+
289
+ **Fields:**
290
+ - `metadata`: Information about the benchmark run
291
+ - `skill_name`: Name of the skill
292
+ - `timestamp`: When the benchmark was run
293
+ - `evals_run`: List of eval names or IDs
294
+ - `runs_per_configuration`: Number of runs per config (e.g. 3)
295
+ - `runs[]`: Individual run results
296
+ - `eval_id`: Numeric eval identifier
297
+ - `eval_name`: Human-readable eval name (used as section header in the viewer)
298
+ - `configuration`: Must be `"with_skill"` or `"without_skill"` (the viewer uses this exact string for grouping and color coding)
299
+ - `run_number`: Integer run number (1, 2, 3...)
300
+ - `result`: Nested object with `pass_rate`, `passed`, `total`, `time_seconds`, `tokens`, `errors`
301
+ - `run_summary`: Statistical aggregates per configuration
302
+ - `with_skill` / `without_skill`: Each contains `pass_rate`, `time_seconds`, `tokens` objects with `mean` and `stddev` fields
303
+ - `delta`: Difference strings like `"+0.50"`, `"+13.0"`, `"+1700"`
304
+ - `notes`: Freeform observations from the analyzer
305
+
306
+ **Important:** The viewer reads these field names exactly. Using `config` instead of `configuration`, or putting `pass_rate` at the top level of a run instead of nested under `result`, will cause the viewer to show empty/zero values. Always reference this schema when generating benchmark.json manually.
307
+
308
+ ---
309
+
310
+ ## comparison.json
311
+
312
+ Output from blind comparator. Located at `<grading-dir>/comparison-N.json`.
313
+
314
+ ```json
315
+ {
316
+ "winner": "A",
317
+ "reasoning": "Output A provides a complete solution with proper formatting and all required fields. Output B is missing the date field and has formatting inconsistencies.",
318
+ "rubric": {
319
+ "A": {
320
+ "content": {
321
+ "correctness": 5,
322
+ "completeness": 5,
323
+ "accuracy": 4
324
+ },
325
+ "structure": {
326
+ "organization": 4,
327
+ "formatting": 5,
328
+ "usability": 4
329
+ },
330
+ "content_score": 4.7,
331
+ "structure_score": 4.3,
332
+ "overall_score": 9.0
333
+ },
334
+ "B": {
335
+ "content": {
336
+ "correctness": 3,
337
+ "completeness": 2,
338
+ "accuracy": 3
339
+ },
340
+ "structure": {
341
+ "organization": 3,
342
+ "formatting": 2,
343
+ "usability": 3
344
+ },
345
+ "content_score": 2.7,
346
+ "structure_score": 2.7,
347
+ "overall_score": 5.4
348
+ }
349
+ },
350
+ "output_quality": {
351
+ "A": {
352
+ "score": 9,
353
+ "strengths": ["Complete solution", "Well-formatted", "All fields present"],
354
+ "weaknesses": ["Minor style inconsistency in header"]
355
+ },
356
+ "B": {
357
+ "score": 5,
358
+ "strengths": ["Readable output", "Correct basic structure"],
359
+ "weaknesses": ["Missing date field", "Formatting inconsistencies", "Partial data extraction"]
360
+ }
361
+ },
362
+ "expectation_results": {
363
+ "A": {
364
+ "passed": 4,
365
+ "total": 5,
366
+ "pass_rate": 0.80,
367
+ "details": [
368
+ {"text": "Output includes name", "passed": true}
369
+ ]
370
+ },
371
+ "B": {
372
+ "passed": 3,
373
+ "total": 5,
374
+ "pass_rate": 0.60,
375
+ "details": [
376
+ {"text": "Output includes name", "passed": true}
377
+ ]
378
+ }
379
+ }
380
+ }
381
+ ```
382
+
383
+ ---
384
+
385
+ ## analysis.json
386
+
387
+ Output from post-hoc analyzer. Located at `<grading-dir>/analysis.json`.
388
+
389
+ ```json
390
+ {
391
+ "comparison_summary": {
392
+ "winner": "A",
393
+ "winner_skill": "path/to/winner/skill",
394
+ "loser_skill": "path/to/loser/skill",
395
+ "comparator_reasoning": "Brief summary of why comparator chose winner"
396
+ },
397
+ "winner_strengths": [
398
+ "Clear step-by-step instructions for handling multi-page documents",
399
+ "Included validation script that caught formatting errors"
400
+ ],
401
+ "loser_weaknesses": [
402
+ "Vague instruction 'process the document appropriately' led to inconsistent behavior",
403
+ "No script for validation, agent had to improvise"
404
+ ],
405
+ "instruction_following": {
406
+ "winner": {
407
+ "score": 9,
408
+ "issues": ["Minor: skipped optional logging step"]
409
+ },
410
+ "loser": {
411
+ "score": 6,
412
+ "issues": [
413
+ "Did not use the skill's formatting template",
414
+ "Invented own approach instead of following step 3"
415
+ ]
416
+ }
417
+ },
418
+ "improvement_suggestions": [
419
+ {
420
+ "priority": "high",
421
+ "category": "instructions",
422
+ "suggestion": "Replace 'process the document appropriately' with explicit steps",
423
+ "expected_impact": "Would eliminate ambiguity that caused inconsistent behavior"
424
+ }
425
+ ],
426
+ "transcript_insights": {
427
+ "winner_execution_pattern": "Read skill -> Followed 5-step process -> Used validation script",
428
+ "loser_execution_pattern": "Read skill -> Unclear on approach -> Tried 3 different methods"
429
+ }
430
+ }
431
+ ```