@botlearn/code-gen 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +35 -0
- package/knowledge/anti-patterns.md +212 -0
- package/knowledge/best-practices.md +228 -0
- package/knowledge/domain.md +344 -0
- package/manifest.json +26 -0
- package/package.json +35 -0
- package/skill.md +43 -0
- package/strategies/main.md +114 -0
- package/tests/benchmark.json +536 -0
- package/tests/smoke.json +64 -0
|
@@ -0,0 +1,536 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "0.0.1",
|
|
3
|
+
"dimension": "code-generation",
|
|
4
|
+
"tasks": [
|
|
5
|
+
{
|
|
6
|
+
"id": "bench-easy-01",
|
|
7
|
+
"difficulty": "easy",
|
|
8
|
+
"description": "Generate a utility function with types and edge case handling",
|
|
9
|
+
"input": "Write a TypeScript utility function called `deepMerge` that recursively merges two objects. It should handle nested objects, arrays (concatenate by default, with an option to replace), null/undefined values, and circular reference detection. Include full TypeScript generics so the return type reflects the merged shape. Include unit tests.",
|
|
10
|
+
"rubric": [
|
|
11
|
+
{
|
|
12
|
+
"criterion": "Type Safety",
|
|
13
|
+
"weight": 0.3,
|
|
14
|
+
"scoring": {
|
|
15
|
+
"5": "Uses TypeScript generics with proper constraints; return type correctly reflects merged shape; handles `undefined` and `null` in the type system",
|
|
16
|
+
"3": "Has generics but return type is imprecise (e.g., `Record<string, unknown>`)",
|
|
17
|
+
"1": "No generics; uses `any` for input/output",
|
|
18
|
+
"0": "No type annotations"
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"criterion": "Edge Case Handling",
|
|
23
|
+
"weight": 0.3,
|
|
24
|
+
"scoring": {
|
|
25
|
+
"5": "Handles nested objects, arrays (concat/replace option), null, undefined, circular references with detection, and non-object inputs",
|
|
26
|
+
"3": "Handles nested objects and arrays but misses circular references or null handling",
|
|
27
|
+
"1": "Only handles flat objects; breaks on nested or null input",
|
|
28
|
+
"0": "Does not handle any edge cases"
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"criterion": "Test Quality",
|
|
33
|
+
"weight": 0.25,
|
|
34
|
+
"scoring": {
|
|
35
|
+
"5": "Tests cover: flat merge, deep nesting, array concat, array replace option, null values, circular reference detection, non-object input error",
|
|
36
|
+
"3": "Tests cover flat and nested merge but miss edge cases",
|
|
37
|
+
"1": "Single test with trivial input",
|
|
38
|
+
"0": "No tests"
|
|
39
|
+
}
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"criterion": "Code Quality",
|
|
43
|
+
"weight": 0.15,
|
|
44
|
+
"scoring": {
|
|
45
|
+
"5": "Clean recursive implementation; small focused functions; JSDoc; no mutation of input objects",
|
|
46
|
+
"3": "Working implementation but mutates inputs or has long functions",
|
|
47
|
+
"1": "Fragile implementation with obvious bugs",
|
|
48
|
+
"0": "Non-functional code"
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
],
|
|
52
|
+
"expectedScoreWithout": 20,
|
|
53
|
+
"expectedScoreWith": 70
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
"id": "bench-easy-02",
|
|
57
|
+
"difficulty": "easy",
|
|
58
|
+
"description": "Generate a typed API endpoint with full validation and error handling",
|
|
59
|
+
"input": "Write a Node.js/Express API endpoint `PATCH /api/products/:id` that partially updates a product. The updatable fields are: name (string, 1-500 chars), price (number, positive, max 2 decimal places), category (enum: electronics, clothing, food, books, other), and tags (array of strings, max 10 tags, each tag 1-50 chars). Use Zod for validation, return proper HTTP codes, and include tests.",
|
|
60
|
+
"rubric": [
|
|
61
|
+
{
|
|
62
|
+
"criterion": "Input Validation",
|
|
63
|
+
"weight": 0.3,
|
|
64
|
+
"scoring": {
|
|
65
|
+
"5": "Uses Zod schemas for all fields with precise constraints (char limits, decimal places, enum, array length); validates partial update (all fields optional but at least one required); validates path param as UUID",
|
|
66
|
+
"3": "Validates most fields but misses some constraints (e.g., decimal places on price, tag length)",
|
|
67
|
+
"1": "Basic presence checks only",
|
|
68
|
+
"0": "No validation"
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"criterion": "Error Handling",
|
|
73
|
+
"weight": 0.25,
|
|
74
|
+
"scoring": {
|
|
75
|
+
"5": "Distinct handling for: validation error (400 with field-level details), not found (404), conflict (409), database error (500); proper error response schema",
|
|
76
|
+
"3": "Handles 400 and 404 but uses generic 500 for other cases",
|
|
77
|
+
"1": "Single catch-all error handler",
|
|
78
|
+
"0": "No error handling"
|
|
79
|
+
}
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
"criterion": "Type Safety",
|
|
83
|
+
"weight": 0.2,
|
|
84
|
+
"scoring": {
|
|
85
|
+
"5": "Types derived from Zod schema; typed request handler; typed repository interface; no `any`",
|
|
86
|
+
"3": "Has types but some derived manually (risk of schema/type mismatch)",
|
|
87
|
+
"1": "Minimal typing",
|
|
88
|
+
"0": "Untyped"
|
|
89
|
+
}
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
"criterion": "Test Coverage",
|
|
93
|
+
"weight": 0.25,
|
|
94
|
+
"scoring": {
|
|
95
|
+
"5": "Tests: valid partial update, empty body (400), invalid field values, product not found (404), concurrent update conflict, includes mock setup",
|
|
96
|
+
"3": "Tests happy path and one error case",
|
|
97
|
+
"1": "Single test",
|
|
98
|
+
"0": "No tests"
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
],
|
|
102
|
+
"expectedScoreWithout": 20,
|
|
103
|
+
"expectedScoreWith": 70
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"id": "bench-easy-03",
|
|
107
|
+
"difficulty": "easy",
|
|
108
|
+
"description": "Generate a data transformation pipeline with proper types",
|
|
109
|
+
"input": "Write a Python function that reads a CSV file, validates each row against a schema (name: non-empty string, age: int 0-150, email: valid format, salary: positive float), collects validation errors per row without stopping, transforms valid rows (normalize name to title case, round salary to 2 decimals), and returns a dataclass result with valid records, error records with row numbers and error details, and summary statistics (total, valid, invalid counts). Use type hints throughout. Include tests with a sample CSV.",
|
|
110
|
+
"rubric": [
|
|
111
|
+
{
|
|
112
|
+
"criterion": "Type Annotations",
|
|
113
|
+
"weight": 0.25,
|
|
114
|
+
"scoring": {
|
|
115
|
+
"5": "Full type hints including dataclasses for input/output, TypedDict or dataclass for rows, typed error records, generic where applicable",
|
|
116
|
+
"3": "Type hints on function signatures but internal variables untyped",
|
|
117
|
+
"1": "Sparse type hints",
|
|
118
|
+
"0": "No type hints"
|
|
119
|
+
}
|
|
120
|
+
},
|
|
121
|
+
{
|
|
122
|
+
"criterion": "Validation & Error Collection",
|
|
123
|
+
"weight": 0.3,
|
|
124
|
+
"scoring": {
|
|
125
|
+
"5": "Validates all fields per schema; collects all errors per row (not just first); includes row number and field name in error detail; continues processing after errors",
|
|
126
|
+
"3": "Validates most fields but stops at first error per row or has vague error messages",
|
|
127
|
+
"1": "Minimal validation; raises on first error",
|
|
128
|
+
"0": "No validation"
|
|
129
|
+
}
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
"criterion": "Data Transformation",
|
|
133
|
+
"weight": 0.2,
|
|
134
|
+
"scoring": {
|
|
135
|
+
"5": "Correctly normalizes names, rounds salaries, handles edge cases (extra whitespace, case variations); transformation is a separate function from validation",
|
|
136
|
+
"3": "Applies transformations but mixed into validation logic",
|
|
137
|
+
"1": "Partial transformation",
|
|
138
|
+
"0": "No transformation"
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"criterion": "Test Quality",
|
|
143
|
+
"weight": 0.25,
|
|
144
|
+
"scoring": {
|
|
145
|
+
"5": "Tests with sample CSV: all valid rows, mixed valid/invalid, all invalid, empty file, malformed CSV; verifies error details and statistics",
|
|
146
|
+
"3": "Tests happy path and one error scenario",
|
|
147
|
+
"1": "Single trivial test",
|
|
148
|
+
"0": "No tests"
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
],
|
|
152
|
+
"expectedScoreWithout": 20,
|
|
153
|
+
"expectedScoreWith": 70
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
"id": "bench-med-01",
|
|
157
|
+
"difficulty": "medium",
|
|
158
|
+
"description": "Generate a finite state machine with typed states and transitions",
|
|
159
|
+
"input": "Implement a TypeScript state machine for an order lifecycle with states: draft, pending_payment, paid, processing, shipped, delivered, cancelled, refunded. Define valid transitions (e.g., draft->pending_payment, pending_payment->paid, pending_payment->cancelled, paid->processing, processing->shipped, shipped->delivered, delivered->refunded, and any state except delivered/refunded->cancelled). Each transition should have a guard function that receives the order context and returns whether the transition is allowed. The state machine should emit typed events on transitions, maintain a transition history log, and reject invalid transitions with descriptive errors. Include comprehensive tests for all transitions and guard conditions.",
|
|
160
|
+
"rubric": [
|
|
161
|
+
{
|
|
162
|
+
"criterion": "State Modeling",
|
|
163
|
+
"weight": 0.25,
|
|
164
|
+
"scoring": {
|
|
165
|
+
"5": "Uses discriminated unions for states; each state carries relevant typed data; impossible states are unrepresentable; transitions are typed as a map",
|
|
166
|
+
"3": "States modeled as string enum; transitions defined but data per state is loosely typed",
|
|
167
|
+
"1": "States as plain strings with no transition constraints",
|
|
168
|
+
"0": "No formal state model"
|
|
169
|
+
}
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
"criterion": "Transition Logic",
|
|
173
|
+
"weight": 0.25,
|
|
174
|
+
"scoring": {
|
|
175
|
+
"5": "All specified transitions implemented with typed guard functions; invalid transitions rejected with descriptive error (current state, attempted transition, valid transitions listed); transition history tracked with timestamps",
|
|
176
|
+
"3": "Valid transitions enforced but guards are missing or history is incomplete",
|
|
177
|
+
"1": "Some transitions work but invalid transitions are not properly rejected",
|
|
178
|
+
"0": "No transition enforcement"
|
|
179
|
+
}
|
|
180
|
+
},
|
|
181
|
+
{
|
|
182
|
+
"criterion": "Event System",
|
|
183
|
+
"weight": 0.2,
|
|
184
|
+
"scoring": {
|
|
185
|
+
"5": "Typed event emitter with discriminated event types per transition; supports subscribe/unsubscribe; events include before/after state and context",
|
|
186
|
+
"3": "Events emitted but untyped or missing context data",
|
|
187
|
+
"1": "Basic callback on transition but not a proper event system",
|
|
188
|
+
"0": "No events"
|
|
189
|
+
}
|
|
190
|
+
},
|
|
191
|
+
{
|
|
192
|
+
"criterion": "Test Completeness",
|
|
193
|
+
"weight": 0.3,
|
|
194
|
+
"scoring": {
|
|
195
|
+
"5": "Tests every valid transition, every invalid transition rejection, guard function behavior, event emission, and transition history; edge cases like double-transition and concurrent access",
|
|
196
|
+
"3": "Tests most valid transitions and some invalid ones",
|
|
197
|
+
"1": "Tests only happy path (draft -> delivered)",
|
|
198
|
+
"0": "No tests"
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
],
|
|
202
|
+
"expectedScoreWithout": 15,
|
|
203
|
+
"expectedScoreWith": 65
|
|
204
|
+
},
|
|
205
|
+
{
|
|
206
|
+
"id": "bench-med-02",
|
|
207
|
+
"difficulty": "medium",
|
|
208
|
+
"description": "Generate a CLI tool with argument parsing, validation, and error handling",
|
|
209
|
+
"input": "Create a Node.js CLI tool called `csvstat` that reads a CSV file and outputs summary statistics. Features: (1) Parse command-line arguments: --file (required, path to CSV), --columns (optional, comma-separated column names to analyze), --format (optional, 'table' or 'json', default 'table'), --output (optional, file path to write results). (2) For numeric columns: compute min, max, mean, median, std deviation, null count. For string columns: compute unique count, most frequent value, min/max length, null count. (3) Handle errors: file not found, invalid CSV, invalid column names, permission denied on output file. Use a proper argument parser (e.g., commander, yargs). Include types for all data structures and tests.",
|
|
210
|
+
"rubric": [
|
|
211
|
+
{
|
|
212
|
+
"criterion": "Argument Parsing & Validation",
|
|
213
|
+
"weight": 0.2,
|
|
214
|
+
"scoring": {
|
|
215
|
+
"5": "Uses a proper CLI framework with typed options; validates file existence, column names against CSV headers, format enum, output path writability; shows helpful usage on error",
|
|
216
|
+
"3": "Parses arguments but validation is incomplete (e.g., doesn't check file exists before processing)",
|
|
217
|
+
"1": "Manual argument parsing with minimal validation",
|
|
218
|
+
"0": "No argument handling"
|
|
219
|
+
}
|
|
220
|
+
},
|
|
221
|
+
{
|
|
222
|
+
"criterion": "Statistical Computation",
|
|
223
|
+
"weight": 0.25,
|
|
224
|
+
"scoring": {
|
|
225
|
+
"5": "Correctly computes all specified statistics for both numeric and string columns; handles mixed types, null/empty values, and single-row edge cases",
|
|
226
|
+
"3": "Computes most statistics correctly but misses median or std dev or has numeric precision issues",
|
|
227
|
+
"1": "Only basic stats (count, min, max)",
|
|
228
|
+
"0": "Incorrect or no computation"
|
|
229
|
+
}
|
|
230
|
+
},
|
|
231
|
+
{
|
|
232
|
+
"criterion": "Error Handling",
|
|
233
|
+
"weight": 0.25,
|
|
234
|
+
"scoring": {
|
|
235
|
+
"5": "Distinct error handling for: file not found, CSV parse error, invalid columns, permission denied, empty file; user-friendly error messages with exit codes",
|
|
236
|
+
"3": "Handles file not found and parse errors but misses other cases",
|
|
237
|
+
"1": "Generic error handling; stack traces shown to user",
|
|
238
|
+
"0": "Unhandled errors crash the program"
|
|
239
|
+
}
|
|
240
|
+
},
|
|
241
|
+
{
|
|
242
|
+
"criterion": "Type Safety & Tests",
|
|
243
|
+
"weight": 0.3,
|
|
244
|
+
"scoring": {
|
|
245
|
+
"5": "Full TypeScript types for CLI options, column statistics, output format; tests cover argument parsing, all stat computations, each error scenario, both output formats",
|
|
246
|
+
"3": "Types defined but tests only cover happy path",
|
|
247
|
+
"1": "Minimal types and tests",
|
|
248
|
+
"0": "No types or tests"
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
],
|
|
252
|
+
"expectedScoreWithout": 15,
|
|
253
|
+
"expectedScoreWith": 65
|
|
254
|
+
},
|
|
255
|
+
{
|
|
256
|
+
"id": "bench-med-03",
|
|
257
|
+
"difficulty": "medium",
|
|
258
|
+
"description": "Generate a rate limiter with multiple strategies and persistence",
|
|
259
|
+
"input": "Implement a TypeScript rate limiter that supports three strategies: (1) Fixed Window — N requests per time window, (2) Sliding Window Log — precise tracking with timestamps, (3) Token Bucket — smooth rate with burst capacity. The rate limiter should: accept a strategy via dependency injection, support both in-memory and Redis-backed storage (via a StorageAdapter interface), return typed results including remaining quota, reset time, and retry-after header value, and handle edge cases (clock skew, concurrent requests, storage failures). Include tests for each strategy with time mocking.",
|
|
260
|
+
"rubric": [
|
|
261
|
+
{
|
|
262
|
+
"criterion": "Architecture",
|
|
263
|
+
"weight": 0.25,
|
|
264
|
+
"scoring": {
|
|
265
|
+
"5": "Clean separation: RateLimiter interface, Strategy implementations, StorageAdapter interface; strategies are injectable; storage is injectable; follows open/closed principle",
|
|
266
|
+
"3": "Strategies exist but storage is hard-coded or architecture has some coupling",
|
|
267
|
+
"1": "Single monolithic implementation with strategy as a config flag",
|
|
268
|
+
"0": "No strategy separation"
|
|
269
|
+
}
|
|
270
|
+
},
|
|
271
|
+
{
|
|
272
|
+
"criterion": "Strategy Correctness",
|
|
273
|
+
"weight": 0.3,
|
|
274
|
+
"scoring": {
|
|
275
|
+
"5": "All three strategies correctly implemented with proper algorithms; fixed window handles boundary correctly; sliding window tracks individual timestamps; token bucket refills at correct rate with burst cap",
|
|
276
|
+
"3": "Two strategies correct; one has edge case bugs",
|
|
277
|
+
"1": "Only one strategy or major algorithmic errors",
|
|
278
|
+
"0": "Non-functional rate limiting"
|
|
279
|
+
}
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
"criterion": "Error Handling & Edge Cases",
|
|
283
|
+
"weight": 0.2,
|
|
284
|
+
"scoring": {
|
|
285
|
+
"5": "Handles storage failures (fallback to allow/deny with logging), clock skew tolerance, concurrent request atomicity considerations, and typed error responses with retry-after",
|
|
286
|
+
"3": "Handles storage failures but misses concurrency or clock skew",
|
|
287
|
+
"1": "Basic error handling; crashes on storage failure",
|
|
288
|
+
"0": "No error handling"
|
|
289
|
+
}
|
|
290
|
+
},
|
|
291
|
+
{
|
|
292
|
+
"criterion": "Test Quality",
|
|
293
|
+
"weight": 0.25,
|
|
294
|
+
"scoring": {
|
|
295
|
+
"5": "Tests each strategy independently with time mocking; tests window boundaries, burst behavior, storage adapter contract, storage failure fallback, and result types",
|
|
296
|
+
"3": "Tests basic allow/deny for each strategy but no time mocking or edge cases",
|
|
297
|
+
"1": "Minimal tests for one strategy",
|
|
298
|
+
"0": "No tests"
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
],
|
|
302
|
+
"expectedScoreWithout": 15,
|
|
303
|
+
"expectedScoreWith": 65
|
|
304
|
+
},
|
|
305
|
+
{
|
|
306
|
+
"id": "bench-med-04",
|
|
307
|
+
"difficulty": "medium",
|
|
308
|
+
"description": "Generate a type-safe event system with middleware support",
|
|
309
|
+
"input": "Create a TypeScript event emitter that is fully type-safe: events are defined as a type map (event name -> payload type), emit/on/off/once are all type-checked, and middleware can intercept events before handlers run. Middleware should be able to transform payloads, short-circuit events, or add logging. Support wildcard subscriptions ('*') that receive all events. Include proper memory leak prevention (max listeners warning, automatic cleanup for `once`), async handler support, and comprehensive tests.",
|
|
310
|
+
"rubric": [
|
|
311
|
+
{
|
|
312
|
+
"criterion": "Type Safety",
|
|
313
|
+
"weight": 0.3,
|
|
314
|
+
"scoring": {
|
|
315
|
+
"5": "Event map type parameter enforces correct payload types on emit, on, off, once; middleware is typed; wildcard handler receives discriminated union of all events; no 'any' usage",
|
|
316
|
+
"3": "Basic type parameter for event map but middleware or wildcard is loosely typed",
|
|
317
|
+
"1": "String-based events with 'any' payloads",
|
|
318
|
+
"0": "No type safety"
|
|
319
|
+
}
|
|
320
|
+
},
|
|
321
|
+
{
|
|
322
|
+
"criterion": "Feature Completeness",
|
|
323
|
+
"weight": 0.25,
|
|
324
|
+
"scoring": {
|
|
325
|
+
"5": "Implements on, off, once, emit, middleware pipeline, wildcard subscription, async handler support, max listener warning, and automatic once cleanup",
|
|
326
|
+
"3": "Has on/off/emit and either middleware or wildcard but not both",
|
|
327
|
+
"1": "Basic on/emit only",
|
|
328
|
+
"0": "Incomplete implementation"
|
|
329
|
+
}
|
|
330
|
+
},
|
|
331
|
+
{
|
|
332
|
+
"criterion": "Middleware Pipeline",
|
|
333
|
+
"weight": 0.2,
|
|
334
|
+
"scoring": {
|
|
335
|
+
"5": "Middleware executes in order; can transform payload, short-circuit (prevent handlers), add cross-cutting concerns; typed middleware signature",
|
|
336
|
+
"3": "Middleware runs but cannot modify payload or short-circuit",
|
|
337
|
+
"1": "Basic pre-emit hook without composition",
|
|
338
|
+
"0": "No middleware"
|
|
339
|
+
}
|
|
340
|
+
},
|
|
341
|
+
{
|
|
342
|
+
"criterion": "Test Quality",
|
|
343
|
+
"weight": 0.25,
|
|
344
|
+
"scoring": {
|
|
345
|
+
"5": "Tests: type-safe emit/subscribe, once auto-cleanup, middleware transformation, middleware short-circuit, wildcard handler, async handlers, max listener warning, memory leak prevention",
|
|
346
|
+
"3": "Tests basic emit/subscribe and one advanced feature",
|
|
347
|
+
"1": "Single happy-path test",
|
|
348
|
+
"0": "No tests"
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
],
|
|
352
|
+
"expectedScoreWithout": 15,
|
|
353
|
+
"expectedScoreWith": 65
|
|
354
|
+
},
|
|
355
|
+
{
|
|
356
|
+
"id": "bench-hard-01",
|
|
357
|
+
"difficulty": "hard",
|
|
358
|
+
"description": "Generate a complete CRUD API with authentication, authorization, pagination, and error handling",
|
|
359
|
+
"input": "Build a complete TypeScript/Express REST API for a task management system with the following: (1) Authentication: JWT-based login/register with password hashing (bcrypt), token refresh, and secure token storage guidance. (2) Authorization: role-based access control (admin, manager, member) with per-endpoint permissions. (3) CRUD for tasks: create, read (with pagination, sorting, filtering by status/assignee/date range), update, delete (soft delete). Task schema: id, title, description, status (todo/in_progress/done), priority (low/medium/high/critical), assigneeId, createdBy, createdAt, updatedAt, deletedAt. (4) Full input validation with Zod, typed request/response schemas, proper error hierarchy, and request-level logging. (5) Include tests for auth flow, RBAC, pagination, and error cases.",
|
|
360
|
+
"rubric": [
|
|
361
|
+
{
|
|
362
|
+
"criterion": "Authentication & Security",
|
|
363
|
+
"weight": 0.2,
|
|
364
|
+
"scoring": {
|
|
365
|
+
"5": "Complete JWT flow: registration with password hashing, login returning access+refresh tokens, token refresh endpoint, middleware for protected routes, proper token expiry; no secrets in code",
|
|
366
|
+
"3": "Basic JWT login/register but missing refresh tokens or password hashing has issues",
|
|
367
|
+
"1": "Partial auth implementation with security gaps",
|
|
368
|
+
"0": "No authentication"
|
|
369
|
+
}
|
|
370
|
+
},
|
|
371
|
+
{
|
|
372
|
+
"criterion": "Authorization (RBAC)",
|
|
373
|
+
"weight": 0.15,
|
|
374
|
+
"scoring": {
|
|
375
|
+
"5": "Role hierarchy enforced via middleware; admin: full access, manager: CRUD on team tasks, member: own tasks only; role checked per endpoint; typed permission definitions",
|
|
376
|
+
"3": "Role checking exists but not granular (e.g., only admin vs non-admin)",
|
|
377
|
+
"1": "Basic role field but no enforcement",
|
|
378
|
+
"0": "No authorization"
|
|
379
|
+
}
|
|
380
|
+
},
|
|
381
|
+
{
|
|
382
|
+
"criterion": "CRUD & Query Features",
|
|
383
|
+
"weight": 0.2,
|
|
384
|
+
"scoring": {
|
|
385
|
+
"5": "Full CRUD with soft delete; pagination (cursor or offset) with typed page response; sorting by multiple fields; filtering by status, assignee, date range with validated query params",
|
|
386
|
+
"3": "CRUD works but pagination is basic (offset only) or filtering is limited",
|
|
387
|
+
"1": "Basic CRUD without pagination or filtering",
|
|
388
|
+
"0": "Incomplete CRUD"
|
|
389
|
+
}
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
"criterion": "Type Safety & Validation",
|
|
393
|
+
"weight": 0.2,
|
|
394
|
+
"scoring": {
|
|
395
|
+
"5": "Zod schemas for all endpoints; types derived from schemas; typed middleware chain; typed error responses; no 'any'",
|
|
396
|
+
"3": "Validation on most endpoints but some types are manual or 'any' appears",
|
|
397
|
+
"1": "Partial validation; many untyped areas",
|
|
398
|
+
"0": "No validation or types"
|
|
399
|
+
}
|
|
400
|
+
},
|
|
401
|
+
{
|
|
402
|
+
"criterion": "Error Handling & Tests",
|
|
403
|
+
"weight": 0.25,
|
|
404
|
+
"scoring": {
|
|
405
|
+
"5": "Custom error hierarchy (AuthError, ForbiddenError, NotFoundError, ValidationError); global error handler; tests cover: auth flow (register/login/refresh/invalid token), RBAC (each role), CRUD operations, pagination boundaries, validation errors, concurrent operations",
|
|
406
|
+
"3": "Error hierarchy exists; tests cover auth and basic CRUD",
|
|
407
|
+
"1": "Generic error handling; minimal tests",
|
|
408
|
+
"0": "No error handling or tests"
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
],
|
|
412
|
+
"expectedScoreWithout": 15,
|
|
413
|
+
"expectedScoreWith": 60
|
|
414
|
+
},
|
|
415
|
+
{
|
|
416
|
+
"id": "bench-hard-02",
|
|
417
|
+
"difficulty": "hard",
|
|
418
|
+
"description": "Generate a resilient external service client with retry, circuit breaker, and caching",
|
|
419
|
+
"input": "Implement a TypeScript HTTP client wrapper for consuming an external payment gateway API. Requirements: (1) Typed request/response for endpoints: createCharge, getCharge, refundCharge, listCharges. (2) Retry logic with exponential backoff and jitter for transient failures (5xx, timeouts, network errors); do NOT retry 4xx errors. (3) Circuit breaker that opens after 5 failures in 60 seconds, stays open for 30 seconds, then enters half-open state allowing one probe request. (4) Response caching for GET requests with configurable TTL and cache invalidation on mutations. (5) Request/response logging with sensitive field redaction (card numbers, CVV). (6) Timeout per request with global default and per-method override. (7) Full TypeScript types, dependency injection for HTTP client and cache, and comprehensive tests including failure scenario simulation.",
|
|
420
|
+
"rubric": [
|
|
421
|
+
{
|
|
422
|
+
"criterion": "Resilience Patterns",
|
|
423
|
+
"weight": 0.3,
|
|
424
|
+
"scoring": {
|
|
425
|
+
"5": "Correctly implements retry with exponential backoff + jitter (only for 5xx/timeout/network); circuit breaker with closed/open/half-open states and configurable thresholds; patterns compose cleanly",
|
|
426
|
+
"3": "Retry and circuit breaker present but retry doesn't distinguish 4xx from 5xx, or circuit breaker state transitions have bugs",
|
|
427
|
+
"1": "Basic retry without backoff; no circuit breaker",
|
|
428
|
+
"0": "No resilience patterns"
|
|
429
|
+
}
|
|
430
|
+
},
|
|
431
|
+
{
|
|
432
|
+
"criterion": "Type Safety & API Design",
|
|
433
|
+
"weight": 0.2,
|
|
434
|
+
"scoring": {
|
|
435
|
+
"5": "Typed methods for each endpoint; request/response types per operation; typed error responses distinguishing client errors from server errors; generic cache types",
|
|
436
|
+
"3": "Types for main operations but error types or cache is loosely typed",
|
|
437
|
+
"1": "Minimal types; mostly `any` or `unknown`",
|
|
438
|
+
"0": "Untyped"
|
|
439
|
+
}
|
|
440
|
+
},
|
|
441
|
+
{
|
|
442
|
+
"criterion": "Caching & Invalidation",
|
|
443
|
+
"weight": 0.15,
|
|
444
|
+
"scoring": {
|
|
445
|
+
"5": "GET responses cached with configurable TTL; cache invalidated on POST/PUT/DELETE to related resource; cache adapter is injectable (in-memory default); stale-while-revalidate pattern considered",
|
|
446
|
+
"3": "Basic caching for GETs but invalidation is missing or incomplete",
|
|
447
|
+
"1": "Cache exists but no TTL or invalidation",
|
|
448
|
+
"0": "No caching"
|
|
449
|
+
}
|
|
450
|
+
},
|
|
451
|
+
{
|
|
452
|
+
"criterion": "Security & Logging",
|
|
453
|
+
"weight": 0.1,
|
|
454
|
+
"scoring": {
|
|
455
|
+
"5": "Request/response logged with sensitive fields redacted (card number masked except last 4, CVV removed); logging is configurable; no secrets in error messages",
|
|
456
|
+
"3": "Logging present but redaction is incomplete",
|
|
457
|
+
"1": "Basic logging without redaction",
|
|
458
|
+
"0": "No logging or logs contain sensitive data"
|
|
459
|
+
}
|
|
460
|
+
},
|
|
461
|
+
{
|
|
462
|
+
"criterion": "Test Comprehensiveness",
|
|
463
|
+
"weight": 0.25,
|
|
464
|
+
"scoring": {
|
|
465
|
+
"5": "Tests: successful operations, retry on 5xx (verifies backoff timing), no retry on 4xx, circuit breaker state transitions (closed->open->half-open->closed), cache hit/miss/invalidation, timeout handling, sensitive data redaction, composed resilience (retry + circuit breaker interaction)",
|
|
466
|
+
"3": "Tests cover basic operations and either retry or circuit breaker but not both",
|
|
467
|
+
"1": "Tests only happy path",
|
|
468
|
+
"0": "No tests"
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
],
|
|
472
|
+
"expectedScoreWithout": 10,
|
|
473
|
+
"expectedScoreWith": 60
|
|
474
|
+
},
|
|
475
|
+
{
|
|
476
|
+
"id": "bench-hard-03",
|
|
477
|
+
"difficulty": "hard",
|
|
478
|
+
"description": "Generate a plugin system with lifecycle hooks, dependency resolution, and type safety",
|
|
479
|
+
"input": "Design and implement a TypeScript plugin system for an application framework. Requirements: (1) Plugin interface with typed lifecycle hooks: onInit, onStart, onStop, onError, with async support. (2) Plugin dependency declaration and topological sort for initialization order; detect circular dependencies. (3) Plugin configuration with typed per-plugin config schemas (using Zod) and runtime validation. (4) Hook into application events via typed event bus (plugins can subscribe and emit typed events). (5) Hot-reload support: ability to unload and reload a plugin without restarting the system; proper cleanup on unload. (6) Health check: each plugin reports its health status; aggregate system health. (7) Include tests for: dependency resolution, circular dependency detection, lifecycle ordering, configuration validation, event communication between plugins, hot-reload with cleanup verification, and health aggregation.",
|
|
480
|
+
"rubric": [
|
|
481
|
+
{
|
|
482
|
+
"criterion": "Plugin Architecture",
|
|
483
|
+
"weight": 0.25,
|
|
484
|
+
"scoring": {
|
|
485
|
+
"5": "Clean Plugin interface with typed lifecycle hooks; PluginManager handles registration, dependency resolution, and lifecycle; plugins are isolated with defined communication channels (event bus only)",
|
|
486
|
+
"3": "Plugin interface exists but lifecycle management is incomplete or plugins have direct references to each other",
|
|
487
|
+
"1": "Basic plugin loading but no lifecycle or isolation",
|
|
488
|
+
"0": "No plugin architecture"
|
|
489
|
+
}
|
|
490
|
+
},
|
|
491
|
+
{
|
|
492
|
+
"criterion": "Dependency Resolution",
|
|
493
|
+
"weight": 0.2,
|
|
494
|
+
"scoring": {
|
|
495
|
+
"5": "Topological sort with cycle detection; clear error message naming the cycle; optional dependencies supported; initialization order respects dependency graph",
|
|
496
|
+
"3": "Topological sort works but cycle detection is missing or error messages are unclear",
|
|
497
|
+
"1": "Manual ordering required; no automatic resolution",
|
|
498
|
+
"0": "No dependency handling"
|
|
499
|
+
}
|
|
500
|
+
},
|
|
501
|
+
{
|
|
502
|
+
"criterion": "Type Safety",
|
|
503
|
+
"weight": 0.2,
|
|
504
|
+
"scoring": {
|
|
505
|
+
"5": "Plugin configs typed via Zod schemas with inference; event bus fully typed; lifecycle hooks have typed context; generics used for plugin-specific state; no 'any'",
|
|
506
|
+
"3": "Core types are strong but event bus or config is loosely typed",
|
|
507
|
+
"1": "Basic types; 'any' in multiple places",
|
|
508
|
+
"0": "Untyped"
|
|
509
|
+
}
|
|
510
|
+
},
|
|
511
|
+
{
|
|
512
|
+
"criterion": "Advanced Features",
|
|
513
|
+
"weight": 0.15,
|
|
514
|
+
"scoring": {
|
|
515
|
+
"5": "Hot-reload with proper cleanup (event unsubscribe, resource release); health check with per-plugin and aggregate status; graceful error isolation (one plugin failure doesn't crash others)",
|
|
516
|
+
"3": "Hot-reload or health check present but not both; cleanup is partial",
|
|
517
|
+
"1": "Neither hot-reload nor health check",
|
|
518
|
+
"0": "No advanced features"
|
|
519
|
+
}
|
|
520
|
+
},
|
|
521
|
+
{
|
|
522
|
+
"criterion": "Test Coverage",
|
|
523
|
+
"weight": 0.2,
|
|
524
|
+
"scoring": {
|
|
525
|
+
"5": "Tests: dependency ordering, circular dependency error, lifecycle hook execution order, config validation (valid/invalid), event communication between plugins, hot-reload with cleanup verification, health aggregation, error isolation",
|
|
526
|
+
"3": "Tests cover dependency resolution and basic lifecycle",
|
|
527
|
+
"1": "Minimal tests",
|
|
528
|
+
"0": "No tests"
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
],
|
|
532
|
+
"expectedScoreWithout": 10,
|
|
533
|
+
"expectedScoreWith": 60
|
|
534
|
+
}
|
|
535
|
+
]
|
|
536
|
+
}
|
package/tests/smoke.json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
{
|
|
2
|
+
"version": "0.0.1",
|
|
3
|
+
"timeout": 60,
|
|
4
|
+
"tasks": [
|
|
5
|
+
{
|
|
6
|
+
"id": "smoke-01",
|
|
7
|
+
"description": "Generate a complete REST API endpoint with typed request/response, validation, error handling, and tests",
|
|
8
|
+
"input": "Write a TypeScript REST API endpoint for creating a new user. The endpoint should accept POST requests with a JSON body containing name (string, required, 1-200 chars), email (string, required, valid email format), and age (number, optional, 0-150). It should validate all inputs, return appropriate HTTP status codes (201 for success, 400 for validation errors, 409 for duplicate email, 500 for server errors), use proper TypeScript types for request and response, include error handling for database failures, and include unit tests.",
|
|
9
|
+
"rubric": [
|
|
10
|
+
{
|
|
11
|
+
"criterion": "Type Safety",
|
|
12
|
+
"weight": 0.2,
|
|
13
|
+
"scoring": {
|
|
14
|
+
"5": "Defines typed interfaces for request body, response body, and error response; uses discriminated unions or specific error types; no use of 'any'",
|
|
15
|
+
"3": "Has basic types for request and response but some fields are loosely typed or 'any' is used in error handling",
|
|
16
|
+
"1": "Minimal typing; most parameters are 'any' or untyped",
|
|
17
|
+
"0": "No type definitions; plain JavaScript without types"
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"criterion": "Input Validation",
|
|
22
|
+
"weight": 0.2,
|
|
23
|
+
"scoring": {
|
|
24
|
+
"5": "Validates all fields (name length, email format, age range) with descriptive error messages per field; uses schema validation (e.g., Zod); rejects extra fields",
|
|
25
|
+
"3": "Validates some fields but misses constraints (e.g., name length) or has generic error messages",
|
|
26
|
+
"1": "Only checks if fields are present, no format or range validation",
|
|
27
|
+
"0": "No input validation; passes raw input to database"
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"criterion": "Error Handling",
|
|
32
|
+
"weight": 0.25,
|
|
33
|
+
"scoring": {
|
|
34
|
+
"5": "Handles validation errors (400), duplicate email (409), database failures (500) with specific error types, contextual messages, and correct HTTP status codes; no silent catches",
|
|
35
|
+
"3": "Handles some error cases but uses generic catch-all or returns incorrect status codes",
|
|
36
|
+
"1": "Basic try/catch with generic 500 response for all errors",
|
|
37
|
+
"0": "No error handling; unhandled promise rejections"
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"criterion": "Code Completeness",
|
|
42
|
+
"weight": 0.15,
|
|
43
|
+
"scoring": {
|
|
44
|
+
"5": "Complete, runnable implementation with imports, types, handler function, validation, database interaction (abstracted via repository), and response formatting",
|
|
45
|
+
"3": "Mostly complete but has TODO placeholders or missing imports",
|
|
46
|
+
"1": "Skeleton code with significant gaps",
|
|
47
|
+
"0": "Only function signature or pseudocode"
|
|
48
|
+
}
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
"criterion": "Test Coverage",
|
|
52
|
+
"weight": 0.2,
|
|
53
|
+
"scoring": {
|
|
54
|
+
"5": "Includes tests for: valid creation (201), missing fields (400), invalid email (400), duplicate email (409), database failure (500); uses mocks for repository",
|
|
55
|
+
"3": "Includes 2-3 tests covering happy path and one error case",
|
|
56
|
+
"1": "Single test or test with no assertions",
|
|
57
|
+
"0": "No tests generated"
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
],
|
|
61
|
+
"passThreshold": 60
|
|
62
|
+
}
|
|
63
|
+
]
|
|
64
|
+
}
|