aitelier 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/dist/commands/add.d.ts +3 -0
  2. package/dist/commands/add.d.ts.map +1 -0
  3. package/dist/commands/add.js +141 -0
  4. package/dist/commands/add.js.map +1 -0
  5. package/dist/commands/add.test.d.ts +2 -0
  6. package/dist/commands/add.test.d.ts.map +1 -0
  7. package/dist/commands/add.test.js +203 -0
  8. package/dist/commands/add.test.js.map +1 -0
  9. package/dist/commands/eval.d.ts +3 -0
  10. package/dist/commands/eval.d.ts.map +1 -0
  11. package/dist/commands/eval.js +505 -0
  12. package/dist/commands/eval.js.map +1 -0
  13. package/dist/commands/eval.test.d.ts +2 -0
  14. package/dist/commands/eval.test.d.ts.map +1 -0
  15. package/dist/commands/eval.test.js +804 -0
  16. package/dist/commands/eval.test.js.map +1 -0
  17. package/dist/commands/format.d.ts +3 -0
  18. package/dist/commands/format.d.ts.map +1 -0
  19. package/dist/commands/format.js +198 -0
  20. package/dist/commands/format.js.map +1 -0
  21. package/dist/commands/format.test.d.ts +2 -0
  22. package/dist/commands/format.test.d.ts.map +1 -0
  23. package/dist/commands/format.test.js +568 -0
  24. package/dist/commands/format.test.js.map +1 -0
  25. package/dist/commands/init.d.ts +3 -0
  26. package/dist/commands/init.d.ts.map +1 -0
  27. package/dist/commands/init.js +94 -0
  28. package/dist/commands/init.js.map +1 -0
  29. package/dist/commands/init.test.d.ts +2 -0
  30. package/dist/commands/init.test.d.ts.map +1 -0
  31. package/dist/commands/init.test.js +123 -0
  32. package/dist/commands/init.test.js.map +1 -0
  33. package/dist/commands/rate.d.ts +3 -0
  34. package/dist/commands/rate.d.ts.map +1 -0
  35. package/dist/commands/rate.js +209 -0
  36. package/dist/commands/rate.js.map +1 -0
  37. package/dist/commands/rate.test.d.ts +2 -0
  38. package/dist/commands/rate.test.d.ts.map +1 -0
  39. package/dist/commands/rate.test.js +393 -0
  40. package/dist/commands/rate.test.js.map +1 -0
  41. package/dist/commands/split.d.ts +3 -0
  42. package/dist/commands/split.d.ts.map +1 -0
  43. package/dist/commands/split.js +210 -0
  44. package/dist/commands/split.js.map +1 -0
  45. package/dist/commands/split.test.d.ts +2 -0
  46. package/dist/commands/split.test.d.ts.map +1 -0
  47. package/dist/commands/split.test.js +503 -0
  48. package/dist/commands/split.test.js.map +1 -0
  49. package/dist/commands/stats.d.ts +3 -0
  50. package/dist/commands/stats.d.ts.map +1 -0
  51. package/dist/commands/stats.js +209 -0
  52. package/dist/commands/stats.js.map +1 -0
  53. package/dist/commands/stats.test.d.ts +2 -0
  54. package/dist/commands/stats.test.d.ts.map +1 -0
  55. package/dist/commands/stats.test.js +476 -0
  56. package/dist/commands/stats.test.js.map +1 -0
  57. package/dist/commands/status.d.ts +3 -0
  58. package/dist/commands/status.d.ts.map +1 -0
  59. package/dist/commands/status.js +132 -0
  60. package/dist/commands/status.js.map +1 -0
  61. package/dist/commands/status.test.d.ts +2 -0
  62. package/dist/commands/status.test.d.ts.map +1 -0
  63. package/dist/commands/status.test.js +443 -0
  64. package/dist/commands/status.test.js.map +1 -0
  65. package/dist/commands/train.d.ts +3 -0
  66. package/dist/commands/train.d.ts.map +1 -0
  67. package/dist/commands/train.js +139 -0
  68. package/dist/commands/train.js.map +1 -0
  69. package/dist/commands/train.test.d.ts +2 -0
  70. package/dist/commands/train.test.d.ts.map +1 -0
  71. package/dist/commands/train.test.js +288 -0
  72. package/dist/commands/train.test.js.map +1 -0
  73. package/dist/index.d.ts +3 -0
  74. package/dist/index.d.ts.map +1 -0
  75. package/dist/index.js +27 -0
  76. package/dist/index.js.map +1 -0
  77. package/dist/providers/openai.d.ts +9 -0
  78. package/dist/providers/openai.d.ts.map +1 -0
  79. package/dist/providers/openai.js +20 -0
  80. package/dist/providers/openai.js.map +1 -0
  81. package/dist/providers/together.d.ts +11 -0
  82. package/dist/providers/together.d.ts.map +1 -0
  83. package/dist/providers/together.js +118 -0
  84. package/dist/providers/together.js.map +1 -0
  85. package/dist/providers/types.d.ts +28 -0
  86. package/dist/providers/types.d.ts.map +1 -0
  87. package/dist/providers/types.js +2 -0
  88. package/dist/providers/types.js.map +1 -0
  89. package/dist/storage/config.d.ts +24 -0
  90. package/dist/storage/config.d.ts.map +1 -0
  91. package/dist/storage/config.js +11 -0
  92. package/dist/storage/config.js.map +1 -0
  93. package/dist/storage/dataset.d.ts +14 -0
  94. package/dist/storage/dataset.d.ts.map +1 -0
  95. package/dist/storage/dataset.js +18 -0
  96. package/dist/storage/dataset.js.map +1 -0
  97. package/package.json +58 -0
@@ -0,0 +1,804 @@
1
+ import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
2
+ import { writeFile, readFile, mkdtemp, rm, mkdir, readdir } from 'node:fs/promises';
3
+ import { join } from 'node:path';
4
+ import { tmpdir } from 'node:os';
5
+ import { Command } from 'commander';
6
+ import { registerEval } from './eval.js';
7
+ import inquirer from 'inquirer';
8
+ describe('ft eval', () => {
9
+ let testDir;
10
+ let originalCwd;
11
+ let originalFetch;
12
+ beforeEach(async () => {
13
+ // Create a temporary directory for each test
14
+ testDir = await mkdtemp(join(tmpdir(), 'ft-eval-test-'));
15
+ originalCwd = process.cwd();
16
+ process.chdir(testDir);
17
+ // Mock TOGETHER_API_KEY
18
+ process.env.TOGETHER_API_KEY = 'test-api-key';
19
+ // Mock fetch
20
+ originalFetch = globalThis.fetch;
21
+ });
22
+ afterEach(async () => {
23
+ process.chdir(originalCwd);
24
+ await rm(testDir, { recursive: true, force: true });
25
+ vi.restoreAllMocks();
26
+ globalThis.fetch = originalFetch;
27
+ delete process.env.TOGETHER_API_KEY;
28
+ });
29
+ it('should evaluate validation examples with completed model', async () => {
30
+ // Create config with completed model
31
+ const config = {
32
+ name: 'test-project',
33
+ provider: 'together',
34
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
35
+ qualityThreshold: 8,
36
+ runs: [
37
+ {
38
+ jobId: 'ft-job-123',
39
+ modelId: 'model-abc-123',
40
+ provider: 'together',
41
+ startedAt: '2025-01-01T00:00:00.000Z',
42
+ status: 'completed',
43
+ },
44
+ ],
45
+ };
46
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
47
+ // Create data directory and validation data
48
+ await mkdir('data', { recursive: true });
49
+ const valExamples = [
50
+ {
51
+ id: 1,
52
+ messages: [
53
+ { role: 'system', content: 'You are a helpful assistant.' },
54
+ { role: 'user', content: 'What is 2+2?' },
55
+ { role: 'assistant', content: '2+2 equals 4.' },
56
+ ],
57
+ rating: 9,
58
+ createdAt: '2025-01-01T00:00:00.000Z',
59
+ version: 1,
60
+ split: 'val',
61
+ },
62
+ {
63
+ id: 2,
64
+ messages: [
65
+ { role: 'system', content: 'You are a helpful assistant.' },
66
+ { role: 'user', content: 'What is the capital of France?' },
67
+ { role: 'assistant', content: 'The capital of France is Paris.' },
68
+ ],
69
+ rating: 10,
70
+ createdAt: '2025-01-01T00:00:00.000Z',
71
+ version: 1,
72
+ split: 'val',
73
+ },
74
+ ];
75
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
76
+ // Mock chat completions API
77
+ globalThis.fetch = vi.fn(async (url) => {
78
+ const urlString = typeof url === 'string' ? url : url.toString();
79
+ if (urlString.includes('/chat/completions')) {
80
+ return {
81
+ ok: true,
82
+ json: async () => ({
83
+ choices: [{ message: { content: 'Model response' } }],
84
+ }),
85
+ text: async () => '',
86
+ };
87
+ }
88
+ throw new Error(`Unexpected fetch call: ${urlString}`);
89
+ });
90
+ // Mock inquirer prompts
91
+ vi.spyOn(inquirer, 'prompt')
92
+ .mockResolvedValueOnce({ action: 'score' }) // First example: action
93
+ .mockResolvedValueOnce({ score: '4' }) // First example: score
94
+ .mockResolvedValueOnce({ action: 'score' }) // Second example: action
95
+ .mockResolvedValueOnce({ score: '5' }); // Second example: score
96
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
97
+ const program = new Command();
98
+ registerEval(program);
99
+ await program.parseAsync(['node', 'test', 'eval']);
100
+ // Verify inference API was called twice
101
+ expect(globalThis.fetch).toHaveBeenCalledTimes(2);
102
+ // Verify output shows evaluation progress
103
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
104
+ expect(logOutput).toContain('Model Evaluation');
105
+ expect(logOutput).toContain('model-abc-123');
106
+ expect(logOutput).toContain('Example 1 of 2');
107
+ expect(logOutput).toContain('Example 2 of 2');
108
+ expect(logOutput).toContain('Evaluation Complete');
109
+ // Verify eval results file was created
110
+ const evalFiles = await readdir('data/evals').catch(() => []);
111
+ expect(evalFiles.length).toBeGreaterThan(0);
112
+ expect(evalFiles[0]).toMatch(/^eval-model-abc-123-\d{4}-\d{2}-\d{2}\.json$/);
113
+ consoleLogSpy.mockRestore();
114
+ });
115
+ it('should handle skip action', async () => {
116
+ // Create config with completed model
117
+ const config = {
118
+ name: 'test-project',
119
+ provider: 'together',
120
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
121
+ qualityThreshold: 8,
122
+ runs: [
123
+ {
124
+ jobId: 'ft-job-123',
125
+ modelId: 'model-abc-123',
126
+ provider: 'together',
127
+ startedAt: '2025-01-01T00:00:00.000Z',
128
+ status: 'completed',
129
+ },
130
+ ],
131
+ };
132
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
133
+ // Create data directory and validation data
134
+ await mkdir('data', { recursive: true });
135
+ const valExamples = [
136
+ {
137
+ id: 1,
138
+ messages: [
139
+ { role: 'system', content: 'You are a helpful assistant.' },
140
+ { role: 'user', content: 'What is 2+2?' },
141
+ { role: 'assistant', content: '2+2 equals 4.' },
142
+ ],
143
+ rating: 9,
144
+ createdAt: '2025-01-01T00:00:00.000Z',
145
+ version: 1,
146
+ split: 'val',
147
+ },
148
+ ];
149
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
150
+ // Mock chat completions API
151
+ globalThis.fetch = vi.fn(async () => {
152
+ return {
153
+ ok: true,
154
+ json: async () => ({
155
+ choices: [{ message: { content: 'Model response' } }],
156
+ }),
157
+ text: async () => '',
158
+ };
159
+ });
160
+ // Mock inquirer to skip
161
+ vi.spyOn(inquirer, 'prompt').mockResolvedValueOnce({ action: 'skip' });
162
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
163
+ const program = new Command();
164
+ registerEval(program);
165
+ await program.parseAsync(['node', 'test', 'eval']);
166
+ // Verify output shows no examples evaluated
167
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
168
+ expect(logOutput).toContain('No examples evaluated');
169
+ consoleLogSpy.mockRestore();
170
+ });
171
+ it('should handle quit action', async () => {
172
+ // Create config with completed model
173
+ const config = {
174
+ name: 'test-project',
175
+ provider: 'together',
176
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
177
+ qualityThreshold: 8,
178
+ runs: [
179
+ {
180
+ jobId: 'ft-job-123',
181
+ modelId: 'model-abc-123',
182
+ provider: 'together',
183
+ startedAt: '2025-01-01T00:00:00.000Z',
184
+ status: 'completed',
185
+ },
186
+ ],
187
+ };
188
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
189
+ // Create data directory and validation data
190
+ await mkdir('data', { recursive: true });
191
+ const valExamples = [
192
+ {
193
+ id: 1,
194
+ messages: [
195
+ { role: 'system', content: 'You are a helpful assistant.' },
196
+ { role: 'user', content: 'What is 2+2?' },
197
+ { role: 'assistant', content: '2+2 equals 4.' },
198
+ ],
199
+ rating: 9,
200
+ createdAt: '2025-01-01T00:00:00.000Z',
201
+ version: 1,
202
+ split: 'val',
203
+ },
204
+ ];
205
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
206
+ // Mock chat completions API
207
+ globalThis.fetch = vi.fn(async () => {
208
+ return {
209
+ ok: true,
210
+ json: async () => ({
211
+ choices: [{ message: { content: 'Model response' } }],
212
+ }),
213
+ text: async () => '',
214
+ };
215
+ });
216
+ // Mock inquirer to quit
217
+ vi.spyOn(inquirer, 'prompt').mockResolvedValueOnce({ action: 'quit' });
218
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
219
+ const program = new Command();
220
+ registerEval(program);
221
+ await program.parseAsync(['node', 'test', 'eval']);
222
+ // Verify output shows quitting message
223
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
224
+ expect(logOutput).toContain('Quitting');
225
+ consoleLogSpy.mockRestore();
226
+ });
227
+ it('should handle no completed models', async () => {
228
+ // Create config with no completed models
229
+ const config = {
230
+ name: 'test-project',
231
+ provider: 'together',
232
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
233
+ qualityThreshold: 8,
234
+ runs: [
235
+ {
236
+ jobId: 'ft-job-123',
237
+ provider: 'together',
238
+ startedAt: '2025-01-01T00:00:00.000Z',
239
+ status: 'running',
240
+ },
241
+ ],
242
+ };
243
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
244
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
245
+ const program = new Command();
246
+ registerEval(program);
247
+ await program.parseAsync(['node', 'test', 'eval']);
248
+ // Verify output shows no completed models message
249
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
250
+ expect(logOutput).toContain('No Completed Models Found');
251
+ consoleLogSpy.mockRestore();
252
+ });
253
+ it('should handle no validation data', async () => {
254
+ // Create config with completed model
255
+ const config = {
256
+ name: 'test-project',
257
+ provider: 'together',
258
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
259
+ qualityThreshold: 8,
260
+ runs: [
261
+ {
262
+ jobId: 'ft-job-123',
263
+ modelId: 'model-abc-123',
264
+ provider: 'together',
265
+ startedAt: '2025-01-01T00:00:00.000Z',
266
+ status: 'completed',
267
+ },
268
+ ],
269
+ };
270
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
271
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
272
+ const program = new Command();
273
+ registerEval(program);
274
+ await program.parseAsync(['node', 'test', 'eval']);
275
+ // Verify output shows no validation data message
276
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
277
+ expect(logOutput).toContain('No Validation Data Found');
278
+ consoleLogSpy.mockRestore();
279
+ });
280
+ it('should handle inference API errors gracefully', async () => {
281
+ // Create config with completed model
282
+ const config = {
283
+ name: 'test-project',
284
+ provider: 'together',
285
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
286
+ qualityThreshold: 8,
287
+ runs: [
288
+ {
289
+ jobId: 'ft-job-123',
290
+ modelId: 'model-abc-123',
291
+ provider: 'together',
292
+ startedAt: '2025-01-01T00:00:00.000Z',
293
+ status: 'completed',
294
+ },
295
+ ],
296
+ };
297
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
298
+ // Create data directory and validation data
299
+ await mkdir('data', { recursive: true });
300
+ const valExamples = [
301
+ {
302
+ id: 1,
303
+ messages: [
304
+ { role: 'system', content: 'You are a helpful assistant.' },
305
+ { role: 'user', content: 'What is 2+2?' },
306
+ { role: 'assistant', content: '2+2 equals 4.' },
307
+ ],
308
+ rating: 9,
309
+ createdAt: '2025-01-01T00:00:00.000Z',
310
+ version: 1,
311
+ split: 'val',
312
+ },
313
+ ];
314
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
315
+ // Mock API error
316
+ globalThis.fetch = vi.fn(async () => {
317
+ return {
318
+ ok: false,
319
+ text: async () => 'Rate limit exceeded',
320
+ };
321
+ });
322
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
323
+ const program = new Command();
324
+ registerEval(program);
325
+ await program.parseAsync(['node', 'test', 'eval']);
326
+ // Verify output shows error and skipped message
327
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
328
+ expect(logOutput).toContain('Error running inference');
329
+ expect(logOutput).toContain('No examples evaluated');
330
+ consoleLogSpy.mockRestore();
331
+ });
332
+ it('should fail if project not initialized', async () => {
333
+ const mockExit = vi.spyOn(process, 'exit').mockImplementation(() => {
334
+ throw new Error('process.exit called');
335
+ });
336
+ const consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => { });
337
+ const program = new Command();
338
+ registerEval(program);
339
+ await expect(program.parseAsync(['node', 'test', 'eval'])).rejects.toThrow('process.exit called');
340
+ expect(consoleErrorSpy).toHaveBeenCalledWith(expect.stringContaining('Project not initialized'));
341
+ mockExit.mockRestore();
342
+ consoleErrorSpy.mockRestore();
343
+ });
344
+ it('should fail if API key not set', async () => {
345
+ delete process.env.TOGETHER_API_KEY;
346
+ // Create config with completed model
347
+ const config = {
348
+ name: 'test-project',
349
+ provider: 'together',
350
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
351
+ qualityThreshold: 8,
352
+ runs: [
353
+ {
354
+ jobId: 'ft-job-123',
355
+ modelId: 'model-abc-123',
356
+ provider: 'together',
357
+ startedAt: '2025-01-01T00:00:00.000Z',
358
+ status: 'completed',
359
+ },
360
+ ],
361
+ };
362
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
363
+ // Create data directory and validation data
364
+ await mkdir('data', { recursive: true });
365
+ const valExamples = [
366
+ {
367
+ id: 1,
368
+ messages: [
369
+ { role: 'system', content: 'You are a helpful assistant.' },
370
+ { role: 'user', content: 'What is 2+2?' },
371
+ { role: 'assistant', content: '2+2 equals 4.' },
372
+ ],
373
+ rating: 9,
374
+ createdAt: '2025-01-01T00:00:00.000Z',
375
+ version: 1,
376
+ split: 'val',
377
+ },
378
+ ];
379
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
380
+ const mockExit = vi.spyOn(process, 'exit').mockImplementation(() => {
381
+ throw new Error('process.exit called');
382
+ });
383
+ const consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => { });
384
+ const program = new Command();
385
+ registerEval(program);
386
+ await expect(program.parseAsync(['node', 'test', 'eval'])).rejects.toThrow('process.exit called');
387
+ expect(consoleErrorSpy).toHaveBeenCalledWith(expect.stringContaining('TOGETHER_API_KEY environment variable is required'));
388
+ mockExit.mockRestore();
389
+ consoleErrorSpy.mockRestore();
390
+ });
391
+ describe('--compare mode', () => {
392
+ it('should compare base model vs fine-tuned model with blind A/B test', async () => {
393
+ // Create config with completed model
394
+ const config = {
395
+ name: 'test-project',
396
+ provider: 'together',
397
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
398
+ qualityThreshold: 8,
399
+ runs: [
400
+ {
401
+ jobId: 'ft-job-123',
402
+ modelId: 'model-abc-123',
403
+ provider: 'together',
404
+ startedAt: '2025-01-01T00:00:00.000Z',
405
+ status: 'completed',
406
+ },
407
+ ],
408
+ };
409
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
410
+ // Create data directory and validation data
411
+ await mkdir('data', { recursive: true });
412
+ const valExamples = [
413
+ {
414
+ id: 1,
415
+ messages: [
416
+ { role: 'system', content: 'You are a helpful assistant.' },
417
+ { role: 'user', content: 'What is 2+2?' },
418
+ { role: 'assistant', content: '2+2 equals 4.' },
419
+ ],
420
+ rating: 9,
421
+ createdAt: '2025-01-01T00:00:00.000Z',
422
+ version: 1,
423
+ split: 'val',
424
+ },
425
+ {
426
+ id: 2,
427
+ messages: [
428
+ { role: 'system', content: 'You are a helpful assistant.' },
429
+ { role: 'user', content: 'What is the capital of France?' },
430
+ { role: 'assistant', content: 'The capital of France is Paris.' },
431
+ ],
432
+ rating: 10,
433
+ createdAt: '2025-01-01T00:00:00.000Z',
434
+ version: 1,
435
+ split: 'val',
436
+ },
437
+ ];
438
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
439
+ // Mock chat completions API to return different responses for base vs fine-tuned
440
+ let callCount = 0;
441
+ globalThis.fetch = vi.fn(async (url) => {
442
+ const urlString = typeof url === 'string' ? url : url.toString();
443
+ if (urlString.includes('/chat/completions')) {
444
+ callCount++;
445
+ // Alternate between "Base response" and "Fine-tuned response"
446
+ const content = callCount % 2 === 1 ? 'Base response' : 'Fine-tuned response';
447
+ return {
448
+ ok: true,
449
+ json: async () => ({
450
+ choices: [{ message: { content } }],
451
+ }),
452
+ text: async () => '',
453
+ };
454
+ }
455
+ throw new Error(`Unexpected fetch call: ${urlString}`);
456
+ });
457
+ // Mock inquirer prompts for comparison
458
+ vi.spyOn(inquirer, 'prompt')
459
+ .mockResolvedValueOnce({ action: 'score' }) // First example: action
460
+ .mockResolvedValueOnce({ score: '4' }) // First example: score A
461
+ .mockResolvedValueOnce({ score: '5' }) // First example: score B
462
+ .mockResolvedValueOnce({ action: 'score' }) // Second example: action
463
+ .mockResolvedValueOnce({ score: '3' }) // Second example: score A
464
+ .mockResolvedValueOnce({ score: '4' }); // Second example: score B
465
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
466
+ const program = new Command();
467
+ registerEval(program);
468
+ await program.parseAsync(['node', 'test', 'eval', '--compare']);
469
+ // Verify inference API was called 4 times (2 models x 2 examples)
470
+ expect(globalThis.fetch).toHaveBeenCalledTimes(4);
471
+ // Verify output shows comparison mode
472
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
473
+ expect(logOutput).toContain('Model Comparison (Blind A/B Test)');
474
+ expect(logOutput).toContain('Base model: meta-llama/Llama-3.3-70B-Instruct-Turbo');
475
+ expect(logOutput).toContain('Fine-tuned model: model-abc-123');
476
+ expect(logOutput).toContain('Model A Output:');
477
+ expect(logOutput).toContain('Model B Output:');
478
+ expect(logOutput).toContain('Comparison Results - The Reveal!');
479
+ expect(logOutput).toContain('Average Scores:');
480
+ expect(logOutput).toContain('Head-to-Head:');
481
+ // Verify comparison results file was created
482
+ const evalFiles = await readdir('data/evals').catch(() => []);
483
+ expect(evalFiles.length).toBeGreaterThan(0);
484
+ expect(evalFiles[0]).toMatch(/^compare-model-abc-123-\d{4}-\d{2}-\d{2}\.json$/);
485
+ // Verify file contents
486
+ const resultFile = await readFile(`data/evals/${evalFiles[0]}`, 'utf-8');
487
+ const results = JSON.parse(resultFile);
488
+ expect(results).toHaveProperty('baseModelId', 'meta-llama/Llama-3.3-70B-Instruct-Turbo');
489
+ expect(results).toHaveProperty('fineTunedModelId', 'model-abc-123');
490
+ expect(results).toHaveProperty('totalExamples', 2);
491
+ expect(results).toHaveProperty('baseModelAvgScore');
492
+ expect(results).toHaveProperty('fineTunedModelAvgScore');
493
+ expect(results).toHaveProperty('baseModelWins');
494
+ expect(results).toHaveProperty('fineTunedModelWins');
495
+ expect(results).toHaveProperty('ties');
496
+ expect(results.results).toHaveLength(2);
497
+ consoleLogSpy.mockRestore();
498
+ });
499
+ it('should handle skip action in compare mode', async () => {
500
+ // Create config with completed model
501
+ const config = {
502
+ name: 'test-project',
503
+ provider: 'together',
504
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
505
+ qualityThreshold: 8,
506
+ runs: [
507
+ {
508
+ jobId: 'ft-job-123',
509
+ modelId: 'model-abc-123',
510
+ provider: 'together',
511
+ startedAt: '2025-01-01T00:00:00.000Z',
512
+ status: 'completed',
513
+ },
514
+ ],
515
+ };
516
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
517
+ // Create data directory and validation data
518
+ await mkdir('data', { recursive: true });
519
+ const valExamples = [
520
+ {
521
+ id: 1,
522
+ messages: [
523
+ { role: 'system', content: 'You are a helpful assistant.' },
524
+ { role: 'user', content: 'What is 2+2?' },
525
+ { role: 'assistant', content: '2+2 equals 4.' },
526
+ ],
527
+ rating: 9,
528
+ createdAt: '2025-01-01T00:00:00.000Z',
529
+ version: 1,
530
+ split: 'val',
531
+ },
532
+ ];
533
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
534
+ // Mock chat completions API
535
+ globalThis.fetch = vi.fn(async () => {
536
+ return {
537
+ ok: true,
538
+ json: async () => ({
539
+ choices: [{ message: { content: 'Response' } }],
540
+ }),
541
+ text: async () => '',
542
+ };
543
+ });
544
+ // Mock inquirer to skip
545
+ vi.spyOn(inquirer, 'prompt').mockResolvedValueOnce({ action: 'skip' });
546
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
547
+ const program = new Command();
548
+ registerEval(program);
549
+ await program.parseAsync(['node', 'test', 'eval', '--compare']);
550
+ // Verify output shows no examples evaluated
551
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
552
+ expect(logOutput).toContain('No examples evaluated');
553
+ consoleLogSpy.mockRestore();
554
+ });
555
+ it('should handle quit action in compare mode', async () => {
556
+ // Create config with completed model
557
+ const config = {
558
+ name: 'test-project',
559
+ provider: 'together',
560
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
561
+ qualityThreshold: 8,
562
+ runs: [
563
+ {
564
+ jobId: 'ft-job-123',
565
+ modelId: 'model-abc-123',
566
+ provider: 'together',
567
+ startedAt: '2025-01-01T00:00:00.000Z',
568
+ status: 'completed',
569
+ },
570
+ ],
571
+ };
572
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
573
+ // Create data directory and validation data
574
+ await mkdir('data', { recursive: true });
575
+ const valExamples = [
576
+ {
577
+ id: 1,
578
+ messages: [
579
+ { role: 'system', content: 'You are a helpful assistant.' },
580
+ { role: 'user', content: 'What is 2+2?' },
581
+ { role: 'assistant', content: '2+2 equals 4.' },
582
+ ],
583
+ rating: 9,
584
+ createdAt: '2025-01-01T00:00:00.000Z',
585
+ version: 1,
586
+ split: 'val',
587
+ },
588
+ ];
589
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
590
+ // Mock chat completions API
591
+ globalThis.fetch = vi.fn(async () => {
592
+ return {
593
+ ok: true,
594
+ json: async () => ({
595
+ choices: [{ message: { content: 'Response' } }],
596
+ }),
597
+ text: async () => '',
598
+ };
599
+ });
600
+ // Mock inquirer to quit
601
+ vi.spyOn(inquirer, 'prompt').mockResolvedValueOnce({ action: 'quit' });
602
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
603
+ const program = new Command();
604
+ registerEval(program);
605
+ await program.parseAsync(['node', 'test', 'eval', '--compare']);
606
+ // Verify output shows quitting message
607
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
608
+ expect(logOutput).toContain('Quitting comparison');
609
+ consoleLogSpy.mockRestore();
610
+ });
611
+ it('should handle inference errors in compare mode', async () => {
612
+ // Create config with completed model
613
+ const config = {
614
+ name: 'test-project',
615
+ provider: 'together',
616
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
617
+ qualityThreshold: 8,
618
+ runs: [
619
+ {
620
+ jobId: 'ft-job-123',
621
+ modelId: 'model-abc-123',
622
+ provider: 'together',
623
+ startedAt: '2025-01-01T00:00:00.000Z',
624
+ status: 'completed',
625
+ },
626
+ ],
627
+ };
628
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
629
+ // Create data directory and validation data
630
+ await mkdir('data', { recursive: true });
631
+ const valExamples = [
632
+ {
633
+ id: 1,
634
+ messages: [
635
+ { role: 'system', content: 'You are a helpful assistant.' },
636
+ { role: 'user', content: 'What is 2+2?' },
637
+ { role: 'assistant', content: '2+2 equals 4.' },
638
+ ],
639
+ rating: 9,
640
+ createdAt: '2025-01-01T00:00:00.000Z',
641
+ version: 1,
642
+ split: 'val',
643
+ },
644
+ ];
645
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
646
+ // Mock API error
647
+ globalThis.fetch = vi.fn(async () => {
648
+ return {
649
+ ok: false,
650
+ text: async () => 'Rate limit exceeded',
651
+ };
652
+ });
653
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
654
+ const program = new Command();
655
+ registerEval(program);
656
+ await program.parseAsync(['node', 'test', 'eval', '--compare']);
657
+ // Verify output shows error and skipped message
658
+ const logOutput = consoleLogSpy.mock.calls.map((call) => call.join(' ')).join('\n');
659
+ expect(logOutput).toContain('Error running inference');
660
+ expect(logOutput).toContain('No examples evaluated');
661
+ consoleLogSpy.mockRestore();
662
+ });
663
+ it('should randomize model order (blind test)', async () => {
664
+ // Create config with completed model
665
+ const config = {
666
+ name: 'test-project',
667
+ provider: 'together',
668
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
669
+ qualityThreshold: 8,
670
+ runs: [
671
+ {
672
+ jobId: 'ft-job-123',
673
+ modelId: 'model-abc-123',
674
+ provider: 'together',
675
+ startedAt: '2025-01-01T00:00:00.000Z',
676
+ status: 'completed',
677
+ },
678
+ ],
679
+ };
680
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
681
+ // Create data directory and validation data
682
+ await mkdir('data', { recursive: true });
683
+ const valExamples = [
684
+ {
685
+ id: 1,
686
+ messages: [
687
+ { role: 'system', content: 'You are a helpful assistant.' },
688
+ { role: 'user', content: 'What is 2+2?' },
689
+ { role: 'assistant', content: '2+2 equals 4.' },
690
+ ],
691
+ rating: 9,
692
+ createdAt: '2025-01-01T00:00:00.000Z',
693
+ version: 1,
694
+ split: 'val',
695
+ },
696
+ ];
697
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
698
+ // Mock chat completions API
699
+ let callCount = 0;
700
+ globalThis.fetch = vi.fn(async () => {
701
+ callCount++;
702
+ const content = callCount % 2 === 1 ? 'Base response' : 'Fine-tuned response';
703
+ return {
704
+ ok: true,
705
+ json: async () => ({
706
+ choices: [{ message: { content } }],
707
+ }),
708
+ text: async () => '',
709
+ };
710
+ });
711
+ // Mock inquirer prompts
712
+ vi.spyOn(inquirer, 'prompt')
713
+ .mockResolvedValueOnce({ action: 'score' })
714
+ .mockResolvedValueOnce({ score: '4' })
715
+ .mockResolvedValueOnce({ score: '5' });
716
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
717
+ const program = new Command();
718
+ registerEval(program);
719
+ await program.parseAsync(['node', 'test', 'eval', '--compare']);
720
+ // Verify results file contains randomization info
721
+ const evalFiles = await readdir('data/evals').catch(() => []);
722
+ const resultFile = await readFile(`data/evals/${evalFiles[0]}`, 'utf-8');
723
+ const results = JSON.parse(resultFile);
724
+ // Check that each result has modelAIsBase flag
725
+ expect(results.results[0]).toHaveProperty('modelAIsBase');
726
+ expect(results.results[0]).toHaveProperty('modelAOutput');
727
+ expect(results.results[0]).toHaveProperty('modelBOutput');
728
+ expect(results.results[0]).toHaveProperty('baseOutput');
729
+ expect(results.results[0]).toHaveProperty('fineTunedOutput');
730
+ consoleLogSpy.mockRestore();
731
+ });
732
+ it('should calculate statistics correctly in compare mode', async () => {
733
+ // Create config with completed model
734
+ const config = {
735
+ name: 'test-project',
736
+ provider: 'together',
737
+ model: 'meta-llama/Llama-3.3-70B-Instruct-Turbo',
738
+ qualityThreshold: 8,
739
+ runs: [
740
+ {
741
+ jobId: 'ft-job-123',
742
+ modelId: 'model-abc-123',
743
+ provider: 'together',
744
+ startedAt: '2025-01-01T00:00:00.000Z',
745
+ status: 'completed',
746
+ },
747
+ ],
748
+ };
749
+ await writeFile('.ftpipeline.json', JSON.stringify(config, null, 2));
750
+ // Create data directory and validation data
751
+ await mkdir('data', { recursive: true });
752
+ const valExamples = [
753
+ {
754
+ id: 1,
755
+ messages: [
756
+ { role: 'system', content: 'You are a helpful assistant.' },
757
+ { role: 'user', content: 'What is 2+2?' },
758
+ { role: 'assistant', content: '2+2 equals 4.' },
759
+ ],
760
+ rating: 9,
761
+ createdAt: '2025-01-01T00:00:00.000Z',
762
+ version: 1,
763
+ split: 'val',
764
+ },
765
+ ];
766
+ await writeFile('data/val.jsonl', valExamples.map((e) => JSON.stringify(e)).join('\n') + '\n');
767
+ // Mock chat completions API
768
+ globalThis.fetch = vi.fn(async () => {
769
+ return {
770
+ ok: true,
771
+ json: async () => ({
772
+ choices: [{ message: { content: 'Response' } }],
773
+ }),
774
+ text: async () => '',
775
+ };
776
+ });
777
+ // Mock Math.random to force specific assignment (Model A = base)
778
+ const originalRandom = Math.random;
779
+ Math.random = vi.fn(() => 0.1); // < 0.5, so Model A is base
780
+ // Mock inquirer prompts: score A=3, score B=5
781
+ vi.spyOn(inquirer, 'prompt')
782
+ .mockResolvedValueOnce({ action: 'score' })
783
+ .mockResolvedValueOnce({ score: '3' }) // Base model (A) gets 3
784
+ .mockResolvedValueOnce({ score: '5' }); // Fine-tuned model (B) gets 5
785
+ const consoleLogSpy = vi.spyOn(console, 'log').mockImplementation(() => { });
786
+ const program = new Command();
787
+ registerEval(program);
788
+ await program.parseAsync(['node', 'test', 'eval', '--compare']);
789
+ // Verify results
790
+ const evalFiles = await readdir('data/evals').catch(() => []);
791
+ const resultFile = await readFile(`data/evals/${evalFiles[0]}`, 'utf-8');
792
+ const results = JSON.parse(resultFile);
793
+ expect(results.baseModelAvgScore).toBe(3);
794
+ expect(results.fineTunedModelAvgScore).toBe(5);
795
+ expect(results.fineTunedModelWins).toBe(1);
796
+ expect(results.baseModelWins).toBe(0);
797
+ expect(results.ties).toBe(0);
798
+ // Restore Math.random
799
+ Math.random = originalRandom;
800
+ consoleLogSpy.mockRestore();
801
+ });
802
+ });
803
+ });
804
+ //# sourceMappingURL=eval.test.js.map