@aeye/models 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/ReplicateScrape.md +54 -0
  2. package/dist/scripts/codegen.d.ts +21 -0
  3. package/dist/scripts/codegen.d.ts.map +1 -0
  4. package/dist/scripts/codegen.js +102 -0
  5. package/dist/scripts/codegen.js.map +1 -0
  6. package/dist/scripts/scrape.d.ts +19 -0
  7. package/dist/scripts/scrape.d.ts.map +1 -0
  8. package/dist/scripts/scrape.js +146 -0
  9. package/dist/scripts/scrape.js.map +1 -0
  10. package/dist/scripts/scrapers/__tests__/aws.test.d.ts +8 -0
  11. package/dist/scripts/scrapers/__tests__/aws.test.d.ts.map +1 -0
  12. package/dist/scripts/scrapers/__tests__/aws.test.js +73 -0
  13. package/dist/scripts/scrapers/__tests__/aws.test.js.map +1 -0
  14. package/dist/scripts/scrapers/aws.d.ts +12 -0
  15. package/dist/scripts/scrapers/aws.d.ts.map +1 -0
  16. package/dist/scripts/scrapers/aws.js +314 -0
  17. package/dist/scripts/scrapers/aws.js.map +1 -0
  18. package/dist/scripts/scrapers/openai.d.ts +12 -0
  19. package/dist/scripts/scrapers/openai.d.ts.map +1 -0
  20. package/dist/scripts/scrapers/openai.js +490 -0
  21. package/dist/scripts/scrapers/openai.js.map +1 -0
  22. package/dist/scripts/scrapers/openrouter.d.ts +13 -0
  23. package/dist/scripts/scrapers/openrouter.d.ts.map +1 -0
  24. package/dist/scripts/scrapers/openrouter.js +156 -0
  25. package/dist/scripts/scrapers/openrouter.js.map +1 -0
  26. package/dist/scripts/scrapers/replicate.d.ts +12 -0
  27. package/dist/scripts/scrapers/replicate.d.ts.map +1 -0
  28. package/dist/scripts/scrapers/replicate.js +305 -0
  29. package/dist/scripts/scrapers/replicate.js.map +1 -0
  30. package/dist/src/index.d.ts +11 -0
  31. package/dist/src/index.d.ts.map +1 -0
  32. package/dist/src/index.js +11 -0
  33. package/dist/src/index.js.map +1 -0
  34. package/dist/src/models/aws.d.ts +11 -0
  35. package/dist/src/models/aws.d.ts.map +1 -0
  36. package/dist/src/models/aws.js +2632 -0
  37. package/dist/src/models/aws.js.map +1 -0
  38. package/dist/src/models/index.d.ts +15 -0
  39. package/dist/src/models/index.d.ts.map +1 -0
  40. package/dist/src/models/index.js +18 -0
  41. package/dist/src/models/index.js.map +1 -0
  42. package/dist/src/models/openai.d.ts +11 -0
  43. package/dist/src/models/openai.d.ts.map +1 -0
  44. package/dist/src/models/openai.js +2207 -0
  45. package/dist/src/models/openai.js.map +1 -0
  46. package/dist/src/models/openrouter.d.ts +11 -0
  47. package/dist/src/models/openrouter.d.ts.map +1 -0
  48. package/dist/src/models/openrouter.js +9786 -0
  49. package/dist/src/models/openrouter.js.map +1 -0
  50. package/dist/src/models/replicate.d.ts +11 -0
  51. package/dist/src/models/replicate.d.ts.map +1 -0
  52. package/dist/src/models/replicate.js +4106 -0
  53. package/dist/src/models/replicate.js.map +1 -0
  54. package/dist/src/transformers/index.d.ts +23 -0
  55. package/dist/src/transformers/index.d.ts.map +1 -0
  56. package/dist/src/transformers/index.js +24 -0
  57. package/dist/src/transformers/index.js.map +1 -0
  58. package/package.json +50 -0
  59. package/scripts/codegen.ts +117 -0
  60. package/scripts/scrape.ts +182 -0
  61. package/scripts/scrapers/__tests__/aws.test.ts +86 -0
  62. package/scripts/scrapers/aws.ts +370 -0
  63. package/scripts/scrapers/openai.ts +619 -0
  64. package/scripts/scrapers/openrouter.ts +214 -0
  65. package/scripts/scrapers/replicate.ts +448 -0
  66. package/scripts/tsconfig.json +24 -0
  67. package/src/index.ts +11 -0
  68. package/src/models/aws.ts +2634 -0
  69. package/src/models/index.ts +21 -0
  70. package/src/models/openai.ts +2209 -0
  71. package/src/models/openrouter.ts +9788 -0
  72. package/src/models/replicate.ts +4108 -0
  73. package/src/transformers/index.ts +26 -0
  74. package/tsconfig.json +14 -0
@@ -0,0 +1,619 @@
1
+ /**
2
+ * OpenAI Model Scraper
3
+ *
4
+ * Scrapes model information from OpenAI documentation using Puppeteer
5
+ */
6
+
7
+ import * as fs from 'fs/promises';
8
+ import * as path from 'path';
9
+ import * as puppeteer from 'puppeteer';
10
+ import type { ModelInfo, ModelCapability, ModelTier, ModelParameter } from '@aeye/ai';
11
+ import { writeModelTS } from '../codegen';
12
+
13
+
14
+ const PerformanceScoreMap: Record<string, number> = {
15
+ 'highest': 1.0,
16
+ 'higher': 0.8,
17
+ 'high': 0.6,
18
+ 'average': 0.4,
19
+ 'low': 0.2,
20
+ 'lowest': 0.1,
21
+ };
22
+ const IntelligenceScoreMap: Record<string, number> = {
23
+ 'highest': 1.0,
24
+ 'higher': 0.8,
25
+ 'high': 0.6,
26
+ 'average': 0.4,
27
+ 'low': 0.2,
28
+ 'lowest': 0.1,
29
+ };
30
+ const ReasoningScoreMap: Record<string, number> = {
31
+ 'highest': 1.0,
32
+ 'higher': 0.8,
33
+ 'high': 0.6,
34
+ 'average': 0.4,
35
+ 'low': 0.2,
36
+ 'lowest': 0.1,
37
+ };
38
+ const SpeedScoreMap: Record<string, number> = {
39
+ 'fastest': 120,
40
+ 'very fast': 100,
41
+ 'fast': 80,
42
+ 'medium': 60,
43
+ 'slow': 40,
44
+ 'slowest': 20,
45
+ };
46
+
47
+ interface OpenAIModelData {
48
+ id: string;
49
+ name: string;
50
+ performance?: string; // Highest|Higher|High|Average|Low|Lowest
51
+ intelligence?: string; // Highest|Higher|High|Average|Low|Lowest
52
+ reasoning?: string; // Highest|Higher|High|Average|Low|Lowest
53
+ speed?: string; // Fastest|Very fast|Fast|Medium|Slow|Slowest
54
+ capabilities: Set<ModelCapability>;
55
+ supportedParameters?: Set<ModelParameter>;
56
+ contextWindow?: number;
57
+ maxOutputTokens?: number;
58
+ knowledgeCutoff?: string;
59
+ reasoningTokenSupport?: boolean;
60
+ pricing: {
61
+ textTokens?: {
62
+ input?: number;
63
+ output?: number;
64
+ cached?: number;
65
+ };
66
+ audioTokens?: {
67
+ input?: number;
68
+ output?: number;
69
+ };
70
+ imageTokens?: {
71
+ input?: number;
72
+ output?: {
73
+ quality: string; // e.g., low, medium, high
74
+ sizes: {
75
+ width: number;
76
+ height: number;
77
+ cost: number;
78
+ }[]
79
+ }[];
80
+ };
81
+ embeddings?: {
82
+ cost?: number;
83
+ };
84
+ };
85
+ }
86
+
87
+ /**
88
+ * Scrape OpenAI models list page
89
+ */
90
+ async function scrapeModelsListPage(): Promise<string[]> {
91
+ console.log('Scraping OpenAI models list...');
92
+
93
+ const models = await fetch('https://api.openai.com/v1/models', {
94
+ headers: {
95
+ 'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
96
+ },
97
+ });
98
+
99
+ const data = await models.json() as { data: { id: string }[] };
100
+
101
+ if (!data.data || !Array.isArray(data.data)) {
102
+ throw new Error('Invalid response from OpenAI API');
103
+ }
104
+
105
+ return data.data.map((model: any) => model.id);
106
+ }
107
+
108
+ /**
109
+ * Scrape individual model page for details
110
+ */
111
+ async function scrapeModelDetails(modelId: string, browser: puppeteer.Browser): Promise<{
112
+ modelId: string;
113
+ data: OpenAIModelData | null;
114
+ }> {
115
+ const page = await browser.newPage();
116
+
117
+ try {
118
+ await page.setUserAgent(
119
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
120
+ );
121
+
122
+ const modelSlug = /-\d{4}$/.test(modelId)
123
+ ? modelId.slice(0, -5)
124
+ : /-\d{4}-\d{2}-\d{2}$/.test(modelId)
125
+ ? modelId.slice(0, -11)
126
+ : modelId;
127
+
128
+ let url = `https://platform.openai.com/docs/models/${modelSlug}`;
129
+ if (modelSlug !== modelId) {
130
+ url += '?snapshot=' + modelId;
131
+ }
132
+
133
+ const response = await page.goto(url, {
134
+ waitUntil: ['domcontentloaded', 'networkidle0'],
135
+ timeout: 30000,
136
+ });
137
+
138
+ if (!response || response.status() === 404) {
139
+ return { modelId, data: null };
140
+ }
141
+
142
+ // Wait for main content to load
143
+ try {
144
+ // await page.waitForSelector('.docs-scroll-container', { timeout: 30000 });
145
+ } catch (e) {}
146
+
147
+ const modelData: OpenAIModelData = {
148
+ id: modelId,
149
+ name: modelId,
150
+ pricing: {},
151
+ capabilities: new Set<ModelCapability>(),
152
+ };
153
+
154
+ const bodyText = await page.$eval('body', el => el.textContent || '');
155
+ const bodyHtml = await page.$eval('body', el => el.innerHTML || '');
156
+
157
+ const grab = (regex: RegExp, fromText: string = bodyText): string | undefined => {
158
+ const match = fromText.match(regex);
159
+ return match ? match[1].toLowerCase() : undefined;
160
+ };
161
+ const grabNumber = (regex: RegExp, fromText: string = bodyText): number | undefined => {
162
+ const match = fromText.match(regex);
163
+ return match ? parseFloat(match[1].replace(/,/g, '')) : undefined;
164
+ };
165
+
166
+ // ===== HEADER =====
167
+ modelData.intelligence = grab(/Performance\s*(Highest|Higher|High|Average|Low|Lowest)/i);
168
+ modelData.performance = grab(/Intelligence\s*(Highest|Higher|High|Average|Low|Lowest)/i);
169
+ modelData.reasoning = grab(/Reasoning\s*(Highest|Higher|High|Average|Low|Lowest)/i);
170
+ modelData.speed = grab(/Speed\s*(Fastest|Very fast|Fast|Medium|Slow|Slowest)/i);
171
+ modelData.contextWindow = grabNumber(/([0-9,]+)\s+context window/i);
172
+ modelData.maxOutputTokens = grabNumber(/([0-9,]+)\s+max output tokens/i);
173
+ modelData.knowledgeCutoff = grab(/((Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Oct|October|Sep|Sept|September|Nov|November|Dec|December)\s+\d{1,2},\s+\d{4})\s+knowledge\s+cutoff/i);
174
+ modelData.reasoningTokenSupport = /Reasoning token support/i.test(bodyText);
175
+
176
+ const headerKeys: (keyof OpenAIModelData)[] = [
177
+ 'intelligence',
178
+ 'performance',
179
+ 'reasoning',
180
+ 'speed',
181
+ 'contextWindow',
182
+ 'maxOutputTokens',
183
+ 'knowledgeCutoff',
184
+ ];
185
+ const hasHeaderData = headerKeys.some((key) => modelData[key] !== undefined);
186
+ let hasPricingData = false;
187
+
188
+ const pricingStart = bodyText.indexOf('PricingPricing');
189
+ const pricingEnd = bodyText.indexOf('Modalities', pricingStart);
190
+ const pricingSection = bodyText.slice(pricingStart, pricingEnd);
191
+ const pricingSections = pricingSection.split(/(Text tokens|Image tokens|Image generation|Audio tokens|Speech generation|Embeddings|Modalities)/i);
192
+ const modalitiesStart = pricingSections.indexOf('Modalities');
193
+ const pricingSectionsTrimmed = modalitiesStart >= 0
194
+ ? pricingSections.slice(0, modalitiesStart)
195
+ : pricingSections;
196
+ const pricingSectionsGrouped: Record<string, string[]> = {};
197
+
198
+ for (let i = 1; i < pricingSectionsTrimmed.length; i += 2) {
199
+ const key = pricingSectionsTrimmed[i].toLowerCase();
200
+ const value = pricingSectionsTrimmed[i + 1] || '';
201
+ pricingSectionsGrouped[key] = pricingSectionsGrouped[key] || [];
202
+ pricingSectionsGrouped[key].push(value);
203
+ }
204
+
205
+ const textTokens = pricingSectionsGrouped['text tokens'] || [];
206
+ if (textTokens.length > 0) {
207
+ for (const section of textTokens) {
208
+ modelData.pricing.textTokens = {};
209
+ modelData.pricing.textTokens.input = grabNumber(/Input\$(\d+(\.\d+)?)/s, section);
210
+ modelData.pricing.textTokens.cached = grabNumber(/Cached input\$(\d+(\.\d+)?)/s, section);
211
+ modelData.pricing.textTokens.output = grabNumber(/Output\$(\d+(\.\d+)?)/s, section);
212
+
213
+ hasPricingData ||= modelData.pricing.textTokens.input !== undefined;
214
+ hasPricingData ||= modelData.pricing.textTokens.output !== undefined;
215
+ hasPricingData ||= modelData.pricing.textTokens.cached !== undefined;
216
+ }
217
+ }
218
+
219
+ const imageTokens = pricingSectionsGrouped['image tokens'] || [];
220
+ if (imageTokens.length > 0) {
221
+ for (const section of imageTokens) {
222
+ modelData.pricing.imageTokens = {};
223
+ modelData.pricing.imageTokens.input = grabNumber(/Input\$(\d+(\.\d+)?)/s, section);
224
+
225
+ if (modelData.pricing.imageTokens.input !== undefined) {
226
+ modelData.capabilities.add('vision');
227
+
228
+ hasPricingData = true;
229
+ }
230
+ }
231
+ }
232
+
233
+ const imageGeneration = pricingSectionsGrouped['image generation'] || [];
234
+ if (imageGeneration.length > 0) {
235
+ for (const section of imageGeneration) {
236
+ const quality = /Quality\s*([^\d]+)/i.exec(section);
237
+ if (!quality) continue;
238
+
239
+ const size = /(1024|1536|256|512|1792)x(1024|1536|256|512|1792)/;
240
+ const sizeSections = section.split(size).slice(1);
241
+
242
+ for (let i = 0; i < sizeSections.length; i += 3) {
243
+ const width = parseInt(sizeSections[i], 10);
244
+ const height = parseInt(sizeSections[i + 1], 10);
245
+ const cost = parseFloat(sizeSections[i + 2].replace(/[\$\,]+/g, ''));
246
+
247
+ if (!isFinite(cost) || !isFinite(width) || !isFinite(height)) continue;
248
+
249
+ modelData.pricing.imageTokens = modelData.pricing.imageTokens || {};
250
+ modelData.pricing.imageTokens.output = modelData.pricing.imageTokens.output || [];
251
+
252
+ let qualityEntry = modelData.pricing.imageTokens.output.find((q) => q.quality === quality[1].trim().toLowerCase());
253
+ if (!qualityEntry) {
254
+ qualityEntry = { quality: quality[1].trim().toLowerCase(), sizes: [] };
255
+ modelData.pricing.imageTokens.output.push(qualityEntry);
256
+ }
257
+
258
+ qualityEntry.sizes.push({ width, height, cost });
259
+ }
260
+ }
261
+
262
+ if (modelData.pricing.imageTokens?.output?.length) {
263
+ modelData.capabilities.add('image');
264
+
265
+ hasPricingData = true;
266
+ } else {
267
+ delete modelData.pricing.imageTokens?.output;
268
+ }
269
+ }
270
+
271
+ const audioTokens = pricingSectionsGrouped['audio tokens'] || [];
272
+ if (audioTokens.length > 0) {
273
+ for (const section of audioTokens) {
274
+ modelData.pricing.audioTokens = {};
275
+ modelData.pricing.audioTokens.input = grabNumber(/Input\$(\d+(\.\d+)?)/s, section);
276
+ modelData.pricing.audioTokens.output = grabNumber(/Output\$(\d+(\.\d+)?)/s, section);
277
+ if (modelData.pricing.audioTokens.input !== undefined) {
278
+ modelData.capabilities.add('hearing');
279
+ hasPricingData = true;
280
+ }
281
+ if (modelData.pricing.audioTokens.output !== undefined) {
282
+ modelData.capabilities.add('audio');
283
+ hasPricingData = true;
284
+ }
285
+ }
286
+ }
287
+
288
+ const speechGeneration = pricingSectionsGrouped['speech generation'] || [];
289
+ if (speechGeneration.length > 0) {
290
+ for (const section of speechGeneration) {
291
+ modelData.pricing.audioTokens = modelData.pricing.audioTokens || {};
292
+ modelData.pricing.audioTokens.output = grabNumber(/Cost\$(\d+(\.\d+)?)/s, section);
293
+ if (modelData.pricing.audioTokens.output !== undefined) {
294
+ modelData.capabilities.add('audio');
295
+ hasPricingData = true;
296
+ }
297
+ }
298
+ }
299
+
300
+ const embeddings = pricingSectionsGrouped['embeddings'] || [];
301
+ if (embeddings.length > 0) {
302
+ for (const section of embeddings) {
303
+ modelData.pricing.embeddings = {};
304
+ modelData.pricing.embeddings.cost = grabNumber(/Cost\$(\d+(\.\d+)?)/s, section);
305
+ if (modelData.pricing.embeddings.cost !== undefined) {
306
+ modelData.capabilities.add('embedding');
307
+ hasPricingData = true;
308
+ }
309
+ }
310
+ }
311
+
312
+ if (/Pricing.*Use caseTranscription/i.test(bodyText)) {
313
+ modelData.pricing.audioTokens = modelData.pricing.audioTokens || {};
314
+ modelData.pricing.audioTokens.input = grabNumber(/Pricing.*?Use caseTranscriptionCost\$(\d+(\.\d+)?)/s);
315
+
316
+ if (modelData.pricing.audioTokens.input !== undefined) {
317
+ modelData.capabilities.add('hearing');
318
+ }
319
+
320
+ hasPricingData ||= modelData.pricing.audioTokens.input !== undefined;
321
+ }
322
+
323
+ const textSupport = grab(/Text(Output only|Input only|Input and output|Not supported)/i) || '';
324
+ const imageSupport = grab(/Image(Output only|Input only|Input and output|Not supported)/i) || '';
325
+ const audioSupport = grab(/Audio(Output only|Input only|Input and output|Not supported)/i) || '';
326
+ // const videoSupport = grab(/Video(Output only|Input only|Input and output|Not supported)/i) || '';
327
+
328
+ if (textSupport.includes('input')) {
329
+ modelData.capabilities.add('chat');
330
+ }
331
+ if (imageSupport.includes('input')) {
332
+ modelData.capabilities.add('vision');
333
+ }
334
+ if (imageSupport.includes('output')) {
335
+ modelData.capabilities.add('image');
336
+ }
337
+ if (audioSupport.includes('input')) {
338
+ modelData.capabilities.add('hearing');
339
+ }
340
+ if (audioSupport.includes('output')) {
341
+ modelData.capabilities.add('audio');
342
+ }
343
+ if (/Streaming\s*Supported/i.test(bodyText)) {
344
+ modelData.capabilities.add('streaming');
345
+ }
346
+ if (/(Function|Tool) calling\s*Supported/i.test(bodyText)) {
347
+ modelData.capabilities.add('tools');
348
+ }
349
+ if (/Structured outputs?\s*Supported/i.test(bodyText)) {
350
+ modelData.capabilities.add('structured');
351
+ modelData.capabilities.add('json');
352
+ }
353
+ if (modelData.reasoningTokenSupport) {
354
+ modelData.capabilities.add('reasoning');
355
+ }
356
+
357
+ // Supported parameters
358
+ const chatEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Chat Completions</div>');
359
+ const imageEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Image generation</div>');
360
+ const embeddingEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Embeddings</div>');
361
+ const transcriptionEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Transcription</div>');
362
+ const speechEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Speech generation</div>');
363
+
364
+ if (chatEndpoint) {
365
+ modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
366
+ modelData.supportedParameters.add('maxTokens');
367
+ modelData.supportedParameters.add('temperature');
368
+ modelData.supportedParameters.add('topP');
369
+ modelData.supportedParameters.add('frequencyPenalty');
370
+ modelData.supportedParameters.add('presencePenalty');
371
+ modelData.supportedParameters.add('logitBias');
372
+ modelData.supportedParameters.add('logProbabilities');
373
+ if (modelData.reasoningTokenSupport) {
374
+ modelData.supportedParameters.add('reason');
375
+ }
376
+ if (modelData.capabilities.has('tools')) {
377
+ modelData.supportedParameters.add('tools');
378
+ modelData.supportedParameters.add('toolChoice');
379
+ }
380
+ if (modelData.capabilities.has('structured')) {
381
+ modelData.supportedParameters.add('responseFormat');
382
+ modelData.supportedParameters.add('structuredOutput');
383
+ }
384
+ if (modelData.capabilities.has('json')) {
385
+ modelData.supportedParameters.add('responseFormat');
386
+ }
387
+ if (!modelId.includes('o3') && !modelId.includes('o4')) {
388
+ modelData.supportedParameters.add('stop');
389
+ }
390
+ }
391
+ if (imageEndpoint) {
392
+ modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
393
+ if (!modelId.startsWith('dall-e-3')) {
394
+ modelData.supportedParameters.add('imageStyle');
395
+ modelData.supportedParameters.add('imageMultiple');
396
+ }
397
+ if (modelId.startsWith('gpt-image-1')) {
398
+ modelData.supportedParameters.add('imageBackground');
399
+ modelData.supportedParameters.add('imageStream');
400
+ modelData.supportedParameters.add('imageFormat');
401
+ }
402
+ }
403
+ if (embeddingEndpoint) {
404
+ modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
405
+ if (modelId.startsWith('text-embedding-3')) {
406
+ modelData.supportedParameters.add('embeddingDimensions');
407
+ }
408
+ }
409
+ if (transcriptionEndpoint) {
410
+ modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
411
+ if (!modelId.startsWith('gpt-4o-transcribe-diarize')) {
412
+ modelData.supportedParameters.add('transcribePrompt');
413
+ }
414
+ if (!modelId.startsWith('whisper-1')) {
415
+ modelData.supportedParameters.add('transcribeStream');
416
+ }
417
+ }
418
+ if (speechEndpoint) {
419
+ modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
420
+ if (!modelId.startsWith('tts-1')) {
421
+ modelData.supportedParameters.add('speechInstructions');
422
+ }
423
+ }
424
+
425
+ const hasCapabilitiesData = modelData.capabilities.size > 0;
426
+ const populatedOverall = hasHeaderData || hasPricingData || hasCapabilitiesData;
427
+
428
+ if (!modelData.supportedParameters || modelData.supportedParameters.size === 0) {
429
+ await fs.writeFile(`./data/pages/openai-${modelId.replace(/[^a-z]/gi, '')}.html`, bodyHtml);
430
+ }
431
+
432
+ if (!populatedOverall) {
433
+ console.log(`⚠ No model data found on page ${modelId}`);
434
+
435
+ await fs.writeFile(`./data/pages/openai-${modelId.replace(/[^a-z]/gi, '')}.txt`, bodyText);
436
+
437
+ return { modelId, data: null };
438
+ }
439
+
440
+ return {
441
+ modelId,
442
+ data: modelData,
443
+ };
444
+ } catch (error) {
445
+ console.log(` ✗ Error scraping ${modelId}:`, error instanceof Error ? error.message : error);
446
+ return { modelId, data: null };
447
+ } finally {
448
+ try {
449
+ await page.close();
450
+ } catch (e) {}
451
+ }
452
+ }
453
+
454
+ /**
455
+ * Scrape model details in parallel with concurrency control
456
+ */
457
+ async function scrapeModelsParallel(
458
+ modelIds: string[],
459
+ concurrency: number = 5
460
+ ): Promise<OpenAIModelData[]> {
461
+ console.log(`\nScraping model details (concurrency: ${concurrency})...`);
462
+
463
+ const browser = await puppeteer.launch({ headless: true });
464
+ const results: OpenAIModelData[] = [];
465
+
466
+ try {
467
+ // Process in batches with concurrency control
468
+ for (let i = 0; i < modelIds.length; i += concurrency) {
469
+ const batch = modelIds.slice(i, i + concurrency);
470
+ console.log(` Processing batch ${Math.floor(i / concurrency) + 1}/${Math.ceil(modelIds.length / concurrency)} (${batch.length} models)...`);
471
+
472
+ const batchResults = await Promise.all(
473
+ batch.map((modelId) => scrapeModelDetails(modelId, browser))
474
+ );
475
+
476
+ // Store results
477
+ for (const { data } of batchResults) {
478
+ if (data) {
479
+ results.push(data);
480
+ }
481
+ }
482
+
483
+ console.log(` ✓ Scraped ${batchResults.filter((r) => r.data).length}/${batch.length} models`);
484
+ }
485
+
486
+ console.log(`✓ Scraped ${results.length}/${modelIds.length} models\n`);
487
+ } finally {
488
+ await browser.close();
489
+ }
490
+
491
+ return results;
492
+ }
493
+
494
+ /**
495
+ * Convert OpenAI model data to ModelInfo
496
+ */
497
+ function convertOpenAIModel(data: OpenAIModelData): ModelInfo {
498
+ const intelligenceScore = data.intelligence ? IntelligenceScoreMap[data.intelligence] || 0 : 0;
499
+ const performanceScore = data.performance ? PerformanceScoreMap[data.performance] || 0 : 0;
500
+ const reasoningScore = data.reasoning ? ReasoningScoreMap[data.reasoning] || 0 : 0;
501
+ const nonZeros = (intelligenceScore ? 1 : 0) + (performanceScore ? 1 : 0) + (reasoningScore ? 1 : 0);
502
+ const overall = nonZeros > 0 ? (intelligenceScore + performanceScore + reasoningScore) / nonZeros : 0;
503
+ const speedScore = data.speed ? SpeedScoreMap[data.speed] || 0 : 0;
504
+
505
+ const tier: ModelTier =
506
+ overall >= 0.8
507
+ ? 'flagship'
508
+ : speedScore >= 80
509
+ ? 'efficient'
510
+ : 'legacy';
511
+
512
+ return {
513
+ provider: 'openai',
514
+ id: data.id,
515
+ name: data.name,
516
+ capabilities: data.capabilities,
517
+ supportedParameters: data.supportedParameters,
518
+ tier,
519
+ contextWindow: data.contextWindow || 0,
520
+ maxOutputTokens: data.maxOutputTokens,
521
+ pricing: {
522
+ text: data.pricing.textTokens ? {
523
+ input: data.pricing.textTokens?.input,
524
+ output: data.pricing.textTokens?.output,
525
+ cached: data.pricing.textTokens?.cached,
526
+ } : undefined,
527
+ audio: data.pricing.audioTokens ? {
528
+ input: data.pricing.audioTokens?.input,
529
+ output: data.pricing.audioTokens?.output,
530
+ } : undefined,
531
+ image: data.pricing.imageTokens ? {
532
+ input: data.pricing.imageTokens?.input,
533
+ output: data.pricing.imageTokens?.output,
534
+ } : undefined,
535
+ embeddings: data.pricing.embeddings ? {
536
+ cost: data.pricing.embeddings?.cost,
537
+ } : undefined,
538
+ },
539
+ metadata: {
540
+ knowledgeCutoff: data.knowledgeCutoff,
541
+ intelligence: data.intelligence,
542
+ performance: data.performance,
543
+ reasoning: data.reasoning,
544
+ speed: data.speed,
545
+ },
546
+ };
547
+ }
548
+
549
+ /**
550
+ * Main scraper function
551
+ */
552
+ export async function scrapeOpenAI(
553
+ outputDir: string,
554
+ options: { concurrency?: number } = {}
555
+ ): Promise<void> {
556
+ const { concurrency = 5 } = options;
557
+
558
+ console.log('\n=== OpenAI Scraper ===\n');
559
+
560
+ // Scrape models list
561
+ const modelIds = await scrapeModelsListPage();
562
+
563
+ // Scrape details for each model in parallel
564
+ const models = await scrapeModelsParallel(modelIds, concurrency);
565
+
566
+ console.log(`✓ Scraped ${models.length} OpenAI models\n`);
567
+
568
+ // Save raw data
569
+ await fs.mkdir(outputDir, { recursive: true });
570
+
571
+ await fs.writeFile(
572
+ path.join(outputDir, 'openai-models.json'),
573
+ JSON.stringify({ data: models }, (key, value) => {
574
+ if (value instanceof Set) {
575
+ return Array.from(value);
576
+ }
577
+ return value;
578
+ }, 2)
579
+ );
580
+ console.log(`✓ Saved raw OpenAI models to openai-models.json`);
581
+
582
+ // Convert to ModelInfo format
583
+ const modelInfos = models.map(convertOpenAIModel);
584
+
585
+ // Save JSON for reference
586
+ await fs.writeFile(
587
+ path.join(outputDir, 'openai-modelinfo.json'),
588
+ JSON.stringify(modelInfos, (key, value) => {
589
+ if (value instanceof Set) {
590
+ return Array.from(value);
591
+ }
592
+ return value;
593
+ }, 2)
594
+ );
595
+ console.log(`✓ Saved ${modelInfos.length} models to JSON`);
596
+
597
+ // Generate TypeScript file
598
+ const srcDir = path.join(__dirname, '../../src/models');
599
+ await writeModelTS(modelInfos, 'openaiModels', path.join(srcDir, 'openai.ts'));
600
+ console.log(`✓ Generated TypeScript file: src/models/openai.ts`);
601
+
602
+ console.log('\n✓ OpenAI scraping complete\n');
603
+ }
604
+
605
+ // CLI execution
606
+ if (process.argv[1].endsWith('openai.ts')) {
607
+ const args = process.argv.slice(2);
608
+ const outputDir = args.find((arg) => !arg.startsWith('--')) || path.join(__dirname, '../../data');
609
+
610
+ const concurrencyArg = args.find((arg) => arg.startsWith('--concurrency='));
611
+ const concurrency = concurrencyArg
612
+ ? parseInt(concurrencyArg.split('=')[1], 10)
613
+ : 5;
614
+
615
+ scrapeOpenAI(outputDir, { concurrency }).catch((error) => {
616
+ console.error('✗ OpenAI scraping failed:', error);
617
+ process.exit(1);
618
+ });
619
+ }