@aeye/models 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ReplicateScrape.md +54 -0
- package/dist/scripts/codegen.d.ts +21 -0
- package/dist/scripts/codegen.d.ts.map +1 -0
- package/dist/scripts/codegen.js +102 -0
- package/dist/scripts/codegen.js.map +1 -0
- package/dist/scripts/scrape.d.ts +19 -0
- package/dist/scripts/scrape.d.ts.map +1 -0
- package/dist/scripts/scrape.js +146 -0
- package/dist/scripts/scrape.js.map +1 -0
- package/dist/scripts/scrapers/__tests__/aws.test.d.ts +8 -0
- package/dist/scripts/scrapers/__tests__/aws.test.d.ts.map +1 -0
- package/dist/scripts/scrapers/__tests__/aws.test.js +73 -0
- package/dist/scripts/scrapers/__tests__/aws.test.js.map +1 -0
- package/dist/scripts/scrapers/aws.d.ts +12 -0
- package/dist/scripts/scrapers/aws.d.ts.map +1 -0
- package/dist/scripts/scrapers/aws.js +314 -0
- package/dist/scripts/scrapers/aws.js.map +1 -0
- package/dist/scripts/scrapers/openai.d.ts +12 -0
- package/dist/scripts/scrapers/openai.d.ts.map +1 -0
- package/dist/scripts/scrapers/openai.js +490 -0
- package/dist/scripts/scrapers/openai.js.map +1 -0
- package/dist/scripts/scrapers/openrouter.d.ts +13 -0
- package/dist/scripts/scrapers/openrouter.d.ts.map +1 -0
- package/dist/scripts/scrapers/openrouter.js +156 -0
- package/dist/scripts/scrapers/openrouter.js.map +1 -0
- package/dist/scripts/scrapers/replicate.d.ts +12 -0
- package/dist/scripts/scrapers/replicate.d.ts.map +1 -0
- package/dist/scripts/scrapers/replicate.js +305 -0
- package/dist/scripts/scrapers/replicate.js.map +1 -0
- package/dist/src/index.d.ts +11 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +11 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/models/aws.d.ts +11 -0
- package/dist/src/models/aws.d.ts.map +1 -0
- package/dist/src/models/aws.js +2632 -0
- package/dist/src/models/aws.js.map +1 -0
- package/dist/src/models/index.d.ts +15 -0
- package/dist/src/models/index.d.ts.map +1 -0
- package/dist/src/models/index.js +18 -0
- package/dist/src/models/index.js.map +1 -0
- package/dist/src/models/openai.d.ts +11 -0
- package/dist/src/models/openai.d.ts.map +1 -0
- package/dist/src/models/openai.js +2207 -0
- package/dist/src/models/openai.js.map +1 -0
- package/dist/src/models/openrouter.d.ts +11 -0
- package/dist/src/models/openrouter.d.ts.map +1 -0
- package/dist/src/models/openrouter.js +9786 -0
- package/dist/src/models/openrouter.js.map +1 -0
- package/dist/src/models/replicate.d.ts +11 -0
- package/dist/src/models/replicate.d.ts.map +1 -0
- package/dist/src/models/replicate.js +4106 -0
- package/dist/src/models/replicate.js.map +1 -0
- package/dist/src/transformers/index.d.ts +23 -0
- package/dist/src/transformers/index.d.ts.map +1 -0
- package/dist/src/transformers/index.js +24 -0
- package/dist/src/transformers/index.js.map +1 -0
- package/package.json +50 -0
- package/scripts/codegen.ts +117 -0
- package/scripts/scrape.ts +182 -0
- package/scripts/scrapers/__tests__/aws.test.ts +86 -0
- package/scripts/scrapers/aws.ts +370 -0
- package/scripts/scrapers/openai.ts +619 -0
- package/scripts/scrapers/openrouter.ts +214 -0
- package/scripts/scrapers/replicate.ts +448 -0
- package/scripts/tsconfig.json +24 -0
- package/src/index.ts +11 -0
- package/src/models/aws.ts +2634 -0
- package/src/models/index.ts +21 -0
- package/src/models/openai.ts +2209 -0
- package/src/models/openrouter.ts +9788 -0
- package/src/models/replicate.ts +4108 -0
- package/src/transformers/index.ts +26 -0
- package/tsconfig.json +14 -0
|
@@ -0,0 +1,619 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI Model Scraper
|
|
3
|
+
*
|
|
4
|
+
* Scrapes model information from OpenAI documentation using Puppeteer
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import * as fs from 'fs/promises';
|
|
8
|
+
import * as path from 'path';
|
|
9
|
+
import * as puppeteer from 'puppeteer';
|
|
10
|
+
import type { ModelInfo, ModelCapability, ModelTier, ModelParameter } from '@aeye/ai';
|
|
11
|
+
import { writeModelTS } from '../codegen';
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
const PerformanceScoreMap: Record<string, number> = {
|
|
15
|
+
'highest': 1.0,
|
|
16
|
+
'higher': 0.8,
|
|
17
|
+
'high': 0.6,
|
|
18
|
+
'average': 0.4,
|
|
19
|
+
'low': 0.2,
|
|
20
|
+
'lowest': 0.1,
|
|
21
|
+
};
|
|
22
|
+
const IntelligenceScoreMap: Record<string, number> = {
|
|
23
|
+
'highest': 1.0,
|
|
24
|
+
'higher': 0.8,
|
|
25
|
+
'high': 0.6,
|
|
26
|
+
'average': 0.4,
|
|
27
|
+
'low': 0.2,
|
|
28
|
+
'lowest': 0.1,
|
|
29
|
+
};
|
|
30
|
+
const ReasoningScoreMap: Record<string, number> = {
|
|
31
|
+
'highest': 1.0,
|
|
32
|
+
'higher': 0.8,
|
|
33
|
+
'high': 0.6,
|
|
34
|
+
'average': 0.4,
|
|
35
|
+
'low': 0.2,
|
|
36
|
+
'lowest': 0.1,
|
|
37
|
+
};
|
|
38
|
+
const SpeedScoreMap: Record<string, number> = {
|
|
39
|
+
'fastest': 120,
|
|
40
|
+
'very fast': 100,
|
|
41
|
+
'fast': 80,
|
|
42
|
+
'medium': 60,
|
|
43
|
+
'slow': 40,
|
|
44
|
+
'slowest': 20,
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
interface OpenAIModelData {
|
|
48
|
+
id: string;
|
|
49
|
+
name: string;
|
|
50
|
+
performance?: string; // Highest|Higher|High|Average|Low|Lowest
|
|
51
|
+
intelligence?: string; // Highest|Higher|High|Average|Low|Lowest
|
|
52
|
+
reasoning?: string; // Highest|Higher|High|Average|Low|Lowest
|
|
53
|
+
speed?: string; // Fastest|Very fast|Fast|Medium|Slow|Slowest
|
|
54
|
+
capabilities: Set<ModelCapability>;
|
|
55
|
+
supportedParameters?: Set<ModelParameter>;
|
|
56
|
+
contextWindow?: number;
|
|
57
|
+
maxOutputTokens?: number;
|
|
58
|
+
knowledgeCutoff?: string;
|
|
59
|
+
reasoningTokenSupport?: boolean;
|
|
60
|
+
pricing: {
|
|
61
|
+
textTokens?: {
|
|
62
|
+
input?: number;
|
|
63
|
+
output?: number;
|
|
64
|
+
cached?: number;
|
|
65
|
+
};
|
|
66
|
+
audioTokens?: {
|
|
67
|
+
input?: number;
|
|
68
|
+
output?: number;
|
|
69
|
+
};
|
|
70
|
+
imageTokens?: {
|
|
71
|
+
input?: number;
|
|
72
|
+
output?: {
|
|
73
|
+
quality: string; // e.g., low, medium, high
|
|
74
|
+
sizes: {
|
|
75
|
+
width: number;
|
|
76
|
+
height: number;
|
|
77
|
+
cost: number;
|
|
78
|
+
}[]
|
|
79
|
+
}[];
|
|
80
|
+
};
|
|
81
|
+
embeddings?: {
|
|
82
|
+
cost?: number;
|
|
83
|
+
};
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Scrape OpenAI models list page
|
|
89
|
+
*/
|
|
90
|
+
async function scrapeModelsListPage(): Promise<string[]> {
|
|
91
|
+
console.log('Scraping OpenAI models list...');
|
|
92
|
+
|
|
93
|
+
const models = await fetch('https://api.openai.com/v1/models', {
|
|
94
|
+
headers: {
|
|
95
|
+
'Authorization': `Bearer ${process.env.OPENAI_API_KEY}`,
|
|
96
|
+
},
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
const data = await models.json() as { data: { id: string }[] };
|
|
100
|
+
|
|
101
|
+
if (!data.data || !Array.isArray(data.data)) {
|
|
102
|
+
throw new Error('Invalid response from OpenAI API');
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return data.data.map((model: any) => model.id);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Scrape individual model page for details
|
|
110
|
+
*/
|
|
111
|
+
async function scrapeModelDetails(modelId: string, browser: puppeteer.Browser): Promise<{
|
|
112
|
+
modelId: string;
|
|
113
|
+
data: OpenAIModelData | null;
|
|
114
|
+
}> {
|
|
115
|
+
const page = await browser.newPage();
|
|
116
|
+
|
|
117
|
+
try {
|
|
118
|
+
await page.setUserAgent(
|
|
119
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
120
|
+
);
|
|
121
|
+
|
|
122
|
+
const modelSlug = /-\d{4}$/.test(modelId)
|
|
123
|
+
? modelId.slice(0, -5)
|
|
124
|
+
: /-\d{4}-\d{2}-\d{2}$/.test(modelId)
|
|
125
|
+
? modelId.slice(0, -11)
|
|
126
|
+
: modelId;
|
|
127
|
+
|
|
128
|
+
let url = `https://platform.openai.com/docs/models/${modelSlug}`;
|
|
129
|
+
if (modelSlug !== modelId) {
|
|
130
|
+
url += '?snapshot=' + modelId;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const response = await page.goto(url, {
|
|
134
|
+
waitUntil: ['domcontentloaded', 'networkidle0'],
|
|
135
|
+
timeout: 30000,
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
if (!response || response.status() === 404) {
|
|
139
|
+
return { modelId, data: null };
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Wait for main content to load
|
|
143
|
+
try {
|
|
144
|
+
// await page.waitForSelector('.docs-scroll-container', { timeout: 30000 });
|
|
145
|
+
} catch (e) {}
|
|
146
|
+
|
|
147
|
+
const modelData: OpenAIModelData = {
|
|
148
|
+
id: modelId,
|
|
149
|
+
name: modelId,
|
|
150
|
+
pricing: {},
|
|
151
|
+
capabilities: new Set<ModelCapability>(),
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
const bodyText = await page.$eval('body', el => el.textContent || '');
|
|
155
|
+
const bodyHtml = await page.$eval('body', el => el.innerHTML || '');
|
|
156
|
+
|
|
157
|
+
const grab = (regex: RegExp, fromText: string = bodyText): string | undefined => {
|
|
158
|
+
const match = fromText.match(regex);
|
|
159
|
+
return match ? match[1].toLowerCase() : undefined;
|
|
160
|
+
};
|
|
161
|
+
const grabNumber = (regex: RegExp, fromText: string = bodyText): number | undefined => {
|
|
162
|
+
const match = fromText.match(regex);
|
|
163
|
+
return match ? parseFloat(match[1].replace(/,/g, '')) : undefined;
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
// ===== HEADER =====
|
|
167
|
+
modelData.intelligence = grab(/Performance\s*(Highest|Higher|High|Average|Low|Lowest)/i);
|
|
168
|
+
modelData.performance = grab(/Intelligence\s*(Highest|Higher|High|Average|Low|Lowest)/i);
|
|
169
|
+
modelData.reasoning = grab(/Reasoning\s*(Highest|Higher|High|Average|Low|Lowest)/i);
|
|
170
|
+
modelData.speed = grab(/Speed\s*(Fastest|Very fast|Fast|Medium|Slow|Slowest)/i);
|
|
171
|
+
modelData.contextWindow = grabNumber(/([0-9,]+)\s+context window/i);
|
|
172
|
+
modelData.maxOutputTokens = grabNumber(/([0-9,]+)\s+max output tokens/i);
|
|
173
|
+
modelData.knowledgeCutoff = grab(/((Jan|January|Feb|February|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|August|Oct|October|Sep|Sept|September|Nov|November|Dec|December)\s+\d{1,2},\s+\d{4})\s+knowledge\s+cutoff/i);
|
|
174
|
+
modelData.reasoningTokenSupport = /Reasoning token support/i.test(bodyText);
|
|
175
|
+
|
|
176
|
+
const headerKeys: (keyof OpenAIModelData)[] = [
|
|
177
|
+
'intelligence',
|
|
178
|
+
'performance',
|
|
179
|
+
'reasoning',
|
|
180
|
+
'speed',
|
|
181
|
+
'contextWindow',
|
|
182
|
+
'maxOutputTokens',
|
|
183
|
+
'knowledgeCutoff',
|
|
184
|
+
];
|
|
185
|
+
const hasHeaderData = headerKeys.some((key) => modelData[key] !== undefined);
|
|
186
|
+
let hasPricingData = false;
|
|
187
|
+
|
|
188
|
+
const pricingStart = bodyText.indexOf('PricingPricing');
|
|
189
|
+
const pricingEnd = bodyText.indexOf('Modalities', pricingStart);
|
|
190
|
+
const pricingSection = bodyText.slice(pricingStart, pricingEnd);
|
|
191
|
+
const pricingSections = pricingSection.split(/(Text tokens|Image tokens|Image generation|Audio tokens|Speech generation|Embeddings|Modalities)/i);
|
|
192
|
+
const modalitiesStart = pricingSections.indexOf('Modalities');
|
|
193
|
+
const pricingSectionsTrimmed = modalitiesStart >= 0
|
|
194
|
+
? pricingSections.slice(0, modalitiesStart)
|
|
195
|
+
: pricingSections;
|
|
196
|
+
const pricingSectionsGrouped: Record<string, string[]> = {};
|
|
197
|
+
|
|
198
|
+
for (let i = 1; i < pricingSectionsTrimmed.length; i += 2) {
|
|
199
|
+
const key = pricingSectionsTrimmed[i].toLowerCase();
|
|
200
|
+
const value = pricingSectionsTrimmed[i + 1] || '';
|
|
201
|
+
pricingSectionsGrouped[key] = pricingSectionsGrouped[key] || [];
|
|
202
|
+
pricingSectionsGrouped[key].push(value);
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const textTokens = pricingSectionsGrouped['text tokens'] || [];
|
|
206
|
+
if (textTokens.length > 0) {
|
|
207
|
+
for (const section of textTokens) {
|
|
208
|
+
modelData.pricing.textTokens = {};
|
|
209
|
+
modelData.pricing.textTokens.input = grabNumber(/Input\$(\d+(\.\d+)?)/s, section);
|
|
210
|
+
modelData.pricing.textTokens.cached = grabNumber(/Cached input\$(\d+(\.\d+)?)/s, section);
|
|
211
|
+
modelData.pricing.textTokens.output = grabNumber(/Output\$(\d+(\.\d+)?)/s, section);
|
|
212
|
+
|
|
213
|
+
hasPricingData ||= modelData.pricing.textTokens.input !== undefined;
|
|
214
|
+
hasPricingData ||= modelData.pricing.textTokens.output !== undefined;
|
|
215
|
+
hasPricingData ||= modelData.pricing.textTokens.cached !== undefined;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const imageTokens = pricingSectionsGrouped['image tokens'] || [];
|
|
220
|
+
if (imageTokens.length > 0) {
|
|
221
|
+
for (const section of imageTokens) {
|
|
222
|
+
modelData.pricing.imageTokens = {};
|
|
223
|
+
modelData.pricing.imageTokens.input = grabNumber(/Input\$(\d+(\.\d+)?)/s, section);
|
|
224
|
+
|
|
225
|
+
if (modelData.pricing.imageTokens.input !== undefined) {
|
|
226
|
+
modelData.capabilities.add('vision');
|
|
227
|
+
|
|
228
|
+
hasPricingData = true;
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const imageGeneration = pricingSectionsGrouped['image generation'] || [];
|
|
234
|
+
if (imageGeneration.length > 0) {
|
|
235
|
+
for (const section of imageGeneration) {
|
|
236
|
+
const quality = /Quality\s*([^\d]+)/i.exec(section);
|
|
237
|
+
if (!quality) continue;
|
|
238
|
+
|
|
239
|
+
const size = /(1024|1536|256|512|1792)x(1024|1536|256|512|1792)/;
|
|
240
|
+
const sizeSections = section.split(size).slice(1);
|
|
241
|
+
|
|
242
|
+
for (let i = 0; i < sizeSections.length; i += 3) {
|
|
243
|
+
const width = parseInt(sizeSections[i], 10);
|
|
244
|
+
const height = parseInt(sizeSections[i + 1], 10);
|
|
245
|
+
const cost = parseFloat(sizeSections[i + 2].replace(/[\$\,]+/g, ''));
|
|
246
|
+
|
|
247
|
+
if (!isFinite(cost) || !isFinite(width) || !isFinite(height)) continue;
|
|
248
|
+
|
|
249
|
+
modelData.pricing.imageTokens = modelData.pricing.imageTokens || {};
|
|
250
|
+
modelData.pricing.imageTokens.output = modelData.pricing.imageTokens.output || [];
|
|
251
|
+
|
|
252
|
+
let qualityEntry = modelData.pricing.imageTokens.output.find((q) => q.quality === quality[1].trim().toLowerCase());
|
|
253
|
+
if (!qualityEntry) {
|
|
254
|
+
qualityEntry = { quality: quality[1].trim().toLowerCase(), sizes: [] };
|
|
255
|
+
modelData.pricing.imageTokens.output.push(qualityEntry);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
qualityEntry.sizes.push({ width, height, cost });
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
if (modelData.pricing.imageTokens?.output?.length) {
|
|
263
|
+
modelData.capabilities.add('image');
|
|
264
|
+
|
|
265
|
+
hasPricingData = true;
|
|
266
|
+
} else {
|
|
267
|
+
delete modelData.pricing.imageTokens?.output;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
const audioTokens = pricingSectionsGrouped['audio tokens'] || [];
|
|
272
|
+
if (audioTokens.length > 0) {
|
|
273
|
+
for (const section of audioTokens) {
|
|
274
|
+
modelData.pricing.audioTokens = {};
|
|
275
|
+
modelData.pricing.audioTokens.input = grabNumber(/Input\$(\d+(\.\d+)?)/s, section);
|
|
276
|
+
modelData.pricing.audioTokens.output = grabNumber(/Output\$(\d+(\.\d+)?)/s, section);
|
|
277
|
+
if (modelData.pricing.audioTokens.input !== undefined) {
|
|
278
|
+
modelData.capabilities.add('hearing');
|
|
279
|
+
hasPricingData = true;
|
|
280
|
+
}
|
|
281
|
+
if (modelData.pricing.audioTokens.output !== undefined) {
|
|
282
|
+
modelData.capabilities.add('audio');
|
|
283
|
+
hasPricingData = true;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
const speechGeneration = pricingSectionsGrouped['speech generation'] || [];
|
|
289
|
+
if (speechGeneration.length > 0) {
|
|
290
|
+
for (const section of speechGeneration) {
|
|
291
|
+
modelData.pricing.audioTokens = modelData.pricing.audioTokens || {};
|
|
292
|
+
modelData.pricing.audioTokens.output = grabNumber(/Cost\$(\d+(\.\d+)?)/s, section);
|
|
293
|
+
if (modelData.pricing.audioTokens.output !== undefined) {
|
|
294
|
+
modelData.capabilities.add('audio');
|
|
295
|
+
hasPricingData = true;
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
const embeddings = pricingSectionsGrouped['embeddings'] || [];
|
|
301
|
+
if (embeddings.length > 0) {
|
|
302
|
+
for (const section of embeddings) {
|
|
303
|
+
modelData.pricing.embeddings = {};
|
|
304
|
+
modelData.pricing.embeddings.cost = grabNumber(/Cost\$(\d+(\.\d+)?)/s, section);
|
|
305
|
+
if (modelData.pricing.embeddings.cost !== undefined) {
|
|
306
|
+
modelData.capabilities.add('embedding');
|
|
307
|
+
hasPricingData = true;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
if (/Pricing.*Use caseTranscription/i.test(bodyText)) {
|
|
313
|
+
modelData.pricing.audioTokens = modelData.pricing.audioTokens || {};
|
|
314
|
+
modelData.pricing.audioTokens.input = grabNumber(/Pricing.*?Use caseTranscriptionCost\$(\d+(\.\d+)?)/s);
|
|
315
|
+
|
|
316
|
+
if (modelData.pricing.audioTokens.input !== undefined) {
|
|
317
|
+
modelData.capabilities.add('hearing');
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
hasPricingData ||= modelData.pricing.audioTokens.input !== undefined;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
const textSupport = grab(/Text(Output only|Input only|Input and output|Not supported)/i) || '';
|
|
324
|
+
const imageSupport = grab(/Image(Output only|Input only|Input and output|Not supported)/i) || '';
|
|
325
|
+
const audioSupport = grab(/Audio(Output only|Input only|Input and output|Not supported)/i) || '';
|
|
326
|
+
// const videoSupport = grab(/Video(Output only|Input only|Input and output|Not supported)/i) || '';
|
|
327
|
+
|
|
328
|
+
if (textSupport.includes('input')) {
|
|
329
|
+
modelData.capabilities.add('chat');
|
|
330
|
+
}
|
|
331
|
+
if (imageSupport.includes('input')) {
|
|
332
|
+
modelData.capabilities.add('vision');
|
|
333
|
+
}
|
|
334
|
+
if (imageSupport.includes('output')) {
|
|
335
|
+
modelData.capabilities.add('image');
|
|
336
|
+
}
|
|
337
|
+
if (audioSupport.includes('input')) {
|
|
338
|
+
modelData.capabilities.add('hearing');
|
|
339
|
+
}
|
|
340
|
+
if (audioSupport.includes('output')) {
|
|
341
|
+
modelData.capabilities.add('audio');
|
|
342
|
+
}
|
|
343
|
+
if (/Streaming\s*Supported/i.test(bodyText)) {
|
|
344
|
+
modelData.capabilities.add('streaming');
|
|
345
|
+
}
|
|
346
|
+
if (/(Function|Tool) calling\s*Supported/i.test(bodyText)) {
|
|
347
|
+
modelData.capabilities.add('tools');
|
|
348
|
+
}
|
|
349
|
+
if (/Structured outputs?\s*Supported/i.test(bodyText)) {
|
|
350
|
+
modelData.capabilities.add('structured');
|
|
351
|
+
modelData.capabilities.add('json');
|
|
352
|
+
}
|
|
353
|
+
if (modelData.reasoningTokenSupport) {
|
|
354
|
+
modelData.capabilities.add('reasoning');
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Supported parameters
|
|
358
|
+
const chatEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Chat Completions</div>');
|
|
359
|
+
const imageEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Image generation</div>');
|
|
360
|
+
const embeddingEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Embeddings</div>');
|
|
361
|
+
const transcriptionEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Transcription</div>');
|
|
362
|
+
const speechEndpoint = bodyHtml.includes('<div class="text-sm font-semibold">Speech generation</div>');
|
|
363
|
+
|
|
364
|
+
if (chatEndpoint) {
|
|
365
|
+
modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
|
|
366
|
+
modelData.supportedParameters.add('maxTokens');
|
|
367
|
+
modelData.supportedParameters.add('temperature');
|
|
368
|
+
modelData.supportedParameters.add('topP');
|
|
369
|
+
modelData.supportedParameters.add('frequencyPenalty');
|
|
370
|
+
modelData.supportedParameters.add('presencePenalty');
|
|
371
|
+
modelData.supportedParameters.add('logitBias');
|
|
372
|
+
modelData.supportedParameters.add('logProbabilities');
|
|
373
|
+
if (modelData.reasoningTokenSupport) {
|
|
374
|
+
modelData.supportedParameters.add('reason');
|
|
375
|
+
}
|
|
376
|
+
if (modelData.capabilities.has('tools')) {
|
|
377
|
+
modelData.supportedParameters.add('tools');
|
|
378
|
+
modelData.supportedParameters.add('toolChoice');
|
|
379
|
+
}
|
|
380
|
+
if (modelData.capabilities.has('structured')) {
|
|
381
|
+
modelData.supportedParameters.add('responseFormat');
|
|
382
|
+
modelData.supportedParameters.add('structuredOutput');
|
|
383
|
+
}
|
|
384
|
+
if (modelData.capabilities.has('json')) {
|
|
385
|
+
modelData.supportedParameters.add('responseFormat');
|
|
386
|
+
}
|
|
387
|
+
if (!modelId.includes('o3') && !modelId.includes('o4')) {
|
|
388
|
+
modelData.supportedParameters.add('stop');
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
if (imageEndpoint) {
|
|
392
|
+
modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
|
|
393
|
+
if (!modelId.startsWith('dall-e-3')) {
|
|
394
|
+
modelData.supportedParameters.add('imageStyle');
|
|
395
|
+
modelData.supportedParameters.add('imageMultiple');
|
|
396
|
+
}
|
|
397
|
+
if (modelId.startsWith('gpt-image-1')) {
|
|
398
|
+
modelData.supportedParameters.add('imageBackground');
|
|
399
|
+
modelData.supportedParameters.add('imageStream');
|
|
400
|
+
modelData.supportedParameters.add('imageFormat');
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
if (embeddingEndpoint) {
|
|
404
|
+
modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
|
|
405
|
+
if (modelId.startsWith('text-embedding-3')) {
|
|
406
|
+
modelData.supportedParameters.add('embeddingDimensions');
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
if (transcriptionEndpoint) {
|
|
410
|
+
modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
|
|
411
|
+
if (!modelId.startsWith('gpt-4o-transcribe-diarize')) {
|
|
412
|
+
modelData.supportedParameters.add('transcribePrompt');
|
|
413
|
+
}
|
|
414
|
+
if (!modelId.startsWith('whisper-1')) {
|
|
415
|
+
modelData.supportedParameters.add('transcribeStream');
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
if (speechEndpoint) {
|
|
419
|
+
modelData.supportedParameters = modelData.supportedParameters || new Set<ModelParameter>();
|
|
420
|
+
if (!modelId.startsWith('tts-1')) {
|
|
421
|
+
modelData.supportedParameters.add('speechInstructions');
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
const hasCapabilitiesData = modelData.capabilities.size > 0;
|
|
426
|
+
const populatedOverall = hasHeaderData || hasPricingData || hasCapabilitiesData;
|
|
427
|
+
|
|
428
|
+
if (!modelData.supportedParameters || modelData.supportedParameters.size === 0) {
|
|
429
|
+
await fs.writeFile(`./data/pages/openai-${modelId.replace(/[^a-z]/gi, '')}.html`, bodyHtml);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
if (!populatedOverall) {
|
|
433
|
+
console.log(`⚠ No model data found on page ${modelId}`);
|
|
434
|
+
|
|
435
|
+
await fs.writeFile(`./data/pages/openai-${modelId.replace(/[^a-z]/gi, '')}.txt`, bodyText);
|
|
436
|
+
|
|
437
|
+
return { modelId, data: null };
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
return {
|
|
441
|
+
modelId,
|
|
442
|
+
data: modelData,
|
|
443
|
+
};
|
|
444
|
+
} catch (error) {
|
|
445
|
+
console.log(` ✗ Error scraping ${modelId}:`, error instanceof Error ? error.message : error);
|
|
446
|
+
return { modelId, data: null };
|
|
447
|
+
} finally {
|
|
448
|
+
try {
|
|
449
|
+
await page.close();
|
|
450
|
+
} catch (e) {}
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Scrape model details in parallel with concurrency control
|
|
456
|
+
*/
|
|
457
|
+
async function scrapeModelsParallel(
|
|
458
|
+
modelIds: string[],
|
|
459
|
+
concurrency: number = 5
|
|
460
|
+
): Promise<OpenAIModelData[]> {
|
|
461
|
+
console.log(`\nScraping model details (concurrency: ${concurrency})...`);
|
|
462
|
+
|
|
463
|
+
const browser = await puppeteer.launch({ headless: true });
|
|
464
|
+
const results: OpenAIModelData[] = [];
|
|
465
|
+
|
|
466
|
+
try {
|
|
467
|
+
// Process in batches with concurrency control
|
|
468
|
+
for (let i = 0; i < modelIds.length; i += concurrency) {
|
|
469
|
+
const batch = modelIds.slice(i, i + concurrency);
|
|
470
|
+
console.log(` Processing batch ${Math.floor(i / concurrency) + 1}/${Math.ceil(modelIds.length / concurrency)} (${batch.length} models)...`);
|
|
471
|
+
|
|
472
|
+
const batchResults = await Promise.all(
|
|
473
|
+
batch.map((modelId) => scrapeModelDetails(modelId, browser))
|
|
474
|
+
);
|
|
475
|
+
|
|
476
|
+
// Store results
|
|
477
|
+
for (const { data } of batchResults) {
|
|
478
|
+
if (data) {
|
|
479
|
+
results.push(data);
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
console.log(` ✓ Scraped ${batchResults.filter((r) => r.data).length}/${batch.length} models`);
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
console.log(`✓ Scraped ${results.length}/${modelIds.length} models\n`);
|
|
487
|
+
} finally {
|
|
488
|
+
await browser.close();
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
return results;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
/**
|
|
495
|
+
* Convert OpenAI model data to ModelInfo
|
|
496
|
+
*/
|
|
497
|
+
function convertOpenAIModel(data: OpenAIModelData): ModelInfo {
|
|
498
|
+
const intelligenceScore = data.intelligence ? IntelligenceScoreMap[data.intelligence] || 0 : 0;
|
|
499
|
+
const performanceScore = data.performance ? PerformanceScoreMap[data.performance] || 0 : 0;
|
|
500
|
+
const reasoningScore = data.reasoning ? ReasoningScoreMap[data.reasoning] || 0 : 0;
|
|
501
|
+
const nonZeros = (intelligenceScore ? 1 : 0) + (performanceScore ? 1 : 0) + (reasoningScore ? 1 : 0);
|
|
502
|
+
const overall = nonZeros > 0 ? (intelligenceScore + performanceScore + reasoningScore) / nonZeros : 0;
|
|
503
|
+
const speedScore = data.speed ? SpeedScoreMap[data.speed] || 0 : 0;
|
|
504
|
+
|
|
505
|
+
const tier: ModelTier =
|
|
506
|
+
overall >= 0.8
|
|
507
|
+
? 'flagship'
|
|
508
|
+
: speedScore >= 80
|
|
509
|
+
? 'efficient'
|
|
510
|
+
: 'legacy';
|
|
511
|
+
|
|
512
|
+
return {
|
|
513
|
+
provider: 'openai',
|
|
514
|
+
id: data.id,
|
|
515
|
+
name: data.name,
|
|
516
|
+
capabilities: data.capabilities,
|
|
517
|
+
supportedParameters: data.supportedParameters,
|
|
518
|
+
tier,
|
|
519
|
+
contextWindow: data.contextWindow || 0,
|
|
520
|
+
maxOutputTokens: data.maxOutputTokens,
|
|
521
|
+
pricing: {
|
|
522
|
+
text: data.pricing.textTokens ? {
|
|
523
|
+
input: data.pricing.textTokens?.input,
|
|
524
|
+
output: data.pricing.textTokens?.output,
|
|
525
|
+
cached: data.pricing.textTokens?.cached,
|
|
526
|
+
} : undefined,
|
|
527
|
+
audio: data.pricing.audioTokens ? {
|
|
528
|
+
input: data.pricing.audioTokens?.input,
|
|
529
|
+
output: data.pricing.audioTokens?.output,
|
|
530
|
+
} : undefined,
|
|
531
|
+
image: data.pricing.imageTokens ? {
|
|
532
|
+
input: data.pricing.imageTokens?.input,
|
|
533
|
+
output: data.pricing.imageTokens?.output,
|
|
534
|
+
} : undefined,
|
|
535
|
+
embeddings: data.pricing.embeddings ? {
|
|
536
|
+
cost: data.pricing.embeddings?.cost,
|
|
537
|
+
} : undefined,
|
|
538
|
+
},
|
|
539
|
+
metadata: {
|
|
540
|
+
knowledgeCutoff: data.knowledgeCutoff,
|
|
541
|
+
intelligence: data.intelligence,
|
|
542
|
+
performance: data.performance,
|
|
543
|
+
reasoning: data.reasoning,
|
|
544
|
+
speed: data.speed,
|
|
545
|
+
},
|
|
546
|
+
};
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
/**
|
|
550
|
+
* Main scraper function
|
|
551
|
+
*/
|
|
552
|
+
export async function scrapeOpenAI(
|
|
553
|
+
outputDir: string,
|
|
554
|
+
options: { concurrency?: number } = {}
|
|
555
|
+
): Promise<void> {
|
|
556
|
+
const { concurrency = 5 } = options;
|
|
557
|
+
|
|
558
|
+
console.log('\n=== OpenAI Scraper ===\n');
|
|
559
|
+
|
|
560
|
+
// Scrape models list
|
|
561
|
+
const modelIds = await scrapeModelsListPage();
|
|
562
|
+
|
|
563
|
+
// Scrape details for each model in parallel
|
|
564
|
+
const models = await scrapeModelsParallel(modelIds, concurrency);
|
|
565
|
+
|
|
566
|
+
console.log(`✓ Scraped ${models.length} OpenAI models\n`);
|
|
567
|
+
|
|
568
|
+
// Save raw data
|
|
569
|
+
await fs.mkdir(outputDir, { recursive: true });
|
|
570
|
+
|
|
571
|
+
await fs.writeFile(
|
|
572
|
+
path.join(outputDir, 'openai-models.json'),
|
|
573
|
+
JSON.stringify({ data: models }, (key, value) => {
|
|
574
|
+
if (value instanceof Set) {
|
|
575
|
+
return Array.from(value);
|
|
576
|
+
}
|
|
577
|
+
return value;
|
|
578
|
+
}, 2)
|
|
579
|
+
);
|
|
580
|
+
console.log(`✓ Saved raw OpenAI models to openai-models.json`);
|
|
581
|
+
|
|
582
|
+
// Convert to ModelInfo format
|
|
583
|
+
const modelInfos = models.map(convertOpenAIModel);
|
|
584
|
+
|
|
585
|
+
// Save JSON for reference
|
|
586
|
+
await fs.writeFile(
|
|
587
|
+
path.join(outputDir, 'openai-modelinfo.json'),
|
|
588
|
+
JSON.stringify(modelInfos, (key, value) => {
|
|
589
|
+
if (value instanceof Set) {
|
|
590
|
+
return Array.from(value);
|
|
591
|
+
}
|
|
592
|
+
return value;
|
|
593
|
+
}, 2)
|
|
594
|
+
);
|
|
595
|
+
console.log(`✓ Saved ${modelInfos.length} models to JSON`);
|
|
596
|
+
|
|
597
|
+
// Generate TypeScript file
|
|
598
|
+
const srcDir = path.join(__dirname, '../../src/models');
|
|
599
|
+
await writeModelTS(modelInfos, 'openaiModels', path.join(srcDir, 'openai.ts'));
|
|
600
|
+
console.log(`✓ Generated TypeScript file: src/models/openai.ts`);
|
|
601
|
+
|
|
602
|
+
console.log('\n✓ OpenAI scraping complete\n');
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
// CLI execution
|
|
606
|
+
if (process.argv[1].endsWith('openai.ts')) {
|
|
607
|
+
const args = process.argv.slice(2);
|
|
608
|
+
const outputDir = args.find((arg) => !arg.startsWith('--')) || path.join(__dirname, '../../data');
|
|
609
|
+
|
|
610
|
+
const concurrencyArg = args.find((arg) => arg.startsWith('--concurrency='));
|
|
611
|
+
const concurrency = concurrencyArg
|
|
612
|
+
? parseInt(concurrencyArg.split('=')[1], 10)
|
|
613
|
+
: 5;
|
|
614
|
+
|
|
615
|
+
scrapeOpenAI(outputDir, { concurrency }).catch((error) => {
|
|
616
|
+
console.error('✗ OpenAI scraping failed:', error);
|
|
617
|
+
process.exit(1);
|
|
618
|
+
});
|
|
619
|
+
}
|