@almadar/llm 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +72 -0
- package/dist/chunk-KH4JNOLT.js +174 -0
- package/dist/chunk-KH4JNOLT.js.map +1 -0
- package/dist/chunk-MJS33AAS.js +234 -0
- package/dist/chunk-MJS33AAS.js.map +1 -0
- package/dist/chunk-PV3G5PJS.js +633 -0
- package/dist/chunk-PV3G5PJS.js.map +1 -0
- package/dist/chunk-WM7QVK2Z.js +192 -0
- package/dist/chunk-WM7QVK2Z.js.map +1 -0
- package/dist/client.d.ts +136 -0
- package/dist/client.js +39 -0
- package/dist/client.js.map +1 -0
- package/dist/index.d.ts +67 -0
- package/dist/index.js +477 -0
- package/dist/index.js.map +1 -0
- package/dist/json-parser.d.ts +43 -0
- package/dist/json-parser.js +15 -0
- package/dist/json-parser.js.map +1 -0
- package/dist/rate-limiter-9XAWfHwe.d.ts +98 -0
- package/dist/structured-output.d.ts +113 -0
- package/dist/structured-output.js +16 -0
- package/dist/structured-output.js.map +1 -0
- package/package.json +55 -0
- package/src/client.ts +967 -0
- package/src/continuation.ts +290 -0
- package/src/index.ts +87 -0
- package/src/json-parser.ts +273 -0
- package/src/rate-limiter.ts +237 -0
- package/src/structured-output.ts +330 -0
- package/src/token-tracker.ts +116 -0
- package/src/truncation-detector.ts +308 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Rate Limiter for LLM API Calls
|
|
3
|
+
*
|
|
4
|
+
* Implements token bucket algorithm with:
|
|
5
|
+
* - Configurable requests per minute/second
|
|
6
|
+
* - Automatic backoff on 429 errors
|
|
7
|
+
* - Queue for pending requests
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
export interface RateLimiterOptions {
|
|
13
|
+
/** Maximum requests per minute (default: 60) */
|
|
14
|
+
requestsPerMinute?: number;
|
|
15
|
+
/** Maximum requests per second (default: 3) */
|
|
16
|
+
requestsPerSecond?: number;
|
|
17
|
+
/** Maximum concurrent requests (default: 5) */
|
|
18
|
+
maxConcurrent?: number;
|
|
19
|
+
/** Base delay for exponential backoff in ms (default: 1000) */
|
|
20
|
+
baseBackoffMs?: number;
|
|
21
|
+
/** Maximum backoff delay in ms (default: 60000) */
|
|
22
|
+
maxBackoffMs?: number;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
interface QueuedRequest<T> {
|
|
26
|
+
execute: () => Promise<T>;
|
|
27
|
+
resolve: (value: T) => void;
|
|
28
|
+
reject: (error: Error) => void;
|
|
29
|
+
retryCount: number;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Rate limiter for LLM API calls using token bucket algorithm.
|
|
34
|
+
*
|
|
35
|
+
* @example
|
|
36
|
+
* ```typescript
|
|
37
|
+
* const limiter = new RateLimiter({ requestsPerMinute: 30 });
|
|
38
|
+
* const result = await limiter.execute(() => llm.invoke(messages));
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
export class RateLimiter {
|
|
42
|
+
private requestsPerMinute: number;
|
|
43
|
+
private requestsPerSecond: number;
|
|
44
|
+
private maxConcurrent: number;
|
|
45
|
+
private baseBackoffMs: number;
|
|
46
|
+
private maxBackoffMs: number;
|
|
47
|
+
|
|
48
|
+
private minuteTokens: number;
|
|
49
|
+
private secondTokens: number;
|
|
50
|
+
private activeRequests: number = 0;
|
|
51
|
+
private queue: QueuedRequest<unknown>[] = [];
|
|
52
|
+
private lastMinuteReset: number = Date.now();
|
|
53
|
+
private lastSecondReset: number = Date.now();
|
|
54
|
+
private processing: boolean = false;
|
|
55
|
+
private currentBackoffMs: number = 0;
|
|
56
|
+
|
|
57
|
+
constructor(options: RateLimiterOptions = {}) {
|
|
58
|
+
this.requestsPerMinute = options.requestsPerMinute ?? 60;
|
|
59
|
+
this.requestsPerSecond = options.requestsPerSecond ?? 3;
|
|
60
|
+
this.maxConcurrent = options.maxConcurrent ?? 5;
|
|
61
|
+
this.baseBackoffMs = options.baseBackoffMs ?? 1000;
|
|
62
|
+
this.maxBackoffMs = options.maxBackoffMs ?? 60000;
|
|
63
|
+
|
|
64
|
+
this.minuteTokens = this.requestsPerMinute;
|
|
65
|
+
this.secondTokens = this.requestsPerSecond;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
async execute<T>(fn: () => Promise<T>, _maxRetries: number = 3): Promise<T> {
|
|
69
|
+
return new Promise<T>((resolve, reject) => {
|
|
70
|
+
this.queue.push({
|
|
71
|
+
execute: fn as () => Promise<unknown>,
|
|
72
|
+
resolve: resolve as (value: unknown) => void,
|
|
73
|
+
reject,
|
|
74
|
+
retryCount: 0,
|
|
75
|
+
});
|
|
76
|
+
this.processQueue();
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
getStatus(): {
|
|
81
|
+
queueLength: number;
|
|
82
|
+
activeRequests: number;
|
|
83
|
+
minuteTokens: number;
|
|
84
|
+
secondTokens: number;
|
|
85
|
+
backoffMs: number;
|
|
86
|
+
} {
|
|
87
|
+
return {
|
|
88
|
+
queueLength: this.queue.length,
|
|
89
|
+
activeRequests: this.activeRequests,
|
|
90
|
+
minuteTokens: this.minuteTokens,
|
|
91
|
+
secondTokens: this.secondTokens,
|
|
92
|
+
backoffMs: this.currentBackoffMs,
|
|
93
|
+
};
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
reset(): void {
|
|
97
|
+
this.minuteTokens = this.requestsPerMinute;
|
|
98
|
+
this.secondTokens = this.requestsPerSecond;
|
|
99
|
+
this.activeRequests = 0;
|
|
100
|
+
this.queue = [];
|
|
101
|
+
this.currentBackoffMs = 0;
|
|
102
|
+
this.lastMinuteReset = Date.now();
|
|
103
|
+
this.lastSecondReset = Date.now();
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
private async processQueue(): Promise<void> {
|
|
107
|
+
if (this.processing) return;
|
|
108
|
+
this.processing = true;
|
|
109
|
+
|
|
110
|
+
while (this.queue.length > 0) {
|
|
111
|
+
this.refillTokens();
|
|
112
|
+
|
|
113
|
+
if (!this.canMakeRequest()) {
|
|
114
|
+
const waitTime = this.getWaitTime();
|
|
115
|
+
await this.sleep(waitTime);
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (this.currentBackoffMs > 0) {
|
|
120
|
+
await this.sleep(this.currentBackoffMs);
|
|
121
|
+
this.currentBackoffMs = 0;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const request = this.queue.shift();
|
|
125
|
+
if (!request) continue;
|
|
126
|
+
|
|
127
|
+
this.consumeTokens();
|
|
128
|
+
this.activeRequests++;
|
|
129
|
+
|
|
130
|
+
try {
|
|
131
|
+
const result = await request.execute();
|
|
132
|
+
request.resolve(result);
|
|
133
|
+
this.currentBackoffMs = 0;
|
|
134
|
+
} catch (error) {
|
|
135
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
136
|
+
|
|
137
|
+
if (this.isRateLimitError(err)) {
|
|
138
|
+
this.currentBackoffMs = Math.min(
|
|
139
|
+
this.baseBackoffMs * Math.pow(2, request.retryCount),
|
|
140
|
+
this.maxBackoffMs,
|
|
141
|
+
);
|
|
142
|
+
|
|
143
|
+
console.warn(
|
|
144
|
+
`[RateLimiter] Rate limited. Backing off for ${this.currentBackoffMs}ms ` +
|
|
145
|
+
`(retry ${request.retryCount + 1})`,
|
|
146
|
+
);
|
|
147
|
+
|
|
148
|
+
if (request.retryCount < 3) {
|
|
149
|
+
this.queue.unshift({
|
|
150
|
+
...request,
|
|
151
|
+
retryCount: request.retryCount + 1,
|
|
152
|
+
});
|
|
153
|
+
} else {
|
|
154
|
+
request.reject(
|
|
155
|
+
new Error(
|
|
156
|
+
`Rate limit exceeded after ${request.retryCount + 1} retries: ${err.message}`,
|
|
157
|
+
),
|
|
158
|
+
);
|
|
159
|
+
}
|
|
160
|
+
} else {
|
|
161
|
+
request.reject(err);
|
|
162
|
+
}
|
|
163
|
+
} finally {
|
|
164
|
+
this.activeRequests--;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
this.processing = false;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
private refillTokens(): void {
|
|
172
|
+
const now = Date.now();
|
|
173
|
+
if (now - this.lastMinuteReset >= 60000) {
|
|
174
|
+
this.minuteTokens = this.requestsPerMinute;
|
|
175
|
+
this.lastMinuteReset = now;
|
|
176
|
+
}
|
|
177
|
+
if (now - this.lastSecondReset >= 1000) {
|
|
178
|
+
this.secondTokens = this.requestsPerSecond;
|
|
179
|
+
this.lastSecondReset = now;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
private canMakeRequest(): boolean {
|
|
184
|
+
return (
|
|
185
|
+
this.minuteTokens > 0 &&
|
|
186
|
+
this.secondTokens > 0 &&
|
|
187
|
+
this.activeRequests < this.maxConcurrent
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
private consumeTokens(): void {
|
|
192
|
+
this.minuteTokens--;
|
|
193
|
+
this.secondTokens--;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
private getWaitTime(): number {
|
|
197
|
+
const now = Date.now();
|
|
198
|
+
if (this.secondTokens <= 0) {
|
|
199
|
+
return Math.max(0, 1000 - (now - this.lastSecondReset));
|
|
200
|
+
}
|
|
201
|
+
if (this.minuteTokens <= 0) {
|
|
202
|
+
return Math.max(0, 60000 - (now - this.lastMinuteReset));
|
|
203
|
+
}
|
|
204
|
+
return 100;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
private isRateLimitError(error: Error): boolean {
|
|
208
|
+
const message = error.message.toLowerCase();
|
|
209
|
+
return (
|
|
210
|
+
message.includes('429') ||
|
|
211
|
+
message.includes('rate limit') ||
|
|
212
|
+
message.includes('too many requests') ||
|
|
213
|
+
message.includes('quota exceeded')
|
|
214
|
+
);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
private sleep(ms: number): Promise<void> {
|
|
218
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Singleton instance
|
|
223
|
+
let globalRateLimiter: RateLimiter | null = null;
|
|
224
|
+
|
|
225
|
+
export function getGlobalRateLimiter(
|
|
226
|
+
options?: RateLimiterOptions,
|
|
227
|
+
): RateLimiter {
|
|
228
|
+
if (!globalRateLimiter) {
|
|
229
|
+
globalRateLimiter = new RateLimiter(options);
|
|
230
|
+
}
|
|
231
|
+
return globalRateLimiter;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
export function resetGlobalRateLimiter(): void {
|
|
235
|
+
globalRateLimiter?.reset();
|
|
236
|
+
globalRateLimiter = null;
|
|
237
|
+
}
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Structured Output Client for OpenAI
|
|
3
|
+
*
|
|
4
|
+
* Uses OpenAI's structured outputs feature (json_schema response_format)
|
|
5
|
+
* to guarantee schema compliance at generation time.
|
|
6
|
+
*
|
|
7
|
+
* The system prompt builder is injectable so consumers can provide
|
|
8
|
+
* domain-specific prompts (e.g., orbital schema references).
|
|
9
|
+
*
|
|
10
|
+
* @packageDocumentation
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import OpenAI from 'openai';
|
|
14
|
+
import type { ChatCompletionCreateParamsNonStreaming } from 'openai/resources/chat/completions';
|
|
15
|
+
import { z } from 'zod';
|
|
16
|
+
import {
|
|
17
|
+
RateLimiter,
|
|
18
|
+
getGlobalRateLimiter,
|
|
19
|
+
type RateLimiterOptions,
|
|
20
|
+
} from './rate-limiter.js';
|
|
21
|
+
import { TokenTracker, getGlobalTokenTracker } from './token-tracker.js';
|
|
22
|
+
|
|
23
|
+
// ============================================================================
|
|
24
|
+
// Types
|
|
25
|
+
// ============================================================================
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* JSON Schema type used for OpenAI structured outputs.
|
|
29
|
+
*/
|
|
30
|
+
export interface JsonSchema {
|
|
31
|
+
type?: string | string[];
|
|
32
|
+
properties?: Record<string, JsonSchema>;
|
|
33
|
+
required?: string[];
|
|
34
|
+
items?: JsonSchema;
|
|
35
|
+
enum?: unknown[];
|
|
36
|
+
const?: unknown;
|
|
37
|
+
anyOf?: JsonSchema[];
|
|
38
|
+
oneOf?: JsonSchema[];
|
|
39
|
+
allOf?: JsonSchema[];
|
|
40
|
+
$ref?: string;
|
|
41
|
+
$defs?: Record<string, JsonSchema>;
|
|
42
|
+
definitions?: Record<string, JsonSchema>;
|
|
43
|
+
additionalProperties?: boolean | JsonSchema;
|
|
44
|
+
description?: string;
|
|
45
|
+
default?: unknown;
|
|
46
|
+
minItems?: number;
|
|
47
|
+
maxItems?: number;
|
|
48
|
+
minLength?: number;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export interface StructuredOutputOptions {
|
|
52
|
+
model?: string;
|
|
53
|
+
temperature?: number;
|
|
54
|
+
maxTokens?: number;
|
|
55
|
+
rateLimiter?: RateLimiterOptions;
|
|
56
|
+
useGlobalRateLimiter?: boolean;
|
|
57
|
+
trackTokens?: boolean;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export interface StructuredGenerationOptions {
|
|
61
|
+
/** User's natural language request */
|
|
62
|
+
userRequest: string;
|
|
63
|
+
/** Model to use (overrides client default) */
|
|
64
|
+
model?: string;
|
|
65
|
+
/** Temperature (overrides client default) */
|
|
66
|
+
temperature?: number;
|
|
67
|
+
/** Maximum tokens (overrides client default) */
|
|
68
|
+
maxTokens?: number;
|
|
69
|
+
/** JSON Schema for structured output */
|
|
70
|
+
jsonSchema?: JsonSchema;
|
|
71
|
+
/** Schema name for the json_schema response format */
|
|
72
|
+
schemaName?: string;
|
|
73
|
+
/** System prompt override */
|
|
74
|
+
systemPrompt?: string;
|
|
75
|
+
/** System prompt builder function (called dynamically) */
|
|
76
|
+
buildSystemPrompt?: () => string;
|
|
77
|
+
/** Additional system prompt instructions */
|
|
78
|
+
additionalInstructions?: string;
|
|
79
|
+
/** Existing context for updates (e.g., existing schema JSON) */
|
|
80
|
+
existingContext?: string;
|
|
81
|
+
/** Skip post-generation validation (default: false) */
|
|
82
|
+
skipValidation?: boolean;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export interface StructuredGenerationResult<T = unknown> {
|
|
86
|
+
/** Generated data (guaranteed to match JSON Schema structure) */
|
|
87
|
+
data: T;
|
|
88
|
+
/** Raw JSON string from API */
|
|
89
|
+
raw: string;
|
|
90
|
+
/** Token usage statistics */
|
|
91
|
+
usage: {
|
|
92
|
+
promptTokens: number;
|
|
93
|
+
completionTokens: number;
|
|
94
|
+
totalTokens: number;
|
|
95
|
+
};
|
|
96
|
+
/** Generation latency in milliseconds */
|
|
97
|
+
latencyMs: number;
|
|
98
|
+
/** Model used for generation */
|
|
99
|
+
model: string;
|
|
100
|
+
/** Zod validation result (if not skipped) */
|
|
101
|
+
zodValidation?: {
|
|
102
|
+
success: boolean;
|
|
103
|
+
errors?: z.ZodError['errors'];
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export const STRUCTURED_OUTPUT_MODELS = {
|
|
108
|
+
GPT5_MINI: 'gpt-5-mini',
|
|
109
|
+
GPT4O_MINI: 'gpt-4o-mini',
|
|
110
|
+
GPT4O: 'gpt-4o',
|
|
111
|
+
GPT4O_2024_08_06: 'gpt-4o-2024-08-06',
|
|
112
|
+
} as const;
|
|
113
|
+
|
|
114
|
+
// ============================================================================
|
|
115
|
+
// Default System Prompt
|
|
116
|
+
// ============================================================================
|
|
117
|
+
|
|
118
|
+
const DEFAULT_SYSTEM_PROMPT = `You are an expert application architect that generates structured schemas from natural language requirements.
|
|
119
|
+
|
|
120
|
+
Generate a complete, well-structured schema based on the user's requirements. Follow the JSON Schema structure exactly.`;
|
|
121
|
+
|
|
122
|
+
// ============================================================================
|
|
123
|
+
// Structured Output Client
|
|
124
|
+
// ============================================================================
|
|
125
|
+
|
|
126
|
+
export class StructuredOutputClient {
|
|
127
|
+
private openai: OpenAI;
|
|
128
|
+
private rateLimiter: RateLimiter;
|
|
129
|
+
private tokenTracker: TokenTracker | null;
|
|
130
|
+
private defaultModel: string;
|
|
131
|
+
private defaultTemperature: number;
|
|
132
|
+
private defaultMaxTokens: number;
|
|
133
|
+
|
|
134
|
+
constructor(options: StructuredOutputOptions = {}) {
|
|
135
|
+
const apiKey = process.env.OPENAI_API_KEY;
|
|
136
|
+
if (!apiKey) {
|
|
137
|
+
throw new Error(
|
|
138
|
+
'OPENAI_API_KEY environment variable is required for StructuredOutputClient',
|
|
139
|
+
);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
this.openai = new OpenAI({ apiKey });
|
|
143
|
+
this.defaultModel = options.model || STRUCTURED_OUTPUT_MODELS.GPT5_MINI;
|
|
144
|
+
this.defaultTemperature = options.temperature ?? 0.3;
|
|
145
|
+
this.defaultMaxTokens = options.maxTokens ?? 16384;
|
|
146
|
+
|
|
147
|
+
this.rateLimiter =
|
|
148
|
+
options.useGlobalRateLimiter !== false
|
|
149
|
+
? getGlobalRateLimiter(options.rateLimiter)
|
|
150
|
+
: new RateLimiter(options.rateLimiter);
|
|
151
|
+
|
|
152
|
+
this.tokenTracker =
|
|
153
|
+
options.trackTokens !== false
|
|
154
|
+
? getGlobalTokenTracker(this.defaultModel)
|
|
155
|
+
: null;
|
|
156
|
+
|
|
157
|
+
console.log(
|
|
158
|
+
`[StructuredOutputClient] Initialized with model: ${this.defaultModel}`,
|
|
159
|
+
);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
private usesMaxCompletionTokens(model: string): boolean {
|
|
163
|
+
const m = model.toLowerCase();
|
|
164
|
+
return (
|
|
165
|
+
m.startsWith('o1') ||
|
|
166
|
+
m.startsWith('gpt-5') ||
|
|
167
|
+
m.includes('o1-') ||
|
|
168
|
+
m.includes('o3')
|
|
169
|
+
);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Generate structured output with guaranteed JSON Schema compliance.
|
|
174
|
+
*/
|
|
175
|
+
async generate<T = unknown>(
|
|
176
|
+
options: StructuredGenerationOptions,
|
|
177
|
+
): Promise<StructuredGenerationResult<T>> {
|
|
178
|
+
const model = options.model || this.defaultModel;
|
|
179
|
+
const temperature = options.temperature ?? this.defaultTemperature;
|
|
180
|
+
const maxTokens = options.maxTokens ?? this.defaultMaxTokens;
|
|
181
|
+
const startTime = Date.now();
|
|
182
|
+
|
|
183
|
+
const jsonSchema: JsonSchema = options.jsonSchema || {
|
|
184
|
+
type: 'object',
|
|
185
|
+
properties: {},
|
|
186
|
+
required: [],
|
|
187
|
+
additionalProperties: false,
|
|
188
|
+
};
|
|
189
|
+
|
|
190
|
+
// Build system prompt
|
|
191
|
+
let systemPrompt: string;
|
|
192
|
+
if (options.systemPrompt) {
|
|
193
|
+
systemPrompt = options.systemPrompt;
|
|
194
|
+
} else if (options.buildSystemPrompt) {
|
|
195
|
+
systemPrompt = options.buildSystemPrompt();
|
|
196
|
+
} else {
|
|
197
|
+
systemPrompt = DEFAULT_SYSTEM_PROMPT;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
if (options.additionalInstructions) {
|
|
201
|
+
systemPrompt += `\n\n## Additional Instructions\n${options.additionalInstructions}`;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// Build user prompt
|
|
205
|
+
let userPrompt = options.userRequest;
|
|
206
|
+
if (options.existingContext) {
|
|
207
|
+
userPrompt += `\n\n## Existing Context\nUpdate based on the above request:\n\`\`\`json\n${options.existingContext}\n\`\`\``;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
const schemaName = options.schemaName || 'structured_output';
|
|
211
|
+
|
|
212
|
+
console.log(
|
|
213
|
+
`[StructuredOutputClient] Generating with ${model}...`,
|
|
214
|
+
);
|
|
215
|
+
console.log(
|
|
216
|
+
`[StructuredOutputClient] Request: "${options.userRequest.slice(0, 80)}..."`,
|
|
217
|
+
);
|
|
218
|
+
|
|
219
|
+
const response = await this.rateLimiter.execute(async () => {
|
|
220
|
+
const isReasoningModel = this.usesMaxCompletionTokens(model);
|
|
221
|
+
|
|
222
|
+
const tokenParam = isReasoningModel
|
|
223
|
+
? { max_completion_tokens: maxTokens }
|
|
224
|
+
: { max_tokens: maxTokens };
|
|
225
|
+
|
|
226
|
+
const tempParam = isReasoningModel ? {} : { temperature };
|
|
227
|
+
|
|
228
|
+
const params: ChatCompletionCreateParamsNonStreaming = {
|
|
229
|
+
model,
|
|
230
|
+
messages: [
|
|
231
|
+
{ role: 'system', content: systemPrompt },
|
|
232
|
+
{ role: 'user', content: userPrompt },
|
|
233
|
+
],
|
|
234
|
+
response_format: {
|
|
235
|
+
type: 'json_schema',
|
|
236
|
+
json_schema: {
|
|
237
|
+
name: schemaName,
|
|
238
|
+
strict: true,
|
|
239
|
+
schema: jsonSchema as Record<string, unknown>,
|
|
240
|
+
},
|
|
241
|
+
},
|
|
242
|
+
...tempParam,
|
|
243
|
+
...tokenParam,
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
return this.openai.chat.completions.create(params);
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
const latencyMs = Date.now() - startTime;
|
|
250
|
+
|
|
251
|
+
const content = response.choices[0]?.message?.content;
|
|
252
|
+
if (!content) {
|
|
253
|
+
throw new Error('No content in OpenAI response');
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
let data: T;
|
|
257
|
+
try {
|
|
258
|
+
data = JSON.parse(content) as T;
|
|
259
|
+
} catch (error) {
|
|
260
|
+
throw new Error(`Failed to parse response JSON: ${error}`);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const usage = {
|
|
264
|
+
promptTokens: response.usage?.prompt_tokens || 0,
|
|
265
|
+
completionTokens: response.usage?.completion_tokens || 0,
|
|
266
|
+
totalTokens: response.usage?.total_tokens || 0,
|
|
267
|
+
};
|
|
268
|
+
|
|
269
|
+
if (this.tokenTracker) {
|
|
270
|
+
this.tokenTracker.addUsage(usage.promptTokens, usage.completionTokens);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
console.log(
|
|
274
|
+
`[StructuredOutputClient] Generated in ${latencyMs}ms, ${usage.totalTokens} tokens`,
|
|
275
|
+
);
|
|
276
|
+
|
|
277
|
+
let zodValidation: StructuredGenerationResult['zodValidation'];
|
|
278
|
+
if (!options.skipValidation) {
|
|
279
|
+
zodValidation = { success: true };
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
return {
|
|
283
|
+
data,
|
|
284
|
+
raw: content,
|
|
285
|
+
usage,
|
|
286
|
+
latencyMs,
|
|
287
|
+
model,
|
|
288
|
+
zodValidation,
|
|
289
|
+
};
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
getModel(): string {
|
|
293
|
+
return this.defaultModel;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
getRateLimiterStatus() {
|
|
297
|
+
return this.rateLimiter.getStatus();
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
getTokenUsage() {
|
|
301
|
+
return this.tokenTracker?.getSummary() ?? null;
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// ============================================================================
|
|
306
|
+
// Singleton Instance
|
|
307
|
+
// ============================================================================
|
|
308
|
+
|
|
309
|
+
let sharedClient: StructuredOutputClient | null = null;
|
|
310
|
+
|
|
311
|
+
export function getStructuredOutputClient(
|
|
312
|
+
options?: StructuredOutputOptions,
|
|
313
|
+
): StructuredOutputClient {
|
|
314
|
+
if (!sharedClient) {
|
|
315
|
+
sharedClient = new StructuredOutputClient(options);
|
|
316
|
+
}
|
|
317
|
+
return sharedClient;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
export function resetStructuredOutputClient(): void {
|
|
321
|
+
sharedClient = null;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// ============================================================================
|
|
325
|
+
// Convenience Functions
|
|
326
|
+
// ============================================================================
|
|
327
|
+
|
|
328
|
+
export function isStructuredOutputAvailable(): boolean {
|
|
329
|
+
return !!process.env.OPENAI_API_KEY;
|
|
330
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token Tracker for LLM Usage
|
|
3
|
+
*
|
|
4
|
+
* Tracks token usage across multiple LLM calls for:
|
|
5
|
+
* - Cost estimation
|
|
6
|
+
* - Usage monitoring
|
|
7
|
+
* - Quota management
|
|
8
|
+
*
|
|
9
|
+
* @packageDocumentation
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
export interface TokenUsage {
|
|
13
|
+
promptTokens: number;
|
|
14
|
+
completionTokens: number;
|
|
15
|
+
totalTokens: number;
|
|
16
|
+
callCount: number;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export interface TokenCost {
|
|
20
|
+
promptCostPer1K: number;
|
|
21
|
+
completionCostPer1K: number;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// Pricing as of 2024 (update as needed)
|
|
25
|
+
const MODEL_COSTS: Record<string, TokenCost> = {
|
|
26
|
+
'gpt-4o': { promptCostPer1K: 0.005, completionCostPer1K: 0.015 },
|
|
27
|
+
'gpt-4o-mini': { promptCostPer1K: 0.00015, completionCostPer1K: 0.0006 },
|
|
28
|
+
'gpt-4-turbo': { promptCostPer1K: 0.01, completionCostPer1K: 0.03 },
|
|
29
|
+
'gpt-4': { promptCostPer1K: 0.03, completionCostPer1K: 0.06 },
|
|
30
|
+
'gpt-3.5-turbo': {
|
|
31
|
+
promptCostPer1K: 0.0005,
|
|
32
|
+
completionCostPer1K: 0.0015,
|
|
33
|
+
},
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
export class TokenTracker {
|
|
37
|
+
private model: string;
|
|
38
|
+
private usage: TokenUsage = {
|
|
39
|
+
promptTokens: 0,
|
|
40
|
+
completionTokens: 0,
|
|
41
|
+
totalTokens: 0,
|
|
42
|
+
callCount: 0,
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
constructor(model: string = 'gpt-4o') {
|
|
46
|
+
this.model = model;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
addUsage(promptTokens: number, completionTokens: number): void {
|
|
50
|
+
this.usage.promptTokens += promptTokens;
|
|
51
|
+
this.usage.completionTokens += completionTokens;
|
|
52
|
+
this.usage.totalTokens += promptTokens + completionTokens;
|
|
53
|
+
this.usage.callCount++;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
getSummary(): TokenUsage {
|
|
57
|
+
return { ...this.usage };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
getEstimatedCost(): number {
|
|
61
|
+
const costs = MODEL_COSTS[this.model] || MODEL_COSTS['gpt-4o'];
|
|
62
|
+
const promptCost =
|
|
63
|
+
(this.usage.promptTokens / 1000) * costs.promptCostPer1K;
|
|
64
|
+
const completionCost =
|
|
65
|
+
(this.usage.completionTokens / 1000) * costs.completionCostPer1K;
|
|
66
|
+
return promptCost + completionCost;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
getFormattedCost(): string {
|
|
70
|
+
const cost = this.getEstimatedCost();
|
|
71
|
+
return `$${cost.toFixed(4)}`;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
getReport(): string {
|
|
75
|
+
const summary = this.getSummary();
|
|
76
|
+
const cost = this.getEstimatedCost();
|
|
77
|
+
return [
|
|
78
|
+
`Token Usage Report (${this.model})`,
|
|
79
|
+
`─────────────────────────────`,
|
|
80
|
+
`Calls: ${summary.callCount}`,
|
|
81
|
+
`Prompt Tokens: ${summary.promptTokens.toLocaleString()}`,
|
|
82
|
+
`Completion Tokens: ${summary.completionTokens.toLocaleString()}`,
|
|
83
|
+
`Total Tokens: ${summary.totalTokens.toLocaleString()}`,
|
|
84
|
+
`Estimated Cost: $${cost.toFixed(4)}`,
|
|
85
|
+
].join('\n');
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
reset(): void {
|
|
89
|
+
this.usage = {
|
|
90
|
+
promptTokens: 0,
|
|
91
|
+
completionTokens: 0,
|
|
92
|
+
totalTokens: 0,
|
|
93
|
+
callCount: 0,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
setModel(model: string): void {
|
|
98
|
+
this.model = model;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Global tracker instance
|
|
103
|
+
let globalTracker: TokenTracker | null = null;
|
|
104
|
+
|
|
105
|
+
export function getGlobalTokenTracker(model?: string): TokenTracker {
|
|
106
|
+
if (!globalTracker) {
|
|
107
|
+
globalTracker = new TokenTracker(model);
|
|
108
|
+
} else if (model) {
|
|
109
|
+
globalTracker.setModel(model);
|
|
110
|
+
}
|
|
111
|
+
return globalTracker;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
export function resetGlobalTokenTracker(): void {
|
|
115
|
+
globalTracker?.reset();
|
|
116
|
+
}
|