@almadar/llm 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/client.ts ADDED
@@ -0,0 +1,967 @@
1
+ /**
2
+ * Shared LLM Client
3
+ *
4
+ * Multi-provider LLM client with:
5
+ * - OpenAI, DeepSeek, Anthropic, and Kimi support
6
+ * - Anthropic prompt caching (CachingChatAnthropic)
7
+ * - Rate limiting and retry logic
8
+ * - Token tracking
9
+ * - Structured output parsing with Zod
10
+ *
11
+ * @packageDocumentation
12
+ */
13
+
14
+ import { ChatOpenAI } from '@langchain/openai';
15
+ import {
16
+ ChatAnthropic,
17
+ type ChatAnthropicCallOptions,
18
+ } from '@langchain/anthropic';
19
+ import Anthropic from '@anthropic-ai/sdk';
20
+ import { z } from 'zod';
21
+ import {
22
+ RateLimiter,
23
+ getGlobalRateLimiter,
24
+ type RateLimiterOptions,
25
+ } from './rate-limiter.js';
26
+ import { TokenTracker, getGlobalTokenTracker } from './token-tracker.js';
27
+ import { parseJsonResponse } from './json-parser.js';
28
+
29
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
30
+ type MessageLike = any;
31
+
32
+ // ============================================================================
33
+ // Caching Chat Anthropic Wrapper
34
+ // ============================================================================
35
+
36
+ class CachingChatAnthropic extends ChatAnthropic {
37
+ async invoke(
38
+ input: MessageLike[] | string,
39
+ options?: Partial<ChatAnthropicCallOptions>,
40
+ ): Promise<MessageLike> {
41
+ let messages: MessageLike[];
42
+ if (typeof input === 'string') {
43
+ messages = [{ role: 'user', content: input }];
44
+ } else {
45
+ messages = input;
46
+ }
47
+
48
+ const transformedMessages = messages.map((msg: MessageLike) => {
49
+ const msgType = msg._getType?.() || msg.role || 'unknown';
50
+ const isSystem = msgType === 'system';
51
+
52
+ if (!isSystem) return msg;
53
+
54
+ if (typeof msg.content === 'string') {
55
+ return {
56
+ ...msg,
57
+ content: [
58
+ {
59
+ type: 'text',
60
+ text: msg.content,
61
+ cache_control: { type: 'ephemeral' },
62
+ },
63
+ ],
64
+ };
65
+ }
66
+
67
+ if (Array.isArray(msg.content)) {
68
+ const blocks = msg.content as Array<{
69
+ type?: string;
70
+ text?: string;
71
+ cache_control?: unknown;
72
+ }>;
73
+ const hasAnyCacheControl = blocks.some((b) => b.cache_control);
74
+
75
+ if (!hasAnyCacheControl) {
76
+ const transformedBlocks = blocks.map((block, idx) => {
77
+ if (block.type === 'text' && idx === blocks.length - 1) {
78
+ return {
79
+ ...block,
80
+ cache_control: { type: 'ephemeral' },
81
+ };
82
+ }
83
+ return block;
84
+ });
85
+
86
+ return { ...msg, content: transformedBlocks };
87
+ }
88
+ }
89
+
90
+ return msg;
91
+ });
92
+
93
+ return super.invoke(transformedMessages, options);
94
+ }
95
+ }
96
+
97
+ type ChatModel = ChatOpenAI | CachingChatAnthropic;
98
+
99
+ // ============================================================================
100
+ // Types
101
+ // ============================================================================
102
+
103
+ export type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi';
104
+
105
+ export interface ProviderConfig {
106
+ apiKey: string;
107
+ baseUrl?: string;
108
+ defaultModel: string;
109
+ }
110
+
111
+ export interface LLMClientOptions {
112
+ provider?: LLMProvider;
113
+ model?: string;
114
+ temperature?: number;
115
+ streaming?: boolean;
116
+ rateLimiter?: RateLimiterOptions;
117
+ useGlobalRateLimiter?: boolean;
118
+ trackTokens?: boolean;
119
+ }
120
+
121
+ export interface LLMCallOptions<T = unknown> {
122
+ systemPrompt: string;
123
+ userPrompt: string;
124
+ schema?: z.ZodSchema<T>;
125
+ maxRetries?: number;
126
+ retryWithContext?: boolean;
127
+ maxTokens?: number;
128
+ skipSchemaValidation?: boolean;
129
+ temperature?: number;
130
+ }
131
+
132
+ export interface CacheableBlock {
133
+ type: 'text';
134
+ text: string;
135
+ cache_control?: { type: 'ephemeral' };
136
+ }
137
+
138
+ export interface CacheAwareLLMCallOptions<T = unknown>
139
+ extends LLMCallOptions<T> {
140
+ systemBlocks?: CacheableBlock[];
141
+ userBlocks?: CacheableBlock[];
142
+ rawText?: boolean;
143
+ }
144
+
145
+ export interface LLMUsage {
146
+ promptTokens: number;
147
+ completionTokens: number;
148
+ totalTokens: number;
149
+ }
150
+
151
+ export type LLMFinishReason =
152
+ | 'stop'
153
+ | 'length'
154
+ | 'content_filter'
155
+ | 'tool_calls'
156
+ | null;
157
+
158
+ export interface LLMResponse<T> {
159
+ data: T;
160
+ raw: string;
161
+ finishReason: LLMFinishReason;
162
+ usage: LLMUsage | null;
163
+ }
164
+
165
+ // ============================================================================
166
+ // Provider Configuration
167
+ // ============================================================================
168
+
169
+ const PROVIDER_CONFIGS: Record<LLMProvider, () => ProviderConfig> = {
170
+ openai: () => {
171
+ const apiKey = process.env.OPENAI_API_KEY;
172
+ if (!apiKey) {
173
+ throw new Error(
174
+ 'OPENAI_API_KEY environment variable is not set. ' +
175
+ 'Please set it in your .env file or environment.',
176
+ );
177
+ }
178
+ return { apiKey, baseUrl: undefined, defaultModel: 'gpt-4o' };
179
+ },
180
+ deepseek: () => {
181
+ const apiKey = process.env.DEEPSEEK_API_KEY;
182
+ if (!apiKey) {
183
+ throw new Error(
184
+ 'DEEPSEEK_API_KEY environment variable is not set. ' +
185
+ 'Please set it in your .env file or environment.',
186
+ );
187
+ }
188
+ return {
189
+ apiKey,
190
+ baseUrl: 'https://api.deepseek.com/v1',
191
+ defaultModel: 'deepseek-chat',
192
+ };
193
+ },
194
+ anthropic: () => {
195
+ const apiKey = process.env.ANTHROPIC_API_KEY;
196
+ if (!apiKey) {
197
+ throw new Error(
198
+ 'ANTHROPIC_API_KEY environment variable is not set. ' +
199
+ 'Please set it in your .env file or environment.',
200
+ );
201
+ }
202
+ return {
203
+ apiKey,
204
+ baseUrl: undefined,
205
+ defaultModel: 'claude-sonnet-4-5-20250929',
206
+ };
207
+ },
208
+ kimi: () => {
209
+ const apiKey = process.env.KIMI_API_KEY;
210
+ if (!apiKey) {
211
+ throw new Error(
212
+ 'KIMI_API_KEY environment variable is not set. ' +
213
+ 'Please set it in your .env file or environment.',
214
+ );
215
+ }
216
+ return {
217
+ apiKey,
218
+ baseUrl: 'https://api.moonshot.cn/v1',
219
+ defaultModel: 'kimi-k2.5',
220
+ };
221
+ },
222
+ };
223
+
224
+ export const DEEPSEEK_MODELS = {
225
+ CHAT: 'deepseek-chat',
226
+ CODER: 'deepseek-coder',
227
+ REASONER: 'deepseek-reasoner',
228
+ } as const;
229
+
230
+ export const OPENAI_MODELS = {
231
+ GPT4O: 'gpt-4o',
232
+ GPT4O_MINI: 'gpt-4o-mini',
233
+ GPT4_TURBO: 'gpt-4-turbo',
234
+ GPT35_TURBO: 'gpt-3.5-turbo',
235
+ GPT_5_1: 'gpt-5.1',
236
+ } as const;
237
+
238
+ export const ANTHROPIC_MODELS = {
239
+ CLAUDE_SONNET_4_5: 'claude-sonnet-4-5-20250929',
240
+ CLAUDE_SONNET_4: 'claude-sonnet-4-20250514',
241
+ CLAUDE_OPUS_4_5: 'claude-opus-4-5-20250929',
242
+ CLAUDE_3_5_HAIKU: 'claude-3-5-haiku-20241022',
243
+ } as const;
244
+
245
+ export const KIMI_MODELS = {
246
+ K2_5: 'kimi-k2.5',
247
+ } as const;
248
+
249
+ const DEFAULT_TEMPERATURE = 0.3;
250
+
251
+ // ============================================================================
252
+ // LLM Client
253
+ // ============================================================================
254
+
255
+ export class LLMClient {
256
+ private model: ChatModel;
257
+ private rateLimiter: RateLimiter;
258
+ private tokenTracker: TokenTracker | null;
259
+ private modelName: string;
260
+ private provider: LLMProvider;
261
+ private providerConfig: ProviderConfig;
262
+ private temperature: number;
263
+ private streaming: boolean;
264
+
265
+ constructor(options: LLMClientOptions = {}) {
266
+ this.provider = options.provider || 'openai';
267
+ this.temperature = options.temperature ?? DEFAULT_TEMPERATURE;
268
+ this.streaming = options.streaming ?? false;
269
+
270
+ this.providerConfig = PROVIDER_CONFIGS[this.provider]();
271
+ this.modelName = options.model || this.providerConfig.defaultModel;
272
+
273
+ const keyPreview = this.providerConfig.apiKey.slice(-4);
274
+ console.log(
275
+ `[LLMClient] Provider: ${this.provider}, Model: ${this.modelName}, Key: ****${keyPreview}`,
276
+ );
277
+ if (this.providerConfig.baseUrl) {
278
+ console.log(
279
+ `[LLMClient] Using custom base URL: ${this.providerConfig.baseUrl}`,
280
+ );
281
+ }
282
+
283
+ this.model = this.createModel();
284
+
285
+ this.rateLimiter =
286
+ options.useGlobalRateLimiter !== false
287
+ ? getGlobalRateLimiter(options.rateLimiter)
288
+ : new RateLimiter(options.rateLimiter);
289
+
290
+ this.tokenTracker =
291
+ options.trackTokens !== false
292
+ ? getGlobalTokenTracker(this.modelName)
293
+ : null;
294
+ }
295
+
296
+ private usesMaxCompletionTokens(): boolean {
297
+ const model = this.modelName.toLowerCase();
298
+ return (
299
+ model.startsWith('o1') ||
300
+ model.startsWith('gpt-5') ||
301
+ model.includes('o1-') ||
302
+ model.includes('o3')
303
+ );
304
+ }
305
+
306
+ private createModel(options?: {
307
+ maxTokens?: number;
308
+ temperature?: number;
309
+ }): ChatModel {
310
+ const maxTokens = options?.maxTokens;
311
+ const temperature = options?.temperature ?? this.temperature;
312
+
313
+ if (this.provider === 'anthropic') {
314
+ return new CachingChatAnthropic({
315
+ anthropicApiKey: this.providerConfig.apiKey,
316
+ modelName: this.modelName,
317
+ temperature,
318
+ streaming: this.streaming,
319
+ maxTokens: maxTokens || 8192,
320
+ callbacks: [
321
+ {
322
+ handleLLMEnd: (output) => {
323
+ const generation = output.generations?.[0]?.[0];
324
+ const usage = (
325
+ generation as unknown as {
326
+ message?: {
327
+ usage_metadata?: {
328
+ cache_creation_input_tokens?: number;
329
+ cache_read_input_tokens?: number;
330
+ input_tokens?: number;
331
+ output_tokens?: number;
332
+ };
333
+ };
334
+ }
335
+ )?.message?.usage_metadata;
336
+
337
+ if (usage) {
338
+ const cacheCreated = usage.cache_creation_input_tokens ?? 0;
339
+ const cacheRead = usage.cache_read_input_tokens ?? 0;
340
+ const inputTokens = usage.input_tokens ?? 0;
341
+ const outputTokens = usage.output_tokens ?? 0;
342
+
343
+ if (cacheCreated > 0) {
344
+ console.log(
345
+ `[LLMClient:Anthropic] Cache WRITE: ${cacheCreated} tokens cached`,
346
+ );
347
+ }
348
+ if (cacheRead > 0) {
349
+ const savingsPercent = Math.round(
350
+ (cacheRead / (cacheRead + inputTokens)) * 100,
351
+ );
352
+ console.log(
353
+ `[LLMClient:Anthropic] Cache HIT: ${cacheRead} tokens (~${savingsPercent}% of prompt)`,
354
+ );
355
+ }
356
+ if (cacheCreated === 0 && cacheRead === 0 && inputTokens > 0) {
357
+ if (inputTokens < 500) {
358
+ console.log(
359
+ `[LLMClient:Anthropic] ${inputTokens} input, ${outputTokens} output tokens (likely cached)`,
360
+ );
361
+ } else {
362
+ console.log(
363
+ `[LLMClient:Anthropic] ${inputTokens} input, ${outputTokens} output tokens`,
364
+ );
365
+ }
366
+ }
367
+ }
368
+ },
369
+ },
370
+ ],
371
+ });
372
+ }
373
+
374
+ const useCompletionTokens = this.usesMaxCompletionTokens();
375
+
376
+ const tokenConfig = maxTokens
377
+ ? useCompletionTokens
378
+ ? { modelKwargs: { max_completion_tokens: maxTokens } }
379
+ : { maxTokens }
380
+ : {};
381
+
382
+ const timeout = this.provider === 'deepseek' ? 600000 : undefined;
383
+
384
+ return new ChatOpenAI({
385
+ openAIApiKey: this.providerConfig.apiKey,
386
+ modelName: this.modelName,
387
+ temperature: useCompletionTokens ? undefined : temperature,
388
+ streaming: this.streaming,
389
+ timeout,
390
+ ...tokenConfig,
391
+ configuration: {
392
+ apiKey: this.providerConfig.apiKey,
393
+ ...(this.providerConfig.baseUrl
394
+ ? { baseURL: this.providerConfig.baseUrl }
395
+ : {}),
396
+ },
397
+ });
398
+ }
399
+
400
+ private getModelWithOptions(options: {
401
+ maxTokens?: number;
402
+ temperature?: number;
403
+ }): ChatModel {
404
+ return this.createModel(options);
405
+ }
406
+
407
+ getProvider(): LLMProvider {
408
+ return this.provider;
409
+ }
410
+
411
+ getModelName(): string {
412
+ return this.modelName;
413
+ }
414
+
415
+ getModel(): ChatModel {
416
+ return this.model;
417
+ }
418
+
419
+ getRateLimiterStatus() {
420
+ return this.rateLimiter.getStatus();
421
+ }
422
+
423
+ getTokenUsage() {
424
+ return this.tokenTracker?.getSummary() ?? null;
425
+ }
426
+
427
+ async call<T>(options: LLMCallOptions<T>): Promise<T> {
428
+ const response = await this.callWithMetadata(options);
429
+ return response.data;
430
+ }
431
+
432
+ async callWithMetadata<T>(options: LLMCallOptions<T>): Promise<LLMResponse<T>> {
433
+ const {
434
+ systemPrompt,
435
+ userPrompt,
436
+ schema,
437
+ maxRetries = 2,
438
+ retryWithContext = true,
439
+ maxTokens,
440
+ skipSchemaValidation = false,
441
+ temperature,
442
+ } = options;
443
+
444
+ let currentPrompt = userPrompt;
445
+ let lastError: Error | null = null;
446
+
447
+ console.log(
448
+ `[LLMClient:call] Starting call to ${this.provider}/${this.modelName}`,
449
+ );
450
+ console.log(`[LLMClient:call] Prompt length: ${userPrompt.length} chars`);
451
+ if (maxTokens) {
452
+ console.log(`[LLMClient:call] Max tokens: ${maxTokens}`);
453
+ }
454
+
455
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
456
+ try {
457
+ console.log(
458
+ `[LLMClient:call] Attempt ${attempt + 1}/${maxRetries + 1}...`,
459
+ );
460
+ const attemptStartTime = Date.now();
461
+
462
+ const result = await this.rateLimiter.execute(async () => {
463
+ console.log(`[LLMClient:call] Invoking model...`);
464
+ const invokeStartTime = Date.now();
465
+
466
+ const modelToUse =
467
+ maxTokens || temperature !== undefined
468
+ ? this.getModelWithOptions({ maxTokens, temperature })
469
+ : this.model;
470
+
471
+ const response = await modelToUse.invoke([
472
+ { role: 'system', content: systemPrompt },
473
+ { role: 'user', content: currentPrompt },
474
+ ]);
475
+
476
+ console.log(
477
+ `[LLMClient:call] Model responded in ${Date.now() - invokeStartTime}ms`,
478
+ );
479
+
480
+ let usage: LLMUsage | null = null;
481
+ if (response.usage_metadata) {
482
+ const usageMeta = response.usage_metadata as {
483
+ input_tokens?: number;
484
+ output_tokens?: number;
485
+ };
486
+ usage = {
487
+ promptTokens: usageMeta.input_tokens || 0,
488
+ completionTokens: usageMeta.output_tokens || 0,
489
+ totalTokens:
490
+ (usageMeta.input_tokens || 0) +
491
+ (usageMeta.output_tokens || 0),
492
+ };
493
+ console.log(
494
+ `[LLMClient:call] Tokens used: ${usage.promptTokens} in, ${usage.completionTokens} out`,
495
+ );
496
+
497
+ if (this.tokenTracker) {
498
+ this.tokenTracker.addUsage(
499
+ usage.promptTokens,
500
+ usage.completionTokens,
501
+ );
502
+ }
503
+ }
504
+
505
+ const finishReason = this.extractFinishReason(response);
506
+ if (finishReason === 'length') {
507
+ console.warn(
508
+ `[LLMClient:call] Response truncated (finish_reason=length)`,
509
+ );
510
+ }
511
+
512
+ const content =
513
+ typeof response.content === 'string'
514
+ ? response.content
515
+ : JSON.stringify(response.content);
516
+
517
+ console.log(
518
+ `[LLMClient:call] Response length: ${content.length} chars, finish_reason: ${finishReason}`,
519
+ );
520
+
521
+ return { content, finishReason, usage };
522
+ });
523
+
524
+ console.log(
525
+ `[LLMClient:call] Attempt ${attempt + 1} completed in ${Date.now() - attemptStartTime}ms, parsing response...`,
526
+ );
527
+
528
+ const parsed = skipSchemaValidation
529
+ ? (parseJsonResponse(result.content, undefined) as T)
530
+ : parseJsonResponse(result.content, schema);
531
+ console.log(
532
+ `[LLMClient:call] Response parsed successfully${skipSchemaValidation ? ' (schema validation skipped)' : ''}`,
533
+ );
534
+
535
+ return {
536
+ data: parsed,
537
+ raw: result.content,
538
+ finishReason: result.finishReason,
539
+ usage: result.usage,
540
+ };
541
+ } catch (error) {
542
+ lastError = error instanceof Error ? error : new Error(String(error));
543
+ console.error(
544
+ `[LLMClient:call] Attempt ${attempt + 1} failed:`,
545
+ lastError.message,
546
+ );
547
+
548
+ if (this.isRateLimitError(lastError)) {
549
+ console.error(`[LLMClient:call] Rate limit error, not retrying`);
550
+ throw lastError;
551
+ }
552
+
553
+ if (attempt < maxRetries && retryWithContext) {
554
+ console.log(`[LLMClient:call] Will retry with error context`);
555
+ currentPrompt =
556
+ `${userPrompt}\n\n` +
557
+ `[Previous attempt failed with: ${lastError.message}]\n` +
558
+ `Please output valid JSON that matches the expected schema.`;
559
+ }
560
+ }
561
+ }
562
+
563
+ console.error(`[LLMClient:call] All attempts exhausted, throwing error`);
564
+ throw lastError;
565
+ }
566
+
567
+ private extractFinishReason(
568
+ response: Awaited<ReturnType<ChatOpenAI['invoke']>>,
569
+ ): LLMFinishReason {
570
+ const metadata = response.response_metadata as
571
+ | Record<string, unknown>
572
+ | undefined;
573
+ if (metadata?.finish_reason) {
574
+ const reason = metadata.finish_reason as string;
575
+ if (
576
+ reason === 'stop' ||
577
+ reason === 'length' ||
578
+ reason === 'content_filter' ||
579
+ reason === 'tool_calls'
580
+ ) {
581
+ return reason;
582
+ }
583
+ }
584
+ return null;
585
+ }
586
+
587
+ async callRaw(options: {
588
+ systemPrompt: string;
589
+ userPrompt: string;
590
+ maxTokens?: number;
591
+ }): Promise<string> {
592
+ const response = await this.callRawWithMetadata(options);
593
+ return response.raw;
594
+ }
595
+
596
+ async callRawWithMetadata(options: {
597
+ systemPrompt: string;
598
+ userPrompt: string;
599
+ maxTokens?: number;
600
+ }): Promise<Omit<LLMResponse<string>, 'data'> & { raw: string }> {
601
+ const { systemPrompt, userPrompt, maxTokens } = options;
602
+
603
+ return this.rateLimiter.execute(async () => {
604
+ const modelToUse = maxTokens
605
+ ? this.getModelWithOptions({ maxTokens })
606
+ : this.model;
607
+
608
+ const response = await modelToUse.invoke([
609
+ { role: 'system', content: systemPrompt },
610
+ { role: 'user', content: userPrompt },
611
+ ]);
612
+
613
+ let usage: LLMUsage | null = null;
614
+ if (response.usage_metadata) {
615
+ const usageMeta = response.usage_metadata as {
616
+ input_tokens?: number;
617
+ output_tokens?: number;
618
+ };
619
+ usage = {
620
+ promptTokens: usageMeta.input_tokens || 0,
621
+ completionTokens: usageMeta.output_tokens || 0,
622
+ totalTokens:
623
+ (usageMeta.input_tokens || 0) + (usageMeta.output_tokens || 0),
624
+ };
625
+
626
+ if (this.tokenTracker) {
627
+ this.tokenTracker.addUsage(
628
+ usage.promptTokens,
629
+ usage.completionTokens,
630
+ );
631
+ }
632
+ }
633
+
634
+ const finishReason = this.extractFinishReason(response);
635
+ const content =
636
+ typeof response.content === 'string'
637
+ ? response.content
638
+ : JSON.stringify(response.content);
639
+
640
+ return { raw: content, finishReason, usage };
641
+ });
642
+ }
643
+
644
+ private isRateLimitError(error: Error): boolean {
645
+ const message = error.message.toLowerCase();
646
+ return (
647
+ message.includes('rate limit') ||
648
+ message.includes('429') ||
649
+ message.includes('quota exceeded')
650
+ );
651
+ }
652
+
653
+ // ==========================================================================
654
+ // Anthropic Cache Control Support
655
+ // ==========================================================================
656
+
657
+ async callWithCache<T>(
658
+ options: CacheAwareLLMCallOptions<T>,
659
+ ): Promise<LLMResponse<T>> {
660
+ const {
661
+ systemPrompt,
662
+ userPrompt,
663
+ systemBlocks,
664
+ userBlocks,
665
+ schema,
666
+ maxRetries = 2,
667
+ maxTokens,
668
+ skipSchemaValidation = false,
669
+ temperature,
670
+ rawText = false,
671
+ } = options;
672
+
673
+ if (this.provider !== 'anthropic') {
674
+ console.log(
675
+ `[LLMClient:callWithCache] Provider ${this.provider} doesn't support caching, using regular call`,
676
+ );
677
+ return this.callWithMetadata(options);
678
+ }
679
+
680
+ const cacheableCount =
681
+ (systemBlocks || []).filter((b) => b.cache_control).length +
682
+ (userBlocks || []).filter((b) => b.cache_control).length;
683
+ console.log(
684
+ `[LLMClient:callWithCache] ${cacheableCount} cacheable block(s)`,
685
+ );
686
+
687
+ let lastError: Error | null = null;
688
+
689
+ for (let attempt = 0; attempt <= maxRetries; attempt++) {
690
+ try {
691
+ console.log(
692
+ `[LLMClient:callWithCache] Attempt ${attempt + 1}/${maxRetries + 1}...`,
693
+ );
694
+
695
+ const result = await this.rateLimiter.execute(async () => {
696
+ const anthropic = new Anthropic();
697
+
698
+ const systemContent =
699
+ systemBlocks && systemBlocks.length > 0
700
+ ? systemBlocks.map((b) => ({
701
+ type: 'text' as const,
702
+ text: b.text,
703
+ ...(b.cache_control
704
+ ? { cache_control: b.cache_control }
705
+ : {}),
706
+ }))
707
+ : systemPrompt
708
+ ? [{ type: 'text' as const, text: systemPrompt }]
709
+ : [];
710
+
711
+ const userContent =
712
+ userBlocks && userBlocks.length > 0
713
+ ? userBlocks.map((b) => ({
714
+ type: 'text' as const,
715
+ text: b.text,
716
+ ...(b.cache_control
717
+ ? { cache_control: b.cache_control }
718
+ : {}),
719
+ }))
720
+ : userPrompt
721
+ ? [{ type: 'text' as const, text: userPrompt }]
722
+ : [];
723
+
724
+ const response = await anthropic.messages.create({
725
+ model: this.modelName,
726
+ max_tokens: maxTokens || 8192,
727
+ temperature: temperature ?? 0,
728
+ system: systemContent,
729
+ messages: [{ role: 'user', content: userContent }],
730
+ });
731
+
732
+ const textContent = response.content.find((c) => c.type === 'text');
733
+ const content =
734
+ textContent && 'text' in textContent ? textContent.text : '';
735
+
736
+ const apiUsage = response.usage as {
737
+ input_tokens: number;
738
+ output_tokens: number;
739
+ cache_creation_input_tokens?: number;
740
+ cache_read_input_tokens?: number;
741
+ };
742
+
743
+ const cacheRead = apiUsage.cache_read_input_tokens || 0;
744
+ const cacheCreation = apiUsage.cache_creation_input_tokens || 0;
745
+
746
+ if (cacheCreation > 0) {
747
+ console.log(
748
+ `[LLMClient:callWithCache] Cache WRITE: ${cacheCreation} tokens`,
749
+ );
750
+ }
751
+ if (cacheRead > 0) {
752
+ const savingsPercent = Math.round(
753
+ (cacheRead / (cacheRead + apiUsage.input_tokens)) * 100,
754
+ );
755
+ console.log(
756
+ `[LLMClient:callWithCache] Cache HIT: ${cacheRead} tokens (~${savingsPercent}% of prompt)`,
757
+ );
758
+ }
759
+ if (cacheCreation === 0 && cacheRead === 0) {
760
+ console.log(
761
+ `[LLMClient:callWithCache] No caching: ${apiUsage.input_tokens} input tokens`,
762
+ );
763
+ }
764
+
765
+ const usage: LLMUsage = {
766
+ promptTokens: apiUsage.input_tokens,
767
+ completionTokens: apiUsage.output_tokens,
768
+ totalTokens: apiUsage.input_tokens + apiUsage.output_tokens,
769
+ };
770
+
771
+ if (this.tokenTracker) {
772
+ this.tokenTracker.addUsage(
773
+ usage.promptTokens,
774
+ usage.completionTokens,
775
+ );
776
+ }
777
+
778
+ const finishReason =
779
+ response.stop_reason === 'end_turn'
780
+ ? 'stop'
781
+ : response.stop_reason;
782
+
783
+ return {
784
+ content,
785
+ finishReason: finishReason as LLMFinishReason,
786
+ usage,
787
+ };
788
+ });
789
+
790
+ let parsed: T;
791
+ if (rawText) {
792
+ parsed = result.content as unknown as T;
793
+ } else if (skipSchemaValidation) {
794
+ parsed = parseJsonResponse(result.content, undefined) as T;
795
+ } else {
796
+ parsed = parseJsonResponse(result.content, schema);
797
+ }
798
+
799
+ return {
800
+ data: parsed,
801
+ raw: result.content,
802
+ finishReason: result.finishReason,
803
+ usage: result.usage,
804
+ };
805
+ } catch (error) {
806
+ lastError = error instanceof Error ? error : new Error(String(error));
807
+ console.error(
808
+ `[LLMClient:callWithCache] Attempt ${attempt + 1} failed:`,
809
+ lastError.message,
810
+ );
811
+
812
+ if (this.isRateLimitError(lastError)) {
813
+ throw lastError;
814
+ }
815
+ }
816
+ }
817
+
818
+ throw lastError;
819
+ }
820
+
821
+ static cacheableBlock(text: string, cache = true): CacheableBlock {
822
+ return cache
823
+ ? { type: 'text', text, cache_control: { type: 'ephemeral' } }
824
+ : { type: 'text', text };
825
+ }
826
+ }
827
+
828
+ // ============================================================================
829
+ // Singleton Instances
830
+ // ============================================================================
831
+
832
+ const sharedClients: Partial<Record<LLMProvider, LLMClient>> = {};
833
+
834
+ export function getSharedLLMClient(options?: LLMClientOptions): LLMClient {
835
+ const provider = options?.provider || 'openai';
836
+ if (!sharedClients[provider]) {
837
+ sharedClients[provider] = new LLMClient(options);
838
+ }
839
+ return sharedClients[provider]!;
840
+ }
841
+
842
+ export function resetSharedLLMClient(provider?: LLMProvider): void {
843
+ if (provider) {
844
+ delete sharedClients[provider];
845
+ } else {
846
+ for (const key of Object.keys(sharedClients) as LLMProvider[]) {
847
+ delete sharedClients[key];
848
+ }
849
+ }
850
+ }
851
+
852
+ // ============================================================================
853
+ // Provider Detection
854
+ // ============================================================================
855
+
856
+ export function getAvailableProvider(): LLMProvider {
857
+ if (process.env.ANTHROPIC_API_KEY) return 'anthropic';
858
+ if (process.env.DEEPSEEK_API_KEY) return 'deepseek';
859
+ if (process.env.KIMI_API_KEY) return 'kimi';
860
+ if (process.env.OPENAI_API_KEY) return 'openai';
861
+ throw new Error(
862
+ 'No LLM API key found. Please set ANTHROPIC_API_KEY, OPENAI_API_KEY, DEEPSEEK_API_KEY, or KIMI_API_KEY.',
863
+ );
864
+ }
865
+
866
+ export function isProviderAvailable(provider: LLMProvider): boolean {
867
+ switch (provider) {
868
+ case 'openai':
869
+ return !!process.env.OPENAI_API_KEY;
870
+ case 'deepseek':
871
+ return !!process.env.DEEPSEEK_API_KEY;
872
+ case 'anthropic':
873
+ return !!process.env.ANTHROPIC_API_KEY;
874
+ case 'kimi':
875
+ return !!process.env.KIMI_API_KEY;
876
+ default:
877
+ return false;
878
+ }
879
+ }
880
+
881
+ // ============================================================================
882
+ // Convenience Functions
883
+ // ============================================================================
884
+
885
+ export function createRequirementsClient(
886
+ options?: Partial<LLMClientOptions>,
887
+ ): LLMClient {
888
+ const provider = options?.provider || getAvailableProvider();
889
+ const defaultModel =
890
+ provider === 'deepseek' ? DEEPSEEK_MODELS.CHAT : OPENAI_MODELS.GPT_5_1;
891
+ return new LLMClient({
892
+ provider,
893
+ model: defaultModel,
894
+ temperature: 0.3,
895
+ ...options,
896
+ });
897
+ }
898
+
899
+ export function createCreativeClient(
900
+ options?: Partial<LLMClientOptions>,
901
+ ): LLMClient {
902
+ const provider = options?.provider || getAvailableProvider();
903
+ const defaultModel =
904
+ provider === 'deepseek' ? DEEPSEEK_MODELS.REASONER : OPENAI_MODELS.GPT4O;
905
+ return new LLMClient({
906
+ provider,
907
+ model: defaultModel,
908
+ temperature: 0.7,
909
+ ...options,
910
+ });
911
+ }
912
+
913
+ export function createFixClient(
914
+ options?: Partial<LLMClientOptions>,
915
+ ): LLMClient {
916
+ const provider = options?.provider || getAvailableProvider();
917
+ const defaultModel =
918
+ provider === 'deepseek'
919
+ ? DEEPSEEK_MODELS.CHAT
920
+ : OPENAI_MODELS.GPT4O_MINI;
921
+ return new LLMClient({
922
+ provider,
923
+ model: defaultModel,
924
+ temperature: 0.2,
925
+ ...options,
926
+ });
927
+ }
928
+
929
+ export function createDeepSeekClient(
930
+ options?: Partial<Omit<LLMClientOptions, 'provider'>>,
931
+ ): LLMClient {
932
+ return new LLMClient({
933
+ provider: 'deepseek',
934
+ model: DEEPSEEK_MODELS.CHAT,
935
+ ...options,
936
+ });
937
+ }
938
+
939
+ export function createOpenAIClient(
940
+ options?: Partial<Omit<LLMClientOptions, 'provider'>>,
941
+ ): LLMClient {
942
+ return new LLMClient({
943
+ provider: 'openai',
944
+ model: OPENAI_MODELS.GPT4O,
945
+ ...options,
946
+ });
947
+ }
948
+
949
+ export function createAnthropicClient(
950
+ options?: Partial<Omit<LLMClientOptions, 'provider'>>,
951
+ ): LLMClient {
952
+ return new LLMClient({
953
+ provider: 'anthropic',
954
+ model: ANTHROPIC_MODELS.CLAUDE_SONNET_4_5,
955
+ ...options,
956
+ });
957
+ }
958
+
959
+ export function createKimiClient(
960
+ options?: Partial<Omit<LLMClientOptions, 'provider'>>,
961
+ ): LLMClient {
962
+ return new LLMClient({
963
+ provider: 'kimi',
964
+ model: KIMI_MODELS.K2_5,
965
+ ...options,
966
+ });
967
+ }