npm - @almadar/llm - Versions diffs - 1.0.0 - Mend

@almadar/llm 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/LICENSE +72 -0
package/dist/chunk-KH4JNOLT.js +174 -0
package/dist/chunk-KH4JNOLT.js.map +1 -0
package/dist/chunk-MJS33AAS.js +234 -0
package/dist/chunk-MJS33AAS.js.map +1 -0
package/dist/chunk-PV3G5PJS.js +633 -0
package/dist/chunk-PV3G5PJS.js.map +1 -0
package/dist/chunk-WM7QVK2Z.js +192 -0
package/dist/chunk-WM7QVK2Z.js.map +1 -0
package/dist/client.d.ts +136 -0
package/dist/client.js +39 -0
package/dist/client.js.map +1 -0
package/dist/index.d.ts +67 -0
package/dist/index.js +477 -0
package/dist/index.js.map +1 -0
package/dist/json-parser.d.ts +43 -0
package/dist/json-parser.js +15 -0
package/dist/json-parser.js.map +1 -0
package/dist/rate-limiter-9XAWfHwe.d.ts +98 -0
package/dist/structured-output.d.ts +113 -0
package/dist/structured-output.js +16 -0
package/dist/structured-output.js.map +1 -0
package/package.json +55 -0
package/src/client.ts +967 -0
package/src/continuation.ts +290 -0
package/src/index.ts +87 -0
package/src/json-parser.ts +273 -0
package/src/rate-limiter.ts +237 -0
package/src/structured-output.ts +330 -0
package/src/token-tracker.ts +116 -0
package/src/truncation-detector.ts +308 -0

package/src/client.ts ADDED Viewed

@@ -0,0 +1,967 @@
+/**
+ * Shared LLM Client
+ *
+ * Multi-provider LLM client with:
+ * - OpenAI, DeepSeek, Anthropic, and Kimi support
+ * - Anthropic prompt caching (CachingChatAnthropic)
+ * - Rate limiting and retry logic
+ * - Token tracking
+ * - Structured output parsing with Zod
+ *
+ * @packageDocumentation
+ */
+import { ChatOpenAI } from '@langchain/openai';
+import {
+  ChatAnthropic,
+  type ChatAnthropicCallOptions,
+} from '@langchain/anthropic';
+import Anthropic from '@anthropic-ai/sdk';
+import { z } from 'zod';
+import {
+  RateLimiter,
+  getGlobalRateLimiter,
+  type RateLimiterOptions,
+} from './rate-limiter.js';
+import { TokenTracker, getGlobalTokenTracker } from './token-tracker.js';
+import { parseJsonResponse } from './json-parser.js';
+// eslint-disable-next-line @typescript-eslint/no-explicit-any
+type MessageLike = any;
+// ============================================================================
+// Caching Chat Anthropic Wrapper
+// ============================================================================
+class CachingChatAnthropic extends ChatAnthropic {
+  async invoke(
+    input: MessageLike[] | string,
+    options?: Partial<ChatAnthropicCallOptions>,
+  ): Promise<MessageLike> {
+    let messages: MessageLike[];
+    if (typeof input === 'string') {
+      messages = [{ role: 'user', content: input }];
+    } else {
+      messages = input;
+    }
+    const transformedMessages = messages.map((msg: MessageLike) => {
+      const msgType = msg._getType?.() || msg.role || 'unknown';
+      const isSystem = msgType === 'system';
+      if (!isSystem) return msg;
+      if (typeof msg.content === 'string') {
+        return {
+          ...msg,
+          content: [
+            {
+              type: 'text',
+              text: msg.content,
+              cache_control: { type: 'ephemeral' },
+            },
+          ],
+        };
+      }
+      if (Array.isArray(msg.content)) {
+        const blocks = msg.content as Array<{
+          type?: string;
+          text?: string;
+          cache_control?: unknown;
+        }>;
+        const hasAnyCacheControl = blocks.some((b) => b.cache_control);
+        if (!hasAnyCacheControl) {
+          const transformedBlocks = blocks.map((block, idx) => {
+            if (block.type === 'text' && idx === blocks.length - 1) {
+              return {
+                ...block,
+                cache_control: { type: 'ephemeral' },
+              };
+            }
+            return block;
+          });
+          return { ...msg, content: transformedBlocks };
+        }
+      }
+      return msg;
+    });
+    return super.invoke(transformedMessages, options);
+  }
+}
+type ChatModel = ChatOpenAI | CachingChatAnthropic;
+// ============================================================================
+// Types
+// ============================================================================
+export type LLMProvider = 'openai' | 'deepseek' | 'anthropic' | 'kimi';
+export interface ProviderConfig {
+  apiKey: string;
+  baseUrl?: string;
+  defaultModel: string;
+}
+export interface LLMClientOptions {
+  provider?: LLMProvider;
+  model?: string;
+  temperature?: number;
+  streaming?: boolean;
+  rateLimiter?: RateLimiterOptions;
+  useGlobalRateLimiter?: boolean;
+  trackTokens?: boolean;
+}
+export interface LLMCallOptions<T = unknown> {
+  systemPrompt: string;
+  userPrompt: string;
+  schema?: z.ZodSchema<T>;
+  maxRetries?: number;
+  retryWithContext?: boolean;
+  maxTokens?: number;
+  skipSchemaValidation?: boolean;
+  temperature?: number;
+}
+export interface CacheableBlock {
+  type: 'text';
+  text: string;
+  cache_control?: { type: 'ephemeral' };
+}
+export interface CacheAwareLLMCallOptions<T = unknown>
+  extends LLMCallOptions<T> {
+  systemBlocks?: CacheableBlock[];
+  userBlocks?: CacheableBlock[];
+  rawText?: boolean;
+}
+export interface LLMUsage {
+  promptTokens: number;
+  completionTokens: number;
+  totalTokens: number;
+}
+export type LLMFinishReason =
+  | 'stop'
+  | 'length'
+  | 'content_filter'
+  | 'tool_calls'
+  | null;
+export interface LLMResponse<T> {
+  data: T;
+  raw: string;
+  finishReason: LLMFinishReason;
+  usage: LLMUsage | null;
+}
+// ============================================================================
+// Provider Configuration
+// ============================================================================
+const PROVIDER_CONFIGS: Record<LLMProvider, () => ProviderConfig> = {
+  openai: () => {
+    const apiKey = process.env.OPENAI_API_KEY;
+    if (!apiKey) {
+      throw new Error(
+        'OPENAI_API_KEY environment variable is not set. ' +
+          'Please set it in your .env file or environment.',
+      );
+    }
+    return { apiKey, baseUrl: undefined, defaultModel: 'gpt-4o' };
+  },
+  deepseek: () => {
+    const apiKey = process.env.DEEPSEEK_API_KEY;
+    if (!apiKey) {
+      throw new Error(
+        'DEEPSEEK_API_KEY environment variable is not set. ' +
+          'Please set it in your .env file or environment.',
+      );
+    }
+    return {
+      apiKey,
+      baseUrl: 'https://api.deepseek.com/v1',
+      defaultModel: 'deepseek-chat',
+    };
+  },
+  anthropic: () => {
+    const apiKey = process.env.ANTHROPIC_API_KEY;
+    if (!apiKey) {
+      throw new Error(
+        'ANTHROPIC_API_KEY environment variable is not set. ' +
+          'Please set it in your .env file or environment.',
+      );
+    }
+    return {
+      apiKey,
+      baseUrl: undefined,
+      defaultModel: 'claude-sonnet-4-5-20250929',
+    };
+  },
+  kimi: () => {
+    const apiKey = process.env.KIMI_API_KEY;
+    if (!apiKey) {
+      throw new Error(
+        'KIMI_API_KEY environment variable is not set. ' +
+          'Please set it in your .env file or environment.',
+      );
+    }
+    return {
+      apiKey,
+      baseUrl: 'https://api.moonshot.cn/v1',
+      defaultModel: 'kimi-k2.5',
+    };
+  },
+};
+export const DEEPSEEK_MODELS = {
+  CHAT: 'deepseek-chat',
+  CODER: 'deepseek-coder',
+  REASONER: 'deepseek-reasoner',
+} as const;
+export const OPENAI_MODELS = {
+  GPT4O: 'gpt-4o',
+  GPT4O_MINI: 'gpt-4o-mini',
+  GPT4_TURBO: 'gpt-4-turbo',
+  GPT35_TURBO: 'gpt-3.5-turbo',
+  GPT_5_1: 'gpt-5.1',
+} as const;
+export const ANTHROPIC_MODELS = {
+  CLAUDE_SONNET_4_5: 'claude-sonnet-4-5-20250929',
+  CLAUDE_SONNET_4: 'claude-sonnet-4-20250514',
+  CLAUDE_OPUS_4_5: 'claude-opus-4-5-20250929',
+  CLAUDE_3_5_HAIKU: 'claude-3-5-haiku-20241022',
+} as const;
+export const KIMI_MODELS = {
+  K2_5: 'kimi-k2.5',
+} as const;
+const DEFAULT_TEMPERATURE = 0.3;
+// ============================================================================
+// LLM Client
+// ============================================================================
+export class LLMClient {
+  private model: ChatModel;
+  private rateLimiter: RateLimiter;
+  private tokenTracker: TokenTracker | null;
+  private modelName: string;
+  private provider: LLMProvider;
+  private providerConfig: ProviderConfig;
+  private temperature: number;
+  private streaming: boolean;
+  constructor(options: LLMClientOptions = {}) {
+    this.provider = options.provider || 'openai';
+    this.temperature = options.temperature ?? DEFAULT_TEMPERATURE;
+    this.streaming = options.streaming ?? false;
+    this.providerConfig = PROVIDER_CONFIGS[this.provider]();
+    this.modelName = options.model || this.providerConfig.defaultModel;
+    const keyPreview = this.providerConfig.apiKey.slice(-4);
+    console.log(
+      `[LLMClient] Provider: ${this.provider}, Model: ${this.modelName}, Key: ****${keyPreview}`,
+    );
+    if (this.providerConfig.baseUrl) {
+      console.log(
+        `[LLMClient] Using custom base URL: ${this.providerConfig.baseUrl}`,
+      );
+    }
+    this.model = this.createModel();
+    this.rateLimiter =
+      options.useGlobalRateLimiter !== false
+        ? getGlobalRateLimiter(options.rateLimiter)
+        : new RateLimiter(options.rateLimiter);
+    this.tokenTracker =
+      options.trackTokens !== false
+        ? getGlobalTokenTracker(this.modelName)
+        : null;
+  }
+  private usesMaxCompletionTokens(): boolean {
+    const model = this.modelName.toLowerCase();
+    return (
+      model.startsWith('o1') ||
+      model.startsWith('gpt-5') ||
+      model.includes('o1-') ||
+      model.includes('o3')
+    );
+  }
+  private createModel(options?: {
+    maxTokens?: number;
+    temperature?: number;
+  }): ChatModel {
+    const maxTokens = options?.maxTokens;
+    const temperature = options?.temperature ?? this.temperature;
+    if (this.provider === 'anthropic') {
+      return new CachingChatAnthropic({
+        anthropicApiKey: this.providerConfig.apiKey,
+        modelName: this.modelName,
+        temperature,
+        streaming: this.streaming,
+        maxTokens: maxTokens || 8192,
+        callbacks: [
+          {
+            handleLLMEnd: (output) => {
+              const generation = output.generations?.[0]?.[0];
+              const usage = (
+                generation as unknown as {
+                  message?: {
+                    usage_metadata?: {
+                      cache_creation_input_tokens?: number;
+                      cache_read_input_tokens?: number;
+                      input_tokens?: number;
+                      output_tokens?: number;
+                    };
+                  };
+                }
+              )?.message?.usage_metadata;
+              if (usage) {
+                const cacheCreated = usage.cache_creation_input_tokens ?? 0;
+                const cacheRead = usage.cache_read_input_tokens ?? 0;
+                const inputTokens = usage.input_tokens ?? 0;
+                const outputTokens = usage.output_tokens ?? 0;
+                if (cacheCreated > 0) {
+                  console.log(
+                    `[LLMClient:Anthropic] Cache WRITE: ${cacheCreated} tokens cached`,
+                  );
+                }
+                if (cacheRead > 0) {
+                  const savingsPercent = Math.round(
+                    (cacheRead / (cacheRead + inputTokens)) * 100,
+                  );
+                  console.log(
+                    `[LLMClient:Anthropic] Cache HIT: ${cacheRead} tokens (~${savingsPercent}% of prompt)`,
+                  );
+                }
+                if (cacheCreated === 0 && cacheRead === 0 && inputTokens > 0) {
+                  if (inputTokens < 500) {
+                    console.log(
+                      `[LLMClient:Anthropic] ${inputTokens} input, ${outputTokens} output tokens (likely cached)`,
+                    );
+                  } else {
+                    console.log(
+                      `[LLMClient:Anthropic] ${inputTokens} input, ${outputTokens} output tokens`,
+                    );
+                  }
+                }
+              }
+            },
+          },
+        ],
+      });
+    }
+    const useCompletionTokens = this.usesMaxCompletionTokens();
+    const tokenConfig = maxTokens
+      ? useCompletionTokens
+        ? { modelKwargs: { max_completion_tokens: maxTokens } }
+        : { maxTokens }
+      : {};
+    const timeout = this.provider === 'deepseek' ? 600000 : undefined;
+    return new ChatOpenAI({
+      openAIApiKey: this.providerConfig.apiKey,
+      modelName: this.modelName,
+      temperature: useCompletionTokens ? undefined : temperature,
+      streaming: this.streaming,
+      timeout,
+      ...tokenConfig,
+      configuration: {
+        apiKey: this.providerConfig.apiKey,
+        ...(this.providerConfig.baseUrl
+          ? { baseURL: this.providerConfig.baseUrl }
+          : {}),
+      },
+    });
+  }
+  private getModelWithOptions(options: {
+    maxTokens?: number;
+    temperature?: number;
+  }): ChatModel {
+    return this.createModel(options);
+  }
+  getProvider(): LLMProvider {
+    return this.provider;
+  }
+  getModelName(): string {
+    return this.modelName;
+  }
+  getModel(): ChatModel {
+    return this.model;
+  }
+  getRateLimiterStatus() {
+    return this.rateLimiter.getStatus();
+  }
+  getTokenUsage() {
+    return this.tokenTracker?.getSummary() ?? null;
+  }
+  async call<T>(options: LLMCallOptions<T>): Promise<T> {
+    const response = await this.callWithMetadata(options);
+    return response.data;
+  }
+  async callWithMetadata<T>(options: LLMCallOptions<T>): Promise<LLMResponse<T>> {
+    const {
+      systemPrompt,
+      userPrompt,
+      schema,
+      maxRetries = 2,
+      retryWithContext = true,
+      maxTokens,
+      skipSchemaValidation = false,
+      temperature,
+    } = options;
+    let currentPrompt = userPrompt;
+    let lastError: Error | null = null;
+    console.log(
+      `[LLMClient:call] Starting call to ${this.provider}/${this.modelName}`,
+    );
+    console.log(`[LLMClient:call] Prompt length: ${userPrompt.length} chars`);
+    if (maxTokens) {
+      console.log(`[LLMClient:call] Max tokens: ${maxTokens}`);
+    }
+    for (let attempt = 0; attempt <= maxRetries; attempt++) {
+      try {
+        console.log(
+          `[LLMClient:call] Attempt ${attempt + 1}/${maxRetries + 1}...`,
+        );
+        const attemptStartTime = Date.now();
+        const result = await this.rateLimiter.execute(async () => {
+          console.log(`[LLMClient:call] Invoking model...`);
+          const invokeStartTime = Date.now();
+          const modelToUse =
+            maxTokens || temperature !== undefined
+              ? this.getModelWithOptions({ maxTokens, temperature })
+              : this.model;
+          const response = await modelToUse.invoke([
+            { role: 'system', content: systemPrompt },
+            { role: 'user', content: currentPrompt },
+          ]);
+          console.log(
+            `[LLMClient:call] Model responded in ${Date.now() - invokeStartTime}ms`,
+          );
+          let usage: LLMUsage | null = null;
+          if (response.usage_metadata) {
+            const usageMeta = response.usage_metadata as {
+              input_tokens?: number;
+              output_tokens?: number;
+            };
+            usage = {
+              promptTokens: usageMeta.input_tokens || 0,
+              completionTokens: usageMeta.output_tokens || 0,
+              totalTokens:
+                (usageMeta.input_tokens || 0) +
+                (usageMeta.output_tokens || 0),
+            };
+            console.log(
+              `[LLMClient:call] Tokens used: ${usage.promptTokens} in, ${usage.completionTokens} out`,
+            );
+            if (this.tokenTracker) {
+              this.tokenTracker.addUsage(
+                usage.promptTokens,
+                usage.completionTokens,
+              );
+            }
+          }
+          const finishReason = this.extractFinishReason(response);
+          if (finishReason === 'length') {
+            console.warn(
+              `[LLMClient:call] Response truncated (finish_reason=length)`,
+            );
+          }
+          const content =
+            typeof response.content === 'string'
+              ? response.content
+              : JSON.stringify(response.content);
+          console.log(
+            `[LLMClient:call] Response length: ${content.length} chars, finish_reason: ${finishReason}`,
+          );
+          return { content, finishReason, usage };
+        });
+        console.log(
+          `[LLMClient:call] Attempt ${attempt + 1} completed in ${Date.now() - attemptStartTime}ms, parsing response...`,
+        );
+        const parsed = skipSchemaValidation
+          ? (parseJsonResponse(result.content, undefined) as T)
+          : parseJsonResponse(result.content, schema);
+        console.log(
+          `[LLMClient:call] Response parsed successfully${skipSchemaValidation ? ' (schema validation skipped)' : ''}`,
+        );
+        return {
+          data: parsed,
+          raw: result.content,
+          finishReason: result.finishReason,
+          usage: result.usage,
+        };
+      } catch (error) {
+        lastError = error instanceof Error ? error : new Error(String(error));
+        console.error(
+          `[LLMClient:call] Attempt ${attempt + 1} failed:`,
+          lastError.message,
+        );
+        if (this.isRateLimitError(lastError)) {
+          console.error(`[LLMClient:call] Rate limit error, not retrying`);
+          throw lastError;
+        }
+        if (attempt < maxRetries && retryWithContext) {
+          console.log(`[LLMClient:call] Will retry with error context`);
+          currentPrompt =
+            `${userPrompt}\n\n` +
+            `[Previous attempt failed with: ${lastError.message}]\n` +
+            `Please output valid JSON that matches the expected schema.`;
+        }
+      }
+    }
+    console.error(`[LLMClient:call] All attempts exhausted, throwing error`);
+    throw lastError;
+  }
+  private extractFinishReason(
+    response: Awaited<ReturnType<ChatOpenAI['invoke']>>,
+  ): LLMFinishReason {
+    const metadata = response.response_metadata as
+      | Record<string, unknown>
+      | undefined;
+    if (metadata?.finish_reason) {
+      const reason = metadata.finish_reason as string;
+      if (
+        reason === 'stop' ||
+        reason === 'length' ||
+        reason === 'content_filter' ||
+        reason === 'tool_calls'
+      ) {
+        return reason;
+      }
+    }
+    return null;
+  }
+  async callRaw(options: {
+    systemPrompt: string;
+    userPrompt: string;
+    maxTokens?: number;
+  }): Promise<string> {
+    const response = await this.callRawWithMetadata(options);
+    return response.raw;
+  }
+  async callRawWithMetadata(options: {
+    systemPrompt: string;
+    userPrompt: string;
+    maxTokens?: number;
+  }): Promise<Omit<LLMResponse<string>, 'data'> & { raw: string }> {
+    const { systemPrompt, userPrompt, maxTokens } = options;
+    return this.rateLimiter.execute(async () => {
+      const modelToUse = maxTokens
+        ? this.getModelWithOptions({ maxTokens })
+        : this.model;
+      const response = await modelToUse.invoke([
+        { role: 'system', content: systemPrompt },
+        { role: 'user', content: userPrompt },
+      ]);
+      let usage: LLMUsage | null = null;
+      if (response.usage_metadata) {
+        const usageMeta = response.usage_metadata as {
+          input_tokens?: number;
+          output_tokens?: number;
+        };
+        usage = {
+          promptTokens: usageMeta.input_tokens || 0,
+          completionTokens: usageMeta.output_tokens || 0,
+          totalTokens:
+            (usageMeta.input_tokens || 0) + (usageMeta.output_tokens || 0),
+        };
+        if (this.tokenTracker) {
+          this.tokenTracker.addUsage(
+            usage.promptTokens,
+            usage.completionTokens,
+          );
+        }
+      }
+      const finishReason = this.extractFinishReason(response);
+      const content =
+        typeof response.content === 'string'
+          ? response.content
+          : JSON.stringify(response.content);
+      return { raw: content, finishReason, usage };
+    });
+  }
+  private isRateLimitError(error: Error): boolean {
+    const message = error.message.toLowerCase();
+    return (
+      message.includes('rate limit') ||
+      message.includes('429') ||
+      message.includes('quota exceeded')
+    );
+  }
+  // ==========================================================================
+  // Anthropic Cache Control Support
+  // ==========================================================================
+  async callWithCache<T>(
+    options: CacheAwareLLMCallOptions<T>,
+  ): Promise<LLMResponse<T>> {
+    const {
+      systemPrompt,
+      userPrompt,
+      systemBlocks,
+      userBlocks,
+      schema,
+      maxRetries = 2,
+      maxTokens,
+      skipSchemaValidation = false,
+      temperature,
+      rawText = false,
+    } = options;
+    if (this.provider !== 'anthropic') {
+      console.log(
+        `[LLMClient:callWithCache] Provider ${this.provider} doesn't support caching, using regular call`,
+      );
+      return this.callWithMetadata(options);
+    }
+    const cacheableCount =
+      (systemBlocks || []).filter((b) => b.cache_control).length +
+      (userBlocks || []).filter((b) => b.cache_control).length;
+    console.log(
+      `[LLMClient:callWithCache] ${cacheableCount} cacheable block(s)`,
+    );
+    let lastError: Error | null = null;
+    for (let attempt = 0; attempt <= maxRetries; attempt++) {
+      try {
+        console.log(
+          `[LLMClient:callWithCache] Attempt ${attempt + 1}/${maxRetries + 1}...`,
+        );
+        const result = await this.rateLimiter.execute(async () => {
+          const anthropic = new Anthropic();
+          const systemContent =
+            systemBlocks && systemBlocks.length > 0
+              ? systemBlocks.map((b) => ({
+                  type: 'text' as const,
+                  text: b.text,
+                  ...(b.cache_control
+                    ? { cache_control: b.cache_control }
+                    : {}),
+                }))
+              : systemPrompt
+                ? [{ type: 'text' as const, text: systemPrompt }]
+                : [];
+          const userContent =
+            userBlocks && userBlocks.length > 0
+              ? userBlocks.map((b) => ({
+                  type: 'text' as const,
+                  text: b.text,
+                  ...(b.cache_control
+                    ? { cache_control: b.cache_control }
+                    : {}),
+                }))
+              : userPrompt
+                ? [{ type: 'text' as const, text: userPrompt }]
+                : [];
+          const response = await anthropic.messages.create({
+            model: this.modelName,
+            max_tokens: maxTokens || 8192,
+            temperature: temperature ?? 0,
+            system: systemContent,
+            messages: [{ role: 'user', content: userContent }],
+          });
+          const textContent = response.content.find((c) => c.type === 'text');
+          const content =
+            textContent && 'text' in textContent ? textContent.text : '';
+          const apiUsage = response.usage as {
+            input_tokens: number;
+            output_tokens: number;
+            cache_creation_input_tokens?: number;
+            cache_read_input_tokens?: number;
+          };
+          const cacheRead = apiUsage.cache_read_input_tokens || 0;
+          const cacheCreation = apiUsage.cache_creation_input_tokens || 0;
+          if (cacheCreation > 0) {
+            console.log(
+              `[LLMClient:callWithCache] Cache WRITE: ${cacheCreation} tokens`,
+            );
+          }
+          if (cacheRead > 0) {
+            const savingsPercent = Math.round(
+              (cacheRead / (cacheRead + apiUsage.input_tokens)) * 100,
+            );
+            console.log(
+              `[LLMClient:callWithCache] Cache HIT: ${cacheRead} tokens (~${savingsPercent}% of prompt)`,
+            );
+          }
+          if (cacheCreation === 0 && cacheRead === 0) {
+            console.log(
+              `[LLMClient:callWithCache] No caching: ${apiUsage.input_tokens} input tokens`,
+            );
+          }
+          const usage: LLMUsage = {
+            promptTokens: apiUsage.input_tokens,
+            completionTokens: apiUsage.output_tokens,
+            totalTokens: apiUsage.input_tokens + apiUsage.output_tokens,
+          };
+          if (this.tokenTracker) {
+            this.tokenTracker.addUsage(
+              usage.promptTokens,
+              usage.completionTokens,
+            );
+          }
+          const finishReason =
+            response.stop_reason === 'end_turn'
+              ? 'stop'
+              : response.stop_reason;
+          return {
+            content,
+            finishReason: finishReason as LLMFinishReason,
+            usage,
+          };
+        });
+        let parsed: T;
+        if (rawText) {
+          parsed = result.content as unknown as T;
+        } else if (skipSchemaValidation) {
+          parsed = parseJsonResponse(result.content, undefined) as T;
+        } else {
+          parsed = parseJsonResponse(result.content, schema);
+        }
+        return {
+          data: parsed,
+          raw: result.content,
+          finishReason: result.finishReason,
+          usage: result.usage,
+        };
+      } catch (error) {
+        lastError = error instanceof Error ? error : new Error(String(error));
+        console.error(
+          `[LLMClient:callWithCache] Attempt ${attempt + 1} failed:`,
+          lastError.message,
+        );
+        if (this.isRateLimitError(lastError)) {
+          throw lastError;
+        }
+      }
+    }
+    throw lastError;
+  }
+  static cacheableBlock(text: string, cache = true): CacheableBlock {
+    return cache
+      ? { type: 'text', text, cache_control: { type: 'ephemeral' } }
+      : { type: 'text', text };
+  }
+}
+// ============================================================================
+// Singleton Instances
+// ============================================================================
+const sharedClients: Partial<Record<LLMProvider, LLMClient>> = {};
+export function getSharedLLMClient(options?: LLMClientOptions): LLMClient {
+  const provider = options?.provider || 'openai';
+  if (!sharedClients[provider]) {
+    sharedClients[provider] = new LLMClient(options);
+  }
+  return sharedClients[provider]!;
+}
+export function resetSharedLLMClient(provider?: LLMProvider): void {
+  if (provider) {
+    delete sharedClients[provider];
+  } else {
+    for (const key of Object.keys(sharedClients) as LLMProvider[]) {
+      delete sharedClients[key];
+    }
+  }
+}
+// ============================================================================
+// Provider Detection
+// ============================================================================
+export function getAvailableProvider(): LLMProvider {
+  if (process.env.ANTHROPIC_API_KEY) return 'anthropic';
+  if (process.env.DEEPSEEK_API_KEY) return 'deepseek';
+  if (process.env.KIMI_API_KEY) return 'kimi';
+  if (process.env.OPENAI_API_KEY) return 'openai';
+  throw new Error(
+    'No LLM API key found. Please set ANTHROPIC_API_KEY, OPENAI_API_KEY, DEEPSEEK_API_KEY, or KIMI_API_KEY.',
+  );
+}
+export function isProviderAvailable(provider: LLMProvider): boolean {
+  switch (provider) {
+    case 'openai':
+      return !!process.env.OPENAI_API_KEY;
+    case 'deepseek':
+      return !!process.env.DEEPSEEK_API_KEY;
+    case 'anthropic':
+      return !!process.env.ANTHROPIC_API_KEY;
+    case 'kimi':
+      return !!process.env.KIMI_API_KEY;
+    default:
+      return false;
+  }
+}
+// ============================================================================
+// Convenience Functions
+// ============================================================================
+export function createRequirementsClient(
+  options?: Partial<LLMClientOptions>,
+): LLMClient {
+  const provider = options?.provider || getAvailableProvider();
+  const defaultModel =
+    provider === 'deepseek' ? DEEPSEEK_MODELS.CHAT : OPENAI_MODELS.GPT_5_1;
+  return new LLMClient({
+    provider,
+    model: defaultModel,
+    temperature: 0.3,
+    ...options,
+  });
+}
+export function createCreativeClient(
+  options?: Partial<LLMClientOptions>,
+): LLMClient {
+  const provider = options?.provider || getAvailableProvider();
+  const defaultModel =
+    provider === 'deepseek' ? DEEPSEEK_MODELS.REASONER : OPENAI_MODELS.GPT4O;
+  return new LLMClient({
+    provider,
+    model: defaultModel,
+    temperature: 0.7,
+    ...options,
+  });
+}
+export function createFixClient(
+  options?: Partial<LLMClientOptions>,
+): LLMClient {
+  const provider = options?.provider || getAvailableProvider();
+  const defaultModel =
+    provider === 'deepseek'
+      ? DEEPSEEK_MODELS.CHAT
+      : OPENAI_MODELS.GPT4O_MINI;
+  return new LLMClient({
+    provider,
+    model: defaultModel,
+    temperature: 0.2,
+    ...options,
+  });
+}
+export function createDeepSeekClient(
+  options?: Partial<Omit<LLMClientOptions, 'provider'>>,
+): LLMClient {
+  return new LLMClient({
+    provider: 'deepseek',
+    model: DEEPSEEK_MODELS.CHAT,
+    ...options,
+  });
+}
+export function createOpenAIClient(
+  options?: Partial<Omit<LLMClientOptions, 'provider'>>,
+): LLMClient {
+  return new LLMClient({
+    provider: 'openai',
+    model: OPENAI_MODELS.GPT4O,
+    ...options,
+  });
+}
+export function createAnthropicClient(
+  options?: Partial<Omit<LLMClientOptions, 'provider'>>,
+): LLMClient {
+  return new LLMClient({
+    provider: 'anthropic',
+    model: ANTHROPIC_MODELS.CLAUDE_SONNET_4_5,
+    ...options,
+  });
+}
+export function createKimiClient(
+  options?: Partial<Omit<LLMClientOptions, 'provider'>>,
+): LLMClient {
+  return new LLMClient({
+    provider: 'kimi',
+    model: KIMI_MODELS.K2_5,
+    ...options,
+  });
+}