llmist 15.13.0 → 15.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -5432,12 +5432,13 @@ var init_anthropic = __esm({
5432
5432
  );
5433
5433
  }
5434
5434
  buildApiRequest(options, descriptor, spec, messages) {
5435
+ const cachingEnabled = options.caching?.enabled !== false;
5435
5436
  const systemMessages = messages.filter((message) => message.role === "system");
5436
5437
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
5437
5438
  type: "text",
5438
5439
  text: extractMessageText(m.content),
5439
- // Add cache_control to the LAST system message block
5440
- ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
5440
+ // Add cache_control to the LAST system message block (only when caching is enabled)
5441
+ ...cachingEnabled && index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
5441
5442
  })) : void 0;
5442
5443
  const nonSystemMessages = messages.filter(
5443
5444
  (message) => message.role !== "system"
@@ -5450,7 +5451,7 @@ var init_anthropic = __esm({
5450
5451
  role: message.role,
5451
5452
  content: this.convertToAnthropicContent(
5452
5453
  message.content,
5453
- message.role === "user" && index === lastUserIndex
5454
+ cachingEnabled && message.role === "user" && index === lastUserIndex
5454
5455
  )
5455
5456
  }));
5456
5457
  const defaultMaxTokens = spec?.maxOutputTokens ?? ANTHROPIC_DEFAULT_MAX_OUTPUT_TOKENS;
@@ -5667,6 +5668,177 @@ var init_anthropic = __esm({
5667
5668
  }
5668
5669
  });
5669
5670
 
5671
+ // src/providers/gemini-cache-manager.ts
5672
+ var import_node_crypto3, GeminiCacheManager;
5673
+ var init_gemini_cache_manager = __esm({
5674
+ "src/providers/gemini-cache-manager.ts"() {
5675
+ "use strict";
5676
+ import_node_crypto3 = require("crypto");
5677
+ GeminiCacheManager = class {
5678
+ constructor(client) {
5679
+ this.client = client;
5680
+ }
5681
+ activeCache = null;
5682
+ /**
5683
+ * Get or create a cache for the given content.
5684
+ *
5685
+ * Returns the cache name if a cache was created/reused, or `null` if caching
5686
+ * was skipped (disabled, below threshold, or API error).
5687
+ *
5688
+ * @param model - Gemini model name (e.g., "gemini-2.5-flash")
5689
+ * @param allContents - All Gemini-formatted contents (system + conversation)
5690
+ * @param config - Caching configuration from the user
5691
+ * @param lastUserMessageIndex - Index of the last user message (content after this is not cached)
5692
+ * @returns Cache name string or null
5693
+ */
5694
+ async getOrCreateCache(model, allContents, config, lastUserMessageIndex) {
5695
+ if (!config.enabled) return null;
5696
+ const scope = config.scope ?? "conversation";
5697
+ const ttl = config.ttl ?? "3600s";
5698
+ const minTokenThreshold = config.minTokenThreshold ?? 32768;
5699
+ const cacheableContents = this.selectCacheableContents(
5700
+ allContents,
5701
+ scope,
5702
+ lastUserMessageIndex
5703
+ );
5704
+ if (cacheableContents.length === 0) return null;
5705
+ const estimatedTokens = this.estimateTokenCount(cacheableContents);
5706
+ if (estimatedTokens < minTokenThreshold) return null;
5707
+ const contentHash = this.computeContentHash(cacheableContents, model);
5708
+ if (this.activeCache && this.canReuseCache(this.activeCache, model, contentHash)) {
5709
+ return {
5710
+ cacheName: this.activeCache.name,
5711
+ cachedContentCount: cacheableContents.length
5712
+ };
5713
+ }
5714
+ try {
5715
+ await this.cleanupActiveCache();
5716
+ const response = await this.client.caches.create({
5717
+ model,
5718
+ config: {
5719
+ contents: cacheableContents,
5720
+ ttl,
5721
+ displayName: `llmist-${scope}-${Date.now()}`
5722
+ }
5723
+ });
5724
+ if (!response.name) {
5725
+ return null;
5726
+ }
5727
+ this.activeCache = {
5728
+ name: response.name,
5729
+ model,
5730
+ contentHash,
5731
+ expireTime: response.expireTime ?? ""
5732
+ };
5733
+ return {
5734
+ cacheName: response.name,
5735
+ cachedContentCount: cacheableContents.length
5736
+ };
5737
+ } catch (error) {
5738
+ console.warn("Gemini cache creation failed, continuing without cache:", error);
5739
+ return null;
5740
+ }
5741
+ }
5742
+ /**
5743
+ * Clean up the active cache (best-effort).
5744
+ * Caches auto-expire via TTL, so failure is non-critical.
5745
+ */
5746
+ async dispose() {
5747
+ await this.cleanupActiveCache();
5748
+ }
5749
+ /**
5750
+ * Select which contents to cache based on scope.
5751
+ *
5752
+ * - "system": Only system-derived messages (the initial user+model exchanges
5753
+ * generated from system messages)
5754
+ * - "conversation": Everything except the last user message
5755
+ */
5756
+ selectCacheableContents(allContents, scope, lastUserMessageIndex) {
5757
+ if (scope === "system") {
5758
+ let systemEndIndex = 0;
5759
+ for (let i = 0; i < allContents.length; i++) {
5760
+ const content = allContents[i];
5761
+ if (content.role === "model" && content.parts.length === 1 && "text" in content.parts[0] && content.parts[0].text === "Understood.") {
5762
+ systemEndIndex = i + 1;
5763
+ } else if (content.role === "user") {
5764
+ const next = allContents[i + 1];
5765
+ if (next && next.role === "model" && next.parts.length === 1 && "text" in next.parts[0] && next.parts[0].text === "Understood.") {
5766
+ continue;
5767
+ }
5768
+ break;
5769
+ } else {
5770
+ break;
5771
+ }
5772
+ }
5773
+ return allContents.slice(0, systemEndIndex);
5774
+ }
5775
+ if (lastUserMessageIndex <= 0) return [];
5776
+ return allContents.slice(0, lastUserMessageIndex);
5777
+ }
5778
+ /**
5779
+ * Estimate token count from contents using character-based heuristic.
5780
+ * Uses ~4 characters per token (conservative estimate for English text).
5781
+ */
5782
+ estimateTokenCount(contents) {
5783
+ let totalChars = 0;
5784
+ for (const content of contents) {
5785
+ for (const part of content.parts) {
5786
+ if ("text" in part) {
5787
+ totalChars += part.text.length;
5788
+ } else if ("inlineData" in part) {
5789
+ totalChars += 258 * 4;
5790
+ }
5791
+ }
5792
+ }
5793
+ return Math.ceil(totalChars / 4);
5794
+ }
5795
+ /**
5796
+ * Compute a stable hash of the cacheable contents for change detection.
5797
+ */
5798
+ computeContentHash(contents, model) {
5799
+ const hash = (0, import_node_crypto3.createHash)("sha256");
5800
+ hash.update(model);
5801
+ for (const content of contents) {
5802
+ hash.update(content.role);
5803
+ for (const part of content.parts) {
5804
+ if ("text" in part) {
5805
+ hash.update(part.text);
5806
+ } else if ("inlineData" in part) {
5807
+ hash.update(part.inlineData.mimeType);
5808
+ hash.update(part.inlineData.data);
5809
+ }
5810
+ }
5811
+ }
5812
+ return hash.digest("hex");
5813
+ }
5814
+ /**
5815
+ * Check if an existing cache can be reused.
5816
+ */
5817
+ canReuseCache(cache, model, contentHash) {
5818
+ if (cache.model !== model) return false;
5819
+ if (cache.contentHash !== contentHash) return false;
5820
+ if (cache.expireTime) {
5821
+ const expiresAt = new Date(cache.expireTime).getTime();
5822
+ const now = Date.now();
5823
+ if (expiresAt - now < 6e4) return false;
5824
+ }
5825
+ return true;
5826
+ }
5827
+ /**
5828
+ * Delete the active cache (best-effort).
5829
+ */
5830
+ async cleanupActiveCache() {
5831
+ if (!this.activeCache) return;
5832
+ try {
5833
+ await this.client.caches.delete({ name: this.activeCache.name });
5834
+ } catch {
5835
+ }
5836
+ this.activeCache = null;
5837
+ }
5838
+ };
5839
+ }
5840
+ });
5841
+
5670
5842
  // src/providers/gemini-image-models.ts
5671
5843
  function getGeminiImageModelSpec(modelId) {
5672
5844
  return geminiImageModels.find((m) => m.modelId === modelId);
@@ -6236,6 +6408,7 @@ var init_gemini = __esm({
6236
6408
  init_messages();
6237
6409
  init_base_provider();
6238
6410
  init_constants2();
6411
+ init_gemini_cache_manager();
6239
6412
  init_gemini_image_models();
6240
6413
  init_gemini_models();
6241
6414
  init_gemini_speech_models();
@@ -6261,12 +6434,62 @@ var init_gemini = __esm({
6261
6434
  };
6262
6435
  GeminiGenerativeProvider = class extends BaseProviderAdapter {
6263
6436
  providerId = "gemini";
6437
+ cacheManager;
6438
+ constructor(client) {
6439
+ super(client);
6440
+ this.cacheManager = new GeminiCacheManager(client);
6441
+ }
6264
6442
  supports(descriptor) {
6265
6443
  return descriptor.provider === this.providerId;
6266
6444
  }
6267
6445
  getModelSpecs() {
6268
6446
  return GEMINI_MODELS;
6269
6447
  }
6448
+ /**
6449
+ * Override the base stream method to inject cache logic.
6450
+ *
6451
+ * When caching is enabled, we:
6452
+ * 1. Prepare messages as usual
6453
+ * 2. Attempt to get/create a cache for the cacheable prefix
6454
+ * 3. If a cache is available, strip cached contents from the request and add cachedContent ref
6455
+ * 4. Otherwise, proceed normally (graceful degradation)
6456
+ */
6457
+ async *stream(options, descriptor, spec) {
6458
+ const preparedMessages = this.prepareMessages(options.messages);
6459
+ const contents = this.convertMessagesToContents(preparedMessages);
6460
+ const cachingConfig = options.caching;
6461
+ let cacheName = null;
6462
+ let cachedContentCount = 0;
6463
+ if (cachingConfig?.enabled) {
6464
+ let lastUserIndex = -1;
6465
+ for (let i = contents.length - 1; i >= 0; i--) {
6466
+ if (contents[i].role === "user") {
6467
+ lastUserIndex = i;
6468
+ break;
6469
+ }
6470
+ }
6471
+ const cacheResult = await this.cacheManager.getOrCreateCache(
6472
+ descriptor.name,
6473
+ contents,
6474
+ cachingConfig,
6475
+ lastUserIndex
6476
+ );
6477
+ if (cacheResult) {
6478
+ cacheName = cacheResult.cacheName;
6479
+ cachedContentCount = cacheResult.cachedContentCount;
6480
+ }
6481
+ }
6482
+ const payload = this.buildApiRequestFromContents(
6483
+ options,
6484
+ descriptor,
6485
+ spec,
6486
+ contents,
6487
+ cacheName,
6488
+ cachedContentCount
6489
+ );
6490
+ const rawStream = await this.executeStreamRequest(payload, options.signal);
6491
+ yield* this.normalizeProviderStream(rawStream);
6492
+ }
6270
6493
  // =========================================================================
6271
6494
  // Image Generation
6272
6495
  // =========================================================================
@@ -6402,6 +6625,17 @@ var init_gemini = __esm({
6402
6625
  }
6403
6626
  buildApiRequest(options, descriptor, _spec, messages) {
6404
6627
  const contents = this.convertMessagesToContents(messages);
6628
+ return this.buildApiRequestFromContents(options, descriptor, _spec, contents, null, 0);
6629
+ }
6630
+ /**
6631
+ * Build API request from pre-converted Gemini contents.
6632
+ *
6633
+ * When a cache name is provided, the cached prefix is stripped from contents
6634
+ * and the cache reference is added to the config. This tells Gemini to use
6635
+ * the pre-computed KV pairs instead of reprocessing the cached content.
6636
+ */
6637
+ buildApiRequestFromContents(options, descriptor, _spec, contents, cacheName, cachedContentCount) {
6638
+ const effectiveContents = cacheName ? contents.slice(cachedContentCount) : contents;
6405
6639
  const generationConfig = this.buildGenerationConfig(options);
6406
6640
  const thinkingConfig = resolveGeminiThinkingConfig(options.reasoning, descriptor.name);
6407
6641
  const config = {
@@ -6415,11 +6649,13 @@ var init_gemini = __esm({
6415
6649
  }
6416
6650
  },
6417
6651
  ...thinkingConfig ?? {},
6652
+ // Add cache reference if available
6653
+ ...cacheName ? { cachedContent: cacheName } : {},
6418
6654
  ...options.extra
6419
6655
  };
6420
6656
  return {
6421
6657
  model: descriptor.name,
6422
- contents,
6658
+ contents: effectiveContents,
6423
6659
  config
6424
6660
  };
6425
6661
  }
@@ -10397,6 +10633,7 @@ var init_builder = __esm({
10397
10633
  // When a gadget calls withParentContext(ctx), this config is shared
10398
10634
  sharedRetryConfig;
10399
10635
  reasoningConfig;
10636
+ cachingConfig;
10400
10637
  constructor(client) {
10401
10638
  this.client = client;
10402
10639
  }
@@ -11036,6 +11273,62 @@ var init_builder = __esm({
11036
11273
  this.reasoningConfig = { enabled: false };
11037
11274
  return this;
11038
11275
  }
11276
+ /**
11277
+ * Enable context caching for supported providers.
11278
+ *
11279
+ * Can be called with:
11280
+ * - No args: enables caching with defaults (`{ enabled: true }`)
11281
+ * - A full config object: `withCaching({ enabled: true, scope: "system", ttl: "7200s" })`
11282
+ *
11283
+ * Provider behavior:
11284
+ * - **Anthropic**: Caching is always-on by default via `cache_control` markers.
11285
+ * Calling `withCaching()` explicitly is a no-op (it's already enabled).
11286
+ * - **Gemini**: Creates an explicit cache via `caches.create()` for the configured scope.
11287
+ * - **OpenAI**: Server-side automatic caching (no-op).
11288
+ *
11289
+ * @param config - Optional caching configuration
11290
+ * @returns This builder for chaining
11291
+ *
11292
+ * @example
11293
+ * ```typescript
11294
+ * // Simple — enable with defaults
11295
+ * LLMist.createAgent()
11296
+ * .withModel("gemini:gemini-2.5-flash")
11297
+ * .withCaching()
11298
+ * .ask("Analyze this large codebase...");
11299
+ *
11300
+ * // Cache only system prompt with longer TTL
11301
+ * LLMist.createAgent()
11302
+ * .withModel("gemini:gemini-2.5-pro")
11303
+ * .withCaching({ enabled: true, scope: "system", ttl: "7200s" })
11304
+ * .ask("...");
11305
+ * ```
11306
+ */
11307
+ withCaching(config) {
11308
+ this.cachingConfig = config ?? { enabled: true };
11309
+ return this;
11310
+ }
11311
+ /**
11312
+ * Explicitly disable context caching.
11313
+ *
11314
+ * For Anthropic, this removes `cache_control` markers from requests,
11315
+ * opting out of prompt caching entirely.
11316
+ *
11317
+ * @returns This builder for chaining
11318
+ *
11319
+ * @example
11320
+ * ```typescript
11321
+ * // Disable Anthropic's automatic caching
11322
+ * LLMist.createAgent()
11323
+ * .withModel("sonnet")
11324
+ * .withoutCaching()
11325
+ * .ask("...");
11326
+ * ```
11327
+ */
11328
+ withoutCaching() {
11329
+ this.cachingConfig = { enabled: false };
11330
+ return this;
11331
+ }
11039
11332
  /**
11040
11333
  * Set subagent configuration overrides.
11041
11334
  *
@@ -11322,6 +11615,7 @@ ${endPrefix}`
11322
11615
  rateLimitConfig: this.rateLimitConfig,
11323
11616
  signal: this.signal,
11324
11617
  reasoning: this.reasoningConfig,
11618
+ caching: this.cachingConfig,
11325
11619
  subagentConfig: this.subagentConfig,
11326
11620
  // Tree context for shared tree model (subagents share parent's tree)
11327
11621
  parentTree: this.parentContext?.tree,
@@ -11510,6 +11804,7 @@ ${endPrefix}`
11510
11804
  rateLimitConfig: this.rateLimitConfig,
11511
11805
  signal: this.signal,
11512
11806
  reasoning: this.reasoningConfig,
11807
+ caching: this.cachingConfig,
11513
11808
  subagentConfig: this.subagentConfig,
11514
11809
  // Tree context for shared tree model (subagents share parent's tree)
11515
11810
  parentTree: this.parentContext?.tree,
@@ -14125,6 +14420,7 @@ var init_agent = __esm({
14125
14420
  // Cancellation
14126
14421
  signal;
14127
14422
  reasoning;
14423
+ caching;
14128
14424
  // Retry configuration
14129
14425
  retryConfig;
14130
14426
  // Rate limit tracker for proactive throttling
@@ -14217,6 +14513,7 @@ var init_agent = __esm({
14217
14513
  }
14218
14514
  this.signal = options.signal;
14219
14515
  this.reasoning = options.reasoning;
14516
+ this.caching = options.caching;
14220
14517
  this.retryConfig = options.sharedRetryConfig ?? resolveRetryConfig(options.retryConfig);
14221
14518
  if (options.sharedRateLimitTracker) {
14222
14519
  this.rateLimitTracker = options.sharedRateLimitTracker;
@@ -14934,6 +15231,19 @@ var init_agent = __esm({
14934
15231
  }
14935
15232
  return void 0;
14936
15233
  }
15234
+ /**
15235
+ * Resolve caching configuration.
15236
+ *
15237
+ * Priority: explicit config > default enabled (preserves Anthropic's existing behavior)
15238
+ * Default is `{ enabled: true }` which means:
15239
+ * - Anthropic: `cache_control` markers are added (existing behavior preserved)
15240
+ * - Gemini: Cache manager is consulted but skips if no explicit config was set
15241
+ * - OpenAI: No-op (server-side automatic)
15242
+ */
15243
+ resolveCachingConfig() {
15244
+ if (this.caching !== void 0) return this.caching;
15245
+ return { enabled: true };
15246
+ }
14937
15247
  /**
14938
15248
  * Prepare LLM call options, create tree node, and process beforeLLMCall controller.
14939
15249
  * @returns options, node ID, and optional skipWithSynthetic response if controller wants to skip
@@ -14941,13 +15251,15 @@ var init_agent = __esm({
14941
15251
  async prepareLLMCall(iteration) {
14942
15252
  const spec = this.client.modelRegistry?.getModelSpec?.(this.model);
14943
15253
  const reasoning = this.resolveReasoningConfig(spec);
15254
+ const caching = this.resolveCachingConfig();
14944
15255
  let llmOptions = {
14945
15256
  model: this.model,
14946
15257
  messages: this.conversation.getMessages(),
14947
15258
  temperature: this.temperature,
14948
15259
  maxTokens: this.defaultMaxTokens,
14949
15260
  signal: this.signal,
14950
- reasoning
15261
+ reasoning,
15262
+ caching
14951
15263
  };
14952
15264
  const llmNode = this.tree.addLLMCall({
14953
15265
  iteration,