llmist 12.3.5 → 12.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1423,6 +1423,241 @@ var init_model_shortcuts = __esm({
1423
1423
  }
1424
1424
  });
1425
1425
 
1426
+ // src/core/rate-limit.ts
1427
+ function resolveRateLimitConfig(config) {
1428
+ if (!config) {
1429
+ return { safetyMargin: DEFAULT_RATE_LIMIT_CONFIG.safetyMargin, enabled: false };
1430
+ }
1431
+ const hasLimits = config.requestsPerMinute !== void 0 || config.tokensPerMinute !== void 0 || config.tokensPerDay !== void 0;
1432
+ return {
1433
+ requestsPerMinute: config.requestsPerMinute,
1434
+ tokensPerMinute: config.tokensPerMinute,
1435
+ tokensPerDay: config.tokensPerDay,
1436
+ safetyMargin: config.safetyMargin ?? DEFAULT_RATE_LIMIT_CONFIG.safetyMargin,
1437
+ enabled: config.enabled ?? (hasLimits && DEFAULT_RATE_LIMIT_CONFIG.enabled)
1438
+ };
1439
+ }
1440
+ var DEFAULT_RATE_LIMIT_CONFIG, RateLimitTracker;
1441
+ var init_rate_limit = __esm({
1442
+ "src/core/rate-limit.ts"() {
1443
+ "use strict";
1444
+ DEFAULT_RATE_LIMIT_CONFIG = {
1445
+ safetyMargin: 0.9,
1446
+ enabled: true
1447
+ };
1448
+ RateLimitTracker = class {
1449
+ config;
1450
+ /** Timestamps of requests in the current minute window */
1451
+ requestTimestamps = [];
1452
+ /** Token usage entries in the current minute window */
1453
+ tokenUsage = [];
1454
+ /** Daily token count */
1455
+ dailyTokens = 0;
1456
+ /** Date string (YYYY-MM-DD UTC) for daily reset tracking */
1457
+ dailyResetDate;
1458
+ constructor(config) {
1459
+ this.config = resolveRateLimitConfig(config);
1460
+ this.dailyResetDate = this.getCurrentDateUTC();
1461
+ }
1462
+ /**
1463
+ * Record a completed request with its token usage.
1464
+ *
1465
+ * @param inputTokens - Number of input tokens used
1466
+ * @param outputTokens - Number of output tokens generated
1467
+ */
1468
+ recordUsage(inputTokens, outputTokens) {
1469
+ const now = Date.now();
1470
+ const totalTokens = inputTokens + outputTokens;
1471
+ this.requestTimestamps.push(now);
1472
+ this.tokenUsage.push({ timestamp: now, tokens: totalTokens });
1473
+ this.checkDailyReset();
1474
+ this.dailyTokens += totalTokens;
1475
+ this.pruneOldEntries(now);
1476
+ }
1477
+ /**
1478
+ * Calculate the delay needed before the next request.
1479
+ *
1480
+ * Returns 0 if no delay is needed, otherwise returns the number of
1481
+ * milliseconds to wait to stay within rate limits.
1482
+ *
1483
+ * @returns Delay in milliseconds (0 if none needed)
1484
+ */
1485
+ getRequiredDelayMs() {
1486
+ if (!this.config.enabled) {
1487
+ return 0;
1488
+ }
1489
+ const now = Date.now();
1490
+ this.pruneOldEntries(now);
1491
+ this.checkDailyReset();
1492
+ let maxDelay = 0;
1493
+ if (this.config.requestsPerMinute !== void 0) {
1494
+ const delay = this.calculateRpmDelay(now);
1495
+ maxDelay = Math.max(maxDelay, delay);
1496
+ }
1497
+ if (this.config.tokensPerMinute !== void 0) {
1498
+ const delay = this.calculateTpmDelay(now);
1499
+ maxDelay = Math.max(maxDelay, delay);
1500
+ }
1501
+ if (this.config.tokensPerDay !== void 0) {
1502
+ const effectiveLimit = this.config.tokensPerDay * this.config.safetyMargin;
1503
+ if (this.dailyTokens >= effectiveLimit) {
1504
+ maxDelay = Math.max(maxDelay, this.getTimeUntilMidnightUTC());
1505
+ }
1506
+ }
1507
+ return Math.ceil(maxDelay);
1508
+ }
1509
+ /**
1510
+ * Check if we're approaching any configured limits.
1511
+ *
1512
+ * @returns true if any limit is at or above the safety margin threshold
1513
+ */
1514
+ isApproachingLimit() {
1515
+ if (!this.config.enabled) {
1516
+ return false;
1517
+ }
1518
+ const now = Date.now();
1519
+ this.pruneOldEntries(now);
1520
+ this.checkDailyReset();
1521
+ if (this.config.requestsPerMinute !== void 0) {
1522
+ const currentRpm = this.requestTimestamps.length;
1523
+ const threshold = this.config.requestsPerMinute * this.config.safetyMargin;
1524
+ if (currentRpm >= threshold) {
1525
+ return true;
1526
+ }
1527
+ }
1528
+ if (this.config.tokensPerMinute !== void 0) {
1529
+ const currentTpm = this.tokenUsage.reduce((sum, entry) => sum + entry.tokens, 0);
1530
+ const threshold = this.config.tokensPerMinute * this.config.safetyMargin;
1531
+ if (currentTpm >= threshold) {
1532
+ return true;
1533
+ }
1534
+ }
1535
+ if (this.config.tokensPerDay !== void 0) {
1536
+ const threshold = this.config.tokensPerDay * this.config.safetyMargin;
1537
+ if (this.dailyTokens >= threshold) {
1538
+ return true;
1539
+ }
1540
+ }
1541
+ return false;
1542
+ }
1543
+ /**
1544
+ * Get current usage statistics.
1545
+ *
1546
+ * @returns Current usage stats for monitoring/logging
1547
+ */
1548
+ getUsageStats() {
1549
+ const now = Date.now();
1550
+ this.pruneOldEntries(now);
1551
+ this.checkDailyReset();
1552
+ return {
1553
+ rpm: this.requestTimestamps.length,
1554
+ tpm: this.tokenUsage.reduce((sum, entry) => sum + entry.tokens, 0),
1555
+ dailyTokens: this.dailyTokens,
1556
+ isApproachingLimit: this.isApproachingLimit(),
1557
+ requiredDelayMs: this.getRequiredDelayMs()
1558
+ };
1559
+ }
1560
+ /**
1561
+ * Reset all tracking state.
1562
+ * Useful for testing or when switching API keys/tiers.
1563
+ */
1564
+ reset() {
1565
+ this.requestTimestamps = [];
1566
+ this.tokenUsage = [];
1567
+ this.dailyTokens = 0;
1568
+ this.dailyResetDate = this.getCurrentDateUTC();
1569
+ }
1570
+ /**
1571
+ * Update configuration dynamically.
1572
+ * Useful when API tier changes or for testing.
1573
+ *
1574
+ * @param config - New configuration to apply
1575
+ */
1576
+ updateConfig(config) {
1577
+ this.config = resolveRateLimitConfig(config);
1578
+ }
1579
+ // ─────────────────────────────────────────────────────────────────────────
1580
+ // Private methods
1581
+ // ─────────────────────────────────────────────────────────────────────────
1582
+ /**
1583
+ * Calculate delay needed based on RPM limit.
1584
+ */
1585
+ calculateRpmDelay(now) {
1586
+ const effectiveLimit = this.config.requestsPerMinute * this.config.safetyMargin;
1587
+ const currentRpm = this.requestTimestamps.length;
1588
+ if (currentRpm < effectiveLimit) {
1589
+ return 0;
1590
+ }
1591
+ const oldestTimestamp = this.requestTimestamps[0];
1592
+ if (oldestTimestamp === void 0) {
1593
+ return 0;
1594
+ }
1595
+ const expiryTime = oldestTimestamp + 6e4;
1596
+ return Math.max(0, expiryTime - now);
1597
+ }
1598
+ /**
1599
+ * Calculate delay needed based on TPM limit.
1600
+ */
1601
+ calculateTpmDelay(now) {
1602
+ const effectiveLimit = this.config.tokensPerMinute * this.config.safetyMargin;
1603
+ const currentTpm = this.tokenUsage.reduce((sum, entry) => sum + entry.tokens, 0);
1604
+ if (currentTpm < effectiveLimit) {
1605
+ return 0;
1606
+ }
1607
+ const sorted = [...this.tokenUsage].sort((a, b) => a.timestamp - b.timestamp);
1608
+ let tokensToFree = currentTpm - effectiveLimit;
1609
+ let delay = 0;
1610
+ for (const entry of sorted) {
1611
+ tokensToFree -= entry.tokens;
1612
+ if (tokensToFree <= 0) {
1613
+ delay = entry.timestamp + 6e4 - now;
1614
+ break;
1615
+ }
1616
+ }
1617
+ return Math.max(0, delay);
1618
+ }
1619
+ /**
1620
+ * Remove entries older than 1 minute from the sliding window.
1621
+ */
1622
+ pruneOldEntries(now) {
1623
+ const cutoff = now - 6e4;
1624
+ while (this.requestTimestamps.length > 0 && this.requestTimestamps[0] < cutoff) {
1625
+ this.requestTimestamps.shift();
1626
+ }
1627
+ while (this.tokenUsage.length > 0 && this.tokenUsage[0].timestamp < cutoff) {
1628
+ this.tokenUsage.shift();
1629
+ }
1630
+ }
1631
+ /**
1632
+ * Check if the day has changed (UTC) and reset daily counters.
1633
+ */
1634
+ checkDailyReset() {
1635
+ const currentDate = this.getCurrentDateUTC();
1636
+ if (currentDate !== this.dailyResetDate) {
1637
+ this.dailyTokens = 0;
1638
+ this.dailyResetDate = currentDate;
1639
+ }
1640
+ }
1641
+ /**
1642
+ * Get current date in YYYY-MM-DD format (UTC).
1643
+ */
1644
+ getCurrentDateUTC() {
1645
+ return (/* @__PURE__ */ new Date()).toISOString().split("T")[0];
1646
+ }
1647
+ /**
1648
+ * Calculate milliseconds until midnight UTC.
1649
+ */
1650
+ getTimeUntilMidnightUTC() {
1651
+ const now = /* @__PURE__ */ new Date();
1652
+ const midnight = new Date(
1653
+ Date.UTC(now.getUTCFullYear(), now.getUTCMonth(), now.getUTCDate() + 1)
1654
+ );
1655
+ return midnight.getTime() - now.getTime();
1656
+ }
1657
+ };
1658
+ }
1659
+ });
1660
+
1426
1661
  // src/core/retry.ts
1427
1662
  function resolveRetryConfig(config) {
1428
1663
  if (!config) {
@@ -1437,7 +1672,9 @@ function resolveRetryConfig(config) {
1437
1672
  randomize: config.randomize ?? DEFAULT_RETRY_CONFIG.randomize,
1438
1673
  onRetry: config.onRetry,
1439
1674
  onRetriesExhausted: config.onRetriesExhausted,
1440
- shouldRetry: config.shouldRetry
1675
+ shouldRetry: config.shouldRetry,
1676
+ respectRetryAfter: config.respectRetryAfter ?? DEFAULT_RETRY_CONFIG.respectRetryAfter,
1677
+ maxRetryAfterMs: config.maxRetryAfterMs ?? DEFAULT_RETRY_CONFIG.maxRetryAfterMs
1441
1678
  };
1442
1679
  }
1443
1680
  function isRetryableError(error) {
@@ -1461,6 +1698,12 @@ function isRetryableError(error) {
1461
1698
  if (message.includes("overloaded") || message.includes("capacity")) {
1462
1699
  return true;
1463
1700
  }
1701
+ if (message.includes("resource_exhausted") || message.includes("quota exceeded") || message.includes("unavailable") || message.includes("deadline_exceeded")) {
1702
+ return true;
1703
+ }
1704
+ if (message.includes("overloaded_error") || message.includes("api_error")) {
1705
+ return true;
1706
+ }
1464
1707
  if (message.includes("401") || message.includes("403") || message.includes("400") || message.includes("404") || message.includes("authentication") || message.includes("unauthorized") || message.includes("forbidden") || message.includes("invalid") || message.includes("content policy") || name === "AuthenticationError" || name === "BadRequestError" || name === "NotFoundError" || name === "PermissionDeniedError") {
1465
1708
  return false;
1466
1709
  }
@@ -1535,10 +1778,54 @@ function formatLLMError(error) {
1535
1778
  if (firstPart && firstPart.length > 10 && firstPart.length < 150) {
1536
1779
  return firstPart.trim();
1537
1780
  }
1538
- return message.slice(0, 150).trim() + "...";
1781
+ return `${message.slice(0, 150).trim()}...`;
1539
1782
  }
1540
1783
  return message;
1541
1784
  }
1785
+ function parseRetryAfterHeader(value) {
1786
+ const seconds = Number.parseFloat(value);
1787
+ if (!Number.isNaN(seconds) && seconds > 0) {
1788
+ return Math.ceil(seconds * 1e3);
1789
+ }
1790
+ const date = Date.parse(value);
1791
+ if (!Number.isNaN(date)) {
1792
+ const delay = date - Date.now();
1793
+ return delay > 0 ? delay : null;
1794
+ }
1795
+ return null;
1796
+ }
1797
+ function extractRetryAfterMs(error) {
1798
+ const errorWithHeaders = error;
1799
+ const headers = errorWithHeaders.headers || errorWithHeaders.response?.headers;
1800
+ if (headers) {
1801
+ const retryAfter = typeof headers.get === "function" ? headers.get("retry-after") : headers["retry-after"];
1802
+ if (retryAfter) {
1803
+ const parsed = parseRetryAfterHeader(retryAfter);
1804
+ if (parsed !== null) {
1805
+ return parsed;
1806
+ }
1807
+ }
1808
+ }
1809
+ const message = error.message;
1810
+ const patterns = [
1811
+ /retry\s+(?:in|after)\s+(\d+(?:\.\d+)?)\s*s(?:econds?)?/i,
1812
+ /retry-after:\s*(\d+(?:\.\d+)?)/i,
1813
+ /wait\s+(\d+(?:\.\d+)?)\s*s(?:econds?)?/i
1814
+ ];
1815
+ for (const pattern of patterns) {
1816
+ const match = message.match(pattern);
1817
+ if (match) {
1818
+ const seconds = Number.parseFloat(match[1]);
1819
+ if (!Number.isNaN(seconds) && seconds > 0) {
1820
+ return Math.ceil(seconds * 1e3);
1821
+ }
1822
+ }
1823
+ }
1824
+ if (message.toLowerCase().includes("resource_exhausted") || message.toLowerCase().includes("quota exceeded")) {
1825
+ return 6e4;
1826
+ }
1827
+ return null;
1828
+ }
1542
1829
  var DEFAULT_RETRY_CONFIG;
1543
1830
  var init_retry = __esm({
1544
1831
  "src/core/retry.ts"() {
@@ -1549,7 +1836,10 @@ var init_retry = __esm({
1549
1836
  minTimeout: 1e3,
1550
1837
  maxTimeout: 3e4,
1551
1838
  factor: 2,
1552
- randomize: true
1839
+ randomize: true,
1840
+ respectRetryAfter: true,
1841
+ maxRetryAfterMs: 12e4
1842
+ // 2 minutes cap
1553
1843
  };
1554
1844
  }
1555
1845
  });
@@ -3905,7 +4195,7 @@ function createProviderFromEnv(envVarName, ClientClass, ProviderClass, clientOpt
3905
4195
  if (!isNonEmpty(apiKey)) {
3906
4196
  return null;
3907
4197
  }
3908
- const client = new ClientClass({ apiKey: apiKey.trim(), ...clientOptions });
4198
+ const client = new ClientClass({ apiKey: apiKey.trim(), maxRetries: 0, ...clientOptions });
3909
4199
  return new ProviderClass(client);
3910
4200
  }
3911
4201
  var init_utils = __esm({
@@ -5865,8 +6155,8 @@ function createHuggingFaceProviderFromEnv() {
5865
6155
  baseURL,
5866
6156
  timeout: 6e4,
5867
6157
  // 60s timeout - HF free tier can be slower than OpenAI
5868
- maxRetries: 3
5869
- // Retry failed requests
6158
+ maxRetries: 0
6159
+ // Disable SDK retries - llmist handles all retries at application level
5870
6160
  });
5871
6161
  return new HuggingFaceProvider(client, endpointType);
5872
6162
  }
@@ -8080,6 +8370,7 @@ var init_builder = __esm({
8080
8370
  gadgetOutputLimitPercent;
8081
8371
  compactionConfig;
8082
8372
  retryConfig;
8373
+ rateLimitConfig;
8083
8374
  signal;
8084
8375
  trailingMessage;
8085
8376
  subagentConfig;
@@ -8587,6 +8878,42 @@ var init_builder = __esm({
8587
8878
  this.retryConfig = { enabled: false };
8588
8879
  return this;
8589
8880
  }
8881
+ /**
8882
+ * Configure proactive rate limiting to prevent rate limit errors.
8883
+ *
8884
+ * Set limits based on your API tier to automatically throttle requests
8885
+ * before hitting provider limits. Works in conjunction with reactive
8886
+ * retry/backoff for comprehensive rate limit handling.
8887
+ *
8888
+ * @param config - Rate limit configuration
8889
+ * @returns This builder for chaining
8890
+ *
8891
+ * @example
8892
+ * ```typescript
8893
+ * // Gemini free tier limits
8894
+ * .withRateLimits({
8895
+ * requestsPerMinute: 15,
8896
+ * tokensPerMinute: 1_000_000,
8897
+ * safetyMargin: 0.8, // Start throttling at 80%
8898
+ * })
8899
+ *
8900
+ * // OpenAI Tier 1 limits
8901
+ * .withRateLimits({
8902
+ * requestsPerMinute: 500,
8903
+ * tokensPerMinute: 200_000,
8904
+ * })
8905
+ *
8906
+ * // With daily limit (Gemini free tier)
8907
+ * .withRateLimits({
8908
+ * requestsPerMinute: 15,
8909
+ * tokensPerDay: 1_500_000,
8910
+ * })
8911
+ * ```
8912
+ */
8913
+ withRateLimits(config) {
8914
+ this.rateLimitConfig = config;
8915
+ return this;
8916
+ }
8590
8917
  /**
8591
8918
  * Set an abort signal for cancelling requests mid-flight.
8592
8919
  *
@@ -8885,6 +9212,7 @@ ${endPrefix}`
8885
9212
  gadgetOutputLimitPercent: this.gadgetOutputLimitPercent,
8886
9213
  compactionConfig: this.compactionConfig,
8887
9214
  retryConfig: this.retryConfig,
9215
+ rateLimitConfig: this.rateLimitConfig,
8888
9216
  signal: this.signal,
8889
9217
  subagentConfig: this.subagentConfig,
8890
9218
  // Tree context for shared tree model (subagents share parent's tree)
@@ -9070,6 +9398,7 @@ ${endPrefix}`
9070
9398
  gadgetOutputLimitPercent: this.gadgetOutputLimitPercent,
9071
9399
  compactionConfig: this.compactionConfig,
9072
9400
  retryConfig: this.retryConfig,
9401
+ rateLimitConfig: this.rateLimitConfig,
9073
9402
  signal: this.signal,
9074
9403
  subagentConfig: this.subagentConfig,
9075
9404
  // Tree context for shared tree model (subagents share parent's tree)
@@ -11576,6 +11905,7 @@ var init_agent = __esm({
11576
11905
  init_execution_tree();
11577
11906
  init_messages();
11578
11907
  init_model_shortcuts();
11908
+ init_rate_limit();
11579
11909
  init_retry();
11580
11910
  init_media_store();
11581
11911
  init_output_viewer();
@@ -11618,6 +11948,8 @@ var init_agent = __esm({
11618
11948
  signal;
11619
11949
  // Retry configuration
11620
11950
  retryConfig;
11951
+ // Rate limit tracker for proactive throttling
11952
+ rateLimitTracker;
11621
11953
  // Subagent configuration
11622
11954
  agentContextConfig;
11623
11955
  subagentConfig;
@@ -11705,6 +12037,10 @@ var init_agent = __esm({
11705
12037
  }
11706
12038
  this.signal = options.signal;
11707
12039
  this.retryConfig = resolveRetryConfig(options.retryConfig);
12040
+ const rateLimitConfig = resolveRateLimitConfig(options.rateLimitConfig);
12041
+ if (rateLimitConfig.enabled) {
12042
+ this.rateLimitTracker = new RateLimitTracker(options.rateLimitConfig);
12043
+ }
11708
12044
  this.agentContextConfig = {
11709
12045
  model: this.model,
11710
12046
  temperature: this.temperature
@@ -11919,6 +12255,8 @@ var init_agent = __esm({
11919
12255
  model: this.model,
11920
12256
  maxIterations: this.maxIterations
11921
12257
  });
12258
+ let currentLLMNodeId;
12259
+ let llmOptions;
11922
12260
  try {
11923
12261
  while (currentIteration < this.maxIterations) {
11924
12262
  if (await this.checkAbortAndNotify(currentIteration)) {
@@ -11933,8 +12271,6 @@ var init_agent = __esm({
11933
12271
  });
11934
12272
  }
11935
12273
  this.logger.debug("Starting iteration", { iteration: currentIteration });
11936
- let currentLLMNodeId;
11937
- let llmOptions;
11938
12274
  try {
11939
12275
  const compactionEvent = await this.checkAndPerformCompaction(currentIteration);
11940
12276
  if (compactionEvent) {
@@ -12086,14 +12422,54 @@ var init_agent = __esm({
12086
12422
  reason: currentIteration >= this.maxIterations ? "max_iterations" : "natural_completion"
12087
12423
  });
12088
12424
  } finally {
12425
+ if (currentLLMNodeId) {
12426
+ const node = this.tree.getNode(currentLLMNodeId);
12427
+ if (node && node.type === "llm_call" && !node.completedAt) {
12428
+ await this.safeObserve(async () => {
12429
+ if (this.hooks.observers?.onLLMCallComplete) {
12430
+ const subagentContext = getSubagentContextForNode(this.tree, currentLLMNodeId);
12431
+ const context = {
12432
+ iteration: currentIteration,
12433
+ options: llmOptions ?? {
12434
+ model: this.model,
12435
+ messages: this.conversation.getMessages(),
12436
+ temperature: this.temperature,
12437
+ maxTokens: this.defaultMaxTokens
12438
+ },
12439
+ finishReason: "interrupted",
12440
+ usage: void 0,
12441
+ rawResponse: "",
12442
+ // No response available for interrupted request
12443
+ finalMessage: "",
12444
+ // No final message for interrupted request
12445
+ logger: this.logger,
12446
+ subagentContext
12447
+ };
12448
+ await this.hooks.observers.onLLMCallComplete(context);
12449
+ }
12450
+ });
12451
+ this.tree.completeLLMCall(currentLLMNodeId, {
12452
+ finishReason: "interrupted"
12453
+ });
12454
+ }
12455
+ }
12089
12456
  unsubscribeBridge();
12090
12457
  }
12091
12458
  }
12092
12459
  /**
12093
- * Create LLM stream with retry logic.
12094
- * Wraps the stream creation with exponential backoff for transient failures.
12460
+ * Create LLM stream with two-layer rate limit protection:
12461
+ *
12462
+ * Layer 1 (Proactive): If rate limits are configured, delays requests to stay within limits.
12463
+ * Layer 2 (Reactive): Exponential backoff with Retry-After header support for transient failures.
12095
12464
  */
12096
12465
  async createStreamWithRetry(llmOptions, iteration) {
12466
+ if (this.rateLimitTracker) {
12467
+ const throttleDelay = this.rateLimitTracker.getRequiredDelayMs();
12468
+ if (throttleDelay > 0) {
12469
+ this.logger.debug("Rate limit throttling", { delayMs: throttleDelay });
12470
+ await this.sleep(throttleDelay);
12471
+ }
12472
+ }
12097
12473
  if (!this.retryConfig.enabled) {
12098
12474
  return this.client.stream(llmOptions);
12099
12475
  }
@@ -12105,11 +12481,23 @@ var init_agent = __esm({
12105
12481
  randomize,
12106
12482
  onRetry,
12107
12483
  onRetriesExhausted,
12108
- shouldRetry
12484
+ shouldRetry,
12485
+ respectRetryAfter,
12486
+ maxRetryAfterMs
12109
12487
  } = this.retryConfig;
12488
+ let retryAfterHintMs = null;
12110
12489
  try {
12111
12490
  return await (0, import_p_retry.default)(
12112
12491
  async (attemptNumber) => {
12492
+ if (retryAfterHintMs !== null && respectRetryAfter) {
12493
+ const cappedDelay = Math.min(retryAfterHintMs, maxRetryAfterMs);
12494
+ this.logger.debug("Using Retry-After delay", {
12495
+ retryAfterMs: retryAfterHintMs,
12496
+ cappedDelay
12497
+ });
12498
+ await this.sleep(cappedDelay);
12499
+ retryAfterHintMs = null;
12500
+ }
12113
12501
  this.logger.debug("Creating LLM stream", {
12114
12502
  attempt: attemptNumber,
12115
12503
  maxAttempts: retries + 1
@@ -12125,9 +12513,21 @@ var init_agent = __esm({
12125
12513
  signal: this.signal,
12126
12514
  onFailedAttempt: (context) => {
12127
12515
  const { error, attemptNumber, retriesLeft } = context;
12516
+ if (respectRetryAfter) {
12517
+ retryAfterHintMs = extractRetryAfterMs(error);
12518
+ if (retryAfterHintMs !== null) {
12519
+ this.logger.debug("Retry-After header detected", {
12520
+ delayMs: retryAfterHintMs
12521
+ });
12522
+ }
12523
+ }
12128
12524
  this.logger.warn(
12129
12525
  `LLM call failed (attempt ${attemptNumber}/${attemptNumber + retriesLeft}), retrying...`,
12130
- { error: error.message, retriesLeft }
12526
+ {
12527
+ error: error.message,
12528
+ retriesLeft,
12529
+ retryAfterMs: retryAfterHintMs
12530
+ }
12131
12531
  );
12132
12532
  onRetry?.(error, attemptNumber);
12133
12533
  },
@@ -12148,6 +12548,12 @@ var init_agent = __esm({
12148
12548
  throw error;
12149
12549
  }
12150
12550
  }
12551
+ /**
12552
+ * Simple sleep utility for rate limit delays.
12553
+ */
12554
+ sleep(ms) {
12555
+ return new Promise((resolve) => setTimeout(resolve, ms));
12556
+ }
12151
12557
  /**
12152
12558
  * Handle LLM error through controller.
12153
12559
  */
@@ -12386,12 +12792,18 @@ var init_agent = __esm({
12386
12792
  }
12387
12793
  /**
12388
12794
  * Calculate cost and complete LLM call in execution tree.
12795
+ * Also records usage to rate limit tracker for proactive throttling.
12389
12796
  */
12390
12797
  completeLLMCallInTree(nodeId, result) {
12798
+ const inputTokens = result.usage?.inputTokens ?? 0;
12799
+ const outputTokens = result.usage?.outputTokens ?? 0;
12800
+ if (this.rateLimitTracker) {
12801
+ this.rateLimitTracker.recordUsage(inputTokens, outputTokens);
12802
+ }
12391
12803
  const llmCost = this.client.modelRegistry?.estimateCost?.(
12392
12804
  this.model,
12393
- result.usage?.inputTokens ?? 0,
12394
- result.usage?.outputTokens ?? 0,
12805
+ inputTokens,
12806
+ outputTokens,
12395
12807
  result.usage?.cachedInputTokens ?? 0,
12396
12808
  result.usage?.cacheCreationInputTokens ?? 0
12397
12809
  )?.totalCost;
@@ -12520,6 +12932,7 @@ __export(index_exports, {
12520
12932
  DEFAULT_COMPACTION_CONFIG: () => DEFAULT_COMPACTION_CONFIG,
12521
12933
  DEFAULT_HINTS: () => DEFAULT_HINTS,
12522
12934
  DEFAULT_PROMPTS: () => DEFAULT_PROMPTS,
12935
+ DEFAULT_RATE_LIMIT_CONFIG: () => DEFAULT_RATE_LIMIT_CONFIG,
12523
12936
  DEFAULT_RETRY_CONFIG: () => DEFAULT_RETRY_CONFIG,
12524
12937
  DEFAULT_SUMMARIZATION_PROMPT: () => DEFAULT_SUMMARIZATION_PROMPT,
12525
12938
  ExecutionTree: () => ExecutionTree,
@@ -12544,6 +12957,7 @@ __export(index_exports, {
12544
12957
  ModelIdentifierParser: () => ModelIdentifierParser,
12545
12958
  ModelRegistry: () => ModelRegistry,
12546
12959
  OpenAIChatProvider: () => OpenAIChatProvider,
12960
+ RateLimitTracker: () => RateLimitTracker,
12547
12961
  SimpleSessionManager: () => SimpleSessionManager,
12548
12962
  SlidingWindowStrategy: () => SlidingWindowStrategy,
12549
12963
  StreamProcessor: () => StreamProcessor,
@@ -12570,6 +12984,7 @@ __export(index_exports, {
12570
12984
  detectImageMimeType: () => detectImageMimeType,
12571
12985
  discoverProviderAdapters: () => discoverProviderAdapters,
12572
12986
  extractMessageText: () => extractMessageText,
12987
+ extractRetryAfterMs: () => extractRetryAfterMs,
12573
12988
  filterByDepth: () => filterByDepth,
12574
12989
  filterByParent: () => filterByParent,
12575
12990
  filterRootEvents: () => filterRootEvents,
@@ -12612,11 +13027,13 @@ __export(index_exports, {
12612
13027
  parallelGadgetHint: () => parallelGadgetHint,
12613
13028
  parseDataUrl: () => parseDataUrl,
12614
13029
  parseManifest: () => parseManifest,
13030
+ parseRetryAfterHeader: () => parseRetryAfterHeader,
12615
13031
  randomDelay: () => randomDelay,
12616
13032
  resolveConfig: () => resolveConfig,
12617
13033
  resolveHintTemplate: () => resolveHintTemplate,
12618
13034
  resolveModel: () => resolveModel,
12619
13035
  resolvePromptTemplate: () => resolvePromptTemplate,
13036
+ resolveRateLimitConfig: () => resolveRateLimitConfig,
12620
13037
  resolveRetryConfig: () => resolveRetryConfig,
12621
13038
  resolveRulesTemplate: () => resolveRulesTemplate,
12622
13039
  resolveSubagentModel: () => resolveSubagentModel,
@@ -13592,6 +14009,7 @@ init_model_shortcuts();
13592
14009
  init_options();
13593
14010
  init_prompt_config();
13594
14011
  init_quick_methods();
14012
+ init_rate_limit();
13595
14013
  init_retry();
13596
14014
  init_create_gadget();
13597
14015
  init_exceptions();
@@ -14159,6 +14577,7 @@ function getHostExports2(ctx) {
14159
14577
  DEFAULT_COMPACTION_CONFIG,
14160
14578
  DEFAULT_HINTS,
14161
14579
  DEFAULT_PROMPTS,
14580
+ DEFAULT_RATE_LIMIT_CONFIG,
14162
14581
  DEFAULT_RETRY_CONFIG,
14163
14582
  DEFAULT_SUMMARIZATION_PROMPT,
14164
14583
  ExecutionTree,
@@ -14183,6 +14602,7 @@ function getHostExports2(ctx) {
14183
14602
  ModelIdentifierParser,
14184
14603
  ModelRegistry,
14185
14604
  OpenAIChatProvider,
14605
+ RateLimitTracker,
14186
14606
  SimpleSessionManager,
14187
14607
  SlidingWindowStrategy,
14188
14608
  StreamProcessor,
@@ -14209,6 +14629,7 @@ function getHostExports2(ctx) {
14209
14629
  detectImageMimeType,
14210
14630
  discoverProviderAdapters,
14211
14631
  extractMessageText,
14632
+ extractRetryAfterMs,
14212
14633
  filterByDepth,
14213
14634
  filterByParent,
14214
14635
  filterRootEvents,
@@ -14251,11 +14672,13 @@ function getHostExports2(ctx) {
14251
14672
  parallelGadgetHint,
14252
14673
  parseDataUrl,
14253
14674
  parseManifest,
14675
+ parseRetryAfterHeader,
14254
14676
  randomDelay,
14255
14677
  resolveConfig,
14256
14678
  resolveHintTemplate,
14257
14679
  resolveModel,
14258
14680
  resolvePromptTemplate,
14681
+ resolveRateLimitConfig,
14259
14682
  resolveRetryConfig,
14260
14683
  resolveRulesTemplate,
14261
14684
  resolveSubagentModel,