@probelabs/probe 0.6.0-rc293 → 0.6.0-rc294

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1341,9 +1341,16 @@ export class ProbeAgent {
1341
1341
  // Use fallback manager with retry for each provider
1342
1342
  return await this.fallbackManager.executeWithFallback(
1343
1343
  async (provider, model, config) => {
1344
+ // Wrap fallback model with per-call concurrency limiter if configured.
1345
+ // The original options.model was wrapped in streamTextWithRetryAndFallback,
1346
+ // but fallback replaces it with a new model that needs wrapping too.
1347
+ let fallbackModel = provider(model);
1348
+ if (this.concurrencyLimiter) {
1349
+ fallbackModel = ProbeAgent._wrapModelWithLimiter(fallbackModel, this.concurrencyLimiter, this.debug);
1350
+ }
1344
1351
  const fallbackOptions = {
1345
1352
  ...options,
1346
- model: provider(model),
1353
+ model: fallbackModel,
1347
1354
  abortSignal: controller.signal
1348
1355
  };
1349
1356
 
@@ -1377,6 +1384,143 @@ export class ProbeAgent {
1377
1384
  );
1378
1385
  }
1379
1386
 
1387
+ /**
1388
+ * Wrap a LanguageModelV1 model so each doStream/doGenerate call acquires and
1389
+ * releases a concurrency limiter slot. This gates individual LLM API calls
1390
+ * (seconds each) instead of entire multi-step agent sessions (minutes).
1391
+ *
1392
+ * @param {Object} model - LanguageModelV1 model instance
1393
+ * @param {Object} limiter - Concurrency limiter with acquire/release/getStats
1394
+ * @param {boolean} debug - Enable debug logging
1395
+ * @returns {Object} Wrapped model with per-call concurrency gating
1396
+ * @private
1397
+ */
1398
+ static _wrapModelWithLimiter(model, limiter, debug) {
1399
+ return new Proxy(model, {
1400
+ get(target, prop) {
1401
+ if (prop === 'doStream') {
1402
+ return async function (...args) {
1403
+ await limiter.acquire(null);
1404
+ if (debug) {
1405
+ const stats = limiter.getStats();
1406
+ console.log(`[DEBUG] Acquired AI slot for LLM call (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
1407
+ }
1408
+ try {
1409
+ const result = await target.doStream(...args);
1410
+
1411
+ // Wrap the ReadableStream to release the slot when it completes,
1412
+ // errors, or is cancelled — covering all stream termination paths.
1413
+ // Guard against double-release: if cancel() races with an in-flight
1414
+ // pull() that is awaiting originalReader.read(), both paths could
1415
+ // try to release. The flag ensures exactly one release.
1416
+ const originalStream = result.stream;
1417
+ const originalReader = originalStream.getReader();
1418
+ let released = false;
1419
+ const releaseOnce = () => {
1420
+ if (released) return;
1421
+ released = true;
1422
+ limiter.release(null);
1423
+ };
1424
+ const wrappedStream = new ReadableStream({
1425
+ async pull(controller) {
1426
+ try {
1427
+ const { done, value } = await originalReader.read();
1428
+ if (done) {
1429
+ controller.close();
1430
+ releaseOnce();
1431
+ if (debug) {
1432
+ const stats = limiter.getStats();
1433
+ console.log(`[DEBUG] Released AI slot after LLM stream complete (${stats.globalActive}/${stats.maxConcurrent})`);
1434
+ }
1435
+ } else {
1436
+ controller.enqueue(value);
1437
+ }
1438
+ } catch (err) {
1439
+ releaseOnce();
1440
+ if (debug) {
1441
+ console.log(`[DEBUG] Released AI slot on LLM stream error`);
1442
+ }
1443
+ controller.error(err);
1444
+ }
1445
+ },
1446
+ cancel() {
1447
+ releaseOnce();
1448
+ if (debug) {
1449
+ console.log(`[DEBUG] Released AI slot on LLM stream cancel`);
1450
+ }
1451
+ originalReader.cancel();
1452
+ }
1453
+ });
1454
+
1455
+ return { ...result, stream: wrappedStream };
1456
+ } catch (err) {
1457
+ limiter.release(null);
1458
+ if (debug) {
1459
+ console.log(`[DEBUG] Released AI slot on doStream error`);
1460
+ }
1461
+ throw err;
1462
+ }
1463
+ };
1464
+ }
1465
+
1466
+ if (prop === 'doGenerate') {
1467
+ return async function (...args) {
1468
+ await limiter.acquire(null);
1469
+ if (debug) {
1470
+ const stats = limiter.getStats();
1471
+ console.log(`[DEBUG] Acquired AI slot for LLM generate (${stats.globalActive}/${stats.maxConcurrent})`);
1472
+ }
1473
+ try {
1474
+ const result = await target.doGenerate(...args);
1475
+ return result;
1476
+ } finally {
1477
+ limiter.release(null);
1478
+ if (debug) {
1479
+ const stats = limiter.getStats();
1480
+ console.log(`[DEBUG] Released AI slot after LLM generate (${stats.globalActive}/${stats.maxConcurrent})`);
1481
+ }
1482
+ }
1483
+ };
1484
+ }
1485
+
1486
+ const value = target[prop];
1487
+ return typeof value === 'function' ? value.bind(target) : value;
1488
+ }
1489
+ });
1490
+ }
1491
+
1492
+ /**
1493
+ * Wrap an engine stream result so its textStream async generator acquires
1494
+ * and releases a concurrency limiter slot. Acquire happens when iteration
1495
+ * begins; release happens in finally (completion, error, or break).
1496
+ *
1497
+ * @param {Object} result - Engine result with { textStream, usage, ... }
1498
+ * @param {Object} limiter - Concurrency limiter with acquire/release/getStats
1499
+ * @param {boolean} debug - Enable debug logging
1500
+ * @returns {Object} Result with wrapped textStream
1501
+ * @private
1502
+ */
1503
+ static _wrapEngineStreamWithLimiter(result, limiter, debug) {
1504
+ const originalStream = result.textStream;
1505
+ async function* gatedStream() {
1506
+ await limiter.acquire(null);
1507
+ if (debug) {
1508
+ const stats = limiter.getStats();
1509
+ console.log(`[DEBUG] Acquired AI slot for engine stream (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
1510
+ }
1511
+ try {
1512
+ yield* originalStream;
1513
+ } finally {
1514
+ limiter.release(null);
1515
+ if (debug) {
1516
+ const stats = limiter.getStats();
1517
+ console.log(`[DEBUG] Released AI slot after engine stream (${stats.globalActive}/${stats.maxConcurrent})`);
1518
+ }
1519
+ }
1520
+ }
1521
+ return { ...result, textStream: gatedStream() };
1522
+ }
1523
+
1380
1524
  /**
1381
1525
  * Execute streamText with retry and fallback support
1382
1526
  * @param {Object} options - streamText options
@@ -1384,14 +1528,12 @@ export class ProbeAgent {
1384
1528
  * @private
1385
1529
  */
1386
1530
  async streamTextWithRetryAndFallback(options) {
1387
- // Acquire global concurrency slot if limiter is configured
1531
+ // Wrap the model with per-call concurrency gating if limiter is configured.
1532
+ // This acquires/releases the slot around each individual LLM API call (doStream/doGenerate)
1533
+ // instead of holding it for the entire multi-step agent session.
1388
1534
  const limiter = this.concurrencyLimiter;
1389
- if (limiter) {
1390
- await limiter.acquire(null);
1391
- if (this.debug) {
1392
- const stats = limiter.getStats();
1393
- console.log(`[DEBUG] Acquired global AI concurrency slot (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
1394
- }
1535
+ if (limiter && options.model) {
1536
+ options = { ...options, model: ProbeAgent._wrapModelWithLimiter(options.model, limiter, this.debug) };
1395
1537
  }
1396
1538
 
1397
1539
  // Create AbortController for overall operation timeout
@@ -1430,6 +1572,12 @@ export class ProbeAgent {
1430
1572
  if (useClaudeCode || useCodex) {
1431
1573
  try {
1432
1574
  result = await this._tryEngineStreamPath(options, controller, timeoutState);
1575
+ // Gate engine stream with concurrency limiter if configured.
1576
+ // Engine paths bypass the Vercel model wrapper, so we wrap the
1577
+ // textStream async generator with acquire/release instead.
1578
+ if (result && limiter) {
1579
+ result = ProbeAgent._wrapEngineStreamWithLimiter(result, limiter, this.debug);
1580
+ }
1433
1581
  } catch (error) {
1434
1582
  if (this.debug) {
1435
1583
  const engineType = useClaudeCode ? 'Claude Code' : 'Codex';
@@ -1444,47 +1592,7 @@ export class ProbeAgent {
1444
1592
  result = await this._executeWithVercelProvider(options, controller);
1445
1593
  }
1446
1594
 
1447
- // Wrap textStream so limiter slot is held until stream completes.
1448
- // result.textStream is a read-only getter on DefaultStreamTextResult,
1449
- // so we wrap the result in a Proxy that intercepts the textStream property.
1450
- if (limiter && result.textStream) {
1451
- const originalStream = result.textStream;
1452
- const debug = this.debug;
1453
- const wrappedStream = (async function* () {
1454
- try {
1455
- for await (const chunk of originalStream) {
1456
- yield chunk;
1457
- }
1458
- } finally {
1459
- limiter.release(null);
1460
- if (debug) {
1461
- const stats = limiter.getStats();
1462
- console.log(`[DEBUG] Released global AI concurrency slot (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
1463
- }
1464
- }
1465
- })();
1466
- return new Proxy(result, {
1467
- get(target, prop) {
1468
- if (prop === 'textStream') return wrappedStream;
1469
- const value = target[prop];
1470
- return typeof value === 'function' ? value.bind(target) : value;
1471
- }
1472
- });
1473
- } else if (limiter) {
1474
- // No textStream (shouldn't happen, but release just in case)
1475
- limiter.release(null);
1476
- }
1477
-
1478
1595
  return result;
1479
- } catch (error) {
1480
- // Release on error if limiter was acquired
1481
- if (limiter) {
1482
- limiter.release(null);
1483
- if (this.debug) {
1484
- console.log(`[DEBUG] Released global AI concurrency slot on error`);
1485
- }
1486
- }
1487
- throw error;
1488
1596
  } finally {
1489
1597
  // Clean up timeout (for non-engine paths; engine paths clean up in the generator)
1490
1598
  if (timeoutState.timeoutId) {
@@ -100620,9 +100620,13 @@ var init_ProbeAgent = __esm({
100620
100620
  }
100621
100621
  return await this.fallbackManager.executeWithFallback(
100622
100622
  async (provider, model, config2) => {
100623
+ let fallbackModel = provider(model);
100624
+ if (this.concurrencyLimiter) {
100625
+ fallbackModel = _ProbeAgent._wrapModelWithLimiter(fallbackModel, this.concurrencyLimiter, this.debug);
100626
+ }
100623
100627
  const fallbackOptions = {
100624
100628
  ...options,
100625
- model: provider(model),
100629
+ model: fallbackModel,
100626
100630
  abortSignal: controller.signal
100627
100631
  };
100628
100632
  if (config2.provider !== "google" && fallbackOptions.tools) {
@@ -100650,6 +100654,132 @@ var init_ProbeAgent = __esm({
100650
100654
  }
100651
100655
  );
100652
100656
  }
100657
+ /**
100658
+ * Wrap a LanguageModelV1 model so each doStream/doGenerate call acquires and
100659
+ * releases a concurrency limiter slot. This gates individual LLM API calls
100660
+ * (seconds each) instead of entire multi-step agent sessions (minutes).
100661
+ *
100662
+ * @param {Object} model - LanguageModelV1 model instance
100663
+ * @param {Object} limiter - Concurrency limiter with acquire/release/getStats
100664
+ * @param {boolean} debug - Enable debug logging
100665
+ * @returns {Object} Wrapped model with per-call concurrency gating
100666
+ * @private
100667
+ */
100668
+ static _wrapModelWithLimiter(model, limiter, debug) {
100669
+ return new Proxy(model, {
100670
+ get(target, prop) {
100671
+ if (prop === "doStream") {
100672
+ return async function(...args) {
100673
+ await limiter.acquire(null);
100674
+ if (debug) {
100675
+ const stats = limiter.getStats();
100676
+ console.log(`[DEBUG] Acquired AI slot for LLM call (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
100677
+ }
100678
+ try {
100679
+ const result = await target.doStream(...args);
100680
+ const originalStream = result.stream;
100681
+ const originalReader = originalStream.getReader();
100682
+ let released = false;
100683
+ const releaseOnce = () => {
100684
+ if (released) return;
100685
+ released = true;
100686
+ limiter.release(null);
100687
+ };
100688
+ const wrappedStream = new ReadableStream({
100689
+ async pull(controller) {
100690
+ try {
100691
+ const { done, value: value2 } = await originalReader.read();
100692
+ if (done) {
100693
+ controller.close();
100694
+ releaseOnce();
100695
+ if (debug) {
100696
+ const stats = limiter.getStats();
100697
+ console.log(`[DEBUG] Released AI slot after LLM stream complete (${stats.globalActive}/${stats.maxConcurrent})`);
100698
+ }
100699
+ } else {
100700
+ controller.enqueue(value2);
100701
+ }
100702
+ } catch (err) {
100703
+ releaseOnce();
100704
+ if (debug) {
100705
+ console.log(`[DEBUG] Released AI slot on LLM stream error`);
100706
+ }
100707
+ controller.error(err);
100708
+ }
100709
+ },
100710
+ cancel() {
100711
+ releaseOnce();
100712
+ if (debug) {
100713
+ console.log(`[DEBUG] Released AI slot on LLM stream cancel`);
100714
+ }
100715
+ originalReader.cancel();
100716
+ }
100717
+ });
100718
+ return { ...result, stream: wrappedStream };
100719
+ } catch (err) {
100720
+ limiter.release(null);
100721
+ if (debug) {
100722
+ console.log(`[DEBUG] Released AI slot on doStream error`);
100723
+ }
100724
+ throw err;
100725
+ }
100726
+ };
100727
+ }
100728
+ if (prop === "doGenerate") {
100729
+ return async function(...args) {
100730
+ await limiter.acquire(null);
100731
+ if (debug) {
100732
+ const stats = limiter.getStats();
100733
+ console.log(`[DEBUG] Acquired AI slot for LLM generate (${stats.globalActive}/${stats.maxConcurrent})`);
100734
+ }
100735
+ try {
100736
+ const result = await target.doGenerate(...args);
100737
+ return result;
100738
+ } finally {
100739
+ limiter.release(null);
100740
+ if (debug) {
100741
+ const stats = limiter.getStats();
100742
+ console.log(`[DEBUG] Released AI slot after LLM generate (${stats.globalActive}/${stats.maxConcurrent})`);
100743
+ }
100744
+ }
100745
+ };
100746
+ }
100747
+ const value = target[prop];
100748
+ return typeof value === "function" ? value.bind(target) : value;
100749
+ }
100750
+ });
100751
+ }
100752
+ /**
100753
+ * Wrap an engine stream result so its textStream async generator acquires
100754
+ * and releases a concurrency limiter slot. Acquire happens when iteration
100755
+ * begins; release happens in finally (completion, error, or break).
100756
+ *
100757
+ * @param {Object} result - Engine result with { textStream, usage, ... }
100758
+ * @param {Object} limiter - Concurrency limiter with acquire/release/getStats
100759
+ * @param {boolean} debug - Enable debug logging
100760
+ * @returns {Object} Result with wrapped textStream
100761
+ * @private
100762
+ */
100763
+ static _wrapEngineStreamWithLimiter(result, limiter, debug) {
100764
+ const originalStream = result.textStream;
100765
+ async function* gatedStream() {
100766
+ await limiter.acquire(null);
100767
+ if (debug) {
100768
+ const stats = limiter.getStats();
100769
+ console.log(`[DEBUG] Acquired AI slot for engine stream (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
100770
+ }
100771
+ try {
100772
+ yield* originalStream;
100773
+ } finally {
100774
+ limiter.release(null);
100775
+ if (debug) {
100776
+ const stats = limiter.getStats();
100777
+ console.log(`[DEBUG] Released AI slot after engine stream (${stats.globalActive}/${stats.maxConcurrent})`);
100778
+ }
100779
+ }
100780
+ }
100781
+ return { ...result, textStream: gatedStream() };
100782
+ }
100653
100783
  /**
100654
100784
  * Execute streamText with retry and fallback support
100655
100785
  * @param {Object} options - streamText options
@@ -100658,12 +100788,8 @@ var init_ProbeAgent = __esm({
100658
100788
  */
100659
100789
  async streamTextWithRetryAndFallback(options) {
100660
100790
  const limiter = this.concurrencyLimiter;
100661
- if (limiter) {
100662
- await limiter.acquire(null);
100663
- if (this.debug) {
100664
- const stats = limiter.getStats();
100665
- console.log(`[DEBUG] Acquired global AI concurrency slot (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
100666
- }
100791
+ if (limiter && options.model) {
100792
+ options = { ...options, model: _ProbeAgent._wrapModelWithLimiter(options.model, limiter, this.debug) };
100667
100793
  }
100668
100794
  const controller = new AbortController();
100669
100795
  const timeoutState = { timeoutId: null };
@@ -100691,6 +100817,9 @@ var init_ProbeAgent = __esm({
100691
100817
  if (useClaudeCode || useCodex) {
100692
100818
  try {
100693
100819
  result = await this._tryEngineStreamPath(options, controller, timeoutState);
100820
+ if (result && limiter) {
100821
+ result = _ProbeAgent._wrapEngineStreamWithLimiter(result, limiter, this.debug);
100822
+ }
100694
100823
  } catch (error40) {
100695
100824
  if (this.debug) {
100696
100825
  const engineType = useClaudeCode ? "Claude Code" : "Codex";
@@ -100701,41 +100830,7 @@ var init_ProbeAgent = __esm({
100701
100830
  if (!result) {
100702
100831
  result = await this._executeWithVercelProvider(options, controller);
100703
100832
  }
100704
- if (limiter && result.textStream) {
100705
- const originalStream = result.textStream;
100706
- const debug = this.debug;
100707
- const wrappedStream = (async function* () {
100708
- try {
100709
- for await (const chunk of originalStream) {
100710
- yield chunk;
100711
- }
100712
- } finally {
100713
- limiter.release(null);
100714
- if (debug) {
100715
- const stats = limiter.getStats();
100716
- console.log(`[DEBUG] Released global AI concurrency slot (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
100717
- }
100718
- }
100719
- })();
100720
- return new Proxy(result, {
100721
- get(target, prop) {
100722
- if (prop === "textStream") return wrappedStream;
100723
- const value = target[prop];
100724
- return typeof value === "function" ? value.bind(target) : value;
100725
- }
100726
- });
100727
- } else if (limiter) {
100728
- limiter.release(null);
100729
- }
100730
100833
  return result;
100731
- } catch (error40) {
100732
- if (limiter) {
100733
- limiter.release(null);
100734
- if (this.debug) {
100735
- console.log(`[DEBUG] Released global AI concurrency slot on error`);
100736
- }
100737
- }
100738
- throw error40;
100739
100834
  } finally {
100740
100835
  if (timeoutState.timeoutId) {
100741
100836
  clearTimeout(timeoutState.timeoutId);
package/cjs/index.cjs CHANGED
@@ -97531,9 +97531,13 @@ var init_ProbeAgent = __esm({
97531
97531
  }
97532
97532
  return await this.fallbackManager.executeWithFallback(
97533
97533
  async (provider, model, config2) => {
97534
+ let fallbackModel = provider(model);
97535
+ if (this.concurrencyLimiter) {
97536
+ fallbackModel = _ProbeAgent._wrapModelWithLimiter(fallbackModel, this.concurrencyLimiter, this.debug);
97537
+ }
97534
97538
  const fallbackOptions = {
97535
97539
  ...options,
97536
- model: provider(model),
97540
+ model: fallbackModel,
97537
97541
  abortSignal: controller.signal
97538
97542
  };
97539
97543
  if (config2.provider !== "google" && fallbackOptions.tools) {
@@ -97561,6 +97565,132 @@ var init_ProbeAgent = __esm({
97561
97565
  }
97562
97566
  );
97563
97567
  }
97568
+ /**
97569
+ * Wrap a LanguageModelV1 model so each doStream/doGenerate call acquires and
97570
+ * releases a concurrency limiter slot. This gates individual LLM API calls
97571
+ * (seconds each) instead of entire multi-step agent sessions (minutes).
97572
+ *
97573
+ * @param {Object} model - LanguageModelV1 model instance
97574
+ * @param {Object} limiter - Concurrency limiter with acquire/release/getStats
97575
+ * @param {boolean} debug - Enable debug logging
97576
+ * @returns {Object} Wrapped model with per-call concurrency gating
97577
+ * @private
97578
+ */
97579
+ static _wrapModelWithLimiter(model, limiter, debug) {
97580
+ return new Proxy(model, {
97581
+ get(target, prop) {
97582
+ if (prop === "doStream") {
97583
+ return async function(...args) {
97584
+ await limiter.acquire(null);
97585
+ if (debug) {
97586
+ const stats = limiter.getStats();
97587
+ console.log(`[DEBUG] Acquired AI slot for LLM call (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
97588
+ }
97589
+ try {
97590
+ const result = await target.doStream(...args);
97591
+ const originalStream = result.stream;
97592
+ const originalReader = originalStream.getReader();
97593
+ let released = false;
97594
+ const releaseOnce = () => {
97595
+ if (released) return;
97596
+ released = true;
97597
+ limiter.release(null);
97598
+ };
97599
+ const wrappedStream = new ReadableStream({
97600
+ async pull(controller) {
97601
+ try {
97602
+ const { done, value: value2 } = await originalReader.read();
97603
+ if (done) {
97604
+ controller.close();
97605
+ releaseOnce();
97606
+ if (debug) {
97607
+ const stats = limiter.getStats();
97608
+ console.log(`[DEBUG] Released AI slot after LLM stream complete (${stats.globalActive}/${stats.maxConcurrent})`);
97609
+ }
97610
+ } else {
97611
+ controller.enqueue(value2);
97612
+ }
97613
+ } catch (err) {
97614
+ releaseOnce();
97615
+ if (debug) {
97616
+ console.log(`[DEBUG] Released AI slot on LLM stream error`);
97617
+ }
97618
+ controller.error(err);
97619
+ }
97620
+ },
97621
+ cancel() {
97622
+ releaseOnce();
97623
+ if (debug) {
97624
+ console.log(`[DEBUG] Released AI slot on LLM stream cancel`);
97625
+ }
97626
+ originalReader.cancel();
97627
+ }
97628
+ });
97629
+ return { ...result, stream: wrappedStream };
97630
+ } catch (err) {
97631
+ limiter.release(null);
97632
+ if (debug) {
97633
+ console.log(`[DEBUG] Released AI slot on doStream error`);
97634
+ }
97635
+ throw err;
97636
+ }
97637
+ };
97638
+ }
97639
+ if (prop === "doGenerate") {
97640
+ return async function(...args) {
97641
+ await limiter.acquire(null);
97642
+ if (debug) {
97643
+ const stats = limiter.getStats();
97644
+ console.log(`[DEBUG] Acquired AI slot for LLM generate (${stats.globalActive}/${stats.maxConcurrent})`);
97645
+ }
97646
+ try {
97647
+ const result = await target.doGenerate(...args);
97648
+ return result;
97649
+ } finally {
97650
+ limiter.release(null);
97651
+ if (debug) {
97652
+ const stats = limiter.getStats();
97653
+ console.log(`[DEBUG] Released AI slot after LLM generate (${stats.globalActive}/${stats.maxConcurrent})`);
97654
+ }
97655
+ }
97656
+ };
97657
+ }
97658
+ const value = target[prop];
97659
+ return typeof value === "function" ? value.bind(target) : value;
97660
+ }
97661
+ });
97662
+ }
97663
+ /**
97664
+ * Wrap an engine stream result so its textStream async generator acquires
97665
+ * and releases a concurrency limiter slot. Acquire happens when iteration
97666
+ * begins; release happens in finally (completion, error, or break).
97667
+ *
97668
+ * @param {Object} result - Engine result with { textStream, usage, ... }
97669
+ * @param {Object} limiter - Concurrency limiter with acquire/release/getStats
97670
+ * @param {boolean} debug - Enable debug logging
97671
+ * @returns {Object} Result with wrapped textStream
97672
+ * @private
97673
+ */
97674
+ static _wrapEngineStreamWithLimiter(result, limiter, debug) {
97675
+ const originalStream = result.textStream;
97676
+ async function* gatedStream() {
97677
+ await limiter.acquire(null);
97678
+ if (debug) {
97679
+ const stats = limiter.getStats();
97680
+ console.log(`[DEBUG] Acquired AI slot for engine stream (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
97681
+ }
97682
+ try {
97683
+ yield* originalStream;
97684
+ } finally {
97685
+ limiter.release(null);
97686
+ if (debug) {
97687
+ const stats = limiter.getStats();
97688
+ console.log(`[DEBUG] Released AI slot after engine stream (${stats.globalActive}/${stats.maxConcurrent})`);
97689
+ }
97690
+ }
97691
+ }
97692
+ return { ...result, textStream: gatedStream() };
97693
+ }
97564
97694
  /**
97565
97695
  * Execute streamText with retry and fallback support
97566
97696
  * @param {Object} options - streamText options
@@ -97569,12 +97699,8 @@ var init_ProbeAgent = __esm({
97569
97699
  */
97570
97700
  async streamTextWithRetryAndFallback(options) {
97571
97701
  const limiter = this.concurrencyLimiter;
97572
- if (limiter) {
97573
- await limiter.acquire(null);
97574
- if (this.debug) {
97575
- const stats = limiter.getStats();
97576
- console.log(`[DEBUG] Acquired global AI concurrency slot (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
97577
- }
97702
+ if (limiter && options.model) {
97703
+ options = { ...options, model: _ProbeAgent._wrapModelWithLimiter(options.model, limiter, this.debug) };
97578
97704
  }
97579
97705
  const controller = new AbortController();
97580
97706
  const timeoutState = { timeoutId: null };
@@ -97602,6 +97728,9 @@ var init_ProbeAgent = __esm({
97602
97728
  if (useClaudeCode || useCodex) {
97603
97729
  try {
97604
97730
  result = await this._tryEngineStreamPath(options, controller, timeoutState);
97731
+ if (result && limiter) {
97732
+ result = _ProbeAgent._wrapEngineStreamWithLimiter(result, limiter, this.debug);
97733
+ }
97605
97734
  } catch (error40) {
97606
97735
  if (this.debug) {
97607
97736
  const engineType = useClaudeCode ? "Claude Code" : "Codex";
@@ -97612,41 +97741,7 @@ var init_ProbeAgent = __esm({
97612
97741
  if (!result) {
97613
97742
  result = await this._executeWithVercelProvider(options, controller);
97614
97743
  }
97615
- if (limiter && result.textStream) {
97616
- const originalStream = result.textStream;
97617
- const debug = this.debug;
97618
- const wrappedStream = (async function* () {
97619
- try {
97620
- for await (const chunk of originalStream) {
97621
- yield chunk;
97622
- }
97623
- } finally {
97624
- limiter.release(null);
97625
- if (debug) {
97626
- const stats = limiter.getStats();
97627
- console.log(`[DEBUG] Released global AI concurrency slot (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
97628
- }
97629
- }
97630
- })();
97631
- return new Proxy(result, {
97632
- get(target, prop) {
97633
- if (prop === "textStream") return wrappedStream;
97634
- const value = target[prop];
97635
- return typeof value === "function" ? value.bind(target) : value;
97636
- }
97637
- });
97638
- } else if (limiter) {
97639
- limiter.release(null);
97640
- }
97641
97744
  return result;
97642
- } catch (error40) {
97643
- if (limiter) {
97644
- limiter.release(null);
97645
- if (this.debug) {
97646
- console.log(`[DEBUG] Released global AI concurrency slot on error`);
97647
- }
97648
- }
97649
- throw error40;
97650
97745
  } finally {
97651
97746
  if (timeoutState.timeoutId) {
97652
97747
  clearTimeout(timeoutState.timeoutId);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@probelabs/probe",
3
- "version": "0.6.0-rc293",
3
+ "version": "0.6.0-rc294",
4
4
  "description": "Node.js wrapper for the probe code search tool",
5
5
  "main": "src/index.js",
6
6
  "module": "src/index.js",
@@ -1341,9 +1341,16 @@ export class ProbeAgent {
1341
1341
  // Use fallback manager with retry for each provider
1342
1342
  return await this.fallbackManager.executeWithFallback(
1343
1343
  async (provider, model, config) => {
1344
+ // Wrap fallback model with per-call concurrency limiter if configured.
1345
+ // The original options.model was wrapped in streamTextWithRetryAndFallback,
1346
+ // but fallback replaces it with a new model that needs wrapping too.
1347
+ let fallbackModel = provider(model);
1348
+ if (this.concurrencyLimiter) {
1349
+ fallbackModel = ProbeAgent._wrapModelWithLimiter(fallbackModel, this.concurrencyLimiter, this.debug);
1350
+ }
1344
1351
  const fallbackOptions = {
1345
1352
  ...options,
1346
- model: provider(model),
1353
+ model: fallbackModel,
1347
1354
  abortSignal: controller.signal
1348
1355
  };
1349
1356
 
@@ -1377,6 +1384,143 @@ export class ProbeAgent {
1377
1384
  );
1378
1385
  }
1379
1386
 
1387
+ /**
1388
+ * Wrap a LanguageModelV1 model so each doStream/doGenerate call acquires and
1389
+ * releases a concurrency limiter slot. This gates individual LLM API calls
1390
+ * (seconds each) instead of entire multi-step agent sessions (minutes).
1391
+ *
1392
+ * @param {Object} model - LanguageModelV1 model instance
1393
+ * @param {Object} limiter - Concurrency limiter with acquire/release/getStats
1394
+ * @param {boolean} debug - Enable debug logging
1395
+ * @returns {Object} Wrapped model with per-call concurrency gating
1396
+ * @private
1397
+ */
1398
+ static _wrapModelWithLimiter(model, limiter, debug) {
1399
+ return new Proxy(model, {
1400
+ get(target, prop) {
1401
+ if (prop === 'doStream') {
1402
+ return async function (...args) {
1403
+ await limiter.acquire(null);
1404
+ if (debug) {
1405
+ const stats = limiter.getStats();
1406
+ console.log(`[DEBUG] Acquired AI slot for LLM call (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
1407
+ }
1408
+ try {
1409
+ const result = await target.doStream(...args);
1410
+
1411
+ // Wrap the ReadableStream to release the slot when it completes,
1412
+ // errors, or is cancelled — covering all stream termination paths.
1413
+ // Guard against double-release: if cancel() races with an in-flight
1414
+ // pull() that is awaiting originalReader.read(), both paths could
1415
+ // try to release. The flag ensures exactly one release.
1416
+ const originalStream = result.stream;
1417
+ const originalReader = originalStream.getReader();
1418
+ let released = false;
1419
+ const releaseOnce = () => {
1420
+ if (released) return;
1421
+ released = true;
1422
+ limiter.release(null);
1423
+ };
1424
+ const wrappedStream = new ReadableStream({
1425
+ async pull(controller) {
1426
+ try {
1427
+ const { done, value } = await originalReader.read();
1428
+ if (done) {
1429
+ controller.close();
1430
+ releaseOnce();
1431
+ if (debug) {
1432
+ const stats = limiter.getStats();
1433
+ console.log(`[DEBUG] Released AI slot after LLM stream complete (${stats.globalActive}/${stats.maxConcurrent})`);
1434
+ }
1435
+ } else {
1436
+ controller.enqueue(value);
1437
+ }
1438
+ } catch (err) {
1439
+ releaseOnce();
1440
+ if (debug) {
1441
+ console.log(`[DEBUG] Released AI slot on LLM stream error`);
1442
+ }
1443
+ controller.error(err);
1444
+ }
1445
+ },
1446
+ cancel() {
1447
+ releaseOnce();
1448
+ if (debug) {
1449
+ console.log(`[DEBUG] Released AI slot on LLM stream cancel`);
1450
+ }
1451
+ originalReader.cancel();
1452
+ }
1453
+ });
1454
+
1455
+ return { ...result, stream: wrappedStream };
1456
+ } catch (err) {
1457
+ limiter.release(null);
1458
+ if (debug) {
1459
+ console.log(`[DEBUG] Released AI slot on doStream error`);
1460
+ }
1461
+ throw err;
1462
+ }
1463
+ };
1464
+ }
1465
+
1466
+ if (prop === 'doGenerate') {
1467
+ return async function (...args) {
1468
+ await limiter.acquire(null);
1469
+ if (debug) {
1470
+ const stats = limiter.getStats();
1471
+ console.log(`[DEBUG] Acquired AI slot for LLM generate (${stats.globalActive}/${stats.maxConcurrent})`);
1472
+ }
1473
+ try {
1474
+ const result = await target.doGenerate(...args);
1475
+ return result;
1476
+ } finally {
1477
+ limiter.release(null);
1478
+ if (debug) {
1479
+ const stats = limiter.getStats();
1480
+ console.log(`[DEBUG] Released AI slot after LLM generate (${stats.globalActive}/${stats.maxConcurrent})`);
1481
+ }
1482
+ }
1483
+ };
1484
+ }
1485
+
1486
+ const value = target[prop];
1487
+ return typeof value === 'function' ? value.bind(target) : value;
1488
+ }
1489
+ });
1490
+ }
1491
+
1492
+ /**
1493
+ * Wrap an engine stream result so its textStream async generator acquires
1494
+ * and releases a concurrency limiter slot. Acquire happens when iteration
1495
+ * begins; release happens in finally (completion, error, or break).
1496
+ *
1497
+ * @param {Object} result - Engine result with { textStream, usage, ... }
1498
+ * @param {Object} limiter - Concurrency limiter with acquire/release/getStats
1499
+ * @param {boolean} debug - Enable debug logging
1500
+ * @returns {Object} Result with wrapped textStream
1501
+ * @private
1502
+ */
1503
+ static _wrapEngineStreamWithLimiter(result, limiter, debug) {
1504
+ const originalStream = result.textStream;
1505
+ async function* gatedStream() {
1506
+ await limiter.acquire(null);
1507
+ if (debug) {
1508
+ const stats = limiter.getStats();
1509
+ console.log(`[DEBUG] Acquired AI slot for engine stream (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
1510
+ }
1511
+ try {
1512
+ yield* originalStream;
1513
+ } finally {
1514
+ limiter.release(null);
1515
+ if (debug) {
1516
+ const stats = limiter.getStats();
1517
+ console.log(`[DEBUG] Released AI slot after engine stream (${stats.globalActive}/${stats.maxConcurrent})`);
1518
+ }
1519
+ }
1520
+ }
1521
+ return { ...result, textStream: gatedStream() };
1522
+ }
1523
+
1380
1524
  /**
1381
1525
  * Execute streamText with retry and fallback support
1382
1526
  * @param {Object} options - streamText options
@@ -1384,14 +1528,12 @@ export class ProbeAgent {
1384
1528
  * @private
1385
1529
  */
1386
1530
  async streamTextWithRetryAndFallback(options) {
1387
- // Acquire global concurrency slot if limiter is configured
1531
+ // Wrap the model with per-call concurrency gating if limiter is configured.
1532
+ // This acquires/releases the slot around each individual LLM API call (doStream/doGenerate)
1533
+ // instead of holding it for the entire multi-step agent session.
1388
1534
  const limiter = this.concurrencyLimiter;
1389
- if (limiter) {
1390
- await limiter.acquire(null);
1391
- if (this.debug) {
1392
- const stats = limiter.getStats();
1393
- console.log(`[DEBUG] Acquired global AI concurrency slot (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
1394
- }
1535
+ if (limiter && options.model) {
1536
+ options = { ...options, model: ProbeAgent._wrapModelWithLimiter(options.model, limiter, this.debug) };
1395
1537
  }
1396
1538
 
1397
1539
  // Create AbortController for overall operation timeout
@@ -1430,6 +1572,12 @@ export class ProbeAgent {
1430
1572
  if (useClaudeCode || useCodex) {
1431
1573
  try {
1432
1574
  result = await this._tryEngineStreamPath(options, controller, timeoutState);
1575
+ // Gate engine stream with concurrency limiter if configured.
1576
+ // Engine paths bypass the Vercel model wrapper, so we wrap the
1577
+ // textStream async generator with acquire/release instead.
1578
+ if (result && limiter) {
1579
+ result = ProbeAgent._wrapEngineStreamWithLimiter(result, limiter, this.debug);
1580
+ }
1433
1581
  } catch (error) {
1434
1582
  if (this.debug) {
1435
1583
  const engineType = useClaudeCode ? 'Claude Code' : 'Codex';
@@ -1444,47 +1592,7 @@ export class ProbeAgent {
1444
1592
  result = await this._executeWithVercelProvider(options, controller);
1445
1593
  }
1446
1594
 
1447
- // Wrap textStream so limiter slot is held until stream completes.
1448
- // result.textStream is a read-only getter on DefaultStreamTextResult,
1449
- // so we wrap the result in a Proxy that intercepts the textStream property.
1450
- if (limiter && result.textStream) {
1451
- const originalStream = result.textStream;
1452
- const debug = this.debug;
1453
- const wrappedStream = (async function* () {
1454
- try {
1455
- for await (const chunk of originalStream) {
1456
- yield chunk;
1457
- }
1458
- } finally {
1459
- limiter.release(null);
1460
- if (debug) {
1461
- const stats = limiter.getStats();
1462
- console.log(`[DEBUG] Released global AI concurrency slot (${stats.globalActive}/${stats.maxConcurrent}, queue: ${stats.queueSize})`);
1463
- }
1464
- }
1465
- })();
1466
- return new Proxy(result, {
1467
- get(target, prop) {
1468
- if (prop === 'textStream') return wrappedStream;
1469
- const value = target[prop];
1470
- return typeof value === 'function' ? value.bind(target) : value;
1471
- }
1472
- });
1473
- } else if (limiter) {
1474
- // No textStream (shouldn't happen, but release just in case)
1475
- limiter.release(null);
1476
- }
1477
-
1478
1595
  return result;
1479
- } catch (error) {
1480
- // Release on error if limiter was acquired
1481
- if (limiter) {
1482
- limiter.release(null);
1483
- if (this.debug) {
1484
- console.log(`[DEBUG] Released global AI concurrency slot on error`);
1485
- }
1486
- }
1487
- throw error;
1488
1596
  } finally {
1489
1597
  // Clean up timeout (for non-engine paths; engine paths clean up in the generator)
1490
1598
  if (timeoutState.timeoutId) {