@openhoo/hoopilot 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -267,13 +267,15 @@ Incoming `x-request-id` headers are preserved on responses. If a request has no
267
267
 
268
268
  ## Metrics and usage
269
269
 
270
- Hoopilot tracks token usage, request counts, and latency in memory while the server runs. It can also report your GitHub Copilot account quota and premium-request usage.
270
+ Hoopilot tracks token usage, request counts, and latency in memory while the server runs. It can also report your GitHub Copilot account quota and premium-request usage, plus your GitHub REST API rate-limit budget.
271
271
 
272
- - `GET /metrics` returns Prometheus text (`text/plain; version=0.0.4`). It exposes request counters, upstream call counters, token counters by model and type, a request-duration histogram, an in-flight gauge, and Copilot quota gauges after `/v1/usage` has been fetched at least once. Counters reset to zero on restart, which Prometheus handles natively.
273
- - `GET /v1/usage` returns JSON combining the proxy metrics snapshot with live Copilot quota fetched from GitHub and cached for 60 seconds. If quota cannot be read, `copilot` is `null` and `copilot_error` explains why.
274
- - `hoopilot usage` prints your Copilot plan and quota from the command line.
272
+ - `GET /metrics` returns Prometheus text (`text/plain; version=0.0.4`). It exposes request counters, upstream call counters, token counters by model and type, a request-duration histogram, an in-flight gauge, Copilot quota gauges, and GitHub REST API rate-limit gauges (`hoopilot_github_ratelimit_limit`, `_remaining`, `_used`, `_reset_timestamp_seconds`, `_retry_after_seconds`, labelled by `resource`) — the quota and rate-limit series appear after `/v1/usage` has been fetched at least once. Counters reset to zero on restart, which Prometheus handles natively.
273
+ - `GET /v1/usage` returns JSON combining the proxy metrics snapshot with live Copilot quota fetched from GitHub and cached for 60 seconds. If quota cannot be read, `copilot` is `null` and `copilot_error` explains why. The snapshot's `proxy.githubRateLimit` field reports the most recent GitHub REST rate-limit budget per resource (`limit`, `remaining`, `used`, `resetAt`, `retryAfterSeconds`, `observedAt`).
274
+ - `hoopilot usage` prints your Copilot plan and quota — and, when GitHub returns them, your GitHub API rate-limit budget — from the command line.
275
275
 
276
- Token usage is read from the upstream `usage` object. For streaming chat completions, usage is only available when the client sends `stream_options: {"include_usage": true}`; Hoopilot does not inject that flag. Responses API streaming always reports usage, so streamed Responses requests are fully accounted.
276
+ Token usage is read from the upstream `usage` object. For streaming chat completions, usage is only available when the client sends `stream_options: {"include_usage": true}`; Hoopilot does not inject that flag. Responses API streaming always reports usage, so streamed Responses requests are fully accounted. The `hoopilot_token_extraction_total{outcome="extracted"|"missing"}` counter (mirrored in `/v1/usage` as `proxy.tokens.extraction`) tracks how often a completion reported usage versus not, so a rising `missing` count flags clients whose token usage is going unaccounted.
277
+
278
+ GitHub API usage is read from the `x-ratelimit-*` response headers that `api.github.com` returns on the `copilot_internal/user` quota call Hoopilot already makes, so it costs no extra request. (The Copilot completion host `api.githubcopilot.com` does not currently emit these headers, so per-completion rate-limit data is not yet available there.)
277
279
 
278
280
  `/metrics` and `/v1/usage` are subject to the same `HOOPILOT_API_KEY` gate as the other routes.
279
281
 
package/dist/cli.js CHANGED
@@ -179,6 +179,38 @@ function applyGithubApiHeaders(headers, token) {
179
179
  headers.set("x-github-api-version", COPILOT_USAGE_API_VERSION);
180
180
  return headers;
181
181
  }
182
+ function parseRateLimitHeaders(headers, nowMs = Date.now()) {
183
+ const limit = headerInt(headers, "x-ratelimit-limit");
184
+ const remaining = headerInt(headers, "x-ratelimit-remaining");
185
+ const used = headerInt(headers, "x-ratelimit-used");
186
+ const resetEpochSeconds = headerInt(headers, "x-ratelimit-reset");
187
+ const retryAfterSeconds = headerInt(headers, "retry-after");
188
+ if (limit === void 0 && remaining === void 0 && used === void 0 && resetEpochSeconds === void 0 && retryAfterSeconds === void 0) {
189
+ return void 0;
190
+ }
191
+ return removeUndefinedRateLimit({
192
+ limit,
193
+ observedAtMs: nowMs,
194
+ remaining,
195
+ resetEpochSeconds,
196
+ resource: headers.get("x-ratelimit-resource")?.trim() || "unknown",
197
+ retryAfterSeconds,
198
+ used
199
+ });
200
+ }
201
+ function headerInt(headers, name) {
202
+ const raw = headers.get(name);
203
+ if (raw === null) {
204
+ return void 0;
205
+ }
206
+ const value = Number.parseInt(raw.trim(), 10);
207
+ return Number.isFinite(value) && value >= 0 ? value : void 0;
208
+ }
209
+ function removeUndefinedRateLimit(rateLimit) {
210
+ return Object.fromEntries(
211
+ Object.entries(rateLimit).filter(([, value]) => value !== void 0)
212
+ );
213
+ }
182
214
  var CopilotClient = class {
183
215
  #auth;
184
216
  #allowUnsafeUpstream;
@@ -1642,6 +1674,7 @@ var DURATION_BUCKETS_SECONDS = [0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60];
1642
1674
  var USAGE_BUFFER_LIMIT_BYTES = 16 * 1024 * 1024;
1643
1675
  var MAX_TRACKED_MODELS = 200;
1644
1676
  var MAX_MODEL_LABEL_LENGTH = 200;
1677
+ var MAX_TRACKED_RATELIMIT_RESOURCES = 32;
1645
1678
  var LABEL_SEPARATOR = "";
1646
1679
  var UNKNOWN_MODEL = "unknown";
1647
1680
  function emptyModelTotals() {
@@ -1655,6 +1688,8 @@ var MetricsRegistry = class {
1655
1688
  #tokens = /* @__PURE__ */ new Map();
1656
1689
  #upstream = /* @__PURE__ */ new Map();
1657
1690
  #copilotQuota;
1691
+ #githubRateLimit = /* @__PURE__ */ new Map();
1692
+ #extraction = { extracted: 0, missing: 0 };
1658
1693
  constructor(options = {}) {
1659
1694
  this.#startedAtMs = (options.now ?? Date.now)();
1660
1695
  }
@@ -1671,6 +1706,19 @@ var MetricsRegistry = class {
1671
1706
  this.#requests.set(key, (this.#requests.get(key) ?? 0) + 1);
1672
1707
  this.#observeDuration(observation.route, observation.durationMs / 1e3);
1673
1708
  }
1709
+ /**
1710
+ * Record whether one upstream completion reported token usage. `missing`
1711
+ * counts responses that carried no usage object — most often streamed Chat
1712
+ * Completions sent without `stream_options: {"include_usage": true}` — so a
1713
+ * rising miss rate flags clients whose token usage is going unaccounted.
1714
+ */
1715
+ recordTokenExtraction(extracted) {
1716
+ if (extracted) {
1717
+ this.#extraction.extracted += 1;
1718
+ } else {
1719
+ this.#extraction.missing += 1;
1720
+ }
1721
+ }
1674
1722
  /** Accumulate token counts for a model from one upstream completion. */
1675
1723
  recordTokens(model, usage) {
1676
1724
  const name = this.#modelLabel(model);
@@ -1692,17 +1740,39 @@ var MetricsRegistry = class {
1692
1740
  recordCopilotQuota(usage) {
1693
1741
  this.#copilotQuota = usage;
1694
1742
  }
1695
- // Sanitize the model into a bounded, control-char-free label. The model can
1696
- // originate from a client request, so cap its length, strip characters that
1697
- // would corrupt the exposition format, and fold overflow past the cardinality
1698
- // limit into UNKNOWN_MODEL to keep the series count bounded.
1743
+ /**
1744
+ * Store the latest GitHub REST rate-limit budget, keyed by its resource bucket.
1745
+ * A no-op when `rateLimit` is undefined (the response carried no rate-limit
1746
+ * headers) so callers can pass {@link parseRateLimitHeaders} output directly.
1747
+ */
1748
+ recordGithubRateLimit(rateLimit) {
1749
+ if (!rateLimit) {
1750
+ return;
1751
+ }
1752
+ const resource = this.#rateLimitResource(rateLimit.resource);
1753
+ this.#githubRateLimit.set(resource, { ...rateLimit, resource });
1754
+ }
1755
+ // Sanitize the model into a bounded label. The model can originate from a
1756
+ // client request, so cap its length, strip characters that would corrupt the
1757
+ // exposition format, and fold overflow past the cardinality limit into
1758
+ // UNKNOWN_MODEL to keep the series count bounded.
1699
1759
  #modelLabel(model) {
1700
- const cleaned = model.replace(/[\u0000-\u001f\u007f]/g, "").trim().slice(0, MAX_MODEL_LABEL_LENGTH) || UNKNOWN_MODEL;
1760
+ const cleaned = cleanLabel(model).slice(0, MAX_MODEL_LABEL_LENGTH) || UNKNOWN_MODEL;
1701
1761
  if (!this.#tokens.has(cleaned) && this.#tokens.size >= MAX_TRACKED_MODELS) {
1702
1762
  return UNKNOWN_MODEL;
1703
1763
  }
1704
1764
  return cleaned;
1705
1765
  }
1766
+ // The resource comes from a trusted upstream header, but clean and bound it
1767
+ // with the same discipline as model labels: strip control characters that
1768
+ // would corrupt the exposition format and fold overflow into "unknown".
1769
+ #rateLimitResource(resource) {
1770
+ const cleaned = cleanLabel(resource).slice(0, MAX_MODEL_LABEL_LENGTH) || UNKNOWN_MODEL;
1771
+ if (!this.#githubRateLimit.has(cleaned) && this.#githubRateLimit.size >= MAX_TRACKED_RATELIMIT_RESOURCES) {
1772
+ return UNKNOWN_MODEL;
1773
+ }
1774
+ return cleaned;
1775
+ }
1706
1776
  #observeDuration(route, seconds) {
1707
1777
  const value = Number.isFinite(seconds) && seconds >= 0 ? seconds : 0;
1708
1778
  const entry = this.#durations.get(route) ?? {
@@ -1747,11 +1817,16 @@ var MetricsRegistry = class {
1747
1817
  upstreamErrors += count;
1748
1818
  }
1749
1819
  }
1820
+ const githubRateLimit = {};
1821
+ for (const [resource, rateLimit] of this.#githubRateLimit) {
1822
+ githubRateLimit[resource] = toRateLimitSnapshot(rateLimit);
1823
+ }
1750
1824
  return {
1825
+ githubRateLimit,
1751
1826
  inFlight: this.#inFlight,
1752
1827
  requests: { byRoute, byStatus, total: requestsTotal },
1753
1828
  startedAt: new Date(this.#startedAtMs).toISOString(),
1754
- tokens: { byModel, ...tokenTotals },
1829
+ tokens: { byModel, extraction: { ...this.#extraction }, ...tokenTotals },
1755
1830
  upstream: { errors: upstreamErrors, total: upstreamTotal },
1756
1831
  uptimeSeconds: Math.max(0, Math.round((now() - this.#startedAtMs) / 1e3))
1757
1832
  };
@@ -1801,6 +1876,16 @@ var MetricsRegistry = class {
1801
1876
  for (const [model, totals] of this.#tokens) {
1802
1877
  lines.push(`hoopilot_model_requests_total${labels({ model })} ${totals.requests}`);
1803
1878
  }
1879
+ lines.push(
1880
+ "# HELP hoopilot_token_extraction_total Completions by whether upstream reported token usage."
1881
+ );
1882
+ lines.push("# TYPE hoopilot_token_extraction_total counter");
1883
+ lines.push(
1884
+ `hoopilot_token_extraction_total${labels({ outcome: "extracted" })} ${this.#extraction.extracted}`
1885
+ );
1886
+ lines.push(
1887
+ `hoopilot_token_extraction_total${labels({ outcome: "missing" })} ${this.#extraction.missing}`
1888
+ );
1804
1889
  lines.push("# HELP hoopilot_request_duration_seconds Request duration by route.");
1805
1890
  lines.push("# TYPE hoopilot_request_duration_seconds histogram");
1806
1891
  for (const [route, entry] of this.#durations) {
@@ -1818,10 +1903,43 @@ var MetricsRegistry = class {
1818
1903
  lines.push(`hoopilot_request_duration_seconds_sum${labels({ route })} ${entry.sum}`);
1819
1904
  lines.push(`hoopilot_request_duration_seconds_count${labels({ route })} ${entry.count}`);
1820
1905
  }
1906
+ this.#renderGithubRateLimit(lines);
1821
1907
  this.#renderCopilotQuota(lines);
1822
1908
  return `${lines.join("\n")}
1823
1909
  `;
1824
1910
  }
1911
+ #renderGithubRateLimit(lines) {
1912
+ const entries = [...this.#githubRateLimit.values()];
1913
+ if (entries.length === 0) {
1914
+ return;
1915
+ }
1916
+ const gauge = (suffix, help, pick) => {
1917
+ const present = entries.filter((rateLimit) => pick(rateLimit) !== void 0);
1918
+ if (present.length === 0) {
1919
+ return;
1920
+ }
1921
+ lines.push(`# HELP hoopilot_github_ratelimit_${suffix} ${help}`);
1922
+ lines.push(`# TYPE hoopilot_github_ratelimit_${suffix} gauge`);
1923
+ for (const rateLimit of present) {
1924
+ lines.push(
1925
+ `hoopilot_github_ratelimit_${suffix}${labels({ resource: rateLimit.resource })} ${pick(rateLimit)}`
1926
+ );
1927
+ }
1928
+ };
1929
+ gauge("limit", "GitHub REST API request ceiling for the resource window.", (r) => r.limit);
1930
+ gauge("remaining", "Requests remaining in the GitHub REST API window.", (r) => r.remaining);
1931
+ gauge("used", "Requests used in the GitHub REST API window.", (r) => r.used);
1932
+ gauge(
1933
+ "reset_timestamp_seconds",
1934
+ "Unix epoch when the GitHub REST API window resets.",
1935
+ (r) => r.resetEpochSeconds
1936
+ );
1937
+ gauge(
1938
+ "retry_after_seconds",
1939
+ "Seconds to wait after a GitHub secondary-limit response.",
1940
+ (r) => r.retryAfterSeconds
1941
+ );
1942
+ }
1825
1943
  #renderCopilotQuota(lines) {
1826
1944
  const usage = this.#copilotQuota;
1827
1945
  if (!usage) {
@@ -1923,23 +2041,25 @@ var MetricsRegistry = class {
1923
2041
  }
1924
2042
  }
1925
2043
  };
1926
- function observeResponseUsage(response, fallbackModel, onUsage, signal) {
2044
+ function observeResponseUsage(response, fallbackModel, onUsage, signal, onOutcome) {
1927
2045
  const body = response.body;
1928
2046
  if (!body) {
1929
2047
  return response;
1930
2048
  }
1931
2049
  const [clientBranch, observerBranch] = body.tee();
1932
2050
  const isSse = response.headers.get("content-type")?.includes("text/event-stream") ?? false;
1933
- void consumeUsage(observerBranch, isSse, fallbackModel, onUsage, signal).catch(() => {
1934
- });
2051
+ void consumeUsage(observerBranch, isSse, fallbackModel, onUsage, signal, onOutcome).catch(
2052
+ () => {
2053
+ }
2054
+ );
1935
2055
  return new Response(clientBranch, {
1936
2056
  headers: response.headers,
1937
2057
  status: response.status,
1938
2058
  statusText: response.statusText
1939
2059
  });
1940
2060
  }
1941
- function recordResponseTextUsage(text, isSse, fallbackModel, onUsage) {
1942
- const accumulator = createUsageAccumulator(fallbackModel, onUsage);
2061
+ function recordResponseTextUsage(text, isSse, fallbackModel, onUsage, onOutcome) {
2062
+ const accumulator = createUsageAccumulator(fallbackModel, onUsage, onOutcome);
1943
2063
  if (isSse) {
1944
2064
  for (const line of text.split(/\r?\n/)) {
1945
2065
  considerSseLine(line, accumulator.consider);
@@ -1952,7 +2072,7 @@ function recordResponseTextUsage(text, isSse, fallbackModel, onUsage) {
1952
2072
  }
1953
2073
  accumulator.finish();
1954
2074
  }
1955
- async function consumeUsage(stream, isSse, fallbackModel, onUsage, signal) {
2075
+ async function consumeUsage(stream, isSse, fallbackModel, onUsage, signal, onOutcome) {
1956
2076
  const reader = stream.getReader();
1957
2077
  const onAbort = () => {
1958
2078
  reader.cancel().catch(() => {
@@ -1965,7 +2085,12 @@ async function consumeUsage(stream, isSse, fallbackModel, onUsage, signal) {
1965
2085
  signal?.addEventListener("abort", onAbort, { once: true });
1966
2086
  }
1967
2087
  const decoder = new TextDecoder();
1968
- const accumulator = createUsageAccumulator(fallbackModel, onUsage);
2088
+ const guardedOutcome = onOutcome ? (extracted) => {
2089
+ if (!signal?.aborted) {
2090
+ onOutcome(extracted);
2091
+ }
2092
+ } : void 0;
2093
+ const accumulator = createUsageAccumulator(fallbackModel, onUsage, guardedOutcome);
1969
2094
  let buffer = "";
1970
2095
  let bufferedBytes = 0;
1971
2096
  let overflowed = false;
@@ -2013,7 +2138,7 @@ async function consumeUsage(stream, isSse, fallbackModel, onUsage, signal) {
2013
2138
  }
2014
2139
  accumulator.finish();
2015
2140
  }
2016
- function createUsageAccumulator(fallbackModel, onUsage) {
2141
+ function createUsageAccumulator(fallbackModel, onUsage, onOutcome) {
2017
2142
  let model = fallbackModel;
2018
2143
  let usage;
2019
2144
  return {
@@ -2032,6 +2157,7 @@ function createUsageAccumulator(fallbackModel, onUsage) {
2032
2157
  if (usage) {
2033
2158
  onUsage(model, usage);
2034
2159
  }
2160
+ onOutcome?.(usage !== void 0);
2035
2161
  }
2036
2162
  };
2037
2163
  }
@@ -2062,6 +2188,37 @@ function modelText(value) {
2062
2188
  function nonNegative(value) {
2063
2189
  return Number.isFinite(value) && value > 0 ? value : 0;
2064
2190
  }
2191
+ function cleanLabel(value) {
2192
+ let result = "";
2193
+ for (const char of value) {
2194
+ const code = char.charCodeAt(0);
2195
+ if (code > 31 && code !== 127) {
2196
+ result += char;
2197
+ }
2198
+ }
2199
+ return result.trim();
2200
+ }
2201
+ function toRateLimitSnapshot(rateLimit) {
2202
+ const snapshot = {
2203
+ observedAt: new Date(rateLimit.observedAtMs).toISOString()
2204
+ };
2205
+ if (rateLimit.limit !== void 0) {
2206
+ snapshot.limit = rateLimit.limit;
2207
+ }
2208
+ if (rateLimit.remaining !== void 0) {
2209
+ snapshot.remaining = rateLimit.remaining;
2210
+ }
2211
+ if (rateLimit.used !== void 0) {
2212
+ snapshot.used = rateLimit.used;
2213
+ }
2214
+ if (rateLimit.resetEpochSeconds !== void 0) {
2215
+ snapshot.resetAt = new Date(rateLimit.resetEpochSeconds * 1e3).toISOString();
2216
+ }
2217
+ if (rateLimit.retryAfterSeconds !== void 0) {
2218
+ snapshot.retryAfterSeconds = rateLimit.retryAfterSeconds;
2219
+ }
2220
+ return snapshot;
2221
+ }
2065
2222
  function labelKey(...parts) {
2066
2223
  return parts.join(LABEL_SEPARATOR);
2067
2224
  }
@@ -2129,6 +2286,7 @@ function createHoopilotHandler(options = {}) {
2129
2286
  const metrics = options.metrics ?? new MetricsRegistry();
2130
2287
  const readUsage = createUsageReader(client, metrics);
2131
2288
  const recordTokens = (model, usage) => metrics.recordTokens(model, usage);
2289
+ const recordExtraction = (extracted) => metrics.recordTokenExtraction(extracted);
2132
2290
  const streamingProxyMode = resolveStreamingProxyMode(options);
2133
2291
  const bufferProxyBodies = shouldBufferProxyBodies(streamingProxyMode);
2134
2292
  return async (request) => {
@@ -2194,6 +2352,7 @@ function createHoopilotHandler(options = {}) {
2194
2352
  client,
2195
2353
  metrics,
2196
2354
  recordTokens,
2355
+ recordExtraction,
2197
2356
  request,
2198
2357
  requestLogger,
2199
2358
  bufferProxyBodies
@@ -2209,6 +2368,7 @@ function createHoopilotHandler(options = {}) {
2209
2368
  client,
2210
2369
  metrics,
2211
2370
  recordTokens,
2371
+ recordExtraction,
2212
2372
  request,
2213
2373
  requestLogger,
2214
2374
  bufferProxyBodies
@@ -2221,6 +2381,7 @@ function createHoopilotHandler(options = {}) {
2221
2381
  client,
2222
2382
  metrics,
2223
2383
  recordTokens,
2384
+ recordExtraction,
2224
2385
  request,
2225
2386
  requestLogger,
2226
2387
  bufferProxyBodies
@@ -2229,7 +2390,14 @@ function createHoopilotHandler(options = {}) {
2229
2390
  }
2230
2391
  if (request.method === "POST" && apiPath === "/v1/responses/compact") {
2231
2392
  return finish(
2232
- await handleResponsesCompact(client, metrics, recordTokens, request, requestLogger)
2393
+ await handleResponsesCompact(
2394
+ client,
2395
+ metrics,
2396
+ recordTokens,
2397
+ recordExtraction,
2398
+ request,
2399
+ requestLogger
2400
+ )
2233
2401
  );
2234
2402
  }
2235
2403
  if (request.method === "POST" && apiPath === "/v1/responses") {
@@ -2238,6 +2406,7 @@ function createHoopilotHandler(options = {}) {
2238
2406
  client,
2239
2407
  metrics,
2240
2408
  recordTokens,
2409
+ recordExtraction,
2241
2410
  request,
2242
2411
  requestLogger,
2243
2412
  bufferProxyBodies
@@ -2314,7 +2483,7 @@ function startHoopilotServer(options = {}) {
2314
2483
  url: `http://${urlHost(host)}:${server.port}`
2315
2484
  };
2316
2485
  }
2317
- async function handleAnthropicMessages(client, metrics, recordTokens, request, logger, bufferProxyBodies) {
2486
+ async function handleAnthropicMessages(client, metrics, recordTokens, recordExtraction, request, logger, bufferProxyBodies) {
2318
2487
  const anthropicRequest = await readJson(request);
2319
2488
  const responsesRequest = anthropicMessagesToResponsesRequest(anthropicRequest);
2320
2489
  const upstream = await client.responses(JSON.stringify(responsesRequest), request.signal);
@@ -2327,12 +2496,18 @@ async function handleAnthropicMessages(client, metrics, recordTokens, request, l
2327
2496
  if (isStreamingResponse(upstream) && upstream.body) {
2328
2497
  if (bufferProxyBodies) {
2329
2498
  const text = await upstream.text();
2330
- recordResponseTextUsage(text, true, model, recordTokens);
2499
+ recordResponseTextUsage(text, true, model, recordTokens, recordExtraction);
2331
2500
  return proxyResponse(
2332
2501
  responseFromText(upstream, responsesSseTextToAnthropicSseText(text, { model }))
2333
2502
  );
2334
2503
  }
2335
- const observed = observeResponseUsage(upstream, model, recordTokens, request.signal);
2504
+ const observed = observeResponseUsage(
2505
+ upstream,
2506
+ model,
2507
+ recordTokens,
2508
+ request.signal,
2509
+ recordExtraction
2510
+ );
2336
2511
  if (!observed.body) {
2337
2512
  return proxyResponse(observed);
2338
2513
  }
@@ -2350,6 +2525,7 @@ async function handleAnthropicMessages(client, metrics, recordTokens, request, l
2350
2525
  const responseModel = typeof body.model === "string" ? body.model.trim() : "";
2351
2526
  recordTokens(responseModel || model, usage);
2352
2527
  }
2528
+ recordExtraction(usage !== void 0);
2353
2529
  return jsonResponse(responsesResponseToAnthropicMessage(body, model));
2354
2530
  }
2355
2531
  function handleAnthropicCountTokens(body) {
@@ -2375,7 +2551,7 @@ async function handleModels(client, metrics, signal, logger) {
2375
2551
  logUpstreamSuccess(logger, "/models", upstream.status);
2376
2552
  return jsonResponse(normalizeModelsResponse(await upstream.json()));
2377
2553
  }
2378
- async function handleChatCompletions(client, metrics, recordTokens, request, logger, bufferProxyBodies) {
2554
+ async function handleChatCompletions(client, metrics, recordTokens, recordExtraction, request, logger, bufferProxyBodies) {
2379
2555
  const chatRequest = normalizeChatCompletionRequest(await readJson(request));
2380
2556
  const upstream = await client.chatCompletions(chatRequest, request.signal);
2381
2557
  metrics.recordUpstream("/chat/completions", upstream.ok);
@@ -2390,11 +2566,12 @@ async function handleChatCompletions(client, metrics, recordTokens, request, log
2390
2566
  model,
2391
2567
  recordTokens,
2392
2568
  request.signal,
2393
- bufferProxyBodies
2569
+ bufferProxyBodies,
2570
+ recordExtraction
2394
2571
  )
2395
2572
  );
2396
2573
  }
2397
- async function handleCompletions(client, metrics, recordTokens, request, logger, bufferProxyBodies) {
2574
+ async function handleCompletions(client, metrics, recordTokens, recordExtraction, request, logger, bufferProxyBodies) {
2398
2575
  const body = await readJson(request);
2399
2576
  const upstream = await client.chatCompletions(
2400
2577
  completionsRequestToChatCompletion(body),
@@ -2409,7 +2586,7 @@ async function handleCompletions(client, metrics, recordTokens, request, logger,
2409
2586
  if (isStreamingResponse(upstream) && upstream.body) {
2410
2587
  if (bufferProxyBodies) {
2411
2588
  const upstreamText = await upstream.text();
2412
- recordResponseTextUsage(upstreamText, true, model, recordTokens);
2589
+ recordResponseTextUsage(upstreamText, true, model, recordTokens, recordExtraction);
2413
2590
  const text = completionSseTextFromChatSseText(upstreamText);
2414
2591
  return proxyResponse(responseFromText(upstream, text));
2415
2592
  }
@@ -2422,7 +2599,8 @@ async function handleCompletions(client, metrics, recordTokens, request, logger,
2422
2599
  }),
2423
2600
  model,
2424
2601
  recordTokens,
2425
- request.signal
2602
+ request.signal,
2603
+ recordExtraction
2426
2604
  )
2427
2605
  );
2428
2606
  }
@@ -2432,9 +2610,10 @@ async function handleCompletions(client, metrics, recordTokens, request, logger,
2432
2610
  const responseModel = typeof completion.model === "string" ? completion.model.trim() : "";
2433
2611
  recordTokens(responseModel || model, usage);
2434
2612
  }
2613
+ recordExtraction(usage !== void 0);
2435
2614
  return jsonResponse(chatCompletionToCompletion(completion));
2436
2615
  }
2437
- async function handleResponses(client, metrics, recordTokens, request, logger, bufferProxyBodies) {
2616
+ async function handleResponses(client, metrics, recordTokens, recordExtraction, request, logger, bufferProxyBodies) {
2438
2617
  const body = await readJsonText(request);
2439
2618
  const upstream = await client.responses(body, request.signal);
2440
2619
  metrics.recordUpstream("/responses", upstream.ok);
@@ -2449,11 +2628,12 @@ async function handleResponses(client, metrics, recordTokens, request, logger, b
2449
2628
  model,
2450
2629
  recordTokens,
2451
2630
  request.signal,
2452
- bufferProxyBodies
2631
+ bufferProxyBodies,
2632
+ recordExtraction
2453
2633
  )
2454
2634
  );
2455
2635
  }
2456
- async function handleResponsesCompact(client, metrics, recordTokens, request, logger) {
2636
+ async function handleResponsesCompact(client, metrics, recordTokens, recordExtraction, request, logger) {
2457
2637
  const body = await readJson(request);
2458
2638
  const upstream = await client.responses(
2459
2639
  JSON.stringify({ ...body, stream: false }),
@@ -2466,17 +2646,23 @@ async function handleResponsesCompact(client, metrics, recordTokens, request, lo
2466
2646
  logUpstreamSuccess(logger, "/responses", upstream.status);
2467
2647
  const isSse = isStreamingResponse(upstream);
2468
2648
  const text = await upstream.text();
2469
- recordResponseTextUsage(text, isSse, normalizeRequestedModel(body.model), recordTokens);
2649
+ recordResponseTextUsage(
2650
+ text,
2651
+ isSse,
2652
+ normalizeRequestedModel(body.model),
2653
+ recordTokens,
2654
+ recordExtraction
2655
+ );
2470
2656
  return jsonResponse(responsesCompactionResult(text, isSse));
2471
2657
  }
2472
- async function responseWithObservedUsage(response, fallbackModel, recordTokens, signal, bufferBody) {
2658
+ async function responseWithObservedUsage(response, fallbackModel, recordTokens, signal, bufferBody, recordExtraction) {
2473
2659
  const isSse = isStreamingResponse(response);
2474
2660
  if (bufferBody && response.body) {
2475
2661
  const text = await response.text();
2476
- recordResponseTextUsage(text, isSse, fallbackModel, recordTokens);
2662
+ recordResponseTextUsage(text, isSse, fallbackModel, recordTokens, recordExtraction);
2477
2663
  return responseFromText(response, text);
2478
2664
  }
2479
- return observeResponseUsage(response, fallbackModel, recordTokens, signal);
2665
+ return observeResponseUsage(response, fallbackModel, recordTokens, signal, recordExtraction);
2480
2666
  }
2481
2667
  function responseFromText(source, text) {
2482
2668
  return new Response(text, {
@@ -2905,6 +3091,7 @@ function createUsageReader(client, metrics, now = Date.now, ttlMs = USAGE_CACHE_
2905
3091
  try {
2906
3092
  const upstream = await client.usage(signal);
2907
3093
  metrics.recordUpstream(usagePath, upstream.ok);
3094
+ metrics.recordGithubRateLimit(parseRateLimitHeaders(upstream.headers, now()));
2908
3095
  if (!upstream.ok) {
2909
3096
  return { error: `GitHub Copilot usage request failed with ${upstream.status}.` };
2910
3097
  }
@@ -3741,6 +3928,7 @@ async function runUsage(options = {}) {
3741
3928
  }
3742
3929
  throw new Error(message);
3743
3930
  }
3931
+ const rateLimit = parseRateLimitHeaders(response.headers);
3744
3932
  const usage = normalizeCopilotUsage(await response.json().catch(() => ({})));
3745
3933
  logger.debug(
3746
3934
  { event: "usage.fetch.succeeded", plan: usage.plan },
@@ -3749,8 +3937,30 @@ async function runUsage(options = {}) {
3749
3937
  for (const line of formatCopilotUsage(usage)) {
3750
3938
  console.log(line);
3751
3939
  }
3940
+ if (rateLimit) {
3941
+ console.log(formatGithubRateLimit(rateLimit));
3942
+ }
3752
3943
  return usage;
3753
3944
  }
3945
+ function formatGithubRateLimit(rateLimit) {
3946
+ const parts = [];
3947
+ if (rateLimit.remaining !== void 0 && rateLimit.limit !== void 0) {
3948
+ parts.push(`${rateLimit.remaining}/${rateLimit.limit} requests remaining`);
3949
+ } else if (rateLimit.remaining !== void 0) {
3950
+ parts.push(`${rateLimit.remaining} requests remaining`);
3951
+ } else if (rateLimit.used !== void 0) {
3952
+ parts.push(`${rateLimit.used} requests used`);
3953
+ }
3954
+ if (rateLimit.resetEpochSeconds !== void 0) {
3955
+ parts.push(`resets ${new Date(rateLimit.resetEpochSeconds * 1e3).toISOString()}`);
3956
+ }
3957
+ if (rateLimit.retryAfterSeconds !== void 0) {
3958
+ parts.push(`retry after ${rateLimit.retryAfterSeconds}s`);
3959
+ }
3960
+ const detail = parts.length > 0 ? parts.join(", ") : "n/a";
3961
+ const resource = rateLimit.resource && rateLimit.resource !== "unknown" ? ` (${rateLimit.resource})` : "";
3962
+ return `GitHub API rate limit${resource}: ${detail}`;
3963
+ }
3754
3964
  function formatCopilotUsage(usage) {
3755
3965
  const lines = [];
3756
3966
  if (usage.plan) {