langgraph-api 0.11.0.dev9__py3-none-any.whl → 0.12.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langgraph_api/__init__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.11.0.dev9"
1
+ __version__ = "0.12.0.dev1"
langgraph_api/api/meta.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import langgraph.version
2
2
  import structlog
3
+ from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
3
4
  from starlette.responses import JSONResponse, PlainTextResponse
4
5
 
5
6
  from langgraph_api import __version__, config, metadata
@@ -46,54 +47,6 @@ def _merge_pool_stats(local: PoolStats, remote: PoolStats) -> PoolStats:
46
47
  return merged
47
48
 
48
49
 
49
- def _pool_stats_to_prometheus_lines(
50
- stats: PoolStats,
51
- project_id: str | None,
52
- revision_id: str | None,
53
- deployment_type: str = "",
54
- ) -> list[str]:
55
- """Format merged pool stats as Prometheus text lines (same format as langgraph_runtime.database.pool_stats)."""
56
- labels = f'project_id="{project_id}", revision_id="{revision_id}", deployment_type="{deployment_type}"'
57
- lines = []
58
- if "postgres" in stats:
59
- pg = stats["postgres"]
60
- lines.extend(
61
- [
62
- "# HELP lg_api_pg_pool_max The maximum size of the postgres connection pool.",
63
- "# TYPE lg_api_pg_pool_max gauge",
64
- f"lg_api_pg_pool_max{{{labels}}} {pg.get('pool_max', 0)}",
65
- "# HELP lg_api_pg_pool_size Number of connections currently managed by the postgres connection pool (in the pool, given to clients, being prepared)",
66
- "# TYPE lg_api_pg_pool_size gauge",
67
- f"lg_api_pg_pool_size{{{labels}}} {pg.get('pool_size', 0)}",
68
- "# HELP lg_api_pg_pool_available Number of connections currently idle in the postgres connection pool",
69
- "# TYPE lg_api_pg_pool_available gauge",
70
- f"lg_api_pg_pool_available{{{labels}}} {pg.get('pool_available', 0)}",
71
- "# HELP lg_api_pg_pool_requests_queued Number of postgres connection requests queued because a postgres connection wasn't immediately available in the pool",
72
- "# TYPE lg_api_pg_pool_requests_queued counter",
73
- f"lg_api_pg_pool_requests_queued{{{labels}}} {pg.get('requests_queued', 0)}",
74
- "# HELP lg_api_pg_pool_requests_errors Number of postgres connection requests resulting in an error (timeouts, queue full...)",
75
- "# TYPE lg_api_pg_pool_requests_errors counter",
76
- f"lg_api_pg_pool_requests_errors{{{labels}}} {pg.get('requests_errors', 0)}",
77
- ]
78
- )
79
- if "redis" in stats:
80
- rd = stats["redis"]
81
- lines.extend(
82
- [
83
- "# HELP lg_api_redis_pool_available Number of connections currently idle in the redis connection pool",
84
- "# TYPE lg_api_redis_pool_available gauge",
85
- f"lg_api_redis_pool_available{{{labels}}} {rd.get('idle_connections', 0)}",
86
- "# HELP lg_api_redis_pool_size Number of connections currently in use in the redis connection pool",
87
- "# TYPE lg_api_redis_pool_size gauge",
88
- f"lg_api_redis_pool_size{{{labels}}} {rd.get('in_use_connections', 0)}",
89
- "# HELP lg_api_redis_pool_max The maximum size of the redis connection pool.",
90
- "# TYPE lg_api_redis_pool_max gauge",
91
- f"lg_api_redis_pool_max{{{labels}}} {rd.get('max_connections', 0)}",
92
- ]
93
- )
94
- return lines
95
-
96
-
97
50
  async def _grpc_pool_stats() -> PoolStats:
98
51
  """Fetch connection pool stats from the Core API (Go) via gRPC for metrics aggregation. Returns {} on error."""
99
52
  if not IS_POSTGRES_OR_GRPC_BACKEND:
@@ -107,21 +60,12 @@ async def _grpc_pool_stats() -> PoolStats:
107
60
  return {}
108
61
 
109
62
 
110
- async def meta_pool_stats(metrics_format: str) -> PoolStats | list[str]:
63
+ async def meta_pool_stats() -> PoolStats:
111
64
  local_pool_stats: PoolStats = pool_stats()
112
65
 
113
66
  # Aggregate with Core API (Go) pool stats when using gRPC backend
114
67
  grpc_pool_stats = await _grpc_pool_stats()
115
- merged_pool_stats = _merge_pool_stats(local_pool_stats, grpc_pool_stats)
116
- if metrics_format == "prometheus":
117
- return _pool_stats_to_prometheus_lines(
118
- merged_pool_stats,
119
- metadata.PROJECT_ID,
120
- metadata.HOST_REVISION_ID,
121
- metadata.DEPLOYMENT_TYPE,
122
- )
123
- else:
124
- return merged_pool_stats
68
+ return _merge_pool_stats(local_pool_stats, grpc_pool_stats)
125
69
 
126
70
 
127
71
  async def meta_info(request: ApiRequest):
@@ -153,81 +97,26 @@ async def meta_metrics(request: ApiRequest):
153
97
  if metrics_format not in METRICS_FORMATS:
154
98
  metrics_format = "prometheus"
155
99
 
156
- # collect stats
157
- metrics = get_metrics()
158
- worker_metrics = metrics["workers"]
159
- workers_max = worker_metrics["max"]
160
- workers_active = worker_metrics["active"]
161
- workers_available = worker_metrics["available"]
100
+ if metrics_format == "prometheus":
101
+ # Served straight from the OTLP Prometheus client's registry (see
102
+ # metrics_otlp._LSDPrometheusReader).
103
+ return PlainTextResponse(generate_latest(), media_type=CONTENT_TYPE_LATEST)
162
104
 
105
+ # JSON: hand-built snapshot of workers, queue depth, HTTP, and pool stats.
106
+ worker_metrics = get_metrics()["workers"]
163
107
  http_metrics = HTTP_METRICS_COLLECTOR.get_metrics(
164
108
  metadata.PROJECT_ID,
165
109
  metadata.HOST_REVISION_ID,
166
110
  metrics_format,
167
111
  metadata.DEPLOYMENT_TYPE,
168
112
  )
169
-
170
- merged_pool_stats = await meta_pool_stats(metrics_format)
171
-
172
- if metrics_format == "json":
173
- async with connect() as conn:
174
- resp = {
175
- **merged_pool_stats,
176
- "queue": await Runs.stats(conn),
177
- **http_metrics,
178
- }
179
- if config.N_JOBS_PER_WORKER > 0:
180
- resp["workers"] = worker_metrics
181
- return JSONResponse(resp)
182
- elif metrics_format == "prometheus":
183
- metrics = []
184
- try:
185
- async with connect() as conn:
186
- queue_stats = await Runs.stats(conn)
187
-
188
- labels = f'project_id="{metadata.PROJECT_ID}", revision_id="{metadata.HOST_REVISION_ID}", deployment_type="{metadata.DEPLOYMENT_TYPE}"'
189
- metrics.extend(
190
- [
191
- "# HELP lg_api_num_pending_runs The number of runs currently pending.",
192
- "# TYPE lg_api_num_pending_runs gauge",
193
- f"lg_api_num_pending_runs{{{labels}}} {queue_stats['n_pending']}",
194
- "# HELP lg_api_num_running_runs The number of runs currently running.",
195
- "# TYPE lg_api_num_running_runs gauge",
196
- f"lg_api_num_running_runs{{{labels}}} {queue_stats['n_running']}",
197
- "# HELP lg_api_pending_runs_wait_time_max The maximum time a run has been pending, in seconds.",
198
- "# TYPE lg_api_pending_runs_wait_time_max gauge",
199
- f"lg_api_pending_runs_wait_time_max{{{labels}}} {queue_stats.get('pending_runs_wait_time_max_secs') or 0}",
200
- "# HELP lg_api_pending_runs_wait_time_med The median pending wait time across runs, in seconds.",
201
- "# TYPE lg_api_pending_runs_wait_time_med gauge",
202
- f"lg_api_pending_runs_wait_time_med{{{labels}}} {queue_stats.get('pending_runs_wait_time_med_secs') or 0}",
203
- "# HELP lg_api_pending_unblocked_runs_wait_time_max The maximum time a run has been pending excluding runs blocked by another run on the same thread, in seconds.",
204
- "# TYPE lg_api_pending_unblocked_runs_wait_time_max gauge",
205
- f"lg_api_pending_unblocked_runs_wait_time_max{{{labels}}} {queue_stats.get('pending_unblocked_runs_wait_time_max_secs') or 0}",
206
- ]
207
- )
208
- except Exception as e:
209
- await logger.awarning(
210
- "Ignoring error while getting run stats for /metrics", exc_info=e
211
- )
212
-
113
+ merged_pool_stats = await meta_pool_stats()
114
+ async with connect() as conn:
115
+ resp = {
116
+ **merged_pool_stats,
117
+ "queue": await Runs.stats(conn),
118
+ **http_metrics,
119
+ }
213
120
  if config.N_JOBS_PER_WORKER > 0:
214
- worker_labels = f'project_id="{metadata.PROJECT_ID}", revision_id="{metadata.HOST_REVISION_ID}", deployment_type="{metadata.DEPLOYMENT_TYPE}"'
215
- metrics.extend(
216
- [
217
- "# HELP lg_api_workers_max The maximum number of workers available.",
218
- "# TYPE lg_api_workers_max gauge",
219
- f"lg_api_workers_max{{{worker_labels}}} {workers_max}",
220
- "# HELP lg_api_workers_active The number of currently active workers.",
221
- "# TYPE lg_api_workers_active gauge",
222
- f"lg_api_workers_active{{{worker_labels}}} {workers_active}",
223
- "# HELP lg_api_workers_available The number of available (idle) workers.",
224
- "# TYPE lg_api_workers_available gauge",
225
- f"lg_api_workers_available{{{worker_labels}}} {workers_available}",
226
- ]
227
- )
228
-
229
- metrics.extend(http_metrics)
230
- metrics.extend(merged_pool_stats)
231
-
232
- metrics_response = "\n".join(metrics)
233
- return PlainTextResponse(metrics_response)
121
+ resp["workers"] = worker_metrics
122
+ return JSONResponse(resp)
@@ -577,8 +577,13 @@ METRIC_MAX_EMITTING_TIER = env(
577
577
  "METRIC_MAX_EMITTING_TIER", cast=int, default=_METRIC_MAX_EMITTING_TIER_DEFAULT
578
578
  )
579
579
  DATADOG_METRICS_ENABLED = bool(LSD_DD_API_KEY)
580
- LSD_PROM_METRICS_ENABLED = env("LSD_PROM_METRICS_ENABLED", cast=bool, default=False)
581
- LSD_PROM_METRICS_PORT = env("LSD_PROM_METRICS_PORT", cast=int, default=9464)
580
+ # When true, the Prometheus scrape (/metrics) exposes ALL metrics, not just the
581
+ # lsd_web_metric (Deployment-UI) set. Record-time tier filtering
582
+ # (METRIC_MAX_EMITTING_TIER) still applies, so internal metrics must be within the
583
+ # max emitting tier to be recorded at all.
584
+ EXPOSE_INTERNAL_METRICS_PROMETHEUS = env(
585
+ "EXPOSE_INTERNAL_METRICS_PROMETHEUS", cast=bool, default=False
586
+ )
582
587
  LANGGRAPH_LOGS_ENDPOINT = env("LANGGRAPH_LOGS_ENDPOINT", cast=str, default=None)
583
588
  LANGGRAPH_LOGS_ENABLED = env("LANGGRAPH_LOGS_ENABLED", cast=bool, default=False)
584
589
 
@@ -635,6 +640,7 @@ __all__ = [
635
640
  "CRON_SCHEDULER_SLEEP_TIME",
636
641
  "DATABASE_URI",
637
642
  "DATADOG_METRICS_ENABLED",
643
+ "EXPOSE_INTERNAL_METRICS_PROMETHEUS",
638
644
  "FF_CRONS_ENABLED",
639
645
  "FF_LOG_DROPPED_EVENTS",
640
646
  "FF_LOG_QUERY_AND_PARAMS",
@@ -672,8 +678,6 @@ __all__ = [
672
678
  "LSD_GRPC_SERVER_ADDRESS",
673
679
  "LSD_GRPC_SERVER_MAX_RECV_MSG_BYTES",
674
680
  "LSD_GRPC_SERVER_MAX_SEND_MSG_BYTES",
675
- "LSD_PROM_METRICS_ENABLED",
676
- "LSD_PROM_METRICS_PORT",
677
681
  "MAX_STREAM_CHUNK_SIZE_BYTES",
678
682
  "METRIC_MAX_EMITTING_TIER",
679
683
  "METRIC_PREFIX",
@@ -0,0 +1,184 @@
1
+ """Periodic collector that pushes snapshot/state metrics to the OTLP client.
2
+
3
+ This background task samples the same sources every ``STATS_INTERVAL_SECS`` and records them via the reporter.
4
+
5
+ The loop runs in **every** process (on postgres both the API server and the
6
+ dedicated queue worker share the same lifespan; inmem is a single process). Each
7
+ metric group self-gates so it lands on the right process:
8
+
9
+ - **worker gauges** — recorded wherever workers run (``N_JOBS_PER_WORKER > 0``):
10
+ the queue worker, or a combined single-process deployment. A distributed API
11
+ process (``N_JOBS_PER_WORKER == 0``) has no workers and skips them.
12
+ - **queue depth** (``num_pending_runs``/``num_running_runs``) — a single global
13
+ value from ``Runs.stats`` (a gRPC call to the Go core). Emitted by the **API
14
+ process only** (``not IS_QUEUE_ENTRYPOINT``) on the **postgres** runtime; inmem
15
+ skips the DB round-trip entirely.
16
+ - **Postgres + Redis pool stats** — recorded on **both** processes (postgres
17
+ runtime only), each reporting its own pools via ``meta_pool_stats()``, which
18
+ merges the local Python pools with the Go-core pools. Redis stats in particular
19
+ come from the local Python pool — the Go core omits them unless it has a
20
+ non-cluster redis client — so a Go-core-only source would drop them. inmem has
21
+ no real Postgres/Redis pools, so nothing is reported.
22
+
23
+ The two pool request counters are cumulative, so we push the delta since the
24
+ previous sample (OTLP counters are additive).
25
+
26
+ The loop also logs the same samples (``Worker stats``, ``Postgres pool stats``,
27
+ ``Redis pool stats``) — folding in what the legacy per-process ``stats_loop``
28
+ functions used to log.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import asyncio
34
+
35
+ import structlog
36
+
37
+ from langgraph_api import config
38
+ from langgraph_api.api.meta import meta_pool_stats
39
+ from langgraph_api.feature_flags import IS_POSTGRES_OR_GRPC_BACKEND
40
+ from langgraph_api.metrics_otlp import (
41
+ COUNTER_PG_POOL_REQUESTS_ERRORS,
42
+ COUNTER_PG_POOL_REQUESTS_QUEUED,
43
+ GAUGE_NUM_PENDING_RUNS,
44
+ GAUGE_NUM_RUNNING_RUNS,
45
+ GAUGE_PG_POOL_AVAILABLE,
46
+ GAUGE_PG_POOL_MAX,
47
+ GAUGE_PG_POOL_SIZE,
48
+ GAUGE_REDIS_POOL_AVAILABLE,
49
+ GAUGE_REDIS_POOL_MAX,
50
+ GAUGE_REDIS_POOL_SIZE,
51
+ GAUGE_WORKERS_ACTIVE,
52
+ GAUGE_WORKERS_AVAILABLE,
53
+ GAUGE_WORKERS_MAX,
54
+ get_otlp_metrics_reporter,
55
+ )
56
+ from langgraph_runtime.database import connect
57
+ from langgraph_runtime.metrics import get_metrics
58
+
59
+ if IS_POSTGRES_OR_GRPC_BACKEND:
60
+ from langgraph_api.grpc.ops import Runs
61
+ else:
62
+ from langgraph_runtime.ops import Runs
63
+
64
+ logger = structlog.stdlib.get_logger(__name__)
65
+
66
+
67
+ async def _collect_queue_and_workers(reporter) -> None:
68
+ """Worker gauges (where workers run) + queue depth (API process only).
69
+
70
+ Worker counts are local to the process running this loop and emitted wherever
71
+ workers run (``N_JOBS_PER_WORKER > 0``) — the queue worker or a combined
72
+ single-process deployment; a distributed API process (N_JOBS == 0) skips them.
73
+
74
+ Queue depth is a single global value (from the run table, via ``Runs.stats`` —
75
+ a gRPC call to the Go core) and is emitted by the **API process only**
76
+ (``not IS_QUEUE_ENTRYPOINT``) on **postgres**: inmem skips the DB round-trip,
77
+ and the dedicated queue worker leaves it to the API process so the global value
78
+ is not double-reported across the queue/API split.
79
+ """
80
+ if config.N_JOBS_PER_WORKER > 0:
81
+ workers = get_metrics()["workers"]
82
+ reporter.record_gauge(GAUGE_WORKERS_MAX, workers["max"])
83
+ reporter.record_gauge(GAUGE_WORKERS_ACTIVE, workers["active"])
84
+ reporter.record_gauge(GAUGE_WORKERS_AVAILABLE, workers["available"])
85
+ await logger.ainfo(
86
+ "Worker stats",
87
+ max=workers["max"],
88
+ active=workers["active"],
89
+ available=workers["available"],
90
+ )
91
+
92
+ # Queue depth is read from the run table via Runs.stats (a gRPC call to the
93
+ # Go core on postgres). Emitted by the API process only — the queue worker
94
+ # (IS_QUEUE_ENTRYPOINT) skips it. inmem skips the DB round-trip and reports
95
+ # nothing.
96
+ if IS_POSTGRES_OR_GRPC_BACKEND and not config.IS_QUEUE_ENTRYPOINT:
97
+ async with connect() as conn:
98
+ stats = await Runs.stats(conn)
99
+ reporter.record_gauge(GAUGE_NUM_PENDING_RUNS, stats["n_pending"])
100
+ reporter.record_gauge(GAUGE_NUM_RUNNING_RUNS, stats["n_running"])
101
+
102
+
103
+ async def _collect_pool(reporter, prev_counters: dict[str, int]) -> None:
104
+ """Postgres + Redis pool gauges + cumulative request counters.
105
+
106
+ Postgres runtime only, recorded on **both** processes (API server and queue
107
+ worker) — each reports its own pools. Uses ``meta_pool_stats()``, which merges
108
+ the local Python pools with the Go-core pools — matching the legacy /metrics.
109
+ Redis stats come from the local Python pool (the Go core omits them unless it
110
+ has a non-cluster redis client), so a Go-core-only source would drop them.
111
+ """
112
+ # Limitation: under BG_JOB_ISOLATED_LOOPS each worker runs in its own thread with its own
113
+ # thread-local pg pool and redis client. This collector runs on the main
114
+ # thread, so meta_pool_stats() -> _get_pool()/redis_stats() only sees the main
115
+ # thread's pool (redis_stats() reads the global client unconditionally), and
116
+ # the per-thread isolated pools are NOT aggregated — so pg/redis pool gauges
117
+ # and the pg request counters under-report in isolated-loop mode.
118
+ stats = await meta_pool_stats()
119
+
120
+ pg = stats.get("postgres") or {}
121
+ if pg:
122
+ reporter.record_gauge(GAUGE_PG_POOL_MAX, pg.get("pool_max", 0))
123
+ reporter.record_gauge(GAUGE_PG_POOL_SIZE, pg.get("pool_size", 0))
124
+ reporter.record_gauge(GAUGE_PG_POOL_AVAILABLE, pg.get("pool_available", 0))
125
+ # Cumulative counters: record the delta since the last sample. Emit on a
126
+ # non-negative delta (>= 0) so the counter is created and reported from the
127
+ # first sample even when it is 0 — the legacy /metrics always reported
128
+ # these. Negative deltas (Go-core pool counter resets) are skipped to keep
129
+ # the OTLP counter monotonic.
130
+ for key, metric in (
131
+ ("requests_queued", COUNTER_PG_POOL_REQUESTS_QUEUED),
132
+ ("requests_errors", COUNTER_PG_POOL_REQUESTS_ERRORS),
133
+ ):
134
+ current = pg.get(key, 0)
135
+ delta = current - prev_counters.get(key, 0)
136
+ if delta >= 0:
137
+ reporter.inc_counter(metric, delta)
138
+ prev_counters[key] = current
139
+ await logger.ainfo("Postgres pool stats", **pg)
140
+
141
+ redis = stats.get("redis") or {}
142
+ if redis:
143
+ reporter.record_gauge(
144
+ GAUGE_REDIS_POOL_AVAILABLE, redis.get("idle_connections", 0)
145
+ )
146
+ reporter.record_gauge(GAUGE_REDIS_POOL_SIZE, redis.get("in_use_connections", 0))
147
+ reporter.record_gauge(GAUGE_REDIS_POOL_MAX, redis.get("max_connections", 0))
148
+ await logger.ainfo("Redis pool stats", **redis)
149
+
150
+
151
+ async def _collect_once(prev_counters: dict[str, int]) -> None:
152
+ reporter = get_otlp_metrics_reporter()
153
+ if not reporter.enabled:
154
+ return
155
+
156
+ # Worker gauges are emitted wherever workers run; _collect_queue_and_workers
157
+ # adds queue depth on the API process only (postgres; inmem skips the DB
158
+ # round-trip).
159
+ try:
160
+ await _collect_queue_and_workers(reporter)
161
+ except Exception as exc:
162
+ await logger.awarning(
163
+ "metrics collector: queue/worker sample failed", exc_info=exc
164
+ )
165
+
166
+ # Postgres/Redis pools live in the Go core (no real pools on inmem).
167
+ if IS_POSTGRES_OR_GRPC_BACKEND:
168
+ try:
169
+ await _collect_pool(reporter, prev_counters)
170
+ except Exception as exc:
171
+ await logger.awarning("metrics collector: pool sample failed", exc_info=exc)
172
+
173
+
174
+ async def collector_loop() -> None:
175
+ """Sample snapshot metrics into the OTLP client every STATS_INTERVAL_SECS."""
176
+ interval = config.STATS_INTERVAL_SECS
177
+ prev_counters: dict[str, int] = {}
178
+ await logger.ainfo("Starting OTLP metrics collector loop", interval_secs=interval)
179
+ try:
180
+ while True:
181
+ await _collect_once(prev_counters)
182
+ await asyncio.sleep(interval)
183
+ except asyncio.CancelledError:
184
+ pass
@@ -4,18 +4,20 @@ import os
4
4
  import threading
5
5
  import time
6
6
  from contextlib import contextmanager
7
- from dataclasses import dataclass
7
+ from dataclasses import dataclass, replace
8
8
  from datetime import timedelta
9
9
  from typing import TYPE_CHECKING, Any, Literal
10
10
 
11
11
  import structlog
12
12
 
13
- from langgraph_api import __version__, config
13
+ from langgraph_api import __version__, config, metadata
14
14
 
15
15
  if TYPE_CHECKING:
16
16
  from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
17
17
  OTLPMetricExporter,
18
18
  )
19
+ from opentelemetry.exporter.prometheus import PrometheusMetricReader
20
+ from opentelemetry.metrics import Observation
19
21
  from opentelemetry.sdk.metrics import Counter, Histogram, MeterProvider
20
22
  from opentelemetry.sdk.metrics.export import (
21
23
  AggregationTemporality,
@@ -28,6 +30,7 @@ else:
28
30
  from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
29
31
  OTLPMetricExporter,
30
32
  )
33
+ from opentelemetry.metrics import Observation
31
34
  from opentelemetry.sdk.metrics import Counter, Histogram, MeterProvider
32
35
  from opentelemetry.sdk.metrics.export import (
33
36
  AggregationTemporality,
@@ -39,6 +42,7 @@ else:
39
42
  OTEL_AVAILABLE = True
40
43
  except ModuleNotFoundError:
41
44
  OTLPMetricExporter = None
45
+ Observation = None
42
46
  MeterProvider = None
43
47
  PeriodicExportingMetricReader = None
44
48
  Resource = None
@@ -50,12 +54,11 @@ else:
50
54
 
51
55
  try:
52
56
  from opentelemetry.exporter.prometheus import PrometheusMetricReader
53
- from prometheus_client import start_http_server
54
57
 
55
58
  PROMETHEUS_EXPORTER_AVAILABLE = True
56
59
  except ModuleNotFoundError:
57
- PrometheusMetricReader = None
58
- start_http_server = None
60
+ # initialize as empty object to prevent breaking downstream inheritancei with _LSDPrometheusReader
61
+ PrometheusMetricReader = object
59
62
  PROMETHEUS_EXPORTER_AVAILABLE = False
60
63
 
61
64
  logger = structlog.stdlib.get_logger(__name__)
@@ -79,28 +82,62 @@ class MetricDef:
79
82
  metric_type: MetricType
80
83
  name: str
81
84
  tier: int
82
-
83
-
84
- def def_counter(name: str, tier: int) -> MetricDef:
85
+ # True for metrics surfaced on the LSD Deployment UI. This flag partitions the
86
+ # two backends: the Prometheus scrape endpoint serves only these (see
87
+ # _LSDPrometheusReader) so GCP indexes just the Deployment-UI metrics, while
88
+ # Datadog gets only the internal complement (see _DatadogExporter).
89
+ lsd_web_metric: bool = False
90
+ # Human-readable help text. Passed to the OTel instrument as its description,
91
+ # which the Prometheus exporter exposes as the metric's ``# HELP`` line.
92
+ description: str = ""
93
+
94
+
95
+ def def_counter(
96
+ name: str, tier: int, lsd_web_metric: bool = False, description: str = ""
97
+ ) -> MetricDef:
85
98
  return MetricDef(
86
- metric_type="counter", name=f"{METRIC_NAME_PREFIX}{name}", tier=tier
99
+ metric_type="counter",
100
+ name=f"{METRIC_NAME_PREFIX}{name}",
101
+ tier=tier,
102
+ lsd_web_metric=lsd_web_metric,
103
+ description=description,
87
104
  )
88
105
 
89
106
 
90
- def def_histogram(name: str, tier: int) -> MetricDef:
107
+ def def_histogram(
108
+ name: str, tier: int, lsd_web_metric: bool = False, description: str = ""
109
+ ) -> MetricDef:
91
110
  return MetricDef(
92
- metric_type="histogram", name=f"{METRIC_NAME_PREFIX}{name}", tier=tier
111
+ metric_type="histogram",
112
+ name=f"{METRIC_NAME_PREFIX}{name}",
113
+ tier=tier,
114
+ lsd_web_metric=lsd_web_metric,
115
+ description=description,
93
116
  )
94
117
 
95
118
 
96
- def def_latency(name: str, tier: int) -> MetricDef:
119
+ def def_latency(
120
+ name: str, tier: int, lsd_web_metric: bool = False, description: str = ""
121
+ ) -> MetricDef:
97
122
  return MetricDef(
98
- metric_type="latency", name=f"{METRIC_NAME_PREFIX}{name}", tier=tier
123
+ metric_type="latency",
124
+ name=f"{METRIC_NAME_PREFIX}{name}",
125
+ tier=tier,
126
+ lsd_web_metric=lsd_web_metric,
127
+ description=description,
99
128
  )
100
129
 
101
130
 
102
- def def_gauge(name: str, tier: int) -> MetricDef:
103
- return MetricDef(metric_type="gauge", name=f"{METRIC_NAME_PREFIX}{name}", tier=tier)
131
+ def def_gauge(
132
+ name: str, tier: int, lsd_web_metric: bool = False, description: str = ""
133
+ ) -> MetricDef:
134
+ return MetricDef(
135
+ metric_type="gauge",
136
+ name=f"{METRIC_NAME_PREFIX}{name}",
137
+ tier=tier,
138
+ lsd_web_metric=lsd_web_metric,
139
+ description=description,
140
+ )
104
141
 
105
142
 
106
143
  # Pre-defined counter metrics.
@@ -152,23 +189,119 @@ COUNTER_PROTOCOL_V2_RESUME_GAP = def_counter(
152
189
  COUNTER_PROTOCOL_V2_TRANSPORT_SEND_FAILURE = def_counter(
153
190
  "protocol_v2_transport_send_failure_counter", METRIC_TIER_INFO
154
191
  )
192
+ # Migrated from meta.py /metrics. Named to expose as `lg_api_http_requests_total`
193
+ # (this exporter version does not double-append `_total`).
194
+ COUNTER_HTTP_REQUESTS = def_counter(
195
+ "http_requests_total", METRIC_TIER_INFO, lsd_web_metric=True
196
+ )
197
+ # Migrated from meta.py /metrics. The exporter appends `_total` to counter names,
198
+ # so these expose as `lg_api_pg_pool_requests_{queued,errors}_total` (idiomatic).
199
+ COUNTER_PG_POOL_REQUESTS_QUEUED = def_counter(
200
+ "pg_pool_requests_queued",
201
+ METRIC_TIER_CRITICAL,
202
+ lsd_web_metric=True,
203
+ description=(
204
+ "Number of postgres connection requests queued because a postgres "
205
+ "connection wasn't immediately available in the pool"
206
+ ),
207
+ )
208
+ COUNTER_PG_POOL_REQUESTS_ERRORS = def_counter(
209
+ "pg_pool_requests_errors",
210
+ METRIC_TIER_CRITICAL,
211
+ lsd_web_metric=True,
212
+ description=(
213
+ "Number of postgres connection requests resulting in an error "
214
+ "(timeouts, queue full...)"
215
+ ),
216
+ )
155
217
 
156
218
  # Pre-defined latency metrics.
157
219
  LATENCY_RUN_EXECUTION = def_latency("run_execution_latency", METRIC_TIER_INFO)
158
220
  LATENCY_RUN_QUEUE_WAIT_TIME_1ST_ATTEMPT = def_latency(
159
- "run_queue_wait_time_1st_attempt", METRIC_TIER_INFO
221
+ "run_queue_wait_time_1st_attempt",
222
+ METRIC_TIER_INFO,
223
+ lsd_web_metric=True,
224
+ description=(
225
+ "Time (milliseconds) spent by jobs waiting in the queue"
226
+ " before getting processed for the first time. "
227
+ ),
160
228
  )
161
229
  LATENCY_RUN_QUEUE_WAIT_TIME_RETRY_ATTEMPT = def_latency(
162
230
  "run_queue_wait_time_retry_attempt", METRIC_TIER_INFO
163
231
  )
164
232
  LATENCY_STREAM_PUBLISH = def_latency("stream_publish_latency", METRIC_TIER_INFO)
233
+ LATENCY_HTTP_REQUEST = def_latency(
234
+ "http_requests_latency",
235
+ METRIC_TIER_INFO,
236
+ lsd_web_metric=True,
237
+ description="HTTP request latency in milliseconds",
238
+ )
165
239
 
166
- # Pre-defined gauge metrics.
167
- GAUGE_WORKERS_ACTIVE = def_gauge("workers_active", METRIC_TIER_CRITICAL)
168
- GAUGE_WORKERS_AVAILABLE = def_gauge("workers_available", METRIC_TIER_CRITICAL)
240
+ GAUGE_WORKERS_ACTIVE = def_gauge(
241
+ "workers_active", METRIC_TIER_CRITICAL, lsd_web_metric=True
242
+ )
243
+ GAUGE_WORKERS_AVAILABLE = def_gauge(
244
+ "workers_available", METRIC_TIER_CRITICAL, lsd_web_metric=True
245
+ )
169
246
  GAUGE_PUBLISH_QUEUE_AVAILABILITY = def_gauge(
170
247
  "publish_queue_availability", METRIC_TIER_CRITICAL
171
248
  )
249
+ # Snapshot/state gauges pushed by the periodic
250
+ # metrics collector loop (langgraph_api.metrics_collector).
251
+ # Queue depth + workers_max are inmem-only (the Go core emits them on postgres);
252
+ # pool stats are emitted on both runtimes.
253
+ GAUGE_WORKERS_MAX = def_gauge("workers_max", METRIC_TIER_CRITICAL, lsd_web_metric=True)
254
+ GAUGE_NUM_PENDING_RUNS = def_gauge(
255
+ "num_pending_runs",
256
+ METRIC_TIER_INFO,
257
+ lsd_web_metric=True,
258
+ description="The number of runs currently pending.",
259
+ )
260
+ GAUGE_NUM_RUNNING_RUNS = def_gauge(
261
+ "num_running_runs",
262
+ METRIC_TIER_INFO,
263
+ lsd_web_metric=True,
264
+ description="The number of runs currently running.",
265
+ )
266
+ GAUGE_PG_POOL_MAX = def_gauge(
267
+ "pg_pool_max",
268
+ METRIC_TIER_CRITICAL,
269
+ lsd_web_metric=True,
270
+ description="The maximum size of the postgres connection pool.",
271
+ )
272
+ GAUGE_PG_POOL_SIZE = def_gauge(
273
+ "pg_pool_size",
274
+ METRIC_TIER_CRITICAL,
275
+ lsd_web_metric=True,
276
+ description=(
277
+ "Number of connections currently managed by the postgres connection "
278
+ "pool (in the pool, given to clients, being prepared)"
279
+ ),
280
+ )
281
+ GAUGE_PG_POOL_AVAILABLE = def_gauge(
282
+ "pg_pool_available",
283
+ METRIC_TIER_INFO,
284
+ lsd_web_metric=True,
285
+ description="Number of connections currently idle in the postgres connection pool",
286
+ )
287
+ GAUGE_REDIS_POOL_AVAILABLE = def_gauge(
288
+ "redis_pool_available",
289
+ METRIC_TIER_INFO,
290
+ lsd_web_metric=True,
291
+ description="Number of connections currently idle in the redis connection pool",
292
+ )
293
+ GAUGE_REDIS_POOL_SIZE = def_gauge(
294
+ "redis_pool_size",
295
+ METRIC_TIER_INFO,
296
+ lsd_web_metric=True,
297
+ description="Number of connections currently in use in the redis connection pool",
298
+ )
299
+ GAUGE_REDIS_POOL_MAX = def_gauge(
300
+ "redis_pool_max",
301
+ METRIC_TIER_INFO,
302
+ lsd_web_metric=True,
303
+ description="The maximum size of the redis connection pool.",
304
+ )
172
305
  # Protocol v2 sessions retain a bounded replay buffer per run. Track the
173
306
  # observed occupancy so operators can tune LSD_PROTOCOL_V2_BUFFER_SIZE before
174
307
  # reconnects start seeing resume gaps.
@@ -182,6 +315,71 @@ HISTOGRAM_PROTOCOL_V2_REPLAYED_EVENTS = def_histogram(
182
315
  )
183
316
 
184
317
 
318
+ # Names of metrics surfaced on the LSD Deployment UI. By default the two metric
319
+ # backends are partitioned by this set: the Prometheus scrape endpoint serves only
320
+ # these (see ``_LSDPrometheusReader``), and Datadog receives only the complement
321
+ # (see ``_DatadogExporter``). Setting EXPOSE_INTERNAL_METRICS_PROMETHEUS lifts the
322
+ # Prometheus filter so it serves every metric. Computed from the definitions above.
323
+ LSD_WEB_METRIC_NAMES: frozenset[str] = frozenset(
324
+ m.name for m in globals().values() if isinstance(m, MetricDef) and m.lsd_web_metric
325
+ )
326
+
327
+
328
+ def _select_metrics(metrics_data: Any, keep) -> Any:
329
+ """Return a copy of ``metrics_data`` keeping only metrics where ``keep(name)``.
330
+
331
+ Rebuilt with ``dataclasses.replace`` (never mutated in place) since the two
332
+ backend readers share the same SDK metric objects. Scope and resource groups
333
+ left empty by the filter are dropped entirely, so a non-empty
334
+ ``resource_metrics`` on the result guarantees at least one metric point.
335
+ """
336
+ if metrics_data is None or not metrics_data.resource_metrics:
337
+ return metrics_data
338
+ resource_metrics = []
339
+ for rm in metrics_data.resource_metrics:
340
+ scope_metrics = []
341
+ for sm in rm.scope_metrics:
342
+ kept = [m for m in sm.metrics if keep(m.name)]
343
+ if kept:
344
+ scope_metrics.append(replace(sm, metrics=kept))
345
+ if scope_metrics:
346
+ resource_metrics.append(replace(rm, scope_metrics=scope_metrics))
347
+ return replace(metrics_data, resource_metrics=resource_metrics)
348
+
349
+
350
+ def _filter_web_metrics(metrics_data: Any) -> Any:
351
+ """Copy of ``metrics_data`` with only LSD web metrics (for Prometheus/GCP)."""
352
+ return _select_metrics(metrics_data, lambda name: name in LSD_WEB_METRIC_NAMES)
353
+
354
+
355
+ def _drop_web_metrics(metrics_data: Any) -> Any:
356
+ """Copy of ``metrics_data`` without LSD web metrics (for Datadog)."""
357
+ return _select_metrics(metrics_data, lambda name: name not in LSD_WEB_METRIC_NAMES)
358
+
359
+
360
+ class _LSDPrometheusReader(PrometheusMetricReader):
361
+ """The Prometheus reader for this service.
362
+
363
+ By default it serves only the LSD Deployment-UI (``lsd_web_metric``) set —
364
+ Prometheus feeds the LSD web UI, while internal metrics go to Datadog instead
365
+ (see ``_DatadogExporter``). When ``EXPOSE_INTERNAL_METRICS_PROMETHEUS`` is set,
366
+ the web filter is skipped and every recorded metric is exposed (record-time
367
+ tier filtering via ``METRIC_MAX_EMITTING_TIER`` still applies). The base
368
+ ``_receive_metrics`` simply hands the data to its collector, so filtering here
369
+ is sufficient.
370
+ """
371
+
372
+ def _receive_metrics(
373
+ self,
374
+ metrics_data: Any,
375
+ timeout_millis: float = 10_000,
376
+ **kwargs: Any,
377
+ ) -> None:
378
+ if not config.EXPOSE_INTERNAL_METRICS_PROMETHEUS:
379
+ metrics_data = _filter_web_metrics(metrics_data)
380
+ super()._receive_metrics(metrics_data, timeout_millis, **kwargs)
381
+
382
+
185
383
  def _normalize_emitting_tier(value: int) -> int:
186
384
  if value < METRIC_TIER_CRITICAL:
187
385
  return METRIC_TIER_CRITICAL
@@ -190,8 +388,15 @@ def _normalize_emitting_tier(value: int) -> int:
190
388
  return value
191
389
 
192
390
 
193
- class _FilteringExporter(MetricExporter):
194
- """Wrapper that skips export when there are no metric points."""
391
+ class _DatadogExporter(MetricExporter):
392
+ """Datadog exporter wrapper: drops LSD web metrics, and skips export when
393
+ nothing remains.
394
+
395
+ Web metrics are served to the LSD Deployment UI via Prometheus only (see
396
+ ``_LSDPrometheusReader``); Datadog receives only the internal complement. The
397
+ drop happens here rather than at record time because the same SDK metric
398
+ objects feed both backend readers.
399
+ """
195
400
 
196
401
  def __init__(self, exporter: MetricExporter):
197
402
  super().__init__()
@@ -205,16 +410,12 @@ class _FilteringExporter(MetricExporter):
205
410
  timeout_millis: float = 10_000,
206
411
  **kwargs: Any,
207
412
  ) -> Any:
208
- if not metrics_data or not metrics_data.resource_metrics:
413
+ # _drop_web_metrics prunes emptied groups, so a non-empty resource_metrics
414
+ # guarantees there is at least one internal metric point left to export.
415
+ filtered = _drop_web_metrics(metrics_data)
416
+ if not filtered or not filtered.resource_metrics:
209
417
  return None
210
- for resource_metric in metrics_data.resource_metrics:
211
- if resource_metric.scope_metrics:
212
- for scope_metric in resource_metric.scope_metrics:
213
- if scope_metric.metrics:
214
- return self._exporter.export(
215
- metrics_data, timeout_millis, **kwargs
216
- )
217
- return None
418
+ return self._exporter.export(filtered, timeout_millis, **kwargs)
218
419
 
219
420
  def shutdown(self, timeout_millis: float = 30_000, **kwargs: Any) -> None:
220
421
  self._exporter.shutdown(timeout_millis, **kwargs)
@@ -232,8 +433,17 @@ class OTelMetricsReporter:
232
433
  self._meter = None
233
434
  self._max_tier = _normalize_emitting_tier(config.METRIC_MAX_EMITTING_TIER)
234
435
  self._instruments: dict[str, Any] = {}
436
+ # Initializes a gauge values cache that is read when querying /metrics.
437
+ # sync gauges have a limitation of flapping values - it doesn't report the metric
438
+ # if the value wasn't set recently. By using a cache, it consistently reports the metric
439
+ # value when scraped. Guarded by ``_gauge_lock`` because
440
+ # callbacks run on the SDK collection thread, ``record_gauge`` on others.
441
+ self._gauge_lock = threading.Lock()
442
+ self._gauge_values: dict[str, dict[tuple, tuple[dict[str, Any], float]]] = {}
443
+ self._observable_gauges: dict[str, Any] = {}
444
+ # Labels attached to every metric (set in ``initialize``)
445
+ self._common_attributes: dict[str, str] = {}
235
446
  self._prom_enabled = False
236
- self._prom_server: Any | None = None
237
447
 
238
448
  @property
239
449
  def enabled(self) -> bool:
@@ -245,14 +455,6 @@ class OTelMetricsReporter:
245
455
  return
246
456
  self._initialized = True
247
457
 
248
- if (
249
- not config.DATADOG_METRICS_ENABLED
250
- and not config.LSD_PROM_METRICS_ENABLED
251
- ):
252
- logger.info(
253
- "OTel metrics disabled (no DD API key and Prometheus not enabled)"
254
- )
255
- return
256
458
  if not OTEL_AVAILABLE:
257
459
  logger.warning(
258
460
  "OTel metrics disabled because OpenTelemetry dependencies are not installed"
@@ -301,22 +503,23 @@ class OTelMetricsReporter:
301
503
 
302
504
  readers.append(
303
505
  PeriodicExportingMetricReader(
304
- _FilteringExporter(base_exporter),
506
+ _DatadogExporter(base_exporter),
305
507
  export_interval_millis=10_000,
306
508
  )
307
509
  )
308
510
 
309
- if config.LSD_PROM_METRICS_ENABLED:
310
- if not PROMETHEUS_EXPORTER_AVAILABLE:
311
- logger.warning(
312
- "Prometheus metrics disabled: opentelemetry-exporter-prometheus not installed"
313
- )
314
- else:
315
- # PrometheusMetricReader registers its collector with the
316
- # global prometheus_client REGISTRY, which start_http_server
317
- # serves by default. MeterProvider.shutdown() unregisters it.
318
- readers.append(PrometheusMetricReader())
319
- self._prom_enabled = True
511
+ # Prometheus metrics are always exported (served via /metrics).
512
+ if not PROMETHEUS_EXPORTER_AVAILABLE:
513
+ logger.error(
514
+ "Prometheus metrics disabled: opentelemetry-exporter-prometheus not installed"
515
+ )
516
+ else:
517
+ # PrometheusMetricReader registers its collector with the
518
+ # global prometheus_client REGISTRY, which the /metrics
519
+ # endpoint serves via generate_latest() (see api/meta.py).
520
+ # Prometheus serves only Deployment-UI metrics.
521
+ readers.append(_LSDPrometheusReader())
522
+ self._prom_enabled = True
320
523
 
321
524
  if not readers:
322
525
  logger.info(
@@ -328,6 +531,12 @@ class OTelMetricsReporter:
328
531
  resource=resource, metric_readers=readers
329
532
  )
330
533
  self._meter = self._meter_provider.get_meter(SERVICE_NAME)
534
+ # Labels added to every metric, matching the legacy /metrics.
535
+ self._common_attributes = {
536
+ "project_id": metadata.PROJECT_ID or "",
537
+ "revision_id": metadata.HOST_REVISION_ID or "",
538
+ "deployment_type": metadata.DEPLOYMENT_TYPE or "",
539
+ }
331
540
  self._enabled = True
332
541
 
333
542
  if config.DATADOG_METRICS_ENABLED:
@@ -335,8 +544,6 @@ class OTelMetricsReporter:
335
544
  "Datadog OTLP metrics reader initialized",
336
545
  endpoint=f"https://{config.LSD_DD_ENDPOINT}/v1/metrics",
337
546
  )
338
- if self._prom_enabled:
339
- self._start_prometheus_server()
340
547
 
341
548
  logger.info(
342
549
  "OTel metrics reporter initialized",
@@ -358,38 +565,25 @@ class OTelMetricsReporter:
358
565
  logger.exception("Failed to initialize OTel metrics reporter")
359
566
  raise
360
567
 
361
- def _start_prometheus_server(self) -> None:
362
- port = config.LSD_PROM_METRICS_PORT
363
- # ``start_http_server`` spins up a WSGI server in a daemon thread serving
364
- # the global prometheus_client REGISTRY, and returns the (server, thread)
365
- # handle. Keep the server so ``shutdown`` can stop it cleanly (e.g. in
366
- # tests); in production the daemon thread exits with the process anyway.
367
- server, _thread = start_http_server(port=port)
368
- self._prom_server = server
369
- logger.info("Prometheus metrics scrape server started", port=port)
370
-
371
568
  def shutdown(self) -> None:
372
569
  with self._lock:
373
570
  if self._meter_provider:
374
571
  try:
572
+ # Unregisters the Prometheus reader's collector from the global
573
+ # prometheus_client REGISTRY (and flushes/stops other readers).
375
574
  self._meter_provider.shutdown()
376
575
  except Exception:
377
576
  logger.exception("Failed to shutdown OTel metrics reporter")
378
577
  finally:
379
578
  self._meter_provider = None
380
579
  self._meter = None
381
- if self._prom_server is not None:
382
- try:
383
- self._prom_server.shutdown()
384
- self._prom_server.server_close()
385
- except Exception:
386
- logger.exception("Failed to stop Prometheus scrape server")
387
- finally:
388
- self._prom_server = None
389
580
  self._prom_enabled = False
390
581
  self._enabled = False
391
582
  self._initialized = False
392
583
  self._instruments.clear()
584
+ with self._gauge_lock:
585
+ self._gauge_values.clear()
586
+ self._observable_gauges.clear()
393
587
 
394
588
  def _instrument_name(self, metric_name: str) -> str:
395
589
  return metric_name
@@ -397,22 +591,69 @@ class OTelMetricsReporter:
397
591
  def _tier_enabled(self, tier: int) -> bool:
398
592
  return _normalize_emitting_tier(tier) <= self._max_tier
399
593
 
594
+ def _should_emit(self, metric: MetricDef) -> bool:
595
+ """Whether a sample for ``metric`` should be recorded.
596
+
597
+ ``lsd_web_metric`` metrics bypass tier filtering: they back the LSD
598
+ Deployment UI (served by the Prometheus reader) and must be emitted even
599
+ on low-tier deployments (dev/dev_free default ``METRIC_MAX_EMITTING_TIER``
600
+ to 1/CRITICAL). The tier gate runs before the MeterProvider, so a dropped
601
+ sample never reaches any reader — Prometheus included.
602
+ """
603
+ if not self._enabled or not self._meter:
604
+ return False
605
+ return metric.lsd_web_metric or self._tier_enabled(metric.tier)
606
+
400
607
  def _get_or_create_instrument(self, metric: MetricDef):
401
608
  name = self._instrument_name(metric.name)
402
609
  instrument = self._instruments.get(name)
403
610
  if instrument is not None:
404
611
  return instrument
405
612
  if metric.metric_type == "counter":
406
- instrument = self._meter.create_counter(name=name)
613
+ instrument = self._meter.create_counter(
614
+ name=name, description=metric.description
615
+ )
407
616
  elif metric.metric_type in {"histogram", "latency"}:
408
- instrument = self._meter.create_histogram(name=name)
409
- elif metric.metric_type == "gauge":
410
- instrument = self._meter.create_gauge(name=name)
617
+ instrument = self._meter.create_histogram(
618
+ name=name, description=metric.description
619
+ )
411
620
  else:
621
+ # Gauges are handled via observable instruments (see _set_gauge).
412
622
  raise ValueError(f"Unsupported metric type: {metric.metric_type}")
413
623
  self._instruments[name] = instrument
414
624
  return instrument
415
625
 
626
+ def _make_gauge_callback(self, name: str):
627
+ """Build the observable-gauge callback that the SDK invokes on each scrape.
628
+
629
+ It yields one Observation per recorded attribute-set from the cache, so the
630
+ last sampled value is re-reported on every collect (no flapping).
631
+ """
632
+
633
+ def _callback(_options: Any):
634
+ with self._gauge_lock:
635
+ points = list(self._gauge_values.get(name, {}).values())
636
+ return [Observation(value, attributes=attrs) for attrs, value in points]
637
+
638
+ return _callback
639
+
640
+ def _with_common(self, attributes: dict[str, Any] | None) -> dict[str, Any]:
641
+ """Merge the shared labels (project_id/revision_id/deployment_type) with
642
+ any per-call attributes. Per-call values win on key conflicts."""
643
+ return {**self._common_attributes, **(attributes or {})}
644
+
645
+ def _set_gauge(self, metric: MetricDef, value: float, attributes: dict) -> None:
646
+ name = metric.name
647
+ key = tuple(sorted(attributes.items()))
648
+ with self._gauge_lock:
649
+ self._gauge_values.setdefault(name, {})[key] = (attributes, float(value))
650
+ if name not in self._observable_gauges:
651
+ self._observable_gauges[name] = self._meter.create_observable_gauge(
652
+ name=name,
653
+ description=metric.description,
654
+ callbacks=[self._make_gauge_callback(name)],
655
+ )
656
+
416
657
  def inc_counter(
417
658
  self,
418
659
  metric: MetricDef,
@@ -421,11 +662,11 @@ class OTelMetricsReporter:
421
662
  ) -> None:
422
663
  if metric.metric_type != "counter":
423
664
  raise ValueError(f"{metric.name} is not a counter metric")
424
- if not self._enabled or not self._meter or not self._tier_enabled(metric.tier):
665
+ if not self._should_emit(metric):
425
666
  return
426
667
  instrument = self._get_or_create_instrument(metric)
427
668
  try:
428
- instrument.add(value, attributes or {})
669
+ instrument.add(value, self._with_common(attributes))
429
670
  except Exception:
430
671
  logger.warning("Failed to add counter", metric_name=metric.name)
431
672
 
@@ -437,11 +678,11 @@ class OTelMetricsReporter:
437
678
  ) -> None:
438
679
  if metric.metric_type != "histogram":
439
680
  raise ValueError(f"{metric.name} is not a histogram metric")
440
- if not self._enabled or not self._meter or not self._tier_enabled(metric.tier):
681
+ if not self._should_emit(metric):
441
682
  return
442
683
  instrument = self._get_or_create_instrument(metric)
443
684
  try:
444
- instrument.record(value, attributes or {})
685
+ instrument.record(value, self._with_common(attributes))
445
686
  except Exception:
446
687
  logger.warning("Failed to record histogram", metric_name=metric.name)
447
688
 
@@ -458,11 +699,11 @@ class OTelMetricsReporter:
458
699
  else:
459
700
  seconds = float(duration_seconds)
460
701
  value = seconds * 1000
461
- if not self._enabled or not self._meter or not self._tier_enabled(metric.tier):
702
+ if not self._should_emit(metric):
462
703
  return
463
704
  instrument = self._get_or_create_instrument(metric)
464
705
  try:
465
- instrument.record(value, attributes or {})
706
+ instrument.record(value, self._with_common(attributes))
466
707
  except Exception:
467
708
  logger.warning("Failed to record latency", metric_name=metric.name)
468
709
 
@@ -474,11 +715,11 @@ class OTelMetricsReporter:
474
715
  ) -> None:
475
716
  if metric.metric_type != "gauge":
476
717
  raise ValueError(f"{metric.name} is not a gauge metric")
477
- if not self._enabled or not self._meter or not self._tier_enabled(metric.tier):
718
+ if not self._should_emit(metric):
478
719
  return
479
- instrument = self._get_or_create_instrument(metric)
480
720
  try:
481
- instrument.set(value, attributes or {})
721
+ # Cache the value; an observable gauge re-reports it on every scrape.
722
+ self._set_gauge(metric, value, self._with_common(attributes))
482
723
  except Exception:
483
724
  logger.warning("Failed to record gauge", metric_name=metric.name)
484
725
 
@@ -488,7 +729,7 @@ class OTelMetricsReporter:
488
729
  metric: MetricDef,
489
730
  attributes: dict[str, Any] | None = None,
490
731
  ):
491
- if not self._enabled or not self._meter or not self._tier_enabled(metric.tier):
732
+ if not self._should_emit(metric):
492
733
  yield
493
734
  return
494
735
  start = time.perf_counter()
@@ -7,6 +7,12 @@ from starlette.types import Message, Receive, Scope, Send
7
7
 
8
8
  from langgraph_api.config import MOUNT_PREFIX
9
9
  from langgraph_api.http_metrics import HTTP_METRICS_COLLECTOR
10
+ from langgraph_api.http_metrics_utils import get_route, should_filter_route
11
+ from langgraph_api.metrics_otlp import (
12
+ COUNTER_HTTP_REQUESTS,
13
+ LATENCY_HTTP_REQUEST,
14
+ get_otlp_metrics_reporter,
15
+ )
10
16
  from langgraph_api.utils.headers import should_include_header_in_logs
11
17
 
12
18
  asgi = structlog.stdlib.get_logger("asgi")
@@ -150,6 +156,24 @@ class AccessLoggerMiddleware:
150
156
 
151
157
  if method and route and status:
152
158
  HTTP_METRICS_COLLECTOR.record_request(method, route, status, latency)
159
+ route_path = get_route(route)
160
+ if route_path is not None and not should_filter_route(route_path):
161
+ reporter = get_otlp_metrics_reporter()
162
+ reporter.inc_counter(
163
+ COUNTER_HTTP_REQUESTS,
164
+ attributes={
165
+ "method": method,
166
+ "path": route_path,
167
+ "status": str(status),
168
+ },
169
+ )
170
+ # record_latency takes seconds and stores milliseconds; latency
171
+ # is already in ms, so pass seconds.
172
+ reporter.record_latency(
173
+ LATENCY_HTTP_REQUEST,
174
+ latency / 1000.0,
175
+ attributes={"method": method, "path": route_path},
176
+ )
153
177
  qs = scope.get("query_string")
154
178
  first_byte_time = info["first_byte_time"]
155
179
  ttfb_ms = (
@@ -65,6 +65,7 @@ def _ensure_port_available(host: str, port: int) -> None:
65
65
 
66
66
  async def health_and_metrics_server():
67
67
  import uvicorn # noqa: PLC0415
68
+ from prometheus_client import CONTENT_TYPE_LATEST, generate_latest # noqa: PLC0415
68
69
  from starlette.applications import Starlette # noqa: PLC0415
69
70
  from starlette.requests import Request # noqa: PLC0415
70
71
  from starlette.responses import JSONResponse, PlainTextResponse # noqa: PLC0415
@@ -91,42 +92,18 @@ async def health_and_metrics_server():
91
92
  )
92
93
  metrics_format = "prometheus"
93
94
 
94
- metrics = get_metrics()
95
- worker_metrics = metrics["workers"]
96
- workers_max = worker_metrics["max"]
97
- workers_active = worker_metrics["active"]
98
- workers_available = worker_metrics["available"]
99
-
100
- project_id = os.getenv("LANGSMITH_HOST_PROJECT_ID")
101
- revision_id = os.getenv("LANGSMITH_HOST_REVISION_ID")
102
-
103
- pg_redis_stats = await meta_pool_stats(metrics_format)
104
-
105
- if metrics_format == "json":
106
- resp = {
107
- **pg_redis_stats,
108
- "workers": worker_metrics,
109
- }
110
- return JSONResponse(resp)
111
- elif metrics_format == "prometheus":
112
- metrics_lines = [
113
- "# HELP lg_api_workers_max The maximum number of workers available.",
114
- "# TYPE lg_api_workers_max gauge",
115
- f'lg_api_workers_max{{project_id="{project_id}", revision_id="{revision_id}"}} {workers_max}',
116
- "# HELP lg_api_workers_active The number of currently active workers.",
117
- "# TYPE lg_api_workers_active gauge",
118
- f'lg_api_workers_active{{project_id="{project_id}", revision_id="{revision_id}"}} {workers_active}',
119
- "# HELP lg_api_workers_available The number of available (idle) workers.",
120
- "# TYPE lg_api_workers_available gauge",
121
- f'lg_api_workers_available{{project_id="{project_id}", revision_id="{revision_id}"}} {workers_available}',
122
- ]
123
-
124
- metrics_lines.extend(pg_redis_stats)
125
-
126
- return PlainTextResponse(
127
- "\n".join(metrics_lines),
128
- media_type="text/plain; version=0.0.4; charset=utf-8",
129
- )
95
+ if metrics_format == "prometheus":
96
+ # Served from THIS process's OTLP Prometheus registry. The collector
97
+ # runs in every process, so this queue worker exposes its worker
98
+ # gauges and its own pool stats — but NOT queue depth
99
+ # (num_pending/num_running), which the collector emits on the API
100
+ # process only. Mirrors the main API /metrics (api/meta.py:meta_metrics).
101
+ return PlainTextResponse(generate_latest(), media_type=CONTENT_TYPE_LATEST)
102
+
103
+ # JSON: hand-built snapshot of workers + pool stats.
104
+ worker_metrics = get_metrics()["workers"]
105
+ pg_redis_stats = await meta_pool_stats()
106
+ return JSONResponse({**pg_redis_stats, "workers": worker_metrics})
130
107
 
131
108
  routes = [
132
109
  Route("/ok", health_endpoint),
@@ -287,7 +287,7 @@ def _get_pool_stats():
287
287
  # so we submit this as a coro to run in the main event loop
288
288
  async def _fetch_pool_stats():
289
289
  try:
290
- return await meta_pool_stats("json")
290
+ return await meta_pool_stats()
291
291
  except Exception as e:
292
292
  logger.warning("Failed to get pool stats", exc_info=e)
293
293
  return {"postgres": {}, "redis": {}}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langgraph-api
3
- Version: 0.11.0.dev9
3
+ Version: 0.12.0.dev1
4
4
  Author-email: Will Fu-Hinthorn <will@langchain.dev>, Josh Rogers <josh@langchain.dev>, Parker Rule <parker@langchain.dev>
5
5
  License: Elastic-2.0
6
6
  License-File: LICENSE
@@ -16,7 +16,7 @@ Requires-Dist: jsonschema-rs<0.45,>=0.20.0
16
16
  Requires-Dist: langchain-core>=0.3.64
17
17
  Requires-Dist: langchain-protocol<0.1,>=0.0.16
18
18
  Requires-Dist: langgraph-checkpoint<5,>=3.0.1
19
- Requires-Dist: langgraph-runtime-inmem<0.31.0.dev0,>=0.30.0.dev0
19
+ Requires-Dist: langgraph-runtime-inmem<0.32.0.dev0,>=0.31.0.dev0
20
20
  Requires-Dist: langgraph-sdk>=0.3.5
21
21
  Requires-Dist: langgraph<2,>=0.4.10
22
22
  Requires-Dist: langsmith[otel]>=0.6.3
@@ -29,7 +29,7 @@ Requires-Dist: prometheus-client>=0.0.1
29
29
  Requires-Dist: protobuf<7.0.0,>=6.32.1
30
30
  Requires-Dist: pyjwt>=2.9.0
31
31
  Requires-Dist: sse-starlette<3.4.0,>=2.1.3
32
- Requires-Dist: starlette>=0.38.6
32
+ Requires-Dist: starlette>=1.0.1
33
33
  Requires-Dist: structlog<26,>=24.1.0
34
34
  Requires-Dist: tenacity>=8.0.0
35
35
  Requires-Dist: truststore>=0.1
@@ -1,4 +1,4 @@
1
- langgraph_api/__init__.py,sha256=ODvnQh6zY2Mdw6hMdx11i_MRvcQ-Kv5LMYazITUYfdo,28
1
+ langgraph_api/__init__.py,sha256=mzCq9ao7wd_jc3lCw0iVM8xH_PSoH6l9__-Lr3pxcKA,28
2
2
  langgraph_api/_factory_utils.py,sha256=5JsiJbg_YocVSryN2jwoZTg03-eyymlWMK6sKCmXwz0,5756
3
3
  langgraph_api/asgi_transport.py,sha256=XApY3lIWBZTMbbsl8dDJzl0cLGirmAGE0SifqZUnXvs,11896
4
4
  langgraph_api/asyncio.py,sha256=c-YE-14N7_AP1GzifsbP14XnhLsmxT2P916KXruerpI,10573
@@ -14,15 +14,16 @@ langgraph_api/http_metrics.py,sha256=etxbZNmYxdb58DVLNkHP7S-N6njXPTiQh2OWKMaIZi8
14
14
  langgraph_api/http_metrics_utils.py,sha256=sjxF7SYGTzY0Wz_G0dzatsYNnWr31S6ujej4JmBG2yo,866
15
15
  langgraph_api/logging.py,sha256=V1RCnqVLuMvJtrBiyMMLfaEdbS3k5A2M8Unhr4FUUdQ,6801
16
16
  langgraph_api/metadata.py,sha256=ih2et_R0prFsCzikQ4_L0j9up7t0rObAMVKyEk7ienI,9778
17
- langgraph_api/metrics_otlp.py,sha256=TxK96ks8fok_g0phk24QDvLcXm-Mh9TbVThoiyTJv6Y,18925
17
+ langgraph_api/metrics_collector.py,sha256=gMLHL18rJyYl985AOmu9eH7W1ttdRdkPHzeyczjCOBw,8280
18
+ langgraph_api/metrics_otlp.py,sha256=t9oJrxfxY2O5jY4JW2gONPKoBiBuklhzCrnZvn1qTxQ,28730
18
19
  langgraph_api/otel_context.py,sha256=DWFwW4Yu88QY4W2J0IRcURR450Th9J2DupvDDkSkMBA,7166
19
20
  langgraph_api/patch.py,sha256=ViUknYvyQWS6y0f5XuaEoci2qB_mQv8vZl-oaUxsI6M,1448
20
- langgraph_api/queue_entrypoint.py,sha256=JitIsrnJXnfAke2qwsvlvMKwvna9GMMm55wBYoMtJFM,12166
21
+ langgraph_api/queue_entrypoint.py,sha256=-9YnY_GhmDxEiGCc3k-7UqRKK_M3dPriits2iGgYlgU,11327
21
22
  langgraph_api/release_tags.py,sha256=BjgGj2vFcA7I0MDRXLw1sUA4jquz-DaKVS0Eq-dYSjE,9091
22
23
  langgraph_api/route.py,sha256=_KE8A8Q-J-QfqjGlyM2Kc6n5cirmgt8xmI5-pI8kVEE,8837
23
24
  langgraph_api/schema.py,sha256=I_ciXy4YE3Ri4-PAWIvqLNRH2FpC4goTkKwfFwk6wIQ,15100
24
25
  langgraph_api/self_hosted_logs.py,sha256=FoUkPdtpt-nuEhejne8o1Q2phE9CccoHdoR_PvXPcBU,4442
25
- langgraph_api/self_hosted_metrics.py,sha256=wgjVYrEQNlgcn6TzAAf14Sdyz-i2l7bqAdhMn-qEiHI,16850
26
+ langgraph_api/self_hosted_metrics.py,sha256=pWsQQ-2ukoFIbmVfzNOSkwCqZ5Cnts6pRSWTII44Ll4,16844
26
27
  langgraph_api/serde.py,sha256=V3fO9bkUOlBX3okw5Qi31nlcr59fcuXMgL7DHNyarZY,8855
27
28
  langgraph_api/server.py,sha256=1eAZPim0Pkgh5oGS4EvW-_7Zh_82iGOZtR1rpX08FoA,11216
28
29
  langgraph_api/sse.py,sha256=cChZ7raQUHp8p5BreE_5wMBR8lFO0n7746sV8_HQOrc,4822
@@ -41,7 +42,7 @@ langgraph_api/api/__init__.py,sha256=Zu1ew3dxYZu7cLRAjn-6HcYmtuQBdihlVFMKMJ77Y3c
41
42
  langgraph_api/api/a2a.py,sha256=VPllgqfoLUQD6Eqob3RjcegjtKgLhphNGTrTqbNLoIY,95135
42
43
  langgraph_api/api/assistants.py,sha256=4v1TpkeeSF7vFrbnOKIvh7BY4K0WamzEdMeTAzwRElE,20786
43
44
  langgraph_api/api/event_streaming.py,sha256=nvoaKz4QGklX5YUmY9WQ3vSwhQ1Q81QeQWNR8aEXUz8,17571
44
- langgraph_api/api/meta.py,sha256=jVRBWZio8x__LbSp4e4WPyPB0sJJ5cgHGjbitLd6yGE,11045
45
+ langgraph_api/api/meta.py,sha256=4vqfgJYIqaqwZPfdmvHN00Co7SdeYSnQ1STxyvGWVDw,4698
45
46
  langgraph_api/api/openapi.py,sha256=Zkdlb9mjrQyHro1TtrDIWVuaBDovxx-uGWJ1fZMOg54,12604
46
47
  langgraph_api/api/profile.py,sha256=CA1ZkHALOuP8orYTICnEhcG_JnnA2wnyjbWyeb117jA,3455
47
48
  langgraph_api/api/runs.py,sha256=h5droLgaz_aAyILCRJIpbj2KH1PbijCeXcggOSa3Zww,35178
@@ -63,7 +64,7 @@ langgraph_api/auth/studio_user.py,sha256=gNCicIo6cYaHmFj2sEdsvDYkKW7NWfGXGS2tTAM
63
64
  langgraph_api/auth/langsmith/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
65
  langgraph_api/auth/langsmith/backend.py,sha256=Y6-VxD7zfV1jzGdjmQ66CgNa3SenLbo3d_375CcKZ9U,3770
65
66
  langgraph_api/auth/langsmith/client.py,sha256=79kwCVeHU64nsHsxWipfZhf44lM6vfs2nlfTxlJF6LU,4142
66
- langgraph_api/config/__init__.py,sha256=gHQgH-0qR3nD0woEZjgQwKNv71FieLhX9Hr6NlrdYzs,25327
67
+ langgraph_api/config/__init__.py,sha256=qj7HF1XmojpG6WXahlFxrMZiO8hUI6QlPNVT-32xfvs,25528
67
68
  langgraph_api/config/_parse.py,sha256=VXQPKzqtIsZrRy-nUEBMDESBxXzqFRQNiqsvAZeX3HU,3921
68
69
  langgraph_api/config/schemas.py,sha256=rYqu67fZxmtCOU-Zc1s3265KbRbqK8PmfvfwvrAmd-Q,20863
69
70
  langgraph_api/encryption/__init__.py,sha256=gaCZ00CocSbqSqrDn6XJHaSp2CZCnC8qnrD9G4fbzyI,363
@@ -125,7 +126,7 @@ langgraph_api/lc_security/policy.py,sha256=tW0IACvPCeJZbcsUKv9egk0LapF5gL3hxF9Ao
125
126
  langgraph_api/lc_security/transport.py,sha256=AgiAsPLNP135ag30XPwVDkMNhomiYJwGYUKs0TfaWSI,7183
126
127
  langgraph_api/middleware/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
127
128
  langgraph_api/middleware/ensure_store.py,sha256=KzgAsLPloPD8mcQG-5v7kguzDMFrs9PJz2-xK5-rV0Q,1351
128
- langgraph_api/middleware/http_logger.py,sha256=Y5E1NgDJuJHWI3pMtpu0cdspZA3Jikbv2PEsiWFctaA,7149
129
+ langgraph_api/middleware/http_logger.py,sha256=jjqLBPqoGRC1UfB2VYKPY2tkq6gT7Rm88DnAHBlwuTw,8231
129
130
  langgraph_api/middleware/private_network.py,sha256=eQEzWI8epBNUCiNsMu9O27ofHBQ45M0p2OZy5YdUYos,2097
130
131
  langgraph_api/middleware/request_id.py,sha256=-p230Q5jDJAJLmSZRqQvB4dFFkJS9B4Vwg6pUgQtI24,1259
131
132
  langgraph_api/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -228,8 +229,8 @@ langgraph_grpc_common/proto/errors_pb2.py,sha256=JI6x-vBK1AE7DHZ5DQwN1mZWF6C4xTR
228
229
  langgraph_grpc_common/proto/errors_pb2.pyi,sha256=rd3-BYUH8V-aO66taL7OOblaLgdrDtf1Vcd38GUoVVM,2181
229
230
  langgraph_grpc_common/proto/errors_pb2_grpc.py,sha256=2-LwQ0OPGo-NtC0269q7Fw6GPBxnTLYWq3xP5Eq0_YA,886
230
231
  langgraph_grpc_common/proto/errors_pb2_grpc.pyi,sha256=uC9Wnq6uyg488QiONpJ0ba1s_iouQCOYsjd_FDd1XUM,495
231
- langgraph_api-0.11.0.dev9.dist-info/METADATA,sha256=4oMpCP0-X5iIKGh10-YW2PnAbkNVSMWCjBMj-pEyBjA,4630
232
- langgraph_api-0.11.0.dev9.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
233
- langgraph_api-0.11.0.dev9.dist-info/entry_points.txt,sha256=hGedv8n7cgi41PypMfinwS_HfCwA7xJIfS0jAp8htV8,78
234
- langgraph_api-0.11.0.dev9.dist-info/licenses/LICENSE,sha256=ZPwVR73Biwm3sK6vR54djCrhaRiM4cAD2zvOQZV8Xis,3859
235
- langgraph_api-0.11.0.dev9.dist-info/RECORD,,
232
+ langgraph_api-0.12.0.dev1.dist-info/METADATA,sha256=T_4d-LSGhM29_XbrhbUhbSvblmXtgsfPeDugmsCBuLU,4629
233
+ langgraph_api-0.12.0.dev1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
234
+ langgraph_api-0.12.0.dev1.dist-info/entry_points.txt,sha256=hGedv8n7cgi41PypMfinwS_HfCwA7xJIfS0jAp8htV8,78
235
+ langgraph_api-0.12.0.dev1.dist-info/licenses/LICENSE,sha256=ZPwVR73Biwm3sK6vR54djCrhaRiM4cAD2zvOQZV8Xis,3859
236
+ langgraph_api-0.12.0.dev1.dist-info/RECORD,,