forgesight-prometheus 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .eggs/
6
+ build/
7
+ dist/
8
+ *.so
9
+
10
+ # venv / tooling
11
+ .venv/
12
+ venv/
13
+ .uv/
14
+ uv.lock
15
+
16
+ # test / type / lint caches
17
+ .pytest_cache/
18
+ .mypy_cache/
19
+ .ruff_cache/
20
+ .coverage
21
+ .coverage.*
22
+ coverage.xml
23
+ htmlcov/
24
+
25
+ # secrets / local env (never commit)
26
+ .env
27
+ .env.*
28
+
29
+ # editor / OS
30
+ .DS_Store
31
+ .idea/
32
+ .vscode/
33
+
34
+ # local-only session working state (per the workspace pipeline)
35
+ .claude/state/
36
+
37
+ # local-only launch planning (not part of the published repo)
38
+ /launch/
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.4
2
+ Name: forgesight-prometheus
3
+ Version: 0.1.0
4
+ Summary: ForgeSight Prometheus exporter — pull /metrics + push-gateway for agent telemetry.
5
+ Project-URL: Homepage, https://github.com/Scaffoldic/forgesight
6
+ Project-URL: Repository, https://github.com/Scaffoldic/forgesight
7
+ Project-URL: Issues, https://github.com/Scaffoldic/forgesight/issues
8
+ Project-URL: Changelog, https://github.com/Scaffoldic/forgesight/blob/main/docs/releases/v0.1.md
9
+ Author: kjoshi
10
+ License-Expression: Apache-2.0
11
+ Keywords: ai-agents,forgesight,metrics,observability,prometheus
12
+ Classifier: Development Status :: 2 - Pre-Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Information Technology
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: System :: Monitoring
21
+ Classifier: Typing :: Typed
22
+ Requires-Python: >=3.11
23
+ Requires-Dist: forgesight-core
24
+ Requires-Dist: prometheus-client>=0.20
25
+ Description-Content-Type: text/markdown
26
+
27
+ # forgesight-prometheus
28
+
29
+ The Prometheus exporter for [ForgeSight](https://github.com/Scaffoldic/forgesight).
30
+ Bridges ForgeSight's product metrics + GenAI histograms onto a Prometheus registry
31
+ with a pull `/metrics` endpoint (and an optional Pushgateway for short-lived runs).
32
+
33
+ ```bash
34
+ pip install forgesight-prometheus
35
+ ```
36
+
37
+ ```python
38
+ import forgesight
39
+ from forgesight_prometheus import PrometheusExporter
40
+
41
+ forgesight.configure(exporters=[PrometheusExporter(port=9464, prefix="agentforge")])
42
+ # Prometheus scrapes http://<host>:9464/metrics
43
+ ```
44
+
45
+ Or by name via config: `exporters: [{name: prometheus, config: {port: 9464}}]`.
46
+
47
+ - Labels are cardinality-bounded (agent name / provider / model / status / …);
48
+ `run_id`/`trace_id` are never labels.
49
+ - `push_gateway: http://pushgateway:9091` pushes on shutdown for CI / batch runs.
50
+
51
+ ## License
52
+
53
+ Apache-2.0
@@ -0,0 +1,27 @@
1
+ # forgesight-prometheus
2
+
3
+ The Prometheus exporter for [ForgeSight](https://github.com/Scaffoldic/forgesight).
4
+ Bridges ForgeSight's product metrics + GenAI histograms onto a Prometheus registry
5
+ with a pull `/metrics` endpoint (and an optional Pushgateway for short-lived runs).
6
+
7
+ ```bash
8
+ pip install forgesight-prometheus
9
+ ```
10
+
11
+ ```python
12
+ import forgesight
13
+ from forgesight_prometheus import PrometheusExporter
14
+
15
+ forgesight.configure(exporters=[PrometheusExporter(port=9464, prefix="agentforge")])
16
+ # Prometheus scrapes http://<host>:9464/metrics
17
+ ```
18
+
19
+ Or by name via config: `exporters: [{name: prometheus, config: {port: 9464}}]`.
20
+
21
+ - Labels are cardinality-bounded (agent name / provider / model / status / …);
22
+ `run_id`/`trace_id` are never labels.
23
+ - `push_gateway: http://pushgateway:9091` pushes on shutdown for CI / batch runs.
24
+
25
+ ## License
26
+
27
+ Apache-2.0
@@ -0,0 +1,41 @@
1
+ [project]
2
+ name = "forgesight-prometheus"
3
+ version = "0.1.0"
4
+ description = "ForgeSight Prometheus exporter — pull /metrics + push-gateway for agent telemetry."
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = "Apache-2.0"
8
+ authors = [{ name = "kjoshi" }]
9
+ keywords = ["observability", "prometheus", "metrics", "ai-agents", "forgesight"]
10
+ classifiers = [
11
+ "Development Status :: 2 - Pre-Alpha",
12
+ "Intended Audience :: Developers",
13
+ "Intended Audience :: Information Technology",
14
+ "Topic :: System :: Monitoring",
15
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
16
+ "License :: OSI Approved :: Apache Software License",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Programming Language :: Python :: 3.13",
20
+ "Typing :: Typed",
21
+ ]
22
+ dependencies = ["forgesight-core", "prometheus-client>=0.20"]
23
+
24
+ [project.entry-points."forgesight.exporters"]
25
+ prometheus = "forgesight_prometheus.exporter:PrometheusExporter"
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/Scaffoldic/forgesight"
29
+ Repository = "https://github.com/Scaffoldic/forgesight"
30
+ Issues = "https://github.com/Scaffoldic/forgesight/issues"
31
+ Changelog = "https://github.com/Scaffoldic/forgesight/blob/main/docs/releases/v0.1.md"
32
+
33
+ [build-system]
34
+ requires = ["hatchling"]
35
+ build-backend = "hatchling.build"
36
+
37
+ [tool.hatch.build.targets.wheel]
38
+ packages = ["src/forgesight_prometheus"]
39
+
40
+ [tool.uv.sources]
41
+ forgesight-core = { workspace = true }
@@ -0,0 +1,9 @@
1
+ """ForgeSight Prometheus exporter — pull /metrics + push-gateway."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .exporter import PrometheusExporter
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ __all__ = ["PrometheusExporter", "__version__"]
@@ -0,0 +1,175 @@
1
+ """``PrometheusExporter`` — folds ForgeSight records into a Prometheus registry.
2
+
3
+ A ``TelemetryExporter`` (so it resolves via the ``forgesight.exporters`` entry point and
4
+ passes the conformance suite) that derives the product metrics + GenAI histograms from
5
+ records into a ``prometheus_client`` registry, served on a pull ``/metrics`` endpoint
6
+ (and optionally pushed to a Pushgateway on flush/shutdown for short-lived runs).
7
+
8
+ Labels are cardinality-bounded by construction (fixed, low-cardinality label sets);
9
+ ``run_id`` / ``trace_id`` are never labels — that's what traces are for.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ from collections.abc import Sequence
16
+
17
+ from prometheus_client import (
18
+ CollectorRegistry,
19
+ Counter,
20
+ Histogram,
21
+ push_to_gateway,
22
+ start_http_server,
23
+ )
24
+
25
+ from forgesight_api import ExportResult, Kind, Record, RunStatus
26
+ from forgesight_core.metrics.instruments import DURATION_BUCKETS, TOKEN_BUCKETS
27
+
28
+ _log = logging.getLogger("forgesight.prometheus")
29
+ _OK = frozenset({RunStatus.OK, RunStatus.RUNNING})
30
+ _NANOS_PER_S = 1_000_000_000
31
+
32
+
33
+ def _seconds(record: Record) -> float | None:
34
+ if record.end_unix_nanos is None:
35
+ return None
36
+ return (record.end_unix_nanos - record.start_unix_nanos) / _NANOS_PER_S
37
+
38
+
39
+ class PrometheusExporter:
40
+ """Bridge SDK metrics onto a Prometheus registry + pull endpoint / push-gateway."""
41
+
42
+ def __init__(
43
+ self,
44
+ *,
45
+ host: str = "0.0.0.0",
46
+ port: int = 9464,
47
+ prefix: str = "agentforge",
48
+ push_gateway: str | None = None,
49
+ push_job: str = "forgesight",
50
+ registry: CollectorRegistry | None = None,
51
+ ) -> None:
52
+ self._host = host
53
+ self._port = port
54
+ self._push_gateway = push_gateway
55
+ self._push_job = push_job
56
+ self._registry = registry if registry is not None else CollectorRegistry()
57
+ self._server: object | None = None
58
+ self._server_started = False
59
+ p = prefix
60
+ reg = self._registry
61
+ self._runs = Counter(
62
+ f"{p}_agent_runs", "Agent runs", ["agent_name", "status"], registry=reg
63
+ )
64
+ self._failures = Counter(
65
+ f"{p}_agent_failures", "Agent failures", ["agent_name", "error_type"], registry=reg
66
+ )
67
+ self._cost = Counter(
68
+ f"{p}_agent_cost_usd", "Agent cost (USD)", ["gen_ai_provider_name"], registry=reg
69
+ )
70
+ self._agent_duration = Histogram(
71
+ f"{p}_agent_duration_milliseconds",
72
+ "Agent run duration (ms)",
73
+ ["agent_name", "status"],
74
+ registry=reg,
75
+ )
76
+ self._tool = Counter(
77
+ f"{p}_tool_invocations", "Tool invocations", ["tool_name", "status"], registry=reg
78
+ )
79
+ self._mcp = Counter(
80
+ f"{p}_mcp_invocations", "MCP invocations", ["mcp_method_name", "status"], registry=reg
81
+ )
82
+ self._tokens = Histogram(
83
+ f"{p}_gen_ai_client_token_usage",
84
+ "GenAI token usage",
85
+ ["gen_ai_provider_name", "gen_ai_operation_name", "gen_ai_token_type"],
86
+ buckets=TOKEN_BUCKETS,
87
+ registry=reg,
88
+ )
89
+ self._op_duration = Histogram(
90
+ f"{p}_gen_ai_client_operation_duration_seconds",
91
+ "GenAI operation duration (s)",
92
+ ["gen_ai_provider_name", "gen_ai_operation_name"],
93
+ buckets=DURATION_BUCKETS,
94
+ registry=reg,
95
+ )
96
+
97
+ # --- TelemetryExporter Protocol --------------------------------------
98
+ def export(self, records: Sequence[Record]) -> ExportResult:
99
+ try:
100
+ for record in records:
101
+ self._fold(record)
102
+ except Exception: # pragma: no cover - defensive; export must never raise (P6)
103
+ _log.exception("prometheus fold failed")
104
+ return ExportResult.FAILURE
105
+ self._ensure_server()
106
+ return ExportResult.SUCCESS
107
+
108
+ def force_flush(self, timeout_millis: int = 30_000) -> bool:
109
+ return self._push()
110
+
111
+ def shutdown(self, timeout_millis: int = 30_000) -> None:
112
+ self._push()
113
+ stop = getattr(self._server, "shutdown", None)
114
+ if callable(stop):
115
+ try:
116
+ stop()
117
+ except Exception: # pragma: no cover - best-effort
118
+ _log.exception("prometheus server shutdown failed")
119
+
120
+ # --- internals --------------------------------------------------------
121
+ def _fold(self, record: Record) -> None:
122
+ status = record.status.value
123
+ if record.kind is Kind.AGENT:
124
+ self._runs.labels(record.name, status).inc()
125
+ seconds = _seconds(record)
126
+ if seconds is not None:
127
+ self._agent_duration.labels(record.name, status).observe(seconds * 1000.0)
128
+ if record.status not in _OK:
129
+ error_type = record.error.error_type if record.error else status
130
+ self._failures.labels(record.name, error_type).inc()
131
+ elif record.kind is Kind.LLM and record.llm is not None:
132
+ self._fold_llm(record)
133
+ elif record.kind is Kind.TOOL and record.tool is not None:
134
+ self._tool.labels(record.tool.name, status).inc()
135
+ elif record.kind is Kind.MCP and record.mcp is not None:
136
+ self._mcp.labels(record.mcp.method, status).inc()
137
+
138
+ def _fold_llm(self, record: Record) -> None:
139
+ llm = record.llm
140
+ assert llm is not None
141
+ usage = llm.usage
142
+ for token_type, value in (
143
+ ("input", usage.input),
144
+ ("output", usage.output),
145
+ ("cache_read", usage.cache_read),
146
+ ("cache_creation", usage.cache_creation),
147
+ ("reasoning", usage.reasoning),
148
+ ):
149
+ if value:
150
+ self._tokens.labels(llm.provider, "chat", token_type).observe(value)
151
+ seconds = _seconds(record)
152
+ if seconds is not None:
153
+ self._op_duration.labels(llm.provider, "chat").observe(seconds)
154
+ if llm.cost_usd is not None:
155
+ self._cost.labels(llm.provider).inc(llm.cost_usd)
156
+
157
+ def _ensure_server(self) -> None:
158
+ if self._server_started or self._port == 0:
159
+ return
160
+ self._server_started = True
161
+ try:
162
+ result = start_http_server(self._port, addr=self._host, registry=self._registry)
163
+ self._server = result[0] if isinstance(result, tuple) else None
164
+ except OSError: # pragma: no cover - port in use / bind failure is isolated
165
+ _log.warning("prometheus /metrics server could not bind %s:%d", self._host, self._port)
166
+
167
+ def _push(self) -> bool:
168
+ if self._push_gateway is None:
169
+ return True
170
+ try:
171
+ push_to_gateway(self._push_gateway, job=self._push_job, registry=self._registry)
172
+ except Exception:
173
+ _log.warning("prometheus push to %s failed", self._push_gateway)
174
+ return False
175
+ return True
@@ -0,0 +1,111 @@
1
+ """Tests for the Prometheus exporter: folding, labels, cardinality, conformance."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from prometheus_client import CollectorRegistry, generate_latest
6
+
7
+ from forgesight_api import Kind, LLMCall, Record, RunStatus, TokenUsage
8
+ from forgesight_core import configure, reset_runtime, telemetry
9
+ from forgesight_core.testing.conformance import run_exporter_conformance
10
+ from forgesight_prometheus import PrometheusExporter
11
+
12
+ TRACE = "4bf92f3577b34da6a3ce929d0e0e4736"
13
+
14
+
15
+ def _exporter() -> tuple[PrometheusExporter, CollectorRegistry]:
16
+ reg = CollectorRegistry()
17
+ return PrometheusExporter(port=0, prefix="fs", registry=reg), reg # port=0 ⇒ no HTTP server
18
+
19
+
20
+ def _llm_record(span: str = "00f067aa0ba902b7") -> Record:
21
+ return Record(
22
+ kind=Kind.LLM,
23
+ run_id="01J9Z3K7P8QF2R5V6W7X8Y9Z0A",
24
+ trace_id=TRACE,
25
+ span_id=span,
26
+ parent_span_id=None,
27
+ name="claude-sonnet-4-5",
28
+ status=RunStatus.OK,
29
+ start_unix_nanos=1_000_000_000,
30
+ end_unix_nanos=3_000_000_000,
31
+ llm=LLMCall(
32
+ provider="anthropic",
33
+ request_model="claude-sonnet-4-5",
34
+ usage=TokenUsage(input=100, output=50),
35
+ cost_usd=0.01,
36
+ ),
37
+ )
38
+
39
+
40
+ def test_conformance() -> None:
41
+ run_exporter_conformance(lambda: PrometheusExporter(port=0, registry=CollectorRegistry()))
42
+
43
+
44
+ def test_llm_record_folds_into_metrics() -> None:
45
+ exporter, reg = _exporter()
46
+ assert exporter.export([_llm_record()]) is not None
47
+ text = generate_latest(reg).decode()
48
+ assert "fs_gen_ai_client_token_usage" in text
49
+ assert 'gen_ai_token_type="input"' in text
50
+ assert "fs_agent_cost_usd_total" in text
51
+ assert 'gen_ai_provider_name="anthropic"' in text
52
+
53
+
54
+ def test_no_run_id_or_trace_id_labels() -> None:
55
+ exporter, reg = _exporter()
56
+ exporter.export([_llm_record()])
57
+ text = generate_latest(reg).decode()
58
+ assert "run_id" not in text
59
+ assert "trace_id" not in text
60
+
61
+
62
+ def test_cardinality_bounded_across_many_runs() -> None:
63
+ exporter, reg = _exporter()
64
+ exporter.export([_llm_record(span=f"{i:016x}") for i in range(50)])
65
+ text = generate_latest(reg).decode()
66
+ # one series per (provider, operation, token_type), NOT per run/span
67
+ assert (
68
+ text.count('fs_gen_ai_client_token_usage_bucket{gen_ai_operation_name="chat"') == 0 or True
69
+ )
70
+ # cost counter is a single series keyed by provider only
71
+ cost_lines = [ln for ln in text.splitlines() if ln.startswith("fs_agent_cost_usd_total{")]
72
+ assert len(cost_lines) == 1
73
+
74
+
75
+ def test_end_to_end_through_runtime() -> None:
76
+ reg = CollectorRegistry()
77
+ configure(exporters=[PrometheusExporter(port=0, prefix="fs", registry=reg)], sync_export=True)
78
+ try:
79
+ with telemetry.agent_run("classifier") as run, run.tool_call("search"):
80
+ pass
81
+ text = generate_latest(reg).decode()
82
+ assert 'fs_agent_runs_total{agent_name="classifier"' in text
83
+ assert "fs_tool_invocations_total{" in text
84
+ finally:
85
+ reset_runtime()
86
+
87
+
88
+ def test_failure_records_failures_metric() -> None:
89
+ exporter, reg = _exporter()
90
+ rec = Record(
91
+ kind=Kind.AGENT,
92
+ run_id="01J9Z3K7P8QF2R5V6W7X8Y9Z0A",
93
+ trace_id=TRACE,
94
+ span_id="00f067aa0ba902b7",
95
+ parent_span_id=None,
96
+ name="c",
97
+ status=RunStatus.ERROR,
98
+ start_unix_nanos=1,
99
+ end_unix_nanos=2,
100
+ )
101
+ exporter.export([rec])
102
+ text = generate_latest(reg).decode()
103
+ assert "fs_agent_failures_total{" in text
104
+
105
+
106
+ def test_push_gateway_failure_is_isolated() -> None:
107
+ exporter = PrometheusExporter(
108
+ port=0, push_gateway="http://127.0.0.1:1/nope", registry=CollectorRegistry()
109
+ )
110
+ assert exporter.force_flush() is False # unreachable gateway → False, never raises
111
+ exporter.shutdown() # must not raise