agentgraf 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .venv/
8
+ venv/
9
+
10
+ # Node
11
+ node_modules/
12
+ .output/
13
+ /public/build/
14
+
15
+ # IDE
16
+ .vscode/
17
+ .idea/
18
+ *.swp
19
+ *.swo
20
+
21
+ # OS
22
+ .DS_Store
23
+ Thumbs.db
24
+
25
+ # Env
26
+ .env
27
+ .env.local
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.4
2
+ Name: agentgraf
3
+ Version: 0.1.0
4
+ Summary: Zero-infrastructure AI agent tracing for Grafana + Loki
5
+ Project-URL: Homepage, https://github.com/Berg-it/agentgraf
6
+ Project-URL: Documentation, https://github.com/Berg-it/agentgraf#readme
7
+ Project-URL: Repository, https://github.com/Berg-it/agentgraf
8
+ Project-URL: Issues, https://github.com/Berg-it/agentgraf/issues
9
+ Author-email: Mohamed Amine Berguiga <m.a.berguiga@gmail.com>
10
+ License: MIT
11
+ Keywords: ai-agent,grafana,langchain,llm,loki,observability,tracing
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: System :: Monitoring
21
+ Requires-Python: >=3.10
22
+ Requires-Dist: httpx>=0.27.0
23
+ Requires-Dist: pydantic>=2.0.0
24
+ Provides-Extra: dev
25
+ Requires-Dist: mypy>=1.10; extra == 'dev'
26
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
27
+ Requires-Dist: pytest>=8.0; extra == 'dev'
28
+ Requires-Dist: respx>=0.21; extra == 'dev'
29
+ Requires-Dist: ruff>=0.4; extra == 'dev'
30
+ Provides-Extra: langchain
31
+ Requires-Dist: langchain-core>=0.2.0; extra == 'langchain'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # agentgraf — Python Tracer
35
+
36
+ Zero-infrastructure AI agent tracing for Grafana + Loki.
37
+
38
+ ```bash
39
+ pip install agentgraf # core only
40
+ pip install agentgraf[langchain] # with LangChain callback
41
+ ```
@@ -0,0 +1,8 @@
1
+ # agentgraf — Python Tracer
2
+
3
+ Zero-infrastructure AI agent tracing for Grafana + Loki.
4
+
5
+ ```bash
6
+ pip install agentgraf # core only
7
+ pip install agentgraf[langchain] # with LangChain callback
8
+ ```
@@ -0,0 +1,70 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "agentgraf"
7
+ version = "0.1.0"
8
+ description = "Zero-infrastructure AI agent tracing for Grafana + Loki"
9
+ readme = "README.md"
10
+ license = { text = "MIT" }
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "Mohamed Amine Berguiga", email = "m.a.berguiga@gmail.com" }
14
+ ]
15
+ keywords = ["tracing", "llm", "langchain", "grafana", "loki", "observability", "ai-agent"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ "Topic :: System :: Monitoring",
26
+ ]
27
+
28
+ dependencies = [
29
+ "httpx>=0.27.0",
30
+ "pydantic>=2.0.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ langchain = [
35
+ "langchain-core>=0.2.0",
36
+ ]
37
+ dev = [
38
+ "pytest>=8.0",
39
+ "pytest-asyncio>=0.23",
40
+ "mypy>=1.10",
41
+ "ruff>=0.4",
42
+ "respx>=0.21",
43
+ ]
44
+
45
+ [project.urls]
46
+ Homepage = "https://github.com/Berg-it/agentgraf"
47
+ Documentation = "https://github.com/Berg-it/agentgraf#readme"
48
+ Repository = "https://github.com/Berg-it/agentgraf"
49
+ Issues = "https://github.com/Berg-it/agentgraf/issues"
50
+
51
+ [tool.hatch.build.targets.wheel]
52
+ packages = ["tracer/src/agentgraf"]
53
+
54
+ [tool.ruff.lint]
55
+ select = ["E", "F", "I", "N", "W", "UP", "B", "C4", "SIM"]
56
+ ignore = ["E501"]
57
+
58
+ [tool.ruff.lint.pydocstyle]
59
+ convention = "google"
60
+
61
+ [tool.mypy]
62
+ python_version = "3.10"
63
+ strict = true
64
+ warn_return_any = true
65
+ warn_unused_ignores = true
66
+ show_error_codes = true
67
+
68
+ [tool.pytest.ini_options]
69
+ asyncio_mode = "auto"
70
+ testpaths = ["tracer/tests"]
@@ -0,0 +1,40 @@
1
+ """AgentGraf — Zero-infrastructure AI agent tracing for Grafana + Loki.
2
+
3
+ Usage::
4
+
5
+ from agentgraf import LokiClient, BatchSpanProcessor, TraceSpan
6
+
7
+ client = LokiClient(loki_url="http://loki:3100/loki/api/v1/push")
8
+ processor = BatchSpanProcessor(exporter=client.send_spans_sync)
9
+ processor.start()
10
+
11
+ # ... add spans manually or use AgentGrafTracer (LangChain) ...
12
+
13
+ processor.shutdown()
14
+
15
+ Optional LangChain integration (``pip install agentgraf[langchain]``)::
16
+
17
+ from agentgraf import AgentGrafTracer
18
+ tracer = AgentGrafTracer(processor=processor)
19
+ graph.astream(state, config={"callbacks": [tracer]})
20
+ """
21
+
22
+ from .models import SpanKind, SpanStatus, TraceSpan
23
+ from .client import LokiClient
24
+ from .processor import BatchSpanProcessor
25
+
26
+ __all__ = [
27
+ "TraceSpan",
28
+ "SpanKind",
29
+ "SpanStatus",
30
+ "LokiClient",
31
+ "BatchSpanProcessor",
32
+ ]
33
+
34
+ # Lazy import for LangChain tracer — only available if langchain-core is installed.
35
+ try:
36
+ from .tracer import AgentGrafTracer # noqa: F401
37
+
38
+ __all__.append("AgentGrafTracer")
39
+ except ImportError:
40
+ pass
@@ -0,0 +1,208 @@
1
+ """Loki-direct HTTP client — dual sync/async, auto-detects execution context.
2
+
3
+ Push spans directly to Loki's push API. No Gateway required in v0.1.0.
4
+
5
+ Loki API reference: POST /loki/api/v1/push
6
+ Payload: {"streams": [{"stream": {...labels...}, "values": [[ts_ns, line, metadata]]}]}
7
+
8
+ Timestamps must be **string nanosecond-epoch** or Loki returns 400.
9
+ Structured metadata (3rd tuple element) is a Loki >=3.0 feature — older Loki
10
+ versions silently ignore it, so the fallback is the JSON body parsed via ``| json``
11
+ in LogQL.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import asyncio
16
+ import logging
17
+ import time
18
+ from typing import Any, Dict, List, Optional
19
+
20
+ import httpx
21
+
22
+ from .models import TraceSpan
23
+
24
+ logger = logging.getLogger("agentgraf.client")
25
+
26
+ # Loki labels kept intentionally low-cardinality — never put run_id/trace_id here.
27
+ _STATIC_LABELS = {"job": "agentgraf"}
28
+
29
+
30
+ class LokiClient:
31
+ """Push spans to Loki HTTP API. Dual sync/async, auto-detects context.
32
+
33
+ Args:
34
+ loki_url: Full Loki push endpoint (e.g. ``http://loki:3100/loki/api/v1/push``).
35
+ max_retries: Number of retry attempts on transient failures (5xx, timeouts).
36
+ retry_delay: Base delay in seconds before first retry (default 1.0).
37
+ timeout: HTTP request timeout in seconds (default 10).
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ loki_url: str,
43
+ max_retries: int = 3,
44
+ retry_delay: float = 1.0,
45
+ timeout: float = 10.0,
46
+ ):
47
+ self._url = loki_url.rstrip("/")
48
+ self._max_retries = max_retries
49
+ self._retry_delay = retry_delay
50
+ self._timeout = timeout
51
+ self._sync_client: Optional[httpx.Client] = None
52
+ self._async_client: Optional[httpx.AsyncClient] = None
53
+
54
+ # ------------------------------------------------------------------
55
+ # Public API — sync
56
+ # ------------------------------------------------------------------
57
+ def send_spans_sync(self, spans: List[TraceSpan]) -> bool:
58
+ """Push a batch of spans to Loki from a synchronous context.
59
+
60
+ Retries on 5xx and network errors. Does NOT retry 4xx
61
+ (client errors will not resolve on their own).
62
+ """
63
+ if self._sync_client is None:
64
+ self._sync_client = httpx.Client(timeout=self._timeout)
65
+ payload = _build_loki_payload(spans)
66
+ for attempt in range(1, self._max_retries + 1):
67
+ try:
68
+ resp = self._sync_client.post(
69
+ self._url,
70
+ json=payload,
71
+ headers={"Content-Type": "application/json"},
72
+ )
73
+ if resp.status_code < 300:
74
+ return True
75
+
76
+ if 400 <= resp.status_code < 500:
77
+ # Client error — not retryable
78
+ logger.warning(
79
+ "Loki push returned %d (client error, not retrying): %s",
80
+ resp.status_code,
81
+ resp.text[:200],
82
+ )
83
+ return False
84
+
85
+ # Server error (5xx) — retryable
86
+ logger.warning(
87
+ "Loki push returned %d (attempt %d/%d): %s",
88
+ resp.status_code,
89
+ attempt,
90
+ self._max_retries,
91
+ resp.text[:200],
92
+ )
93
+ except httpx.HTTPError as exc:
94
+ logger.warning(
95
+ "Loki push failed (attempt %d/%d): %s",
96
+ attempt,
97
+ self._max_retries,
98
+ exc,
99
+ )
100
+ if attempt < self._max_retries:
101
+ time.sleep(self._retry_delay * (2 ** (attempt - 1)))
102
+ logger.error(
103
+ "Failed to push %d spans to Loki after %d attempts", len(spans), self._max_retries
104
+ )
105
+ return False
106
+
107
+ # ------------------------------------------------------------------
108
+ # Public API — async
109
+ # ------------------------------------------------------------------
110
+ async def send_spans_async(self, spans: List[TraceSpan]) -> bool:
111
+ """Push a batch of spans to Loki from an async context.
112
+
113
+ Retries on 5xx and network errors. Does NOT retry 4xx.
114
+ """
115
+ if self._async_client is None:
116
+ self._async_client = httpx.AsyncClient(timeout=self._timeout)
117
+ payload = _build_loki_payload(spans)
118
+ for attempt in range(1, self._max_retries + 1):
119
+ try:
120
+ resp = await self._async_client.post(
121
+ self._url,
122
+ json=payload,
123
+ headers={"Content-Type": "application/json"},
124
+ )
125
+ if resp.status_code < 300:
126
+ return True
127
+
128
+ if 400 <= resp.status_code < 500:
129
+ logger.warning(
130
+ "Loki push returned %d (client error, not retrying): %s",
131
+ resp.status_code,
132
+ resp.text[:200],
133
+ )
134
+ return False
135
+
136
+ logger.warning(
137
+ "Loki push returned %d (attempt %d/%d): %s",
138
+ resp.status_code,
139
+ attempt,
140
+ self._max_retries,
141
+ resp.text[:200],
142
+ )
143
+ except httpx.HTTPError as exc:
144
+ logger.warning(
145
+ "Loki push failed (attempt %d/%d): %s",
146
+ attempt,
147
+ self._max_retries,
148
+ exc,
149
+ )
150
+ if attempt < self._max_retries:
151
+ await asyncio.sleep(self._retry_delay * (2 ** (attempt - 1)))
152
+ logger.error(
153
+ "Failed to push %d spans to Loki after %d attempts", len(spans), self._max_retries
154
+ )
155
+ return False
156
+
157
+ # ------------------------------------------------------------------
158
+ # Lifecycle
159
+ # ------------------------------------------------------------------
160
+ def close_sync(self) -> None:
161
+ """Close the synchronous HTTP client."""
162
+ if self._sync_client is not None:
163
+ self._sync_client.close()
164
+ self._sync_client = None
165
+
166
+ async def close_async(self) -> None:
167
+ """Close the asynchronous HTTP client."""
168
+ if self._async_client is not None:
169
+ await self._async_client.aclose()
170
+ self._async_client = None
171
+
172
+
173
+ # ======================================================================
174
+ # Internal helpers
175
+ # ======================================================================
176
+
177
+
178
+ def _build_loki_payload(spans: List[TraceSpan]) -> Dict[str, Any]:
179
+ """Build a Loki push-API payload from a batch of spans.
180
+
181
+ Groups spans by (project, kind) to minimise stream count.
182
+ Timestamps are string nanoseconds (Loki requirement).
183
+ Structured metadata carries trace_id/span_id/run_id for Loki >=3.0;
184
+ the same data is in the JSON body for users on older Loki with ``| json``.
185
+ """
186
+ groups: Dict[tuple, List[tuple]] = {}
187
+ for span in spans:
188
+ key = (span.project, span.kind.value)
189
+ ts_ns = str(int(span.start_time * 1_000_000_000))
190
+ line = span.to_json()
191
+ meta = {
192
+ "trace_id": span.trace_id,
193
+ "span_id": span.span_id,
194
+ "run_id": span.run_id,
195
+ }
196
+ if span.parent_span_id:
197
+ meta["parent_span_id"] = span.parent_span_id
198
+ groups.setdefault(key, []).append((ts_ns, line, meta))
199
+
200
+ streams = []
201
+ for (project, kind), values in groups.items():
202
+ streams.append(
203
+ {
204
+ "stream": {**_STATIC_LABELS, "project": project, "kind": kind},
205
+ "values": values,
206
+ }
207
+ )
208
+ return {"streams": streams}
@@ -0,0 +1,165 @@
1
+ """AgentGraf data models — Pydantic v2 span representation for LLM agent traces.
2
+
3
+ All timestamps are float unix-epoch seconds (compatible with Loki nanosecond push).
4
+ The model mirrors OpenTelemetry conventions where possible so that future
5
+ exporters (OTLP, Jaeger, Zipkin) are trivial to add.
6
+
7
+ contract-version: 1
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import time
12
+ import uuid
13
+ from enum import Enum
14
+ from typing import Any, Dict, List, Optional
15
+
16
+ from pydantic import BaseModel, Field
17
+
18
+
19
+ class SpanKind(str, Enum):
20
+ """OpenTelemetry SpanKind adapted for LLM workloads."""
21
+
22
+ LLM = "llm"
23
+ TOOL = "tool"
24
+ CHAIN = "chain"
25
+ AGENT = "agent"
26
+ RETRIEVER = "retriever"
27
+
28
+
29
+ class SpanStatus(str, Enum):
30
+ OK = "ok"
31
+ ERROR = "error"
32
+
33
+
34
+ class TraceSpan(BaseModel):
35
+ """A single span in an AI-agent trace.
36
+
37
+ This is the stable data contract — backward compatible for life.
38
+ Fields mirror OpenTelemetry conventions where possible.
39
+
40
+ Attributes:
41
+ trace_id: 32-hex-char UUID, shared by all spans in one logical run.
42
+ span_id: 16-hex-char UUID, unique per span.
43
+ parent_span_id: 16-hex-char UUID or ``None`` for root spans.
44
+ run_id: Stable ID across a full agent invocation (LangChain ``run_id``).
45
+ run_name: Optional human-readable label for the run.
46
+ project: Logical grouping (e.g. ``k-fix``, ``support-bot``).
47
+ kind: Semantic classification of the span.
48
+ name: Human-readable operation name (``gpt-4o``, ``kubectl_get_pods``).
49
+ start_time: Unix-epoch seconds.
50
+ end_time: Unix-epoch seconds (``None`` until span is closed).
51
+ latency_ms: Computed from (end_time - start_time) * 1000.
52
+ model: LLM model name (OpenAI, Anthropic, etc.).
53
+ input_tokens: Token count consumed by the prompt.
54
+ output_tokens: Token count produced by the completion.
55
+ total_tokens: ``input_tokens + output_tokens``.
56
+ input_data: Truncated/sanitized input payload (JSON string).
57
+ output_data: Truncated/sanitized output payload (JSON string).
58
+ status: ``"ok"`` or ``"error"``.
59
+ error: Error message when status is ``"error"``.
60
+ tags: Free-form key/value labels.
61
+ metadata: Structured metadata (extensible).
62
+ agentgraf_version: Protocol version (current = 1).
63
+ """
64
+
65
+ # ── OTel-compatible IDs ──
66
+ trace_id: str = Field(
67
+ default_factory=lambda: uuid.uuid4().hex,
68
+ description="32-char hex (OTel format)",
69
+ )
70
+ span_id: str = Field(
71
+ default_factory=lambda: uuid.uuid4().hex[:16],
72
+ description="16-char hex",
73
+ )
74
+ parent_span_id: Optional[str] = Field(
75
+ default=None,
76
+ description="16-char hex or None (root span)",
77
+ )
78
+
79
+ # ── Identification ──
80
+ run_id: str = Field(
81
+ default_factory=lambda: uuid.uuid4().hex[:16],
82
+ description="Stable across a full agent invocation",
83
+ )
84
+ run_name: Optional[str] = None
85
+ project: str = "default"
86
+
87
+ # ── Span metadata ──
88
+ kind: SpanKind = SpanKind.CHAIN
89
+ name: str = "unnamed"
90
+ start_time: float = Field(default_factory=time.time)
91
+ end_time: Optional[float] = None
92
+ latency_ms: Optional[int] = None
93
+
94
+ # ── LLM-specific ──
95
+ model: Optional[str] = None
96
+ input_tokens: int = 0
97
+ output_tokens: int = 0
98
+ total_tokens: int = 0
99
+
100
+ # ── I/O (sanitized, truncated) ──
101
+ input_data: Optional[str] = None
102
+ output_data: Optional[str] = None
103
+
104
+ # ── Status ──
105
+ status: SpanStatus = SpanStatus.OK
106
+ error: Optional[str] = None
107
+
108
+ # ── Extensibility ──
109
+ tags: Dict[str, Any] = Field(default_factory=dict)
110
+ metadata: Dict[str, Any] = Field(default_factory=dict)
111
+
112
+ # ── AgentGraf protocol version ──
113
+ agentgraf_version: int = 1
114
+
115
+ # ------------------------------------------------------------------
116
+ # Life-cycle helpers
117
+ # ------------------------------------------------------------------
118
+ def finish(
119
+ self,
120
+ end_time: Optional[float] = None,
121
+ status: Optional[SpanStatus] = None,
122
+ error: Optional[str] = None,
123
+ ) -> "TraceSpan":
124
+ """Close the span, recording end_time, latency, and optional status."""
125
+ self.end_time = end_time or time.time()
126
+ self.latency_ms = int((self.end_time - self.start_time) * 1000)
127
+ if status is not None:
128
+ self.status = status
129
+ if error is not None:
130
+ self.error = error
131
+ return self
132
+
133
+ def set_tag(self, key: str, value: Any) -> "TraceSpan":
134
+ """Fluent helper to add a single tag."""
135
+ self.tags[key] = value
136
+ return self
137
+
138
+ def set_metadata(self, key: str, value: Any) -> "TraceSpan":
139
+ """Fluent helper to add a single metadata entry."""
140
+ self.metadata[key] = value
141
+ return self
142
+
143
+ def set_input(self, data: str, truncate: int = 10_000) -> "TraceSpan":
144
+ """Set input_data with optional truncation."""
145
+ self.input_data = data[:truncate] if len(data) > truncate else data
146
+ return self
147
+
148
+ def set_output(self, data: str, truncate: int = 10_000) -> "TraceSpan":
149
+ """Set output_data with optional truncation."""
150
+ self.output_data = data[:truncate] if len(data) > truncate else data
151
+ return self
152
+
153
+ def set_tokens(self, input_tokens: int, output_tokens: int) -> "TraceSpan":
154
+ """Record token usage."""
155
+ self.input_tokens = input_tokens
156
+ self.output_tokens = output_tokens
157
+ self.total_tokens = input_tokens + output_tokens
158
+ return self
159
+
160
+ # ------------------------------------------------------------------
161
+ # Serialization
162
+ # ------------------------------------------------------------------
163
+ def to_json(self) -> str:
164
+ """Compact JSON string (one line → friendly for Loki / stdout)."""
165
+ return self.model_dump_json(exclude_none=True)