puffinflow 2.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- puffinflow/__init__.py +132 -0
- puffinflow/core/__init__.py +110 -0
- puffinflow/core/agent/__init__.py +320 -0
- puffinflow/core/agent/base.py +1635 -0
- puffinflow/core/agent/checkpoint.py +50 -0
- puffinflow/core/agent/context.py +521 -0
- puffinflow/core/agent/decorators/__init__.py +90 -0
- puffinflow/core/agent/decorators/builder.py +454 -0
- puffinflow/core/agent/decorators/flexible.py +714 -0
- puffinflow/core/agent/decorators/inspection.py +144 -0
- puffinflow/core/agent/dependencies.py +57 -0
- puffinflow/core/agent/scheduling/__init__.py +21 -0
- puffinflow/core/agent/scheduling/builder.py +160 -0
- puffinflow/core/agent/scheduling/exceptions.py +35 -0
- puffinflow/core/agent/scheduling/inputs.py +137 -0
- puffinflow/core/agent/scheduling/parser.py +209 -0
- puffinflow/core/agent/scheduling/scheduler.py +413 -0
- puffinflow/core/agent/state.py +141 -0
- puffinflow/core/config.py +62 -0
- puffinflow/core/coordination/__init__.py +137 -0
- puffinflow/core/coordination/agent_group.py +359 -0
- puffinflow/core/coordination/agent_pool.py +629 -0
- puffinflow/core/coordination/agent_team.py +577 -0
- puffinflow/core/coordination/coordinator.py +720 -0
- puffinflow/core/coordination/deadlock.py +1759 -0
- puffinflow/core/coordination/fluent_api.py +421 -0
- puffinflow/core/coordination/primitives.py +478 -0
- puffinflow/core/coordination/rate_limiter.py +520 -0
- puffinflow/core/observability/__init__.py +47 -0
- puffinflow/core/observability/agent.py +139 -0
- puffinflow/core/observability/alerting.py +73 -0
- puffinflow/core/observability/config.py +127 -0
- puffinflow/core/observability/context.py +88 -0
- puffinflow/core/observability/core.py +147 -0
- puffinflow/core/observability/decorators.py +105 -0
- puffinflow/core/observability/events.py +71 -0
- puffinflow/core/observability/interfaces.py +196 -0
- puffinflow/core/observability/metrics.py +137 -0
- puffinflow/core/observability/tracing.py +209 -0
- puffinflow/core/reliability/__init__.py +27 -0
- puffinflow/core/reliability/bulkhead.py +96 -0
- puffinflow/core/reliability/circuit_breaker.py +149 -0
- puffinflow/core/reliability/leak_detector.py +122 -0
- puffinflow/core/resources/__init__.py +77 -0
- puffinflow/core/resources/allocation.py +790 -0
- puffinflow/core/resources/pool.py +645 -0
- puffinflow/core/resources/quotas.py +567 -0
- puffinflow/core/resources/requirements.py +217 -0
- puffinflow/version.py +21 -0
- puffinflow-2.dev0.dist-info/METADATA +334 -0
- puffinflow-2.dev0.dist-info/RECORD +55 -0
- puffinflow-2.dev0.dist-info/WHEEL +5 -0
- puffinflow-2.dev0.dist-info/entry_points.txt +3 -0
- puffinflow-2.dev0.dist-info/licenses/LICENSE +21 -0
- puffinflow-2.dev0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SpanType(Enum):
|
|
12
|
+
"""Types of spans for categorization"""
|
|
13
|
+
|
|
14
|
+
WORKFLOW = "workflow"
|
|
15
|
+
STATE = "state"
|
|
16
|
+
RESOURCE = "resource"
|
|
17
|
+
BUSINESS = "business"
|
|
18
|
+
SYSTEM = "system"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MetricType(Enum):
|
|
22
|
+
"""Types of metrics"""
|
|
23
|
+
|
|
24
|
+
COUNTER = "counter"
|
|
25
|
+
GAUGE = "gauge"
|
|
26
|
+
HISTOGRAM = "histogram"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class AlertSeverity(Enum):
|
|
30
|
+
"""Alert severity levels"""
|
|
31
|
+
|
|
32
|
+
INFO = "info"
|
|
33
|
+
WARNING = "warning"
|
|
34
|
+
ERROR = "error"
|
|
35
|
+
CRITICAL = "critical"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class SpanContext:
|
|
40
|
+
"""Correlation context for distributed tracing"""
|
|
41
|
+
|
|
42
|
+
trace_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
43
|
+
span_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
44
|
+
parent_span_id: Optional[str] = None
|
|
45
|
+
workflow_id: Optional[str] = None
|
|
46
|
+
agent_name: Optional[str] = None
|
|
47
|
+
state_name: Optional[str] = None
|
|
48
|
+
user_id: Optional[str] = None
|
|
49
|
+
session_id: Optional[str] = None
|
|
50
|
+
|
|
51
|
+
def child_context(self) -> "SpanContext":
|
|
52
|
+
"""Create child span context"""
|
|
53
|
+
return SpanContext(
|
|
54
|
+
trace_id=self.trace_id,
|
|
55
|
+
span_id=str(uuid.uuid4()),
|
|
56
|
+
parent_span_id=self.span_id,
|
|
57
|
+
workflow_id=self.workflow_id,
|
|
58
|
+
agent_name=self.agent_name,
|
|
59
|
+
state_name=self.state_name,
|
|
60
|
+
user_id=self.user_id,
|
|
61
|
+
session_id=self.session_id,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class ObservabilityEvent:
|
|
67
|
+
"""Structured observability event"""
|
|
68
|
+
|
|
69
|
+
timestamp: datetime
|
|
70
|
+
event_type: str
|
|
71
|
+
source: str
|
|
72
|
+
level: str
|
|
73
|
+
message: str
|
|
74
|
+
attributes: dict[str, Any] = field(default_factory=dict)
|
|
75
|
+
span_context: Optional[SpanContext] = None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Span(ABC):
|
|
79
|
+
"""Abstract span interface"""
|
|
80
|
+
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def set_attribute(self, key: str, value: Any) -> None:
|
|
83
|
+
"""Set span attribute"""
|
|
84
|
+
|
|
85
|
+
@abstractmethod
|
|
86
|
+
def set_status(self, status: str, description: Optional[str] = None) -> None:
|
|
87
|
+
"""Set span status"""
|
|
88
|
+
|
|
89
|
+
@abstractmethod
|
|
90
|
+
def add_event(self, name: str, attributes: Optional[dict[str, Any]] = None) -> None:
|
|
91
|
+
"""Add event to span"""
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def record_exception(self, exception: Exception) -> None:
|
|
95
|
+
"""Record exception in span"""
|
|
96
|
+
|
|
97
|
+
@abstractmethod
|
|
98
|
+
def end(self) -> None:
|
|
99
|
+
"""End the span"""
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
@abstractmethod
|
|
103
|
+
def context(self) -> SpanContext:
|
|
104
|
+
"""Get span context"""
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class TracingProvider(ABC):
|
|
108
|
+
"""Abstract tracing provider"""
|
|
109
|
+
|
|
110
|
+
@abstractmethod
|
|
111
|
+
def start_span(
|
|
112
|
+
self,
|
|
113
|
+
name: str,
|
|
114
|
+
span_type: SpanType = SpanType.SYSTEM,
|
|
115
|
+
parent: Optional[SpanContext] = None,
|
|
116
|
+
**attributes: Any,
|
|
117
|
+
) -> Span:
|
|
118
|
+
"""Start a new span"""
|
|
119
|
+
|
|
120
|
+
@abstractmethod
|
|
121
|
+
def get_current_span(self) -> Optional[Span]:
|
|
122
|
+
"""Get current active span"""
|
|
123
|
+
|
|
124
|
+
@contextmanager
|
|
125
|
+
def span(
|
|
126
|
+
self,
|
|
127
|
+
name: str,
|
|
128
|
+
span_type: SpanType = SpanType.SYSTEM,
|
|
129
|
+
parent: Optional[SpanContext] = None,
|
|
130
|
+
**attributes: Any,
|
|
131
|
+
) -> Iterator[Span]:
|
|
132
|
+
"""Context manager for spans"""
|
|
133
|
+
span = self.start_span(name, span_type, parent, **attributes)
|
|
134
|
+
try:
|
|
135
|
+
yield span
|
|
136
|
+
span.set_status("ok")
|
|
137
|
+
except Exception as e:
|
|
138
|
+
span.record_exception(e)
|
|
139
|
+
raise
|
|
140
|
+
finally:
|
|
141
|
+
span.end()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class Metric(ABC):
|
|
145
|
+
"""Abstract metric interface"""
|
|
146
|
+
|
|
147
|
+
@abstractmethod
|
|
148
|
+
def record(self, value: float, **labels: Any) -> None:
|
|
149
|
+
"""Record metric value"""
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class MetricsProvider(ABC):
|
|
153
|
+
"""Abstract metrics provider"""
|
|
154
|
+
|
|
155
|
+
@abstractmethod
|
|
156
|
+
def counter(
|
|
157
|
+
self, name: str, description: str = "", labels: Optional[list[str]] = None
|
|
158
|
+
) -> Metric:
|
|
159
|
+
"""Create counter metric"""
|
|
160
|
+
|
|
161
|
+
@abstractmethod
|
|
162
|
+
def gauge(
|
|
163
|
+
self, name: str, description: str = "", labels: Optional[list[str]] = None
|
|
164
|
+
) -> Metric:
|
|
165
|
+
"""Create gauge metric"""
|
|
166
|
+
|
|
167
|
+
@abstractmethod
|
|
168
|
+
def histogram(
|
|
169
|
+
self, name: str, description: str = "", labels: Optional[list[str]] = None
|
|
170
|
+
) -> Metric:
|
|
171
|
+
"""Create histogram metric"""
|
|
172
|
+
|
|
173
|
+
@abstractmethod
|
|
174
|
+
def export_metrics(self) -> str:
|
|
175
|
+
"""Export metrics in Prometheus format"""
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class AlertingProvider(ABC):
|
|
179
|
+
"""Abstract alerting provider"""
|
|
180
|
+
|
|
181
|
+
@abstractmethod
|
|
182
|
+
async def send_alert(
|
|
183
|
+
self,
|
|
184
|
+
message: str,
|
|
185
|
+
severity: AlertSeverity,
|
|
186
|
+
attributes: Optional[dict[str, Any]] = None,
|
|
187
|
+
) -> None:
|
|
188
|
+
"""Send alert"""
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
class EventProcessor(ABC):
|
|
192
|
+
"""Abstract event processor"""
|
|
193
|
+
|
|
194
|
+
@abstractmethod
|
|
195
|
+
async def process_event(self, event: ObservabilityEvent) -> None:
|
|
196
|
+
"""Process observability event"""
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
from typing import Any, Optional
|
|
3
|
+
|
|
4
|
+
from prometheus_client import CollectorRegistry, generate_latest
|
|
5
|
+
from prometheus_client import Counter as PrometheusCounter
|
|
6
|
+
from prometheus_client import Gauge as PrometheusGauge
|
|
7
|
+
from prometheus_client import Histogram as PrometheusHistogram
|
|
8
|
+
|
|
9
|
+
from .config import MetricsConfig
|
|
10
|
+
from .interfaces import Metric, MetricsProvider, MetricType
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PrometheusMetric(Metric):
|
|
14
|
+
"""Prometheus metric wrapper"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self, prometheus_metric: Any, metric_type: MetricType, cardinality_limit: int
|
|
18
|
+
) -> None:
|
|
19
|
+
self._prometheus_metric = prometheus_metric
|
|
20
|
+
self._metric_type = metric_type
|
|
21
|
+
self._cardinality_limit = cardinality_limit
|
|
22
|
+
self._series_count = 0
|
|
23
|
+
self._lock = threading.Lock()
|
|
24
|
+
|
|
25
|
+
def record(self, value: float, **labels: Any) -> None:
|
|
26
|
+
"""Record metric value"""
|
|
27
|
+
# Basic cardinality protection
|
|
28
|
+
with self._lock:
|
|
29
|
+
if self._series_count >= self._cardinality_limit:
|
|
30
|
+
return # Skip to prevent memory issues
|
|
31
|
+
|
|
32
|
+
# Convert label values to strings
|
|
33
|
+
str_labels = {k: str(v) for k, v in labels.items() if v is not None}
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
if str_labels:
|
|
37
|
+
if self._metric_type == MetricType.COUNTER:
|
|
38
|
+
self._prometheus_metric.labels(**str_labels).inc(value)
|
|
39
|
+
elif self._metric_type == MetricType.GAUGE:
|
|
40
|
+
self._prometheus_metric.labels(**str_labels).set(value)
|
|
41
|
+
elif self._metric_type == MetricType.HISTOGRAM:
|
|
42
|
+
self._prometheus_metric.labels(**str_labels).observe(value)
|
|
43
|
+
else:
|
|
44
|
+
if self._metric_type == MetricType.COUNTER:
|
|
45
|
+
self._prometheus_metric.inc(value)
|
|
46
|
+
elif self._metric_type == MetricType.GAUGE:
|
|
47
|
+
self._prometheus_metric.set(value)
|
|
48
|
+
elif self._metric_type == MetricType.HISTOGRAM:
|
|
49
|
+
self._prometheus_metric.observe(value)
|
|
50
|
+
|
|
51
|
+
self._series_count += 1
|
|
52
|
+
|
|
53
|
+
except Exception as e:
|
|
54
|
+
# Log error but don't fail the application
|
|
55
|
+
print(f"Failed to record metric: {e}")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class PrometheusMetricsProvider(MetricsProvider):
|
|
59
|
+
"""Prometheus metrics provider"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, config: MetricsConfig):
|
|
62
|
+
self.config = config
|
|
63
|
+
self._registry = CollectorRegistry()
|
|
64
|
+
self._metrics_cache: dict[str, Metric] = {}
|
|
65
|
+
self._lock = threading.Lock()
|
|
66
|
+
|
|
67
|
+
def counter(
|
|
68
|
+
self, name: str, description: str = "", labels: Optional[list[str]] = None
|
|
69
|
+
) -> Metric:
|
|
70
|
+
"""Create counter metric"""
|
|
71
|
+
return self._get_or_create_metric(
|
|
72
|
+
name, MetricType.COUNTER, description, labels or []
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def gauge(
|
|
76
|
+
self, name: str, description: str = "", labels: Optional[list[str]] = None
|
|
77
|
+
) -> Metric:
|
|
78
|
+
"""Create gauge metric"""
|
|
79
|
+
return self._get_or_create_metric(
|
|
80
|
+
name, MetricType.GAUGE, description, labels or []
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
def histogram(
|
|
84
|
+
self, name: str, description: str = "", labels: Optional[list[str]] = None
|
|
85
|
+
) -> Metric:
|
|
86
|
+
"""Create histogram metric"""
|
|
87
|
+
return self._get_or_create_metric(
|
|
88
|
+
name, MetricType.HISTOGRAM, description, labels or []
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def _get_or_create_metric(
|
|
92
|
+
self, name: str, metric_type: MetricType, description: str, labels: list[str]
|
|
93
|
+
) -> Metric:
|
|
94
|
+
"""Get or create metric"""
|
|
95
|
+
metric_key = f"{self.config.namespace}_{name}"
|
|
96
|
+
|
|
97
|
+
with self._lock:
|
|
98
|
+
if metric_key in self._metrics_cache:
|
|
99
|
+
return self._metrics_cache[metric_key]
|
|
100
|
+
|
|
101
|
+
labelnames = labels or []
|
|
102
|
+
|
|
103
|
+
prometheus_metric: Any
|
|
104
|
+
if metric_type == MetricType.COUNTER:
|
|
105
|
+
prometheus_metric = PrometheusCounter(
|
|
106
|
+
metric_key,
|
|
107
|
+
description,
|
|
108
|
+
labelnames=labelnames,
|
|
109
|
+
registry=self._registry,
|
|
110
|
+
)
|
|
111
|
+
elif metric_type == MetricType.GAUGE:
|
|
112
|
+
prometheus_metric = PrometheusGauge(
|
|
113
|
+
metric_key,
|
|
114
|
+
description,
|
|
115
|
+
labelnames=labelnames,
|
|
116
|
+
registry=self._registry,
|
|
117
|
+
)
|
|
118
|
+
elif metric_type == MetricType.HISTOGRAM:
|
|
119
|
+
prometheus_metric = PrometheusHistogram(
|
|
120
|
+
metric_key,
|
|
121
|
+
description,
|
|
122
|
+
labelnames=labelnames,
|
|
123
|
+
registry=self._registry,
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError(f"Unsupported metric type: {metric_type}")
|
|
127
|
+
|
|
128
|
+
metric = PrometheusMetric(
|
|
129
|
+
prometheus_metric, metric_type, self.config.cardinality_limit
|
|
130
|
+
)
|
|
131
|
+
self._metrics_cache[metric_key] = metric
|
|
132
|
+
return metric
|
|
133
|
+
|
|
134
|
+
def export_metrics(self) -> str:
|
|
135
|
+
"""Export metrics in Prometheus format"""
|
|
136
|
+
result: bytes = generate_latest(self._registry)
|
|
137
|
+
return result.decode("utf-8")
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import time
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from opentelemetry import trace
|
|
9
|
+
from opentelemetry.exporter.jaeger.thrift import JaegerExporter
|
|
10
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
11
|
+
from opentelemetry.sdk.resources import Resource
|
|
12
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
13
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
|
|
14
|
+
from opentelemetry.trace import Status, StatusCode
|
|
15
|
+
|
|
16
|
+
_OPENTELEMETRY_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
# Create mock classes for when OpenTelemetry is not available
|
|
19
|
+
trace = None
|
|
20
|
+
JaegerExporter = None
|
|
21
|
+
OTLPSpanExporter = None
|
|
22
|
+
Resource = None
|
|
23
|
+
TracerProvider = None
|
|
24
|
+
BatchSpanProcessor = None
|
|
25
|
+
ConsoleSpanExporter = None
|
|
26
|
+
Status = None
|
|
27
|
+
StatusCode = None
|
|
28
|
+
_OPENTELEMETRY_AVAILABLE = False
|
|
29
|
+
|
|
30
|
+
from .config import TracingConfig
|
|
31
|
+
from .interfaces import Span, SpanContext, SpanType, TracingProvider
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class OpenTelemetrySpan(Span):
|
|
35
|
+
"""OpenTelemetry span implementation"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, otel_span: Any, span_context: SpanContext):
|
|
38
|
+
self._span = otel_span
|
|
39
|
+
self._context = span_context
|
|
40
|
+
self._start_time = time.time()
|
|
41
|
+
|
|
42
|
+
# Set workflow context attributes if OpenTelemetry is available
|
|
43
|
+
if _OPENTELEMETRY_AVAILABLE and self._span:
|
|
44
|
+
if span_context.workflow_id:
|
|
45
|
+
self._span.set_attribute("workflow.id", span_context.workflow_id)
|
|
46
|
+
if span_context.agent_name:
|
|
47
|
+
self._span.set_attribute("agent.name", span_context.agent_name)
|
|
48
|
+
if span_context.state_name:
|
|
49
|
+
self._span.set_attribute("state.name", span_context.state_name)
|
|
50
|
+
if span_context.user_id:
|
|
51
|
+
self._span.set_attribute("user.id", span_context.user_id)
|
|
52
|
+
|
|
53
|
+
def set_attribute(self, key: str, value: Any) -> None:
|
|
54
|
+
"""Set span attribute"""
|
|
55
|
+
if _OPENTELEMETRY_AVAILABLE and self._span and key and value is not None:
|
|
56
|
+
if isinstance(value, (dict, list)):
|
|
57
|
+
value = str(value)
|
|
58
|
+
self._span.set_attribute(key, value)
|
|
59
|
+
|
|
60
|
+
def set_status(self, status: str, description: Optional[str] = None) -> None:
|
|
61
|
+
"""Set span status"""
|
|
62
|
+
if _OPENTELEMETRY_AVAILABLE and self._span:
|
|
63
|
+
if status.lower() in ["ok", "success"]:
|
|
64
|
+
self._span.set_status(Status(StatusCode.OK, description))
|
|
65
|
+
elif status.lower() in ["error", "failed"]:
|
|
66
|
+
self._span.set_status(Status(StatusCode.ERROR, description))
|
|
67
|
+
|
|
68
|
+
def add_event(self, name: str, attributes: Optional[dict[str, Any]] = None) -> None:
|
|
69
|
+
"""Add event to span"""
|
|
70
|
+
if _OPENTELEMETRY_AVAILABLE and self._span:
|
|
71
|
+
event_attrs = attributes or {}
|
|
72
|
+
event_attrs = {k: v for k, v in event_attrs.items() if v is not None}
|
|
73
|
+
self._span.add_event(name, event_attrs)
|
|
74
|
+
|
|
75
|
+
def record_exception(self, exception: Exception) -> None:
|
|
76
|
+
"""Record exception in span"""
|
|
77
|
+
if _OPENTELEMETRY_AVAILABLE and self._span:
|
|
78
|
+
self._span.record_exception(exception)
|
|
79
|
+
self.set_status("error", str(exception))
|
|
80
|
+
|
|
81
|
+
def end(self) -> None:
|
|
82
|
+
"""End span"""
|
|
83
|
+
duration = time.time() - self._start_time
|
|
84
|
+
self.set_attribute("span.duration_ms", duration * 1000)
|
|
85
|
+
if _OPENTELEMETRY_AVAILABLE and self._span:
|
|
86
|
+
self._span.end()
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def context(self) -> SpanContext:
|
|
90
|
+
"""Get span context"""
|
|
91
|
+
return self._context
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class OpenTelemetryTracingProvider(TracingProvider):
|
|
95
|
+
"""OpenTelemetry tracing provider"""
|
|
96
|
+
|
|
97
|
+
def __init__(self, config: TracingConfig):
|
|
98
|
+
self.config = config
|
|
99
|
+
self._current_context = threading.local()
|
|
100
|
+
self._tracer: Any = None
|
|
101
|
+
if _OPENTELEMETRY_AVAILABLE:
|
|
102
|
+
self._setup_tracing()
|
|
103
|
+
|
|
104
|
+
def _setup_tracing(self) -> None:
|
|
105
|
+
"""Setup OpenTelemetry tracing"""
|
|
106
|
+
if not _OPENTELEMETRY_AVAILABLE:
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
resource = Resource.create(
|
|
110
|
+
{
|
|
111
|
+
"service.name": self.config.service_name,
|
|
112
|
+
"service.version": self.config.service_version,
|
|
113
|
+
}
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
provider = TracerProvider(resource=resource)
|
|
117
|
+
trace.set_tracer_provider(provider)
|
|
118
|
+
|
|
119
|
+
# Setup exporters
|
|
120
|
+
processors = []
|
|
121
|
+
|
|
122
|
+
if self.config.otlp_endpoint:
|
|
123
|
+
otlp_exporter = OTLPSpanExporter(endpoint=self.config.otlp_endpoint)
|
|
124
|
+
processors.append(BatchSpanProcessor(otlp_exporter))
|
|
125
|
+
|
|
126
|
+
if self.config.jaeger_endpoint:
|
|
127
|
+
jaeger_exporter = JaegerExporter(
|
|
128
|
+
agent_host_name=self.config.jaeger_endpoint.split(":")[0],
|
|
129
|
+
agent_port=(
|
|
130
|
+
int(self.config.jaeger_endpoint.split(":")[1])
|
|
131
|
+
if ":" in self.config.jaeger_endpoint
|
|
132
|
+
else 6831
|
|
133
|
+
),
|
|
134
|
+
)
|
|
135
|
+
processors.append(BatchSpanProcessor(jaeger_exporter))
|
|
136
|
+
|
|
137
|
+
if self.config.console_enabled:
|
|
138
|
+
console_exporter = ConsoleSpanExporter()
|
|
139
|
+
processors.append(BatchSpanProcessor(console_exporter))
|
|
140
|
+
|
|
141
|
+
for processor in processors:
|
|
142
|
+
provider.add_span_processor(processor)
|
|
143
|
+
|
|
144
|
+
self._tracer = trace.get_tracer(
|
|
145
|
+
instrumenting_module_name="puffinflow.observability",
|
|
146
|
+
instrumenting_library_version="1.0.0",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def start_span(
|
|
150
|
+
self,
|
|
151
|
+
name: str,
|
|
152
|
+
span_type: SpanType = SpanType.SYSTEM,
|
|
153
|
+
parent: Optional[SpanContext] = None,
|
|
154
|
+
**attributes: Any,
|
|
155
|
+
) -> Span:
|
|
156
|
+
"""Start new span"""
|
|
157
|
+
# Create span context
|
|
158
|
+
if parent:
|
|
159
|
+
span_context = parent.child_context()
|
|
160
|
+
else:
|
|
161
|
+
current_span = self.get_current_span()
|
|
162
|
+
if current_span:
|
|
163
|
+
span_context = current_span.context.child_context()
|
|
164
|
+
else:
|
|
165
|
+
span_context = SpanContext()
|
|
166
|
+
|
|
167
|
+
# Start OpenTelemetry span if available
|
|
168
|
+
otel_span = None
|
|
169
|
+
if _OPENTELEMETRY_AVAILABLE and self._tracer:
|
|
170
|
+
otel_span = self._tracer.start_span(name)
|
|
171
|
+
|
|
172
|
+
# Create wrapper
|
|
173
|
+
span = OpenTelemetrySpan(otel_span, span_context)
|
|
174
|
+
|
|
175
|
+
# Set additional attributes
|
|
176
|
+
span.set_attribute("span.type", span_type.value)
|
|
177
|
+
for key, value in attributes.items():
|
|
178
|
+
span.set_attribute(key, value)
|
|
179
|
+
|
|
180
|
+
self._set_current_span(span)
|
|
181
|
+
return span
|
|
182
|
+
|
|
183
|
+
def get_current_span(self) -> Optional[Span]:
|
|
184
|
+
"""Get current active span"""
|
|
185
|
+
return getattr(self._current_context, "current_span", None)
|
|
186
|
+
|
|
187
|
+
def _set_current_span(self, span: Optional[Span]) -> None:
|
|
188
|
+
"""Set current span in context"""
|
|
189
|
+
self._current_context.current_span = span
|
|
190
|
+
|
|
191
|
+
@contextmanager
|
|
192
|
+
def span(
|
|
193
|
+
self,
|
|
194
|
+
name: str,
|
|
195
|
+
span_type: SpanType = SpanType.SYSTEM,
|
|
196
|
+
parent: Optional[SpanContext] = None,
|
|
197
|
+
**attributes: Any,
|
|
198
|
+
) -> Iterator[Span]:
|
|
199
|
+
"""Context manager for spans"""
|
|
200
|
+
span = self.start_span(name, span_type, parent, **attributes)
|
|
201
|
+
try:
|
|
202
|
+
yield span
|
|
203
|
+
span.set_status("ok")
|
|
204
|
+
except Exception as e:
|
|
205
|
+
span.record_exception(e)
|
|
206
|
+
raise
|
|
207
|
+
finally:
|
|
208
|
+
span.end()
|
|
209
|
+
self._set_current_span(None)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Reliability patterns for production workflows."""
|
|
2
|
+
|
|
3
|
+
# Import submodules for import path tests
|
|
4
|
+
from . import bulkhead, circuit_breaker, leak_detector
|
|
5
|
+
from .bulkhead import Bulkhead, BulkheadConfig, BulkheadFullError
|
|
6
|
+
from .circuit_breaker import (
|
|
7
|
+
CircuitBreaker,
|
|
8
|
+
CircuitBreakerConfig,
|
|
9
|
+
CircuitBreakerError,
|
|
10
|
+
CircuitState,
|
|
11
|
+
)
|
|
12
|
+
from .leak_detector import ResourceLeak, ResourceLeakDetector
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"Bulkhead",
|
|
16
|
+
"BulkheadConfig",
|
|
17
|
+
"BulkheadFullError",
|
|
18
|
+
"CircuitBreaker",
|
|
19
|
+
"CircuitBreakerConfig",
|
|
20
|
+
"CircuitBreakerError",
|
|
21
|
+
"CircuitState",
|
|
22
|
+
"ResourceLeak",
|
|
23
|
+
"ResourceLeakDetector",
|
|
24
|
+
"bulkhead",
|
|
25
|
+
"circuit_breaker",
|
|
26
|
+
"leak_detector",
|
|
27
|
+
]
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Bulkhead pattern for resource isolation."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from collections.abc import AsyncIterator
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class BulkheadConfig:
|
|
12
|
+
name: str
|
|
13
|
+
max_concurrent: int
|
|
14
|
+
max_queue_size: int = 100
|
|
15
|
+
timeout: float = 30.0
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BulkheadFullError(Exception):
|
|
19
|
+
"""Raised when bulkhead is at capacity"""
|
|
20
|
+
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Bulkhead:
|
|
25
|
+
"""Isolate resources to prevent cascading failures"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, config: BulkheadConfig):
|
|
28
|
+
self.config = config
|
|
29
|
+
self._semaphore = asyncio.Semaphore(config.max_concurrent)
|
|
30
|
+
self._queue_size = 0
|
|
31
|
+
self._active_tasks: set[asyncio.Task] = set()
|
|
32
|
+
|
|
33
|
+
@asynccontextmanager
|
|
34
|
+
async def isolate(self) -> AsyncIterator[None]:
|
|
35
|
+
"""Execute function within bulkhead constraints"""
|
|
36
|
+
# Check queue capacity
|
|
37
|
+
if self._queue_size >= self.config.max_queue_size:
|
|
38
|
+
raise BulkheadFullError(f"Bulkhead {self.config.name} queue full")
|
|
39
|
+
|
|
40
|
+
self._queue_size += 1
|
|
41
|
+
try:
|
|
42
|
+
# Wait for semaphore with timeout
|
|
43
|
+
try:
|
|
44
|
+
await asyncio.wait_for(
|
|
45
|
+
self._semaphore.acquire(), timeout=self.config.timeout
|
|
46
|
+
)
|
|
47
|
+
except asyncio.TimeoutError as e:
|
|
48
|
+
raise BulkheadFullError(
|
|
49
|
+
f"Bulkhead {self.config.name} timeout waiting for slot"
|
|
50
|
+
) from e
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
yield
|
|
54
|
+
finally:
|
|
55
|
+
self._semaphore.release()
|
|
56
|
+
finally:
|
|
57
|
+
self._queue_size -= 1
|
|
58
|
+
|
|
59
|
+
def get_metrics(self) -> dict[str, Any]:
|
|
60
|
+
"""Get bulkhead metrics"""
|
|
61
|
+
return {
|
|
62
|
+
"name": self.config.name,
|
|
63
|
+
"max_concurrent": self.config.max_concurrent,
|
|
64
|
+
"available_slots": self._semaphore._value,
|
|
65
|
+
"queue_size": self._queue_size,
|
|
66
|
+
"max_queue_size": self.config.max_queue_size,
|
|
67
|
+
"active_tasks": len(self._active_tasks),
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# Global bulkhead registry
|
|
72
|
+
class BulkheadRegistry:
|
|
73
|
+
"""Simple registry for bulkheads"""
|
|
74
|
+
|
|
75
|
+
def __init__(self) -> None:
|
|
76
|
+
self._bulkheads: dict[str, Bulkhead] = {}
|
|
77
|
+
|
|
78
|
+
def get_or_create(
|
|
79
|
+
self, name: str, config: Optional[BulkheadConfig] = None
|
|
80
|
+
) -> Bulkhead:
|
|
81
|
+
"""Get existing or create new bulkhead"""
|
|
82
|
+
if name not in self._bulkheads:
|
|
83
|
+
if config is None:
|
|
84
|
+
config = BulkheadConfig(name=name, max_concurrent=5)
|
|
85
|
+
self._bulkheads[name] = Bulkhead(config)
|
|
86
|
+
return self._bulkheads[name]
|
|
87
|
+
|
|
88
|
+
def get_all_metrics(self) -> dict[str, dict[str, Any]]:
|
|
89
|
+
"""Get metrics for all bulkheads"""
|
|
90
|
+
return {
|
|
91
|
+
name: bulkhead.get_metrics() for name, bulkhead in self._bulkheads.items()
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# Global registry instance
|
|
96
|
+
bulkhead_registry = BulkheadRegistry()
|