ecip-observability-stack 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +48 -0
- package/README.md +75 -0
- package/alerts/analysis-backlog.yaml +39 -0
- package/alerts/cache-degradation.yaml +44 -0
- package/alerts/dlq-depth.yaml +56 -0
- package/alerts/lsp-daemon.yaml +43 -0
- package/alerts/mcp-latency.yaml +46 -0
- package/alerts/security-anomaly.yaml +59 -0
- package/alerts/sla-latency.yaml +61 -0
- package/chaos/kafka-broker-restart.sh +168 -0
- package/chaos/kill-lsp-daemon.sh +148 -0
- package/chaos/redis-node-failure.sh +318 -0
- package/ci/check-observability-contract.js +285 -0
- package/ci/eslint-plugin-ecip/index.js +209 -0
- package/ci/eslint-plugin-ecip/package.json +12 -0
- package/ci/github-actions-observability-gate.yaml +180 -0
- package/ci/ruff-shared.toml +41 -0
- package/collector/otel-collector-config.yaml +226 -0
- package/collector/otel-collector-daemonset.yaml +168 -0
- package/collector/sampling-config.yaml +83 -0
- package/dashboards/_provisioning/grafana-dashboards.yaml +16 -0
- package/dashboards/analysis-throughput.json +166 -0
- package/dashboards/cache-performance.json +129 -0
- package/dashboards/cross-repo-fanout.json +93 -0
- package/dashboards/event-bus-dlq.json +129 -0
- package/dashboards/lsp-daemon-health.json +104 -0
- package/dashboards/mcp-call-graph.json +114 -0
- package/dashboards/query-latency.json +160 -0
- package/dashboards/security-events.json +131 -0
- package/docs/M08-Observability-Design.md +639 -0
- package/docs/PROGRESS.md +375 -0
- package/docs/module-documentation.md +64 -0
- package/elasticsearch/ilm-policy.json +57 -0
- package/elasticsearch/index-template.json +62 -0
- package/elasticsearch/kibana-space.yaml +53 -0
- package/helm/Chart.yaml +30 -0
- package/helm/templates/configmaps.yaml +25 -0
- package/helm/templates/elasticsearch.yaml +68 -0
- package/helm/templates/grafana-secret.yaml +22 -0
- package/helm/templates/grafana.yaml +19 -0
- package/helm/templates/loki.yaml +33 -0
- package/helm/templates/otel-collector.yaml +119 -0
- package/helm/templates/prometheus.yaml +43 -0
- package/helm/templates/tempo.yaml +16 -0
- package/helm/values.prod.yaml +159 -0
- package/helm/values.yaml +146 -0
- package/logging-lib/nodejs/package.json +57 -0
- package/logging-lib/nodejs/pnpm-lock.yaml +4576 -0
- package/logging-lib/python/pyproject.toml +45 -0
- package/logging-lib/python/src/__init__.py +19 -0
- package/logging-lib/python/src/logger.py +131 -0
- package/logging-lib/python/src/security_events.py +150 -0
- package/logging-lib/python/src/tracer.py +185 -0
- package/logging-lib/python/tests/test_logger.py +113 -0
- package/package.json +21 -0
- package/prometheus/prometheus-values.yaml +170 -0
- package/prometheus/recording-rules.yaml +97 -0
- package/prometheus/scrape-configs.yaml +122 -0
- package/runbooks/SDK-INTEGRATION.md +239 -0
- package/runbooks/alert-response/ANALYSIS_BACKLOG.md +128 -0
- package/runbooks/alert-response/DLQ_DEPTH_EXCEEDED.md +150 -0
- package/runbooks/alert-response/HIGH_QUERY_LATENCY.md +134 -0
- package/runbooks/alert-response/LSP_DAEMON_RESTART.md +118 -0
- package/runbooks/alert-response/SECURITY_ANOMALY.md +160 -0
- package/runbooks/dashboard-guide.md +169 -0
- package/scripts/lint-dashboards.js +184 -0
- package/tempo/tempo-datasource.yaml +46 -0
- package/tempo/tempo-values.yaml +94 -0
- package/tests/alert-threshold-config.test.ts +283 -0
- package/tests/log-schema-validation.test.ts +246 -0
- package/tests/metric-label-validation.test.ts +292 -0
- package/tests/otel-pipeline-integration.test.ts +420 -0
- package/tests/security-events.test.ts +417 -0
- package/tsconfig.json +17 -0
- package/vitest.config.ts +21 -0
- package/vitest.integration.config.ts +9 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ecip-observability"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "ECIP shared observability library — structured logging, tracing, and security event helpers for Python services"
|
|
9
|
+
authors = [
|
|
10
|
+
{ name = "ECIP Platform Team" }
|
|
11
|
+
]
|
|
12
|
+
requires-python = ">=3.11"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"structlog>=23.2.0",
|
|
15
|
+
"opentelemetry-api>=1.22.0",
|
|
16
|
+
"opentelemetry-sdk>=1.22.0",
|
|
17
|
+
"opentelemetry-exporter-otlp-proto-grpc>=1.22.0",
|
|
18
|
+
"opentelemetry-exporter-otlp-proto-http>=1.22.0",
|
|
19
|
+
"opentelemetry-instrumentation>=0.43b0",
|
|
20
|
+
"opentelemetry-instrumentation-grpc>=0.43b0",
|
|
21
|
+
"opentelemetry-instrumentation-requests>=0.43b0",
|
|
22
|
+
"opentelemetry-instrumentation-aiohttp-client>=0.43b0",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
dev = [
|
|
27
|
+
"pytest>=7.4.0",
|
|
28
|
+
"pytest-asyncio>=0.23.0",
|
|
29
|
+
"ruff>=0.2.0",
|
|
30
|
+
"mypy>=1.8.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[tool.setuptools.packages.find]
|
|
34
|
+
where = ["src"]
|
|
35
|
+
|
|
36
|
+
[tool.ruff]
|
|
37
|
+
target-version = "py311"
|
|
38
|
+
line-length = 120
|
|
39
|
+
|
|
40
|
+
[tool.ruff.lint]
|
|
41
|
+
select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
|
|
42
|
+
|
|
43
|
+
[tool.mypy]
|
|
44
|
+
python_version = "3.11"
|
|
45
|
+
strict = true
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ecip_observability — Package init
|
|
3
|
+
|
|
4
|
+
Re-exports all public APIs.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from .logger import get_logger, MissingObservabilityContext
|
|
8
|
+
from .tracer import init_tracer, get_tracer, traced
|
|
9
|
+
from .security_events import emit_auth_failure, emit_rbac_denial
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"get_logger",
|
|
13
|
+
"MissingObservabilityContext",
|
|
14
|
+
"init_tracer",
|
|
15
|
+
"get_tracer",
|
|
16
|
+
"traced",
|
|
17
|
+
"emit_auth_failure",
|
|
18
|
+
"emit_rbac_denial",
|
|
19
|
+
]
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ecip_observability — Structured Logger (Python)
|
|
3
|
+
|
|
4
|
+
structlog-based logger with mandatory ECIP context fields.
|
|
5
|
+
Missing fields raise MissingObservabilityContext at logger creation time.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from ecip_observability import get_logger
|
|
9
|
+
log = get_logger(repo="acme/auth", branch="main", user_id="u_abc", module="M02")
|
|
10
|
+
log.info("Analysis complete", duration_ms=14200, files_indexed=47)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
import structlog
|
|
22
|
+
from opentelemetry import trace
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
# Exceptions
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
|
|
29
|
+
class MissingObservabilityContext(Exception):
|
|
30
|
+
"""Raised when required ECIP observability fields are missing."""
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# Valid module identifiers
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
VALID_MODULES = {"M01", "M02", "M03", "M04", "M05", "M06", "M07", "M08"}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# structlog configuration
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
def _configure_structlog() -> None:
|
|
46
|
+
"""Configure structlog for JSON output with ECIP mandatory fields."""
|
|
47
|
+
structlog.configure(
|
|
48
|
+
processors=[
|
|
49
|
+
structlog.contextvars.merge_contextvars,
|
|
50
|
+
structlog.processors.add_log_level,
|
|
51
|
+
structlog.processors.TimeStamper(fmt="iso", utc=True),
|
|
52
|
+
_add_trace_context,
|
|
53
|
+
structlog.processors.StackInfoRenderer(),
|
|
54
|
+
structlog.processors.format_exc_info,
|
|
55
|
+
structlog.processors.UnicodeDecoder(),
|
|
56
|
+
structlog.processors.JSONRenderer(),
|
|
57
|
+
],
|
|
58
|
+
wrapper_class=structlog.make_filtering_bound_logger(
|
|
59
|
+
logging.getLevelName(os.environ.get("LOG_LEVEL", "INFO").upper())
|
|
60
|
+
),
|
|
61
|
+
context_class=dict,
|
|
62
|
+
logger_factory=structlog.PrintLoggerFactory(file=sys.stdout),
|
|
63
|
+
cache_logger_on_first_use=True,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _add_trace_context(
|
|
68
|
+
logger: Any, method_name: str, event_dict: dict[str, Any]
|
|
69
|
+
) -> dict[str, Any]:
|
|
70
|
+
"""Add OpenTelemetry trace_id and span_id to every log entry."""
|
|
71
|
+
span = trace.get_current_span()
|
|
72
|
+
if span and span.is_recording():
|
|
73
|
+
ctx = span.get_span_context()
|
|
74
|
+
event_dict["trace_id"] = format(ctx.trace_id, "032x")
|
|
75
|
+
event_dict["span_id"] = format(ctx.span_id, "016x")
|
|
76
|
+
else:
|
|
77
|
+
event_dict["trace_id"] = "no-active-trace"
|
|
78
|
+
event_dict["span_id"] = "no-active-span"
|
|
79
|
+
return event_dict
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# Initialize on module import
|
|
83
|
+
_configure_structlog()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ---------------------------------------------------------------------------
|
|
87
|
+
# Logger factory
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
def get_logger(
|
|
91
|
+
*,
|
|
92
|
+
repo: str,
|
|
93
|
+
branch: str,
|
|
94
|
+
user_id: str,
|
|
95
|
+
module: str,
|
|
96
|
+
) -> structlog.BoundLogger:
|
|
97
|
+
"""
|
|
98
|
+
Create a structured ECIP logger with mandatory context fields.
|
|
99
|
+
|
|
100
|
+
All fields are required — omitting any raises MissingObservabilityContext.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
repo: Repository in the form {org}/{repo}
|
|
104
|
+
branch: Branch being operated on
|
|
105
|
+
user_id: Hashed user identifier (no raw PII)
|
|
106
|
+
module: ECIP module identifier (M01-M08)
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
A structlog BoundLogger with ECIP context bound
|
|
110
|
+
"""
|
|
111
|
+
# Validate mandatory fields
|
|
112
|
+
if not repo:
|
|
113
|
+
raise MissingObservabilityContext("'repo' is required")
|
|
114
|
+
if not branch:
|
|
115
|
+
raise MissingObservabilityContext("'branch' is required")
|
|
116
|
+
if not user_id:
|
|
117
|
+
raise MissingObservabilityContext("'user_id' is required")
|
|
118
|
+
if not module:
|
|
119
|
+
raise MissingObservabilityContext("'module' is required")
|
|
120
|
+
if module not in VALID_MODULES:
|
|
121
|
+
raise MissingObservabilityContext(
|
|
122
|
+
f"'module' must be one of {VALID_MODULES}, got '{module}'"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return structlog.get_logger().bind(
|
|
126
|
+
repo=repo,
|
|
127
|
+
branch=branch,
|
|
128
|
+
user_id=user_id,
|
|
129
|
+
module=module,
|
|
130
|
+
env=os.environ.get("NODE_ENV", os.environ.get("ENVIRONMENT", "development")),
|
|
131
|
+
)
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ecip_observability — Security Event Helpers (Python)
|
|
3
|
+
|
|
4
|
+
Security events route to a dedicated Elasticsearch pipeline — NEVER to
|
|
5
|
+
the general log store. Use these helpers exclusively for auth/RBAC events.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from ecip_observability import emit_auth_failure, emit_rbac_denial
|
|
9
|
+
|
|
10
|
+
emit_auth_failure(
|
|
11
|
+
user_id="u_abc",
|
|
12
|
+
reason="jwt_expired",
|
|
13
|
+
source_ip="10.0.14.22",
|
|
14
|
+
module="M02",
|
|
15
|
+
)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import hashlib
|
|
21
|
+
import json
|
|
22
|
+
import sys
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from typing import Any, Literal
|
|
25
|
+
|
|
26
|
+
from opentelemetry import trace
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
# Types
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
AuthFailureReason = Literal["jwt_expired", "jwt_invalid", "jwt_missing", "mtls_rejected"]
|
|
34
|
+
RbacAction = Literal["read", "write", "admin"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------------------
|
|
38
|
+
# Internal helpers
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
def _hash_user_id(user_id: str) -> str:
|
|
42
|
+
"""Hash a user ID to prevent raw PII in security logs."""
|
|
43
|
+
if user_id.startswith("u_"):
|
|
44
|
+
return user_id
|
|
45
|
+
hash_hex = hashlib.sha256(user_id.encode()).hexdigest()[:12]
|
|
46
|
+
return f"u_{hash_hex}"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _get_trace_id() -> str:
|
|
50
|
+
"""Get the current trace ID for correlation."""
|
|
51
|
+
span = trace.get_current_span()
|
|
52
|
+
if span and span.is_recording():
|
|
53
|
+
ctx = span.get_span_context()
|
|
54
|
+
return format(ctx.trace_id, "032x")
|
|
55
|
+
return "no-active-trace"
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _emit_event(event: dict[str, Any]) -> None:
|
|
59
|
+
"""
|
|
60
|
+
Emit a security event to stderr as JSON.
|
|
61
|
+
The OTel Collector's filelog receiver picks this up and routes it
|
|
62
|
+
through the security pipeline to Elasticsearch.
|
|
63
|
+
"""
|
|
64
|
+
sys.stderr.write(json.dumps(event, default=str) + "\n")
|
|
65
|
+
sys.stderr.flush()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
# Public API
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
def emit_auth_failure(
|
|
73
|
+
*,
|
|
74
|
+
user_id: str,
|
|
75
|
+
reason: AuthFailureReason,
|
|
76
|
+
source_ip: str,
|
|
77
|
+
module: str,
|
|
78
|
+
metadata: dict[str, Any] | None = None,
|
|
79
|
+
) -> None:
|
|
80
|
+
"""
|
|
81
|
+
Emit an authentication failure security event.
|
|
82
|
+
|
|
83
|
+
Routes to Elasticsearch via the dedicated security event pipeline.
|
|
84
|
+
NEVER use the general logger for this.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
user_id: User identifier (will be hashed if not already)
|
|
88
|
+
reason: Failure reason
|
|
89
|
+
source_ip: Source IP address of the request
|
|
90
|
+
module: ECIP module that detected the failure
|
|
91
|
+
metadata: Optional additional context
|
|
92
|
+
"""
|
|
93
|
+
event = {
|
|
94
|
+
"@timestamp": datetime.now(timezone.utc).isoformat(),
|
|
95
|
+
"event.kind": "event",
|
|
96
|
+
"event.category": "authentication",
|
|
97
|
+
"event.type": "denied",
|
|
98
|
+
"event.outcome": "failure",
|
|
99
|
+
"trace.id": _get_trace_id(),
|
|
100
|
+
"user.id": _hash_user_id(user_id),
|
|
101
|
+
"source.ip": source_ip,
|
|
102
|
+
"reason": reason,
|
|
103
|
+
"module": module,
|
|
104
|
+
}
|
|
105
|
+
if metadata:
|
|
106
|
+
event["metadata"] = metadata
|
|
107
|
+
|
|
108
|
+
_emit_event(event)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def emit_rbac_denial(
|
|
112
|
+
*,
|
|
113
|
+
user_id: str,
|
|
114
|
+
resource: str,
|
|
115
|
+
action: RbacAction,
|
|
116
|
+
reason: str,
|
|
117
|
+
module: str,
|
|
118
|
+
metadata: dict[str, Any] | None = None,
|
|
119
|
+
) -> None:
|
|
120
|
+
"""
|
|
121
|
+
Emit an RBAC denial security event.
|
|
122
|
+
|
|
123
|
+
Routes to Elasticsearch via the dedicated security event pipeline.
|
|
124
|
+
NEVER use the general logger for this.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
user_id: User identifier (will be hashed if not already)
|
|
128
|
+
resource: Resource that was being accessed
|
|
129
|
+
action: Action attempted
|
|
130
|
+
reason: Denial reason
|
|
131
|
+
module: ECIP module that denied access
|
|
132
|
+
metadata: Optional additional context
|
|
133
|
+
"""
|
|
134
|
+
event = {
|
|
135
|
+
"@timestamp": datetime.now(timezone.utc).isoformat(),
|
|
136
|
+
"event.kind": "event",
|
|
137
|
+
"event.category": "authorization",
|
|
138
|
+
"event.type": "denied",
|
|
139
|
+
"event.outcome": "failure",
|
|
140
|
+
"trace.id": _get_trace_id(),
|
|
141
|
+
"user.id": _hash_user_id(user_id),
|
|
142
|
+
"resource": resource,
|
|
143
|
+
"action": action,
|
|
144
|
+
"reason": reason,
|
|
145
|
+
"module": module,
|
|
146
|
+
}
|
|
147
|
+
if metadata:
|
|
148
|
+
event["metadata"] = metadata
|
|
149
|
+
|
|
150
|
+
_emit_event(event)
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ecip_observability — Distributed Tracer (Python)
|
|
3
|
+
|
|
4
|
+
Initializes the OpenTelemetry Python SDK and provides a @traced decorator
|
|
5
|
+
for automatic span creation.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from ecip_observability import init_tracer, traced
|
|
9
|
+
|
|
10
|
+
init_tracer(service_name="ecip-analysis-engine")
|
|
11
|
+
|
|
12
|
+
@traced(name="lsp.symbol_extraction")
|
|
13
|
+
def extract_symbols(file_path: str) -> list:
|
|
14
|
+
... # span automatically started/ended; exceptions auto-captured
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import functools
|
|
20
|
+
import os
|
|
21
|
+
from typing import Any, Callable, TypeVar, ParamSpec
|
|
22
|
+
|
|
23
|
+
from opentelemetry import trace
|
|
24
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
25
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
26
|
+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
27
|
+
from opentelemetry.sdk.resources import Resource, SERVICE_NAME, SERVICE_VERSION
|
|
28
|
+
from opentelemetry.trace import StatusCode, Span
|
|
29
|
+
from opentelemetry.instrumentation.grpc import GrpcInstrumentorClient, GrpcInstrumentorServer
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Types
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
P = ParamSpec("P")
|
|
37
|
+
T = TypeVar("T")
|
|
38
|
+
|
|
39
|
+
_initialized = False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
# SDK Initialization
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
def init_tracer(
|
|
47
|
+
*,
|
|
48
|
+
service_name: str,
|
|
49
|
+
service_version: str = "0.0.0",
|
|
50
|
+
otlp_endpoint: str | None = None,
|
|
51
|
+
environment: str | None = None,
|
|
52
|
+
resource_attributes: dict[str, str] | None = None,
|
|
53
|
+
) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Initialize the OpenTelemetry Python SDK.
|
|
56
|
+
|
|
57
|
+
Must be called once at process entry, before any other imports or
|
|
58
|
+
server initialization.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
service_name: Service name (e.g., 'ecip-analysis-engine')
|
|
62
|
+
service_version: Service version string
|
|
63
|
+
otlp_endpoint: OTLP collector endpoint (default from env)
|
|
64
|
+
environment: Deployment environment (default from env)
|
|
65
|
+
resource_attributes: Additional resource attributes
|
|
66
|
+
"""
|
|
67
|
+
global _initialized
|
|
68
|
+
if _initialized:
|
|
69
|
+
import warnings
|
|
70
|
+
warnings.warn("Tracer already initialized — skipping re-initialization", stacklevel=2)
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
endpoint = (
|
|
74
|
+
otlp_endpoint
|
|
75
|
+
or os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
76
|
+
or "http://otel-collector.monitoring:4317"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
env = environment or os.environ.get("ENVIRONMENT", "development")
|
|
80
|
+
|
|
81
|
+
attrs: dict[str, str] = {
|
|
82
|
+
SERVICE_NAME: service_name,
|
|
83
|
+
SERVICE_VERSION: service_version,
|
|
84
|
+
"deployment.environment": env,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
# Parse OTEL_RESOURCE_ATTRIBUTES env var
|
|
88
|
+
otel_attrs = os.environ.get("OTEL_RESOURCE_ATTRIBUTES", "")
|
|
89
|
+
for pair in otel_attrs.split(","):
|
|
90
|
+
if "=" in pair:
|
|
91
|
+
key, value = pair.split("=", 1)
|
|
92
|
+
attrs[key.strip()] = value.strip()
|
|
93
|
+
|
|
94
|
+
if resource_attributes:
|
|
95
|
+
attrs.update(resource_attributes)
|
|
96
|
+
|
|
97
|
+
resource = Resource.create(attrs)
|
|
98
|
+
|
|
99
|
+
exporter = OTLPSpanExporter(endpoint=endpoint, insecure=True)
|
|
100
|
+
processor = BatchSpanProcessor(exporter)
|
|
101
|
+
|
|
102
|
+
provider = TracerProvider(resource=resource)
|
|
103
|
+
provider.add_span_processor(processor)
|
|
104
|
+
|
|
105
|
+
trace.set_tracer_provider(provider)
|
|
106
|
+
|
|
107
|
+
# Auto-instrument gRPC
|
|
108
|
+
try:
|
|
109
|
+
GrpcInstrumentorClient().instrument()
|
|
110
|
+
GrpcInstrumentorServer().instrument()
|
|
111
|
+
except Exception:
|
|
112
|
+
pass # gRPC not in use — skip
|
|
113
|
+
|
|
114
|
+
_initialized = True
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def get_tracer(name: str = "ecip-observability") -> trace.Tracer:
|
|
118
|
+
"""Get a tracer instance."""
|
|
119
|
+
return trace.get_tracer(name)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ---------------------------------------------------------------------------
|
|
123
|
+
# @traced decorator
|
|
124
|
+
# ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def traced(
|
|
127
|
+
name: str | None = None,
|
|
128
|
+
attributes: dict[str, str] | None = None,
|
|
129
|
+
) -> Callable[[Callable[P, T]], Callable[P, T]]:
|
|
130
|
+
"""
|
|
131
|
+
Decorator that wraps a function in an OpenTelemetry span.
|
|
132
|
+
|
|
133
|
+
Span is automatically started on function entry and ended on exit.
|
|
134
|
+
Exceptions are recorded and re-raised with span status set to ERROR.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
name: Span name (defaults to function qualified name)
|
|
138
|
+
attributes: Additional span attributes
|
|
139
|
+
|
|
140
|
+
Usage:
|
|
141
|
+
@traced(name="lsp.symbol_extraction")
|
|
142
|
+
def extract_symbols(file_path: str) -> list:
|
|
143
|
+
...
|
|
144
|
+
"""
|
|
145
|
+
def decorator(fn: Callable[P, T]) -> Callable[P, T]:
|
|
146
|
+
span_name = name or f"{fn.__module__}.{fn.__qualname__}"
|
|
147
|
+
|
|
148
|
+
@functools.wraps(fn)
|
|
149
|
+
def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
150
|
+
tracer = get_tracer()
|
|
151
|
+
with tracer.start_as_current_span(span_name) as span:
|
|
152
|
+
if attributes:
|
|
153
|
+
for key, value in attributes.items():
|
|
154
|
+
span.set_attribute(key, value)
|
|
155
|
+
try:
|
|
156
|
+
result = fn(*args, **kwargs)
|
|
157
|
+
span.set_status(StatusCode.OK)
|
|
158
|
+
return result
|
|
159
|
+
except Exception as exc:
|
|
160
|
+
span.set_status(StatusCode.ERROR, str(exc))
|
|
161
|
+
span.record_exception(exc)
|
|
162
|
+
raise
|
|
163
|
+
|
|
164
|
+
@functools.wraps(fn)
|
|
165
|
+
async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
166
|
+
tracer = get_tracer()
|
|
167
|
+
with tracer.start_as_current_span(span_name) as span:
|
|
168
|
+
if attributes:
|
|
169
|
+
for key, value in attributes.items():
|
|
170
|
+
span.set_attribute(key, value)
|
|
171
|
+
try:
|
|
172
|
+
result = await fn(*args, **kwargs) # type: ignore[misc]
|
|
173
|
+
span.set_status(StatusCode.OK)
|
|
174
|
+
return result
|
|
175
|
+
except Exception as exc:
|
|
176
|
+
span.set_status(StatusCode.ERROR, str(exc))
|
|
177
|
+
span.record_exception(exc)
|
|
178
|
+
raise
|
|
179
|
+
|
|
180
|
+
import asyncio
|
|
181
|
+
if asyncio.iscoroutinefunction(fn):
|
|
182
|
+
return async_wrapper # type: ignore[return-value]
|
|
183
|
+
return sync_wrapper # type: ignore[return-value]
|
|
184
|
+
|
|
185
|
+
return decorator
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Tests for the ECIP Python structured logger."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from unittest.mock import patch, MagicMock
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TestGetLogger:
|
|
10
|
+
"""Test the get_logger factory function."""
|
|
11
|
+
|
|
12
|
+
def test_creates_logger_with_valid_context(self):
|
|
13
|
+
from src.logger import get_logger
|
|
14
|
+
|
|
15
|
+
log = get_logger(
|
|
16
|
+
repo="acme-corp/auth-service",
|
|
17
|
+
branch="main",
|
|
18
|
+
user_id="u_8f3a1c",
|
|
19
|
+
module="M02",
|
|
20
|
+
)
|
|
21
|
+
assert log is not None
|
|
22
|
+
|
|
23
|
+
def test_raises_on_missing_repo(self):
|
|
24
|
+
from src.logger import get_logger, MissingObservabilityContext
|
|
25
|
+
|
|
26
|
+
with pytest.raises(MissingObservabilityContext, match="repo"):
|
|
27
|
+
get_logger(repo="", branch="main", user_id="u_abc", module="M02")
|
|
28
|
+
|
|
29
|
+
def test_raises_on_missing_branch(self):
|
|
30
|
+
from src.logger import get_logger, MissingObservabilityContext
|
|
31
|
+
|
|
32
|
+
with pytest.raises(MissingObservabilityContext, match="branch"):
|
|
33
|
+
get_logger(repo="acme/auth", branch="", user_id="u_abc", module="M02")
|
|
34
|
+
|
|
35
|
+
def test_raises_on_missing_user_id(self):
|
|
36
|
+
from src.logger import get_logger, MissingObservabilityContext
|
|
37
|
+
|
|
38
|
+
with pytest.raises(MissingObservabilityContext, match="user_id"):
|
|
39
|
+
get_logger(repo="acme/auth", branch="main", user_id="", module="M02")
|
|
40
|
+
|
|
41
|
+
def test_raises_on_missing_module(self):
|
|
42
|
+
from src.logger import get_logger, MissingObservabilityContext
|
|
43
|
+
|
|
44
|
+
with pytest.raises(MissingObservabilityContext, match="module"):
|
|
45
|
+
get_logger(repo="acme/auth", branch="main", user_id="u_abc", module="")
|
|
46
|
+
|
|
47
|
+
def test_raises_on_invalid_module(self):
|
|
48
|
+
from src.logger import get_logger, MissingObservabilityContext
|
|
49
|
+
|
|
50
|
+
with pytest.raises(MissingObservabilityContext, match="must be one of"):
|
|
51
|
+
get_logger(repo="acme/auth", branch="main", user_id="u_abc", module="M99")
|
|
52
|
+
|
|
53
|
+
def test_all_valid_modules(self):
|
|
54
|
+
from src.logger import get_logger, VALID_MODULES
|
|
55
|
+
|
|
56
|
+
for module in VALID_MODULES:
|
|
57
|
+
log = get_logger(
|
|
58
|
+
repo="test/repo",
|
|
59
|
+
branch="main",
|
|
60
|
+
user_id="u_test",
|
|
61
|
+
module=module,
|
|
62
|
+
)
|
|
63
|
+
assert log is not None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class TestSecurityEvents:
|
|
67
|
+
"""Test security event emission."""
|
|
68
|
+
|
|
69
|
+
def test_emit_auth_failure_format(self, capsys):
|
|
70
|
+
from src.security_events import emit_auth_failure
|
|
71
|
+
|
|
72
|
+
emit_auth_failure(
|
|
73
|
+
user_id="u_abc123",
|
|
74
|
+
reason="jwt_expired",
|
|
75
|
+
source_ip="10.0.14.22",
|
|
76
|
+
module="M01",
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
captured = capsys.readouterr()
|
|
80
|
+
event = json.loads(captured.err.strip())
|
|
81
|
+
assert event["event.category"] == "authentication"
|
|
82
|
+
assert event["event.type"] == "denied"
|
|
83
|
+
assert event["event.outcome"] == "failure"
|
|
84
|
+
assert event["reason"] == "jwt_expired"
|
|
85
|
+
assert event["user.id"] == "u_abc123" # already hashed format
|
|
86
|
+
|
|
87
|
+
def test_emit_rbac_denial_format(self, capsys):
|
|
88
|
+
from src.security_events import emit_rbac_denial
|
|
89
|
+
|
|
90
|
+
emit_rbac_denial(
|
|
91
|
+
user_id="u_abc123",
|
|
92
|
+
resource="acme-corp/auth-service",
|
|
93
|
+
action="read",
|
|
94
|
+
reason="rbac_insufficient_role",
|
|
95
|
+
module="M06",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
captured = capsys.readouterr()
|
|
99
|
+
event = json.loads(captured.err.strip())
|
|
100
|
+
assert event["event.category"] == "authorization"
|
|
101
|
+
assert event["resource"] == "acme-corp/auth-service"
|
|
102
|
+
assert event["action"] == "read"
|
|
103
|
+
|
|
104
|
+
def test_user_id_hashing(self):
|
|
105
|
+
from src.security_events import _hash_user_id
|
|
106
|
+
|
|
107
|
+
# Already hashed — should return as-is
|
|
108
|
+
assert _hash_user_id("u_abc123") == "u_abc123"
|
|
109
|
+
|
|
110
|
+
# Raw ID — should be hashed
|
|
111
|
+
hashed = _hash_user_id("john.doe@example.com")
|
|
112
|
+
assert hashed.startswith("u_")
|
|
113
|
+
assert len(hashed) == 15 # u_ + 12 hex chars
|
package/package.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "ecip-observability-stack",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"private": false,
|
|
5
|
+
"description": "ECIP M08 — Observability Stack: OTel Collector, Prometheus, Tempo, Grafana, Elasticsearch",
|
|
6
|
+
"scripts": {
|
|
7
|
+
"test": "vitest run",
|
|
8
|
+
"test:watch": "vitest",
|
|
9
|
+
"test:coverage": "vitest run --coverage",
|
|
10
|
+
"lint:dashboards": "node scripts/lint-dashboards.js",
|
|
11
|
+
"lint:alerts": "promtool check rules alerts/*.yaml",
|
|
12
|
+
"validate": "npm run test && npm run lint:dashboards && npm run lint:alerts"
|
|
13
|
+
},
|
|
14
|
+
"devDependencies": {
|
|
15
|
+
"typescript": "^5.4.0",
|
|
16
|
+
"vitest": "^1.3.0",
|
|
17
|
+
"@vitest/coverage-v8": "^1.3.0",
|
|
18
|
+
"yaml": "^2.4.0",
|
|
19
|
+
"testcontainers": "^10.7.0"
|
|
20
|
+
}
|
|
21
|
+
}
|