PyPI - guardianhub - Versions diffs - 0.1.88__py3-none-any.whl - Mend

guardianhub 0.1.88__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

guardianhub/__init__.py +29 -0
guardianhub/_version.py +1 -0
guardianhub/agents/runtime.py +12 -0
guardianhub/auth/token_provider.py +22 -0
guardianhub/clients/__init__.py +2 -0
guardianhub/clients/classification_client.py +52 -0
guardianhub/clients/graph_db_client.py +161 -0
guardianhub/clients/langfuse/dataset_client.py +157 -0
guardianhub/clients/langfuse/manager.py +118 -0
guardianhub/clients/langfuse/prompt_client.py +68 -0
guardianhub/clients/langfuse/score_evaluation_client.py +92 -0
guardianhub/clients/langfuse/tracing_client.py +250 -0
guardianhub/clients/langfuse_client.py +63 -0
guardianhub/clients/llm_client.py +144 -0
guardianhub/clients/llm_service.py +295 -0
guardianhub/clients/metadata_extractor_client.py +53 -0
guardianhub/clients/ocr_client.py +81 -0
guardianhub/clients/paperless_client.py +515 -0
guardianhub/clients/registry_client.py +18 -0
guardianhub/clients/text_cleaner_client.py +58 -0
guardianhub/clients/vector_client.py +344 -0
guardianhub/config/__init__.py +0 -0
guardianhub/config/config_development.json +84 -0
guardianhub/config/config_prod.json +39 -0
guardianhub/config/settings.py +221 -0
guardianhub/http/http_client.py +26 -0
guardianhub/logging/__init__.py +2 -0
guardianhub/logging/logging.py +168 -0
guardianhub/logging/logging_filters.py +35 -0
guardianhub/models/__init__.py +0 -0
guardianhub/models/agent_models.py +153 -0
guardianhub/models/base.py +2 -0
guardianhub/models/registry/client.py +16 -0
guardianhub/models/registry/dynamic_loader.py +73 -0
guardianhub/models/registry/loader.py +37 -0
guardianhub/models/registry/registry.py +17 -0
guardianhub/models/registry/signing.py +70 -0
guardianhub/models/template/__init__.py +0 -0
guardianhub/models/template/agent_plan.py +65 -0
guardianhub/models/template/agent_response_evaluation.py +67 -0
guardianhub/models/template/extraction.py +29 -0
guardianhub/models/template/reflection_critique.py +206 -0
guardianhub/models/template/suggestion.py +42 -0
guardianhub/observability/__init__.py +1 -0
guardianhub/observability/instrumentation.py +271 -0
guardianhub/observability/otel_helper.py +43 -0
guardianhub/observability/otel_middlewares.py +73 -0
guardianhub/prompts/base.py +7 -0
guardianhub/prompts/providers/langfuse_provider.py +13 -0
guardianhub/prompts/providers/local_provider.py +22 -0
guardianhub/prompts/registry.py +14 -0
guardianhub/scripts/script.sh +31 -0
guardianhub/services/base.py +15 -0
guardianhub/template/__init__.py +0 -0
guardianhub/tools/gh_registry_cli.py +171 -0
guardianhub/utils/__init__.py +0 -0
guardianhub/utils/app_state.py +74 -0
guardianhub/utils/fastapi_utils.py +152 -0
guardianhub/utils/json_utils.py +137 -0
guardianhub/utils/metrics.py +60 -0
guardianhub-0.1.88.dist-info/METADATA +240 -0
guardianhub-0.1.88.dist-info/RECORD +64 -0
guardianhub-0.1.88.dist-info/WHEEL +4 -0
guardianhub-0.1.88.dist-info/licenses/LICENSE +21 -0

guardianhub/models/template/agent_response_evaluation.py ADDED Viewed

@@ -0,0 +1,67 @@
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List, Optional
+from enum import Enum
+from ..registry.registry import register_model
+class EvaluationMetric(str, Enum):
+    """Standard evaluation metrics."""
+    RELEVANCE = "relevance"
+    GROUNDEDNESS = "groundedness"
+    COHERENCE = "coherence"
+    FLUENCY = "fluency"
+    COMPLETENESS = "completeness"
+    CORRECTNESS = "correctness"
+    SAFETY = "safety"
+# New model for structured LLM output
+@register_model
+class EvaluationScoresModel(BaseModel):
+    """Pydantic model for the structured JSON output expected from the LLM."""
+    relevance: float = Field(..., ge=0.0, le=1.0,
+                             description="Score for how well the response addresses the query (0.0 to 1.0).")
+    groundedness: float = Field(..., ge=0.0, le=1.0,
+                                description="Score for whether the response is supported by the context (0.0 to 1.0).")
+    coherence: float = Field(..., ge=0.0, le=1.0,
+                             description="Score for the logical flow and consistency of the response (0.0 to 1.0).")
+    fluency: float = Field(..., ge=0.0, le=1.0,
+                           description="Score for the readability and grammatical correctness (0.0 to 1.0).")
+    completeness: float = Field(..., ge=0.0, le=1.0,
+                                description="Score for whether the response fully answers all parts of the query (0.0 to 1.0).")
+    # Optional fields for other potential metrics, if the LLM supports them
+    # correctness: Optional[float] = Field(None, ge=0.0, le=1.0, description="Factual accuracy score.")
+    # safety: Optional[float] = Field(None, ge=0.0, le=1.0, description="Safety compliance score.")
+class EvaluationErrorLevel(str, Enum):
+    """Severity levels for evaluation errors."""
+    WARNING = "warning"
+    ERROR = "error"
+    CRITICAL = "critical"
+@register_model
+class EvaluationResult(BaseModel):
+    """Container for evaluation results and metrics."""
+    scores: Dict[EvaluationMetric, float] = Field(default_factory=dict)
+    metadata: Dict[str, Any] = Field(default_factory=dict)
+    error: Optional[str] = None
+    error_level: Optional[EvaluationErrorLevel] = None
+    @property
+    def overall_score(self) -> float:
+        """Calculate an overall score from individual metrics."""
+        if not self.scores:
+            return 0.0
+        return sum(self.scores.values()) / len(self.scores)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert the result to a dictionary."""
+        return {
+            "scores": {k.value: v for k, v in self.scores.items()},
+            "overall_score": self.overall_score,
+            "metadata": self.metadata,
+            "error": self.error,
+            "error_level": self.error_level.value if self.error_level else None
+        }

guardianhub/models/template/extraction.py ADDED Viewed

@@ -0,0 +1,29 @@
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List, Optional
+from ..registry.registry import register_model
+@register_model
+class StructuredExtractionResult(BaseModel):
+    """
+    Unified result model for document classification and metadata extraction.
+    The LLM must populate the 'document_type' and 'metadata' fields.
+    """
+    document_type: str = Field(
+        ...,
+        description=(
+            "The primary classification of the document. Must be one of the provided types "
+            "(e.g., 'Invoice', 'Receipt', 'Contract', 'Technical Knowledge Documents', 'Unknown')."
+        )
+    )
+    # FIX: Use default_factory=dict. This ensures that if the field is missing or comes in as null,
+    # Pydantic accepts it and defaults it to an empty dictionary {}.
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="A dictionary containing the extracted key-value metadata pairs specific to the classified document_type."
+    )
+    confidence: float = Field(
+        1.0,
+        description="A confidence score (0.0 to 1.0) of the classification/extraction accuracy. Default to 1.0."
+    )

guardianhub/models/template/reflection_critique.py ADDED Viewed

@@ -0,0 +1,206 @@
+import json
+import logging
+from pydantic import BaseModel, Field, validator, field_validator
+from typing import Literal, Optional, Any, Dict, Type, TypeVar, Union, List
+logger = logging.getLogger(__name__)
+from ..registry.registry import register_model
+T = TypeVar('T', bound='ReflectionCritique')
+# --- Enumerations for Clarity ---
+# Common component types for better IDE support and documentation
+COMMON_COMPONENTS = [
+    "PlannerAgent",
+    "DiagnosisAgent",
+    "ToolCall_CMDB",
+    "ToolCall_Metrics",
+    "TemporalWorkflow",
+    "LangGraphStateNode",
+    "PromptTemplate",
+    'ToolRegistryService',
+    'ExternalLLMService',
+    'ActivityError',
+    'HTTPCommunication'
+]
+# FailingComponent is now a string with validation
+def validate_failing_component(value: str) -> str:
+    """Validate that the failing component is a non-empty string.
+    Args:
+        value: The component name to validate
+    Returns:
+        The validated component name
+    Raises:
+        ValueError: If the value is not a string or is empty
+    """
+    if not isinstance(value, str) or not value.strip():
+        raise ValueError("Failing component must be a non-empty string")
+    return value
+# Type alias for documentation and IDE support
+FailingComponent = str
+# Define the severity of the identified issue
+IssueSeverity = Literal[
+    "Critical_Tool_Failure",
+    "Major_Logic_Error",
+    "Minor_Prompt_Drift",
+    "Schema_Mismatch",
+    'Critical_Service_Unreachable',
+    'Critical_Server_Error_5xx',
+    'Minor_Tool_Schema',
+    'Major_Read_Timeout'
+]
+# --- Update 3: Fix Scope ---
+FixScope = Literal[
+    'Prompt',
+    'Code',
+    'Configuration',
+    'Tool_Schema',
+    # NEW TECHNICAL SCOPE:
+    'Infrastructure_Kubernetes',
+    'Timeout_Setting'
+]
+@register_model
+class ReflectionSuccessOptimization(BaseModel):
+    """Structured output for optimizing successful agent executions."""
+    trace_id: str = Field(..., description="The unique Langfuse ID of the trace being optimized")
+    optimization_opportunity: str = Field(..., description="Brief description of the optimization opportunity")
+    impact_level: Literal["Low", "Medium", "High"] = Field(..., description="Expected impact of implementing the optimization")
+    optimization_type: Literal["Efficiency", "Completeness", "Accuracy", "UserExperience", "CostSaving"]
+    suggested_improvement: str = Field(..., description="Detailed suggestion for improvement")
+    expected_benefit: str = Field(..., description="Expected benefits of implementing the suggestion")
+    confidence_score: int = Field(ge=0, le=100, default=50, description="Confidence in this optimization (0-100)")
+    optimization_priority: int = Field(ge=1, le=5, default=3, description="Suggested implementation priority (1-5)")
+    related_components: List[str] = Field(default_factory=list, description="Components that would be affected by this optimization")
+# --- Main Structured Output ---
+@register_model
+class ReflectionCritique(BaseModel):
+    """
+    A structured output generated by the Reflection Agent after analyzing a failed or
+    sub-optimal Langfuse trace. This object is the key deliverable for the Governance layer (JARVIS v1).
+    """
+    trace_id: str = Field(..., description="The unique Langfuse ID of the trace that was analyzed.")
+    run_status: Literal["Failure", "Suboptimal"] = Field(...,
+                                                         description="The final status of the run being critiqued.")
+    # --- Core Findings ---
+    failing_component: str = Field(
+        ...,
+        description="The specific component, agent, or tool that ultimately caused the failure."
+    )
+    root_cause_summary: str = Field(
+        ...,
+        description="A concise, one-sentence summary of the single root cause."
+    )
+    issue_severity: IssueSeverity = Field(
+        ...,
+        description="The assessed severity of the root cause."
+    )
+    # --- Suggested Fix & Confidence ---
+    suggested_fix_action: str = Field(
+        ...,
+        description="A detailed, actionable recommendation to fix the root cause."
+    )
+    fix_scope: FixScope = Field(
+        ...,
+        description="The area where the fix needs to be applied."
+    )
+    fix_confidence_score: int = Field(
+        default=50,
+        ge=1,
+        le=100,
+        description="Confidence score (1-100) for the suggested fix. Defaults to 50."
+    )
+    # --- Audit Score ---
+    critique_score: int = Field(
+        default=3,
+        ge=0,
+        le=5,
+        description="Quality score (0-5) for the failed run. Defaults to 3."
+    )
+    # --- Detailed Analysis ---
+    detailed_analysis: str = Field(
+        ...,
+        description="Detailed explanation of the root cause and fix rationale."
+    )
+    # --- Validation ---
+    @field_validator('failing_component')
+    def validate_failing_component(cls, v: str) -> str:
+        """
+        Ensure failing_component is a valid non-empty string.
+        This validator is intentionally permissive to allow any non-empty string
+        while providing validation for common issues like whitespace-only values.
+        """
+        try:
+            # Use the standalone validator function for consistency
+            return validate_failing_component(v)
+        except ValueError as e:
+            # Provide a more helpful error message
+            raise ValueError(
+                f"Invalid failing component: {str(e)}. "
+                f"Common components include: {', '.join(COMMON_COMPONENTS)}"
+            ) from e
+    @field_validator('root_cause_summary')
+    def validate_root_cause_summary(cls, v: str) -> str:
+        """Ensure root_cause_summary is a non-empty string."""
+        if not isinstance(v, str) or not v.strip():
+            raise ValueError("Root cause summary must be a non-empty string")
+        return v.strip()
+    @field_validator('suggested_fix_action')
+    def validate_suggested_fix(cls, v: str) -> str:
+        """Ensure suggested_fix_action is a non-empty string."""
+        if not isinstance(v, str) or not v.strip():
+            raise ValueError("Suggested fix must be a non-empty string")
+        return v.strip()
+    @field_validator('detailed_analysis')
+    def validate_detailed_analysis(cls, v: str) -> str:
+        """Ensure detailed_analysis is a non-empty string."""
+        if not isinstance(v, str) or not v.strip():
+            raise ValueError("Detailed analysis must be a non-empty string")
+        return v.strip()
+    # --- Serialization ---
+    def model_dump_json(self, **kwargs) -> str:
+        """Serialize to JSON with proper error handling."""
+        try:
+            return super().model_dump_json(**kwargs)
+        except Exception as e:
+            logger.error(f"Failed to serialize ReflectionCritique: {str(e)}")
+            # Return a minimal valid JSON object with error information
+            return json.dumps({
+                "error": "Failed to serialize reflection critique",
+                "trace_id": self.trace_id,
+                "failing_component": getattr(self, 'failing_component', 'unknown'),
+                "issue_severity": getattr(self, 'issue_severity', 'unknown')
+            })
+    @classmethod
+    def parse_raw(cls: Type[T], json_data: Union[str, bytes], **kwargs) -> T:
+        """Parse JSON data with improved error handling."""
+        try:
+            if isinstance(json_data, bytes):
+                json_data = json_data.decode('utf-8')
+            return super().model_validate_json(json_data, **kwargs)
+        except Exception as e:
+            logger.error(f"Failed to parse ReflectionCritique: {str(e)}\nData: {json_data[:500]}")
+            raise ValueError(f"Invalid reflection critique data: {str(e)}") from e

guardianhub/models/template/suggestion.py ADDED Viewed

@@ -0,0 +1,42 @@
+from pydantic import BaseModel, Field
+from typing import Dict, Any, List, Optional
+from ..registry.registry import register_model
+@register_model
+class TemplateSchemaSuggestion(BaseModel):
+    """
+    Schema expected from the LLM for a new document type.
+    """
+    template_id: str = Field(
+        ...,
+        description="The id with which it will be identified."
+    )
+    document_type: str = Field(
+        ...,
+        description="The high-level category (e.g., 'Invoice', 'CV', 'Tax Form')."
+    )
+    template_name: str = Field(
+        ...,
+        description="A unique, descriptive name (e.g., 'ACME Q3 2024 Invoice')."
+    )
+    fingerprint_vector: Optional[List[float]] = Field(
+        None,
+        description="The fingerprint vector of the document."
+    )
+    json_schema: Dict[str, Any] = Field(
+        ...,
+        description=(
+            "The Pydantic-compatible JSON Schema defining the required "
+            "extraction fields."
+        )
+    )
+    required_keywords: List[str] = Field(
+        default_factory=list,
+        description="Top 5 keywords unique to this document template."
+    )

guardianhub/observability/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .instrumentation import configure_instrumentation

guardianhub/observability/instrumentation.py ADDED Viewed

@@ -0,0 +1,271 @@
+"""
+Centralized OpenTelemetry instrumentation and observability configuration.
+This module provides a consistent way to configure distributed tracing and metrics
+across all services in the GuardianHub ecosystem. It sets up:
+1. Distributed Tracing:
+   - Automatic instrumentation for FastAPI (incoming requests)
+   - HTTPX client instrumentation (outgoing requests)
+   - OTLP export for centralized trace collection
+2. Metrics:
+   - System and application metrics
+   - OTLP export for metrics collection
+3. Context Propagation:
+   - Ensures trace context is propagated across service boundaries (CRITICAL for Langfuse integration)
+   - Integrates with Langfuse for LLM/agent tracing
+The module follows OpenTelemetry best practices and provides sensible defaults
+while remaining configurable for different deployment environments.
+"""
+import os
+from typing import Optional, Union, Tuple, Any
+# Imports for resilient HTTP session configuration
+import requests
+from opentelemetry import trace, metrics
+from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
+from opentelemetry.propagate import set_global_textmap
+from opentelemetry.sdk.metrics import MeterProvider
+from opentelemetry.sdk.metrics.export import (
+    PeriodicExportingMetricReader,
+    ConsoleMetricExporter
+)
+from guardianhub.config.settings import settings
+from opentelemetry.sdk.resources import (
+    SERVICE_NAME,
+    Resource,
+    SERVICE_VERSION,
+    SERVICE_NAMESPACE,
+    DEPLOYMENT_ENVIRONMENT
+)
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from guardianhub import get_logger
+from guardianhub.observability.otel_middlewares import GuardianHubSampler
+logger = get_logger(__name__)
+def configure_instrumentation(
+    app,
+    enable_console_export: bool = False,
+    excluded_urls: str = "/health,/metrics",
+    httpx_excluded_urls: Union[str, Tuple[str, ...]] = "/health,/metrics",
+) -> None:
+    """Configure OpenTelemetry instrumentation for the application.
+    Args:
+        app: The FastAPI application instance to instrument
+        service_name: Name of the service for resource identification
+        environment: Deployment environment (defaults to ENV var or 'development')
+        service_version: Service version string (defaults to ENV var or '0.1.0')
+        otlp_endpoint: Base URL for OTLP collector (defaults to OTEL_EXPORTER_OTLP_ENDPOINT)
+        enable_console_export: If True, export traces/metrics to console
+        excluded_urls: Comma-separated URLs to exclude from tracing
+    """
+    # 1. Resolve configuration variables
+    environment = settings.endpoints.ENVIRONMENT
+    otlp_endpoint = settings.endpoints.OTEL_EXPORTER_OTLP_ENDPOINT
+    service_version = settings.service.version
+    # Default to the known Kubernetes OTLP Collector service if the environment variable is missing.
+    # We use the service-name.namespace:port format for cross-namespace communication.
+    # default_otlp_endpoint = "http://otel-collector-service.monitoring.svc.cluster.local:4318"
+    # otlp_endpoint = otlp_endpoint or os.getenv('OTEL_EXPORTER_OTLP_ENDPOINT')
+    #
+    # # Ensure the endpoint doesn't have a trailing slash, as the exporter needs the clean base URL
+    # if otlp_endpoint:
+    #     otlp_endpoint = otlp_endpoint.rstrip('/')
+    logger.info(
+        "Configuring OpenTelemetry instrumentation",
+        extra={
+            "service_name": settings.service.version,
+            "environment": environment,
+            "version": service_version,
+            "otlp_endpoint": settings.endpoints.OTEL_EXPORTER_OTLP_ENDPOINT
+        }
+    )
+    try:
+        # 2. Create resource with service metadata
+        resource = Resource.create(
+            attributes={
+                SERVICE_NAME: settings.service.name,
+                SERVICE_VERSION: service_version,
+                DEPLOYMENT_ENVIRONMENT: environment,
+                SERVICE_NAMESPACE: "guardianhub",
+            }
+        )
+        # # 2.5. Configure global context propagation using W3C Trace Context
+        # set_global_textmap(TraceContextTextMapPropagator())
+        # logger.info("Configured W3C Trace Context Propagator for context propagation")
+        # 3. Configure tracing
+        _setup_tracing(resource, settings, otlp_endpoint, enable_console_export)
+        # 4. Configure metrics
+        _setup_metrics(resource, otlp_endpoint, enable_console_export)
+        # 5. Instrument libraries
+        # Preferred path: pass request/response hooks if supported
+        FastAPIInstrumentor.instrument_app(
+            app=app,
+            tracer_provider=trace.get_tracer_provider(),
+            excluded_urls=excluded_urls,
+            # server_request_hook=server_request_hook,
+            # server_response_hook=server_response_hook,
+        )
+        logger.info("Instrumented FastAPI application", extra={"excluded_urls": excluded_urls})
+        # Convert tuple of URLs to comma-separated string if needed
+        excluded_urls_param = (
+            ",".join(httpx_excluded_urls)
+            if isinstance(httpx_excluded_urls, tuple)
+            else httpx_excluded_urls
+        )
+        HTTPXClientInstrumentor().instrument(
+            excluded_urls=excluded_urls_param,
+            # request_hook=client_request_hook,
+            # response_hook=client_response_hook,
+        )
+        logger.info("Instrumented HTTPX clients for outbound requests")
+        logger.info("OpenTelemetry instrumentation configured successfully")
+    except Exception as e:
+        logger.error(
+            "Failed to configure OpenTelemetry instrumentation. Continuing without full tracing/metrics.",
+            exc_info=True,
+            extra={"error": str(e)}
+        )
+        # Note: We catch the error but don't re-raise, allowing the application to start
+        # but with reduced observability. This is typically safer than failing startup.
+def _setup_tracing(resource: Resource, settings: Any, otlp_endpoint: Optional[str], console_export: bool) -> None:
+    """Configure and initialize OpenTelemetry tracing."""
+    logger.debug("Configuring tracing subsystem")
+    tracer_provider = TracerProvider(resource=resource, sampler=GuardianHubSampler())
+    # 2. OTLP Exporter
+    try:
+        # The Python SDK must be trusted to append '/v1/traces' internally,
+        # as demonstrated by the successful curl test to the collector.
+        # Langfuse acts as an OTEL receiver via OTLP
+        langfuse_base_url = settings.endpoints.LANGFUSE_OTLP_TRACES_ENDPOINT
+        langfuse_public_key = settings.endpoints.LANGFUSE_PUBLIC_KEY
+        langfuse_secret_key = settings.endpoints.LANGFUSE_SECRET_KEY
+        otlp_exporter = OTLPSpanExporter(
+            endpoint=f"{langfuse_base_url}/v1/traces",
+            headers={
+                "x-langfuse-public-key": langfuse_public_key,
+                "x-langfuse-secret-key": langfuse_secret_key,
+            }
+        )
+        otlp_processor = BatchSpanProcessor(otlp_exporter)
+        tracer_provider.add_span_processor(otlp_processor)
+        logger.info("Configured OTLP trace exporter", extra={"endpoint": f"{otlp_endpoint}/v1/traces (internal path)"})
+        logger.info(f"{otlp_endpoint}/v1/traces (internal path)")
+        trace.set_tracer_provider(tracer_provider)
+    except Exception as e:
+        # Only log the error, don't crash startup if the collector is unreachable
+        logger.warning(
+            "Failed to configure OTLP trace exporter. Check endpoint and network access.",
+            extra={"endpoint": otlp_endpoint, "error": str(e)}
+        )
+def _setup_metrics(resource: Resource, otlp_endpoint: Optional[str], console_export: bool) -> None:
+    """Configure and initialize OpenTelemetry metrics."""
+    logger.debug("Configuring metrics subsystem")
+    metric_readers = []
+    # 1. Console Exporter
+    if console_export:
+        # Wrap the ConsoleMetricExporter in a PeriodicExportingMetricReader
+        metric_readers.append(
+            PeriodicExportingMetricReader(
+                ConsoleMetricExporter()
+            )
+        )
+        logger.debug("Enabled console metrics export")
+    # 2. OTLP Exporter
+    if otlp_endpoint:
+        try:
+            # Create a resilient HTTP session for the exporter
+            otlp_session = _create_otlp_session()
+            full_otlp_metrics_endpoint = f"{otlp_endpoint}/v1/metrics"
+            # FIX for 404 error: Revert the explicit path addition.
+            # The Python SDK must be trusted to append '/v1/metrics' internally.
+            otlp_exporter = OTLPMetricExporter(
+                endpoint=full_otlp_metrics_endpoint,
+                session=otlp_session
+            )
+            # Wrap the OTLPMetricExporter in a PeriodicExportingMetricReader
+            metric_readers.append(
+                PeriodicExportingMetricReader(otlp_exporter)
+            )
+            logger.info("Configured OTLP metrics exporter", extra={"endpoint": f"{otlp_endpoint}/v1/metrics (internal path)"})
+        except Exception as e:
+            # Only log the error, don't crash startup if the collector is unreachable
+            logger.warning(
+                "Failed to configure OTLP metrics exporter. Check endpoint and network access.",
+                extra={"endpoint": otlp_endpoint, "error": str(e)}
+            )
+    if metric_readers:
+        # Set the MeterProvider only if at least one reader is successfully configured
+        metrics.set_meter_provider(
+            MeterProvider(
+                resource=resource,
+                metric_readers=metric_readers
+            )
+        )
+    else:
+        logger.info("No OTLP endpoint or console export enabled. Metrics will not be exported.")
+def _create_otlp_session() -> requests.Session:
+    """
+    Creates a requests session configured for robust OTLP export retries.
+    This helps handle transient network failures (like 'Connection refused'
+    during service startup) in Kubernetes environments.
+    """
+    # Configure retry strategy: 5 total retries with 1 second backoff factor
+    retry_strategy = Retry(
+        total=5,
+        backoff_factor=1,
+        # Includes connection errors and typical server errors
+        status_forcelist=[429, 500, 502, 503, 504],
+        allowed_methods=frozenset(['POST']),
+        # We allow the underlying connection errors to trigger retries
+    )
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+    session = requests.Session()
+    # Apply the resilient adapter to both HTTP and HTTPS protocols
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+def get_meter(name: str) -> metrics.Meter:
+    """Get a meter instance with the given name."""
+    return metrics.get_meter(name)

guardianhub/observability/otel_helper.py ADDED Viewed

@@ -0,0 +1,43 @@
+from typing import Dict, Any, Optional
+from opentelemetry import trace
+from opentelemetry.sdk.resources import (
+    Resource,
+    SERVICE_NAME,
+    SERVICE_VERSION,
+    SERVICE_NAMESPACE,
+    DEPLOYMENT_ENVIRONMENT
+)
+from opentelemetry.sdk.trace import TracerProvider
+def configure_resource(
+    service_name: str,
+    service_namespace: str = "guardianhub",
+    service_version: str = "0.1.0",
+    service_language: str = "python",
+    environment: str = "development",
+    resource_attributes: Optional[Dict[str, Any]] = None,
+) -> None:
+    """
+    Configure the global tracer provider with the given service details.
+    This should be called once at application startup.
+    Args:
+        service_name: Name of the service (e.g., 'aura-llm-service')
+        service_namespace: Namespace for the service (default: 'guardianhub')
+        service_version: Version of the service (default: '0.1.0')
+        environment: Deployment environment (e.g., 'development', 'staging', 'production')
+        resource_attributes: Additional attributes to add to the resource
+        service_language: Always set to 'python'
+    """
+    attributes = {
+        SERVICE_NAME: service_name,
+        SERVICE_NAMESPACE: service_namespace,
+        SERVICE_VERSION: service_version,
+        DEPLOYMENT_ENVIRONMENT: environment,
+        "service.language": service_language,
+        **(resource_attributes or {})
+    }
+    resource = Resource(attributes=attributes)
+    trace.set_tracer_provider(TracerProvider(resource=resource))