PyPI - logdetective - Versions diffs - 0.4.0__py3-none-any.whl → 2.11.0__py3-none-any.whl - Mend

logdetective 0.4.0py3-none-any.whl → 2.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

logdetective/constants.py +33 -12
logdetective/extractors.py +137 -68
logdetective/logdetective.py +102 -33
logdetective/models.py +99 -0
logdetective/prompts-summary-first.yml +20 -0
logdetective/prompts-summary-only.yml +13 -0
logdetective/prompts.yml +90 -0
logdetective/remote_log.py +67 -0
logdetective/server/compressors.py +186 -0
logdetective/server/config.py +78 -0
logdetective/server/database/base.py +34 -26
logdetective/server/database/models/__init__.py +33 -0
logdetective/server/database/models/exceptions.py +17 -0
logdetective/server/database/models/koji.py +143 -0
logdetective/server/database/models/merge_request_jobs.py +623 -0
logdetective/server/database/models/metrics.py +427 -0
logdetective/server/emoji.py +148 -0
logdetective/server/exceptions.py +37 -0
logdetective/server/gitlab.py +451 -0
logdetective/server/koji.py +159 -0
logdetective/server/llm.py +309 -0
logdetective/server/metric.py +75 -30
logdetective/server/models.py +426 -23
logdetective/server/plot.py +432 -0
logdetective/server/server.py +580 -468
logdetective/server/templates/base_response.html.j2 +59 -0
logdetective/server/templates/gitlab_full_comment.md.j2 +73 -0
logdetective/server/templates/gitlab_short_comment.md.j2 +62 -0
logdetective/server/utils.py +98 -32
logdetective/skip_snippets.yml +12 -0
logdetective/utils.py +187 -73
logdetective-2.11.0.dist-info/METADATA +568 -0
logdetective-2.11.0.dist-info/RECORD +40 -0
{logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info}/WHEEL +1 -1
logdetective/server/database/models.py +0 -88
logdetective-0.4.0.dist-info/METADATA +0 -333
logdetective-0.4.0.dist-info/RECORD +0 -19
{logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info}/entry_points.txt +0 -0
{logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info/licenses}/LICENSE +0 -0

logdetective/server/models.py CHANGED Viewed

@@ -1,6 +1,34 @@
+import asyncio
+from collections import defaultdict
+import datetime
 from logging import BASIC_FORMAT
-from typing import List, Dict, Optional, Literal
-from pydantic import BaseModel, Field
+from typing import List, Dict, Optional
+from pydantic import (
+    BaseModel,
+    Field,
+    model_validator,
+    field_validator,
+    NonNegativeFloat,
+    HttpUrl,
+    PrivateAttr,
+)
+import aiohttp
+from aiolimiter import AsyncLimiter
+from gitlab import Gitlab
+import koji
+from logdetective.constants import (
+    DEFAULT_TEMPERATURE,
+    LLM_DEFAULT_MAX_QUEUE_SIZE,
+    LLM_DEFAULT_REQUESTS_PER_MINUTE,
+    SYSTEM_ROLE_DEFAULT,
+    USER_ROLE_DEFAULT,
+)
+from logdetective.extractors import Extractor, DrainExtractor, CSGrepExtractor
+from logdetective.utils import check_csgrep
 class BuildLog(BaseModel):
@@ -21,7 +49,7 @@ class JobHook(BaseModel):
     # The identifier of the job. We only care about 'build_rpm' and
     # 'build_centos_stream_rpm' jobs.
-    build_name: str = Field(pattern=r"^build(_.*)?_rpm$")
+    build_name: str = Field(pattern=r"^build.*rpm$")
     # A string representing the job status. We only care about 'failed' jobs.
     build_status: str = Field(pattern=r"^failed$")
@@ -37,6 +65,51 @@ class JobHook(BaseModel):
     project_id: int
+class EmojiMergeRequest(BaseModel):
+    """Model of the 'merge_request' subsection of Emoji webhook messages.
+    This model implements only the fields that we care about. The webhook
+    sends many more fields that we will ignore."""
+    # The identifier of the target project
+    target_project_id: int
+    # The internal identifier (relative to the target project)
+    iid: int
+class EmojiHook(BaseModel):
+    """Model of Job Hook events sent from GitLab.
+    Full details of the specification are available at
+    https://docs.gitlab.com/user/project/integrations/webhook_events/#job-events
+    This model implements only the fields that we care about. The webhook
+    sends many more fields that we will ignore."""
+    # The kind of webhook message. We are only interested in 'emoji' messages
+    # which represents awarding or revoking emoji reactions on notes.
+    object_kind: str = Field(pattern=r"^emoji$")
+    # Information about the merge request this emoji applies to, if any.
+    merge_request: EmojiMergeRequest = Field(default=None)
+class SnippetAnalysis(BaseModel):
+    """Model of snippet analysis from LLM."""
+    text: str = Field(description="Analysis of log snippet contents.")
+class RatedSnippetAnalysis(SnippetAnalysis):
+    """Model for rated snippet analysis. This model is used to generate
+    json schema for inference with structured output."""
+    relevance: int = Field(
+        ge=0,
+        le=100,
+        description="Estimate of likelyhood that snippet contains an error, "
+        "with 0 standing for completely unlikely, 100 for absolutely certain.",
+    )
 class Explanation(BaseModel):
     """Model of snippet or general log explanation from Log Detective"""
@@ -44,6 +117,7 @@ class Explanation(BaseModel):
     logprobs: Optional[List[Dict]] = None
     def __str__(self):
+        """Return text of the Explanation"""
         return self.text
@@ -54,7 +128,8 @@ class AnalyzedSnippet(BaseModel):
     text: original snippet text
     line_number: location of snippet in original log
     """
-    explanation: Explanation
+    explanation: SnippetAnalysis | RatedSnippetAnalysis
     text: str
     line_number: int
@@ -82,14 +157,35 @@ class StagedResponse(Response):
     snippets: List[AnalyzedSnippet]
-class InferenceConfig(BaseModel):
+class KojiStagedResponse(BaseModel):
+    """Model of data returned by Log Detective API when called when a Koji build
+    analysis is requested. Contains list of reponses to prompts for individual
+    snippets.
+    """
+    task_id: int
+    log_file_name: str
+    response: StagedResponse
+class InferenceConfig(BaseModel):  # pylint: disable=too-many-instance-attributes
     """Model for inference configuration of logdetective server."""
     max_tokens: int = -1
-    log_probs: int = 1
-    api_endpoint: Optional[Literal["/chat/completions", "/completions"]] = (
-        "/chat/completions"
-    )
+    log_probs: bool = True
+    url: str = ""
+    # OpenAI client library requires a string to be specified for API token
+    # even if it is not checked on the server side
+    api_token: str = "None"
+    model: str = ""
+    temperature: NonNegativeFloat = DEFAULT_TEMPERATURE
+    max_queue_size: int = LLM_DEFAULT_MAX_QUEUE_SIZE
+    http_timeout: float = 5.0
+    user_role: str = USER_ROLE_DEFAULT
+    system_role: str = SYSTEM_ROLE_DEFAULT
+    llm_api_timeout: float = 15.0
+    _limiter: AsyncLimiter = PrivateAttr(
+        default_factory=lambda: AsyncLimiter(LLM_DEFAULT_REQUESTS_PER_MINUTE))
     def __init__(self, data: Optional[dict] = None):
         super().__init__()
@@ -97,53 +193,262 @@ class InferenceConfig(BaseModel):
             return
         self.max_tokens = data.get("max_tokens", -1)
-        self.log_probs = data.get("log_probs", 1)
-        self.api_endpoint = data.get("api_endpoint", "/chat/completions")
+        self.log_probs = data.get("log_probs", True)
+        self.url = data.get("url", "")
+        self.http_timeout = data.get("http_timeout", 5.0)
+        self.api_token = data.get("api_token", "None")
+        self.model = data.get("model", "default-model")
+        self.temperature = data.get("temperature", DEFAULT_TEMPERATURE)
+        self.max_queue_size = data.get("max_queue_size", LLM_DEFAULT_MAX_QUEUE_SIZE)
+        self.user_role = data.get("user_role", USER_ROLE_DEFAULT)
+        self.system_role = data.get("system_role", SYSTEM_ROLE_DEFAULT)
+        self._requests_per_minute = data.get(
+            "requests_per_minute", LLM_DEFAULT_REQUESTS_PER_MINUTE
+        )
+        self.llm_api_timeout = data.get("llm_api_timeout", 15.0)
+        self._limiter = AsyncLimiter(self._requests_per_minute)
+    def get_limiter(self):
+        """Return the limiter object so it can be used as a context manager"""
+        return self._limiter
 class ExtractorConfig(BaseModel):
     """Model for extractor configuration of logdetective server."""
-    context: bool = True
     max_clusters: int = 8
     verbose: bool = False
+    max_snippet_len: int = 2000
+    csgrep: bool = False
+    _extractors: List[Extractor] = PrivateAttr(default_factory=list)
+    def _setup_extractors(self):
+        """Initialize extractors with common settings."""
+        self._extractors = [
+            DrainExtractor(
+                verbose=self.verbose,
+                max_snippet_len=self.max_snippet_len,
+                max_clusters=self.max_clusters,
+            )
+        ]
+        if self.csgrep:
+            self._extractors.append(
+                CSGrepExtractor(
+                    verbose=self.verbose,
+                    max_snippet_len=self.max_snippet_len,
+                )
+            )
     def __init__(self, data: Optional[dict] = None):
-        super().__init__()
+        super().__init__(data=data)
         if data is None:
+            self._setup_extractors()
             return
-        self.context = data.get("context", True)
         self.max_clusters = data.get("max_clusters", 8)
         self.verbose = data.get("verbose", False)
+        self.max_snippet_len = data.get("max_snippet_len", 2000)
+        self.csgrep = data.get("csgrep", False)
+        self._setup_extractors()
-class GitLabConfig(BaseModel):
+    def get_extractors(self) -> List[Extractor]:
+        """Return list of initialized extractors, each will be applied in turn
+        on original log text to retrieve snippets."""
+        return self._extractors
+    @field_validator("csgrep", mode="after")
+    @classmethod
+    def validate_csgrep(cls, value: bool) -> bool:
+        """Verify that csgrep is available if requested."""
+        if not check_csgrep():
+            raise ValueError(
+                "Requested csgrep extractor but `csgrep` binary is not in the PATH"
+            )
+        return value
+class GitLabInstanceConfig(BaseModel):  # pylint: disable=too-many-instance-attributes
     """Model for GitLab configuration of logdetective server."""
+    name: str = None
     url: str = None
-    api_url: str = None
+    # Path to API of the gitlab instance, assuming `url` as prefix.
+    api_path: str = None
     api_token: str = None
+    # This is a list to support key rotation.
+    # When the key is being changed, we will add the new key as a new entry in
+    # the configuration and then remove the old key once all of the client
+    # webhook configurations have been updated.
+    # If this option is left empty or unspecified, all requests will be
+    # considered authorized.
+    webhook_secrets: Optional[List[str]] = None
+    timeout: float = 5.0
+    _conn: Gitlab | None = PrivateAttr(default=None)
+    _http_session: aiohttp.ClientSession | None = PrivateAttr(default=None)
     # Maximum size of artifacts.zip in MiB. (default: 300 MiB)
-    max_artifact_size: int = 300
+    max_artifact_size: int = 300 * 1024 * 1024
-    def __init__(self, data: Optional[dict] = None):
+    def __init__(self, name: str, data: Optional[dict] = None):
         super().__init__()
         if data is None:
             return
+        self.name = name
         self.url = data.get("url", "https://gitlab.com")
-        self.api_url = f"{self.url}/api/v4"
+        self.api_path = data.get("api_path", "/api/v4")
         self.api_token = data.get("api_token", None)
-        self.max_artifact_size = int(data.get("max_artifact_size")) * 1024 * 1024
+        self.webhook_secrets = data.get("webhook_secrets", None)
+        self.max_artifact_size = int(data.get("max_artifact_size", 300)) * 1024 * 1024
+        self.timeout = data.get("timeout", 5.0)
+        self._conn = Gitlab(
+            url=self.url,
+            private_token=self.api_token,
+            timeout=self.timeout,
+        )
+    def get_connection(self):
+        """Get the Gitlab connection object"""
+        return self._conn
+    def get_http_session(self):
+        """Return the internal HTTP session so it can be used to contect the
+        Gitlab server. May be used as a context manager."""
+        # Create the session on the first attempt. We need to do this "lazily"
+        # because it needs to happen once the event loop is running, even
+        # though the initialization itself is synchronous.
+        if not self._http_session:
+            self._http_session = aiohttp.ClientSession(
+                base_url=self.url,
+                headers={"Authorization": f"Bearer {self.api_token}"},
+                timeout=aiohttp.ClientTimeout(
+                    total=self.timeout,
+                    connect=3.07,
+                ),
+            )
+        return self._http_session
+    def __del__(self):
+        # Close connection when this object is destroyed
+        if self._http_session:
+            try:
+                loop = asyncio.get_running_loop()
+                loop.create_task(self._http_session.close())
+            except RuntimeError:
+                # No loop running, so create one to close the session
+                loop = asyncio.new_event_loop()
+                loop.run_until_complete(self._http_session.close())
+                loop.close()
+            except Exception:  # pylint: disable=broad-exception-caught
+                # We should only get here if we're shutting down, so we don't
+                # really care if the close() completes cleanly.
+                pass
+class GitLabConfig(BaseModel):
+    """Model for GitLab configuration of logdetective server."""
+    instances: Dict[str, GitLabInstanceConfig] = {}
+    def __init__(self, data: Optional[dict] = None):
+        super().__init__()
+        if data is None:
+            return
+        for instance_name, instance_data in data.items():
+            instance = GitLabInstanceConfig(instance_name, instance_data)
+            self.instances[instance.url] = instance
+class KojiInstanceConfig(BaseModel):
+    """Model for Koji configuration of logdetective server."""
+    name: str = ""
+    xmlrpc_url: str = ""
+    tokens: List[str] = []
+    _conn: Optional[koji.ClientSession] = PrivateAttr(default=None)
+    _callbacks: defaultdict[int, set[str]] = PrivateAttr(default_factory=lambda: defaultdict(set))
+    def __init__(self, name: str, data: Optional[dict] = None):
+        super().__init__()
+        self.name = name
+        if data is None:
+            # Set some reasonable defaults
+            self.xmlrpc_url = "https://koji.fedoraproject.org/kojihub"
+            self.tokens = []
+            self.max_artifact_size = 1024 * 1024
+            return
+        self.xmlrpc_url = data.get(
+            "xmlrpc_url", "https://koji.fedoraproject.org/kojihub"
+        )
+        self.tokens = data.get("tokens", [])
+    def get_connection(self):
+        """Get the Koji connection object"""
+        if not self._conn:
+            self._conn = koji.ClientSession(self.xmlrpc_url)
+        return self._conn
+    def register_callback(self, task_id: int, callback: str):
+        """Register a callback for a task"""
+        self._callbacks[task_id].add(callback)
+    def clear_callbacks(self, task_id: int):
+        """Unregister a callback for a task"""
+        try:
+            del self._callbacks[task_id]
+        except KeyError:
+            pass
+    def get_callbacks(self, task_id: int) -> set[str]:
+        """Get the callbacks for a task"""
+        return self._callbacks[task_id]
+class KojiConfig(BaseModel):
+    """Model for Koji configuration of logdetective server."""
+    instances: Dict[str, KojiInstanceConfig] = {}
+    analysis_timeout: int = 15
+    max_artifact_size: int = 300 * 1024 * 1024
+    def __init__(self, data: Optional[dict] = None):
+        super().__init__()
+        if data is None:
+            return
+        # Handle analysis_timeout with default 15
+        self.analysis_timeout = data.get("analysis_timeout", 15)
+        # Handle max_artifact_size with default 300
+        self.max_artifact_size = data.get("max_artifact_size", 300) * 1024 * 1024
+        # Handle instances dictionary
+        instances_data = data.get("instances", {})
+        for instance_name, instance_data in instances_data.items():
+            self.instances[instance_name] = KojiInstanceConfig(
+                instance_name, instance_data
+            )
 class LogConfig(BaseModel):
     """Logging configuration"""
     name: str = "logdetective"
-    level: str | int = "INFO"
+    level_stream: str | int = "INFO"
+    level_file: str | int = "INFO"
     path: str | None = None
     format: str = BASIC_FORMAT
@@ -153,7 +458,8 @@ class LogConfig(BaseModel):
             return
         self.name = data.get("name", "logdetective")
-        self.level = data.get("level", "INFO").upper()
+        self.level_stream = data.get("level_stream", "INFO").upper()
+        self.level_file = data.get("level_file", "INFO").upper()
         self.path = data.get("path")
         self.format = data.get("format", BASIC_FORMAT)
@@ -161,7 +467,12 @@ class LogConfig(BaseModel):
 class GeneralConfig(BaseModel):
     """General config options for Log Detective"""
-    packages: List[str] = None
+    packages: List[str] = []
+    excluded_packages: List[str] = []
+    devmode: bool = False
+    sentry_dsn: HttpUrl | None = None
+    collect_emojis_interval: int = 60 * 60  # seconds
+    top_k_snippets: int = 0
     def __init__(self, data: Optional[dict] = None):
         super().__init__()
@@ -169,6 +480,13 @@ class GeneralConfig(BaseModel):
             return
         self.packages = data.get("packages", [])
+        self.excluded_packages = data.get("excluded_packages", [])
+        self.devmode = data.get("devmode", False)
+        self.sentry_dsn = data.get("sentry_dsn")
+        self.collect_emojis_interval = data.get(
+            "collect_emojis_interval", 60 * 60
+        )  # seconds
+        self.top_k_snippets = data.get("top_k_snippets", 0)
 class Config(BaseModel):
@@ -176,8 +494,11 @@ class Config(BaseModel):
     log: LogConfig = LogConfig()
     inference: InferenceConfig = InferenceConfig()
+    snippet_inference: InferenceConfig = InferenceConfig()
+    # TODO(jpodivin): Extend to work with multiple extractor configs
     extractor: ExtractorConfig = ExtractorConfig()
     gitlab: GitLabConfig = GitLabConfig()
+    koji: KojiConfig = KojiConfig()
     general: GeneralConfig = GeneralConfig()
     def __init__(self, data: Optional[dict] = None):
@@ -190,4 +511,86 @@ class Config(BaseModel):
         self.inference = InferenceConfig(data.get("inference"))
         self.extractor = ExtractorConfig(data.get("extractor"))
         self.gitlab = GitLabConfig(data.get("gitlab"))
+        self.koji = KojiConfig(data.get("koji"))
         self.general = GeneralConfig(data.get("general"))
+        if snippet_inference := data.get("snippet_inference", None):
+            self.snippet_inference = InferenceConfig(snippet_inference)
+        else:
+            self.snippet_inference = self.inference
+class TimePeriod(BaseModel):
+    """Specification for a period of time.
+    If no indication is given
+    it falls back to a 2 days period of time.
+    Can't be smaller than a hour"""
+    weeks: Optional[int] = None
+    days: Optional[int] = None
+    hours: Optional[int] = None
+    @model_validator(mode="before")
+    @classmethod
+    def check_exclusive_fields(cls, data):
+        """Check that only one key between weeks, days and hours is defined,
+        if no period is specified, fall back to 2 days."""
+        if isinstance(data, dict):
+            how_many_fields = sum(
+                1
+                for field in ["weeks", "days", "hours"]
+                if field in data and data[field] is not None
+            )
+            if how_many_fields == 0:
+                data["days"] = 2  # by default fallback to a 2 days period
+            if how_many_fields > 1:
+                raise ValueError("Only one of months, weeks, days, or hours can be set")
+        return data
+    @field_validator("weeks", "days", "hours")
+    @classmethod
+    def check_positive(cls, v):
+        """Check that the given value is positive"""
+        if v is not None and v <= 0:
+            raise ValueError("Time period must be positive")
+        return v
+    def get_time_period(self) -> datetime.timedelta:
+        """Get the period of time represented by this input model.
+        Will default to 2 days, if no period is set.
+        Returns:
+            datetime.timedelta: The time period as a timedelta object.
+        """
+        delta = None
+        if self.weeks:
+            delta = datetime.timedelta(weeks=self.weeks)
+        elif self.days:
+            delta = datetime.timedelta(days=self.days)
+        elif self.hours:
+            delta = datetime.timedelta(hours=self.hours)
+        else:
+            delta = datetime.timedelta(days=2)
+        return delta
+    def get_period_start_time(
+        self, end_time: Optional[datetime.datetime] = None
+    ) -> datetime.datetime:
+        """Calculate the start time of this period based on the end time.
+        Args:
+            end_time (datetime.datetime, optional): The end time of the period.
+                Defaults to current UTC time if not provided.
+        Returns:
+            datetime.datetime: The start time of the period.
+        """
+        time = end_time or datetime.datetime.now(datetime.timezone.utc)
+        if time.tzinfo is None:
+            time = time.replace(tzinfo=datetime.timezone.utc)
+        return time - self.get_time_period()

logdetective 0.4.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

logdetective 0.4.0py3-none-any.whl → 2.11.0py3-none-any.whl