PyPI - logdetective - Versions diffs - 2.0.1__py3-none-any.whl → 2.11.0__py3-none-any.whl - Mend

logdetective 2.0.1py3-none-any.whl → 2.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

logdetective/extractors.py +134 -23
logdetective/logdetective.py +39 -23
logdetective/models.py +26 -0
logdetective/prompts-summary-first.yml +0 -2
logdetective/prompts.yml +0 -3
logdetective/server/compressors.py +7 -10
logdetective/server/config.py +3 -2
logdetective/server/database/base.py +31 -26
logdetective/server/database/models/__init__.py +2 -2
logdetective/server/database/models/exceptions.py +4 -0
logdetective/server/database/models/koji.py +47 -30
logdetective/server/database/models/merge_request_jobs.py +205 -186
logdetective/server/database/models/metrics.py +87 -61
logdetective/server/emoji.py +57 -55
logdetective/server/exceptions.py +4 -0
logdetective/server/gitlab.py +18 -11
logdetective/server/llm.py +19 -10
logdetective/server/metric.py +18 -13
logdetective/server/models.py +65 -48
logdetective/server/plot.py +13 -11
logdetective/server/server.py +52 -30
logdetective/server/templates/base_response.html.j2 +59 -0
logdetective/server/templates/gitlab_full_comment.md.j2 +58 -53
logdetective/server/templates/gitlab_short_comment.md.j2 +52 -47
logdetective/server/utils.py +15 -27
logdetective/utils.py +115 -49
{logdetective-2.0.1.dist-info → logdetective-2.11.0.dist-info}/METADATA +95 -21
logdetective-2.11.0.dist-info/RECORD +40 -0
{logdetective-2.0.1.dist-info → logdetective-2.11.0.dist-info}/WHEEL +1 -1
logdetective-2.0.1.dist-info/RECORD +0 -39
{logdetective-2.0.1.dist-info → logdetective-2.11.0.dist-info}/entry_points.txt +0 -0
{logdetective-2.0.1.dist-info → logdetective-2.11.0.dist-info/licenses}/LICENSE +0 -0

logdetective/extractors.py CHANGED Viewed

@@ -1,57 +1,168 @@
 import os
 import logging
+import subprocess as sp
 from typing import Tuple
 import drain3
 from drain3.template_miner_config import TemplateMinerConfig
+from pydantic import ValidationError
 from logdetective.utils import get_chunks, filter_snippet_patterns
-from logdetective.models import SkipSnippets
+from logdetective.models import SkipSnippets, CSGrepOutput
 LOG = logging.getLogger("logdetective")
-class DrainExtractor:
-    """A class that extracts information from logs using a template miner algorithm."""
+class Extractor:
+    """Base extractor class."""
     def __init__(
         self,
         verbose: bool = False,
-        context: bool = False,
-        max_clusters=8,
         skip_snippets: SkipSnippets = SkipSnippets({}),
-        max_snippet_len: int = 2000
-    ):  # pylint: disable=R0913,R0917
-        config = TemplateMinerConfig()
-        config.load(f"{os.path.dirname(__file__)}/drain3.ini")
-        config.profiling_enabled = verbose
-        config.drain_max_clusters = max_clusters
-        self.miner = drain3.TemplateMiner(config=config)
+        max_snippet_len: int = 2000,
+    ):
         self.verbose = verbose
-        self.context = context
         self.skip_snippets = skip_snippets
         self.max_snippet_len = max_snippet_len
+        if self.verbose:
+            LOG.setLevel(logging.DEBUG)
     def __call__(self, log: str) -> list[Tuple[int, str]]:
-        out = []
-        # Create chunks
-        chunks = list(get_chunks(log, self.max_snippet_len))
-        # Keep only chunks that don't match any of the excluded patterns
+        raise NotImplementedError
+    def filter_snippet_patterns(
+        self, chunks: list[tuple[int, str]]
+    ) -> list[tuple[int, str]]:
+        """Keep only chunks that don't match any of the excluded patterns"""
         chunks = [
             (_, chunk)
             for _, chunk in chunks
             if not filter_snippet_patterns(chunk, self.skip_snippets)
         ]
-        # First pass create clusters
+        return chunks
+class DrainExtractor(Extractor):
+    """A class that extracts information from logs using a template miner algorithm."""
+    _clusters: list
+    def __init__(
+        self,
+        verbose: bool = False,
+        skip_snippets: SkipSnippets = SkipSnippets({}),
+        max_snippet_len: int = 2000,
+        max_clusters: int = 8,
+    ):
+        super().__init__(verbose, skip_snippets, max_snippet_len)
+        config = TemplateMinerConfig()
+        config.load(f"{os.path.dirname(__file__)}/drain3.ini")
+        config.profiling_enabled = verbose
+        config.drain_max_clusters = max_clusters
+        self.miner = drain3.TemplateMiner(config=config)
+    def __call__(self, log: str) -> list[Tuple[int, str]]:
+        # Create chunks
+        chunks = list(get_chunks(log, self.max_snippet_len))
+        chunks = self.filter_snippet_patterns(chunks)
+        # First pass to create clusters
+        self._create_clusters(chunks=chunks)
+        # Second pass, only matching lines with clusters,
+        # to recover original text
+        snippets = self._extract_messages(chunks=chunks)
+        return snippets
+    def _create_clusters(self, chunks: list[tuple[int, str]]):
+        """First pass to create clusters"""
         for _, chunk in chunks:
             processed_chunk = self.miner.add_log_message(chunk)
             LOG.debug(processed_chunk)
-        clusters = list(self.miner.drain.clusters)
-        # Second pass, only matching lines with clusters,
-        # to recover original text
+        self._clusters = list(self.miner.drain.clusters)
+    def _extract_messages(self, chunks: list[tuple[int, str]]) -> list[tuple[int, str]]:
+        """Second pass with drain using patterns from the first,
+        to extract matching lines and their numbers."""
+        out = []
         for chunk_start, chunk in chunks:
             cluster = self.miner.match(chunk, "always")
-            if cluster in clusters:
+            if cluster in self._clusters:
                 out.append((chunk_start, chunk))
-                clusters.remove(cluster)
+                self._clusters.remove(cluster)
         return out
+class CSGrepExtractor(DrainExtractor):
+    """Extract messages using csgrep
+    This extractor is only effective at retrieving messages from GCC
+    compiler and associated utilities, it is not capable of safely
+    extracting other messages from the logs. Therefore, it must only
+    be used together with the Drain based extractor."""
+    def __init__(
+        self,
+        verbose: bool = False,
+        skip_snippets: SkipSnippets = SkipSnippets({}),
+        max_snippet_len: int = 2000,
+        max_clusters: int = 8,
+    ):
+        super().__init__(verbose, skip_snippets, max_snippet_len, max_clusters)
+    def __call__(self, log: str) -> list[Tuple[int, str]]:
+        """Extract error messages from log using csgrep"""
+        chunks = []
+        try:
+            # We are not running binary in check mode, since csgrep
+            # can produce many errors due to log file syntax
+            result = sp.run(
+                [
+                    "csgrep",
+                    "--event=error",
+                    "--remove-duplicates",
+                    "--mode=json",
+                    "--quiet",
+                ],
+                input=log,
+                shell=False,
+                check=False,
+                capture_output=True,
+                text=True,
+                timeout=1.0,
+            )
+        except sp.TimeoutExpired as ex:
+            LOG.exception("Exception encountered while parsing log with csgrep %s", ex)
+            raise ex
+        if result.returncode != 0:
+            # This can happen even if `csgrep` managed to extract useful info.
+            # Most commonly, when it encountered unexpected syntax in the log.
+            LOG.warning("csgrep call resulted in an error")
+            LOG.debug("csgrep error: `%s`", result.stderr)
+        if not result.stdout:
+            return []
+        # Parse JSON output from csgrep
+        try:
+            report = CSGrepOutput.model_validate_json(result.stdout)
+        except ValidationError as ex:
+            LOG.exception("Exception encountered while parsing csgrpe output %s", ex)
+            raise ex
+        for defect in report.defects:
+            # Single original error message can be split across multiple events
+            # before returning, we will turn them back into single string.
+            # We must also extract the original line number.
+            # Line number is NOT location of message in the log, but location of
+            # the issue in source, we can't really mix the two, so we'll set it to `0`.
+            chunks.append((0, "\n".join([event.message for event in defect.events])))
+        chunks = self.filter_snippet_patterns(chunks)
+        LOG.info("Total %d messages extracted with csgrep", len(chunks))
+        self._create_clusters(chunks=chunks)
+        snippets = self._extract_messages(chunks=chunks)
+        return snippets

logdetective/logdetective.py CHANGED Viewed

@@ -15,8 +15,10 @@ from logdetective.utils import (
     compute_certainty,
     load_prompts,
     load_skip_snippet_patterns,
+    check_csgrep,
+    mine_logs,
 )
-from logdetective.extractors import DrainExtractor
+from logdetective.extractors import DrainExtractor, CSGrepExtractor
 LOG = logging.getLogger("logdetective")
@@ -89,10 +91,13 @@ def setup_args():
         default=f"{os.path.dirname(__file__)}/skip_snippets.yml",
         help="Path to patterns for skipping snippets.",
     )
+    parser.add_argument(
+        "--csgrep", action="store_true", help="Use csgrep to process the log."
+    )
     return parser.parse_args()
-async def run():  # pylint: disable=too-many-statements,too-many-locals
+async def run():  # pylint: disable=too-many-statements,too-many-locals,too-many-branches
     """Main execution function."""
     args = setup_args()
@@ -134,13 +139,25 @@ async def run():  # pylint: disable=too-many-statements,too-many-locals
         sys.exit(5)
     # Log file summarizer initialization
-    extractor = DrainExtractor(
-        args.verbose > 1,
-        context=True,
-        max_clusters=args.n_clusters,
-        skip_snippets=skip_snippets,
+    extractors = []
+    extractors.append(
+        DrainExtractor(
+            args.verbose > 1,
+            max_clusters=args.n_clusters,
+            skip_snippets=skip_snippets,
+        )
     )
+    if args.csgrep:
+        if not check_csgrep():
+            LOG.error(
+                "You have requested use of `csgrep` when it isn't available on your system."
+            )
+            sys.exit(6)
+        extractors.append(
+            CSGrepExtractor(args.verbose > 1, skip_snippets=skip_snippets)
+        )
     LOG.info("Getting summary")
     async with aiohttp.ClientSession() as http:
@@ -150,22 +167,13 @@ async def run():  # pylint: disable=too-many-statements,too-many-locals
             # file does not exist
             LOG.error(e)
             sys.exit(4)
-        log_summary = extractor(log)
-    ratio = len(log_summary) / len(log.split("\n"))
-    LOG.info("Compression ratio: %s", ratio)
+    log_summary = mine_logs(log=log, extractors=extractors)
     LOG.info("Analyzing the text")
     log_summary = format_snippets(log_summary)
     LOG.info("Log summary: \n %s", log_summary)
-    prompt = (
-        f"{prompts_configuration.default_system_prompt}\n"
-        f"{prompts_configuration.prompt_template}"
-    )
     stream = True
     if args.no_stream:
         stream = False
@@ -173,30 +181,38 @@ async def run():  # pylint: disable=too-many-statements,too-many-locals
         log_summary,
         model,
         stream,
-        prompt_template=prompt,
+        prompt_templates=prompts_configuration,
         temperature=args.temperature,
     )
     probs = []
     print("Explanation:")
     # We need to extract top token probability from the response
-    # CreateCompletionResponse structure of llama-cpp-python.
+    # CreateChatCompletionResponse structure of llama-cpp-python.
     # `compute_certainty` function expects list of dictionaries with form
     # { 'logprob': <float> } as expected from the OpenAI API.
     if args.no_stream:
-        print(response["choices"][0]["text"])
+        print(response["choices"][0]["message"]["content"])
         probs = [
-            {"logprob": e} for e in response["choices"][0]["logprobs"]["token_logprobs"]
+            {"logprob": e["logprob"]} for e in response["choices"][0]["logprobs"]["content"]
         ]
     else:
         # Stream the output
         for chunk in response:
+            # What might happen, is that first (or possibly any other) chunk may not contain
+            # fields choices[0].delta.content or choices[0].logprobs -> if so, we just skip it
+            if any([
+                'content' not in chunk["choices"][0]["delta"],
+                'logprobs' not in chunk["choices"][0]
+            ]):
+                continue
             if isinstance(chunk["choices"][0]["logprobs"], dict):
                 probs.append(
-                    {"logprob": chunk["choices"][0]["logprobs"]["token_logprobs"][0]}
+                    {"logprob": chunk["choices"][0]["logprobs"]["content"][0]["logprob"]}
                 )
-            delta = chunk["choices"][0]["text"]
+            delta = chunk["choices"][0]["delta"]["content"]
             print(delta, end="", flush=True)
     certainty = compute_certainty(probs)

logdetective/models.py CHANGED Viewed

@@ -71,3 +71,29 @@ class SkipSnippets(BaseModel):
                 ) from ex
         return data
+class CSGrepEvent(BaseModel):
+    """`csgrep` splits error and warning messages into individual events."""
+    file_name: str
+    line: int
+    event: str
+    message: str
+    verbosity_level: int
+class CSGrepDefect(BaseModel):
+    """Defects detected by `csgrep`"""
+    checker: str
+    language: str
+    tool: str
+    key_event_idx: int
+    events: list[CSGrepEvent]
+class CSGrepOutput(BaseModel):
+    """Parsed output of `gsgrep`"""
+    defects: list[CSGrepDefect]

logdetective/prompts-summary-first.yml CHANGED Viewed

@@ -18,5 +18,3 @@ prompt_template: |
   Snippets:
   {}
-  Analysis:

logdetective/prompts.yml CHANGED Viewed

@@ -19,7 +19,6 @@ prompt_template: |
   {}
-  Analysis:
 snippet_prompt_template: |
   Analyse following RPM build log snippet. Describe contents accurately, without speculation or suggestions for resolution
@@ -30,7 +29,6 @@ snippet_prompt_template: |
   {}
-  Analysis:
 prompt_template_staged: |
   Given following log snippets, their explanation, and nothing else, explain what failure, if any, occurred during build of this package.
@@ -47,7 +45,6 @@ prompt_template_staged: |
   {}
-  Analysis:
 # System prompts
 # System prompts are meant to serve as general guide for model behavior,

logdetective/server/compressors.py CHANGED Viewed

@@ -36,20 +36,17 @@ class TextCompressor:
         zip_buffer.seek(0)
         return zip_buffer.getvalue()
-    def unzip(self, zip_data: Union[bytes, io.BytesIO]) -> str:
+    def unzip(self, zip_data: bytes) -> Dict[str, str]:
         """
         Uncompress data created by TextCompressor.zip().
         Args:
-            zip_data: A zipped stream of bytes or BytesIO object
+            zip_data: A zipped stream of bytes
         Returns:
             {file_name: str}: The decompressed content as a dict of file names and UTF-8 strings
         """
-        if isinstance(zip_data, bytes):
-            zip_buffer = io.BytesIO(zip_data)
-        else:
-            zip_buffer = zip_data
+        zip_buffer = io.BytesIO(zip_data)
         content = {}
         with zipfile.ZipFile(zip_buffer, "r") as zip_file:
@@ -95,12 +92,12 @@ class RemoteLogCompressor:
         return self.zip_text(content_text)
     @classmethod
-    def unzip(cls, zip_data: Union[bytes, io.BytesIO]) -> str:
+    def unzip(cls, zip_data: bytes) -> str:
         """
         Uncompress the zipped content of the remote log.
         Args:
-            zip_data: Compressed data as bytes or BytesIO
+            zip_data: Compressed data as bytes
         Returns:
             str: The decompressed log content
@@ -147,13 +144,13 @@ class LLMResponseCompressor:
     @classmethod
     def unzip(
-        cls, zip_data: Union[bytes, io.BytesIO]
+        cls, zip_data: bytes
     ) -> Union[StagedResponse, Response]:
         """
         Uncompress the zipped content of the LLM response.
         Args:
-            zip_data: Compressed data as bytes or BytesIO
+            zip_data: Compressed data as bytes
         Returns:
             Union[StagedResponse, Response]: The decompressed (partial) response object,

logdetective/server/config.py CHANGED Viewed

@@ -52,10 +52,11 @@ def get_log(config: Config):
     return log
-def get_openai_api_client(ineference_config: InferenceConfig):
+def get_openai_api_client(inference_config: InferenceConfig):
     """Set up AsyncOpenAI client with default configuration."""
     return AsyncOpenAI(
-        api_key=ineference_config.api_token, base_url=ineference_config.url
+        api_key=inference_config.api_token, base_url=inference_config.url,
+        timeout=inference_config.llm_api_timeout
     )

logdetective/server/database/base.py CHANGED Viewed

@@ -1,15 +1,14 @@
 from os import getenv
-from contextlib import contextmanager
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker, declarative_base
+from contextlib import asynccontextmanager
+from sqlalchemy.orm import DeclarativeBase
+from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker
 from logdetective import logger
 def get_pg_url() -> str:
     """create postgresql connection string"""
     return (
-        f"postgresql+psycopg2://{getenv('POSTGRESQL_USER')}"
+        f"postgresql+asyncpg://{getenv('POSTGRESQL_USER')}"
         f":{getenv('POSTGRESQL_PASSWORD')}@{getenv('POSTGRESQL_HOST', 'postgres')}"
         f":{getenv('POSTGRESQL_PORT', '5432')}/{getenv('POSTGRESQL_DATABASE')}"
     )
@@ -23,13 +22,16 @@ sqlalchemy_echo = getenv("SQLALCHEMY_ECHO", "False").lower() in (
     "y",
     "1",
 )
-engine = create_engine(get_pg_url(), echo=sqlalchemy_echo)
-SessionFactory = sessionmaker(autoflush=True, bind=engine)
-Base = declarative_base()
+engine = create_async_engine(get_pg_url(), echo=sqlalchemy_echo)
+SessionFactory = async_sessionmaker(autoflush=True, bind=engine)  # pylint: disable=invalid-name
+class Base(DeclarativeBase):
+    """Declarative base class for all ORM models."""
-@contextmanager
-def transaction(commit: bool = False):
+@asynccontextmanager
+async def transaction(commit: bool = False):
     """
     Context manager for 'framing' a db transaction.
@@ -39,27 +41,30 @@ def transaction(commit: bool = False):
     """
     session = SessionFactory()
-    try:
-        yield session
-        if commit:
-            session.commit()
-    except Exception as ex:
-        logger.warning("Exception while working with database: %s", str(ex))
-        session.rollback()
-        raise
-    finally:
-        session.close()
-def init():
+    async with session:
+        try:
+            yield session
+            if commit:
+                await session.commit()
+        except Exception as ex:
+            logger.warning("Exception while working with database: %s", str(ex))
+            await session.rollback()
+            raise
+        finally:
+            await session.close()
+async def init():
     """Init db"""
-    Base.metadata.create_all(engine)
+    async with engine.begin() as conn:
+        await conn.run_sync(Base.metadata.create_all)
     logger.debug("Database initialized")
-def destroy():
+async def destroy():
     """Destroy db"""
-    Base.metadata.drop_all(engine)
+    async with engine.begin() as conn:
+        await conn.run_sync(Base.metadata.drop_all)
     logger.warning("Database cleaned")

logdetective/server/database/models/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from logdetective.server.database.base import Base
 from logdetective.server.database.models.merge_request_jobs import (
     Forge,
     GitlabMergeRequestJobs,
@@ -18,8 +17,9 @@ from logdetective.server.database.models.exceptions import (
     KojiTaskAnalysisTimeoutError,
 )
+# pylint: disable=undefined-all-variable
 __all__ = [
-    Base.__name__,
     GitlabMergeRequestJobs.__name__,
     Comments.__name__,
     Reactions.__name__,

logdetective/server/database/models/exceptions.py CHANGED Viewed

@@ -11,3 +11,7 @@ class KojiTaskNotAnalyzedError(Exception):
 class KojiTaskAnalysisTimeoutError(Exception):
     """Exception raised when a koji task analysis has timed out"""
+class AnalyzeRequestMetricsNotFroundError(Exception):
+    """Exception raised when AnalyzeRequestMetrics is not found"""

logdetective 2.0.1__py3-none-any.whl → 2.11.0__py3-none-any.whl

logdetective 2.0.1py3-none-any.whl → 2.11.0py3-none-any.whl