PyPI - logdetective - Versions diffs - 0.4.0__py3-none-any.whl → 2.11.0__py3-none-any.whl - Mend

logdetective 0.4.0py3-none-any.whl → 2.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

logdetective/constants.py +33 -12
logdetective/extractors.py +137 -68
logdetective/logdetective.py +102 -33
logdetective/models.py +99 -0
logdetective/prompts-summary-first.yml +20 -0
logdetective/prompts-summary-only.yml +13 -0
logdetective/prompts.yml +90 -0
logdetective/remote_log.py +67 -0
logdetective/server/compressors.py +186 -0
logdetective/server/config.py +78 -0
logdetective/server/database/base.py +34 -26
logdetective/server/database/models/__init__.py +33 -0
logdetective/server/database/models/exceptions.py +17 -0
logdetective/server/database/models/koji.py +143 -0
logdetective/server/database/models/merge_request_jobs.py +623 -0
logdetective/server/database/models/metrics.py +427 -0
logdetective/server/emoji.py +148 -0
logdetective/server/exceptions.py +37 -0
logdetective/server/gitlab.py +451 -0
logdetective/server/koji.py +159 -0
logdetective/server/llm.py +309 -0
logdetective/server/metric.py +75 -30
logdetective/server/models.py +426 -23
logdetective/server/plot.py +432 -0
logdetective/server/server.py +580 -468
logdetective/server/templates/base_response.html.j2 +59 -0
logdetective/server/templates/gitlab_full_comment.md.j2 +73 -0
logdetective/server/templates/gitlab_short_comment.md.j2 +62 -0
logdetective/server/utils.py +98 -32
logdetective/skip_snippets.yml +12 -0
logdetective/utils.py +187 -73
logdetective-2.11.0.dist-info/METADATA +568 -0
logdetective-2.11.0.dist-info/RECORD +40 -0
{logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info}/WHEEL +1 -1
logdetective/server/database/models.py +0 -88
logdetective-0.4.0.dist-info/METADATA +0 -333
logdetective-0.4.0.dist-info/RECORD +0 -19
{logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info}/entry_points.txt +0 -0
{logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info/licenses}/LICENSE +0 -0

logdetective/utils.py CHANGED Viewed

@@ -1,63 +1,80 @@
 import logging
 import os
+import subprocess as sp
 from typing import Iterator, List, Dict, Tuple, Generator
 from urllib.parse import urlparse
+import aiohttp
 import numpy as np
-import requests
+import yaml
-from llama_cpp import Llama, CreateCompletionResponse, CreateCompletionStreamResponse
-from logdetective.constants import PROMPT_TEMPLATE, SNIPPET_DELIMITER
-from logdetective.server.models import AnalyzedSnippet
+from llama_cpp import (
+    Llama,
+    CreateChatCompletionResponse,
+    CreateChatCompletionStreamResponse,
+)
+from logdetective.constants import SNIPPET_DELIMITER
+from logdetective.models import PromptConfig, SkipSnippets
+from logdetective.remote_log import RemoteLog
 LOG = logging.getLogger("logdetective")
-def chunk_continues(text: str, index: int) -> bool:
+def new_message(text: str) -> bool:
     """Set of heuristics for determining whether or not
     does the current chunk of log text continue on next line.
     Following rules are checked, in order:
-    * is the next character is whitespace
-    * is the previous character backslash '\\'
-    * is the previous character colon ':'
+    * is the first character is whitespace
+    * is the first character backslash '|'
     """
     conditionals = [
-        lambda i, string: string[i + 1].isspace(),
-        lambda i, string: string[i - 1] == "\\",
-        lambda i, string: string[i - 1] == ":",
+        lambda string: string[0].isspace(),
+        lambda string: string[0] == "|",
     ]
     for c in conditionals:
-        y = c(index, text)
+        y = c(text)
         if y:
-            return True
+            return False
-    return False
+    return True
-def get_chunks(text: str) -> Generator[Tuple[int, str], None, None]:
+def get_chunks(
+    text: str, max_chunk_len: int = 2000
+) -> Generator[Tuple[int, str], None, None]:
     """Split log into chunks according to heuristic
     based on whitespace and backslash presence.
     """
-    text_len = len(text)
-    i = 0
+    lines = text.splitlines()
+    # Chunk we will be yielding
     chunk = ""
-    # Keep track of the original and next line number
-    # every `\n` hit increases the next_line_number by one.
-    original_line_number = 0
-    next_line_number = 0
-    while i < text_len:
-        chunk += text[i]
-        if text[i] == "\n":
-            next_line_number += 1
-            if i + 1 < text_len and chunk_continues(text, i):
-                i += 1
-                continue
-            yield (original_line_number, chunk)
-            original_line_number = next_line_number + 1
-            chunk = ""
-        i += 1
+    # Number of line where the message started
+    original_line = 0
+    for i, line in enumerate(lines):
+        if len(line) == 0:
+            continue
+        if new_message(line):
+            # Yield chunk if we have it
+            if len(chunk) > 0:
+                yield (original_line, chunk)
+            original_line = i
+            chunk = line
+        else:
+            chunk += "\n" + line
+        if len(chunk) > max_chunk_len:
+            # If the chunk is too long, keep splitting into smaller chunks
+            # until we reach manageable size
+            while len(chunk) > max_chunk_len:
+                remainder = chunk[max_chunk_len:]
+                chunk = chunk[:max_chunk_len]
+                yield (original_line, chunk)
+                chunk = remainder
+    # if we still have some text left over
+    yield (original_line, chunk)
 def initialize_model(
@@ -110,25 +127,43 @@ def compute_certainty(probs: List[Dict]) -> float:
 def process_log(
-    log: str, model: Llama, stream: bool
-) -> CreateCompletionResponse | Iterator[CreateCompletionStreamResponse]:
+    log: str, model: Llama, stream: bool, prompt_templates: PromptConfig, temperature: float
+) -> CreateChatCompletionResponse | Iterator[CreateChatCompletionStreamResponse]:
     """Processes a given log using the provided language model and returns its summary.
     Args:
         log (str): The input log to be processed.
         model (Llama): The language model used for processing the log.
+        stream (bool): Return output as Iterator.
+        prompt_template (str): Which prompt template to use.
+        temperature (float): Temperature parameter for model runtime.
     Returns:
         str: The summary of the given log generated by the language model.
     """
-    response = model(
-        prompt=PROMPT_TEMPLATE.format(log), stream=stream, max_tokens=0, logprobs=1
+    messages = [
+        {
+            "role": "system",
+            "content": prompt_templates.default_system_prompt
+        },
+        {
+            "role": "user",
+            "content": prompt_templates.prompt_template.format(log)
+        },
+    ]
+    response = model.create_chat_completion(
+        messages=messages,
+        stream=stream,
+        max_tokens=0,
+        logprobs=True,
+        top_logprobs=1,
+        temperature=temperature,
     )
     return response
-def retrieve_log_content(log_path: str) -> str:
+async def retrieve_log_content(http: aiohttp.ClientSession, log_path: str) -> str:
     """Get content of the file on the log_path path.
     Path is assumed to be valid URL if it has a scheme.
     Otherwise it attempts to pull it from local filesystem."""
@@ -143,7 +178,8 @@ def retrieve_log_content(log_path: str) -> str:
             log = f.read()
     else:
-        log = requests.get(log_path, timeout=60).text
+        remote_log = RemoteLog(log_path, http)
+        log = await remote_log.get_url_content()
     return log
@@ -156,46 +192,124 @@ def format_snippets(snippets: list[str] | list[Tuple[int, str]]) -> str:
     Line number must be first element in the tuple. Mixed format of snippets
     is permitted, but may have impact on inference.
     """
-    summary = ""
+    summary = "\n"
     for i, s in enumerate(snippets):
         if isinstance(s, tuple):
-            summary += f"""
-            Snippet No. {i} at line #{s[0]}:
-            {s[1]}
-            ================
-            """
+            line_number, snippet_content = s
+            header = f"Snippet No. {i} at line #{line_number}:"
         else:
-            summary += f"""
-            Snippet No. {i}:
-            {s[1]}
-            ================
-            """
+            header = f"Snippet No. {i}:"
+            snippet_content = s
+        summary += (
+            f"{header}\n"
+            "\n"
+            f"{snippet_content}\n"
+            f"{SNIPPET_DELIMITER}\n"
+            f"\n"
+        )
     return summary
-def format_analyzed_snippets(snippets: list[AnalyzedSnippet]) -> str:
-    """Format snippets for submission into staged prompt."""
-    summary = f"\n{SNIPPET_DELIMITER}\n".join(
-        [
-            f"[{e.text}] at line [{e.line_number}]: [{e.explanation.text}]"
-            for e in snippets
+def load_prompts(path: str | None) -> PromptConfig:
+    """Load prompts from given yaml file if there is one.
+    Alternatively use defaults."""
+    if path:
+        try:
+            with open(path, "r") as file:
+                return PromptConfig(yaml.safe_load(file))
+        except FileNotFoundError:
+            print("Prompt configuration file not found, reverting to defaults.")
+    return PromptConfig()
+def prompt_to_messages(
+    user_message: str,
+    system_prompt: str | None = None,
+    system_role: str = "developer",
+    user_role: str = "user",
+) -> List[Dict[str, str]]:
+    """Turn prompt into list of message dictionaries.
+    If `system_role` and `user_role` are the same, only a single message is created,
+    as concatenation of `user_message` and `system_prompt`. This is useful for models which
+    do not have separate system role, such as mistral.
+    """
+    if system_role == user_role:
+        messages = [
+            {"role": system_role, "content": f"{system_prompt}\n{user_message}"}
+        ]
+    else:
+        messages = [
+            {"role": system_role, "content": system_prompt},
+            {
+                "role": user_role,
+                "content": user_message,
+            },
         ]
-    )
-    return summary
+    return messages
-def validate_url(url: str) -> bool:
-    """Validate incoming URL to be at least somewhat sensible for log files
-    Only http and https protocols permitted. No result, params or query fields allowed.
-    Either netloc or path must have non-zero length.
-    """
-    result = urlparse(url)
-    if result.scheme not in ["http", "https"]:
-        return False
-    if any([result.params, result.query, result.fragment]):
-        return False
-    if not (result.path or result.netloc):
+def filter_snippet_patterns(snippet: str, skip_snippets: SkipSnippets) -> bool:
+    """Try to match snippet agains provided patterns to determine if we should
+    filter it out or not."""
+    for key, pattern in skip_snippets.snippet_patterns.items():
+        if pattern.match(snippet):
+            LOG.debug("Snippet `%s` has matched agains skip pattern %s", snippet, key)
+            return True
+    return False
+def load_skip_snippet_patterns(path: str | None) -> SkipSnippets:
+    """Load dictionary of snippet patterns we want to skip."""
+    if path:
+        try:
+            with open(path, "r") as file:
+                return SkipSnippets(yaml.safe_load(file))
+        except OSError as e:
+            LOG.error("Couldn't open file with snippet skip patterns `%s`", path)
+            raise e
+    return SkipSnippets({})
+def check_csgrep() -> bool:
+    """Verifies presence of csgrep in path"""
+    try:
+        result = sp.run(
+            ["csgrep", "--version"],
+            text=True,
+            check=True,
+            shell=False,
+            capture_output=True,
+            timeout=1.0,
+        )
+    except (FileNotFoundError, sp.TimeoutExpired, sp.CalledProcessError) as ex:
+        LOG.error("Required binary `csgrep` was not found in path: %s", ex)
         return False
-    return True
+    if result.returncode == 0:
+        return True
+    LOG.error("Issue was encountered while calling `csgrep`: `%s`", result.stderr)
+    return False
+def mine_logs(log: str, extractors: list) -> List[Tuple[int, str]]:
+    """Extract snippets from log text using extractors provided.
+    Each extractor is applied in turn on original log.
+    Depending on characteristics of extractors used, there may be
+    an overlap in snippets extracted."""
+    log_summary = []
+    LOG.info("Getting summary")
+    for extractor in extractors:
+        log_summary.extend(extractor(log))
+    ratio = len("\n".join([text for _, text in log_summary])) / len(log)
+    LOG.debug("Log summary: \n %s", log_summary)
+    LOG.info("Snippets: %s Compression ratio: %s", len(log_summary), ratio)
+    return log_summary

logdetective 0.4.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

logdetective 0.4.0py3-none-any.whl → 2.11.0py3-none-any.whl