PyPI - logdetective - Versions diffs - 0.2.14__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

logdetective 0.2.14py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

logdetective/constants.py +2 -3
logdetective/extractors.py +24 -14
logdetective/logdetective.py +69 -31
logdetective/server/models.py +94 -3
logdetective/server/server.py +240 -20
logdetective/server/utils.py +29 -0
logdetective/utils.py +56 -25
{logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/METADATA +30 -1
logdetective-0.3.2.dist-info/RECORD +15 -0
logdetective-0.2.14.dist-info/RECORD +0 -15
{logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/LICENSE +0 -0
{logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/WHEEL +0 -0
{logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/entry_points.txt +0 -0

logdetective/constants.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # pylint: disable=line-too-long
 DEFAULT_ADVISOR = "fedora-copr/Mistral-7B-Instruct-v0.2-GGUF"
@@ -32,7 +31,7 @@ Answer:
 """
 SNIPPET_PROMPT_TEMPLATE = """
-Analyse following RPM build log snippet. Decribe contents accurately, without speculation or suggestions for resolution.
+Analyse following RPM build log snippet. Describe contents accurately, without speculation or suggestions for resolution.
 Snippet:
@@ -59,4 +58,4 @@ Analysis:
 """
-SNIPPET_DELIMITER = '================'
+SNIPPET_DELIMITER = "================"

logdetective/extractors.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import logging
+from typing import Tuple
 import drain3
 from drain3.template_miner_config import TemplateMinerConfig
@@ -15,13 +16,17 @@ class LLMExtractor:
     """
     A class that extracts relevant information from logs using a language model.
     """
     def __init__(self, model: Llama, n_lines: int = 2):
         self.model = model
         self.n_lines = n_lines
         self.grammar = LlamaGrammar.from_string(
-            "root ::= (\"Yes\" | \"No\")", verbose=False)
+            'root ::= ("Yes" | "No")', verbose=False
+        )
-    def __call__(self, log: str, n_lines: int = 2, neighbors: bool = False) -> list[str]:
+    def __call__(
+        self, log: str, n_lines: int = 2, neighbors: bool = False
+    ) -> list[str]:
         chunks = self.rate_chunks(log)
         out = self.create_extract(chunks, neighbors)
         return out
@@ -35,7 +40,7 @@ class LLMExtractor:
         log_lines = log.split("\n")
         for i in range(0, len(log_lines), self.n_lines):
-            block = '\n'.join(log_lines[i:i + self.n_lines])
+            block = "\n".join(log_lines[i: i + self.n_lines])
             prompt = SUMMARIZE_PROMPT_TEMPLATE.format(log)
             out = self.model(prompt, max_tokens=7, grammar=self.grammar)
             out = f"{out['choices'][0]['text']}\n"
@@ -44,8 +49,7 @@ class LLMExtractor:
         return results
     def create_extract(self, chunks: list[tuple], neighbors: bool = False) -> list[str]:
-        """Extract interesting chunks from the model processing.
-        """
+        """Extract interesting chunks from the model processing."""
         interesting = []
         summary = []
         # pylint: disable=consider-using-enumerate
@@ -64,8 +68,8 @@ class LLMExtractor:
 class DrainExtractor:
-    """A class that extracts information from logs using a template miner algorithm.
-    """
+    """A class that extracts information from logs using a template miner algorithm."""
     def __init__(self, verbose: bool = False, context: bool = False, max_clusters=8):
         config = TemplateMinerConfig()
         config.load(f"{os.path.dirname(__file__)}/drain3.ini")
@@ -75,15 +79,21 @@ class DrainExtractor:
         self.verbose = verbose
         self.context = context
-    def __call__(self, log: str) -> list[str]:
+    def __call__(self, log: str) -> list[Tuple[int, str]]:
         out = []
-        for chunk in get_chunks(log):
-            processed_line = self.miner.add_log_message(chunk)
-            LOG.debug(processed_line)
-        sorted_clusters = sorted(self.miner.drain.clusters, key=lambda it: it.size, reverse=True)
-        for chunk in get_chunks(log):
+        # First pass create clusters
+        for _, chunk in get_chunks(log):
+            processed_chunk = self.miner.add_log_message(chunk)
+            LOG.debug(processed_chunk)
+        # Sort found clusters by size, descending order
+        sorted_clusters = sorted(
+            self.miner.drain.clusters, key=lambda it: it.size, reverse=True
+        )
+        # Second pass, only matching lines with clusters,
+        # to recover original text
+        for chunk_start, chunk in get_chunks(log):
             cluster = self.miner.match(chunk, "always")
             if cluster in sorted_clusters:
-                out.append(chunk)
+                out.append((chunk_start, chunk))
                 sorted_clusters.remove(cluster)
         return out

logdetective/logdetective.py CHANGED Viewed

@@ -4,40 +4,71 @@ import sys
 from logdetective.constants import DEFAULT_ADVISOR
 from logdetective.utils import (
-    process_log, initialize_model, retrieve_log_content, format_snippets, compute_certainty)
+    process_log,
+    initialize_model,
+    retrieve_log_content,
+    format_snippets,
+    compute_certainty,
+)
 from logdetective.extractors import LLMExtractor, DrainExtractor
 LOG = logging.getLogger("logdetective")
 def setup_args():
-    """ Setup argument parser and return arguments. """
+    """Setup argument parser and return arguments."""
     parser = argparse.ArgumentParser("logdetective")
-    parser.add_argument("file", type=str,
-                        default="", help="The URL or path to the log file to be analyzed.")
-    parser.add_argument("-M", "--model",
-                        help="The path or Hugging Face name of the language model for analysis.",
-                        type=str, default=DEFAULT_ADVISOR)
-    parser.add_argument("-F", "--filename_suffix",
-                        help="Suffix of the model file name to be retrieved from Hugging Face.\
+    parser.add_argument(
+        "file",
+        type=str,
+        default="",
+        help="The URL or path to the log file to be analyzed.",
+    )
+    parser.add_argument(
+        "-M",
+        "--model",
+        help="The path or Hugging Face name of the language model for analysis.",
+        type=str,
+        default=DEFAULT_ADVISOR,
+    )
+    parser.add_argument(
+        "-F",
+        "--filename_suffix",
+        help="Suffix of the model file name to be retrieved from Hugging Face.\
                             Makes sense only if the model is specified with Hugging Face name.",
-                        default="Q4_K_S.gguf")
-    parser.add_argument("-n", "--no-stream", action='store_true')
-    parser.add_argument("-S", "--summarizer", type=str, default="drain",
-                        help="Choose between LLM and Drain template miner as the log summarizer.\
-                                LLM must be specified as path to a model, URL or local file.")
-    parser.add_argument("-N", "--n_lines", type=int,
-                        default=8, help="The number of lines per chunk for LLM analysis.\
-                            This only makes sense when you are summarizing with LLM.")
-    parser.add_argument("-C", "--n_clusters", type=int, default=8,
-                        help="Number of clusters for Drain to organize log chunks into.\
-                            This only makes sense when you are summarizing with Drain")
-    parser.add_argument("-v", "--verbose", action='count', default=0)
-    parser.add_argument("-q", "--quiet", action='store_true')
+        default="Q4_K_S.gguf",
+    )
+    parser.add_argument("-n", "--no-stream", action="store_true")
+    parser.add_argument(
+        "-S",
+        "--summarizer",
+        type=str,
+        default="drain",
+        help="Choose between LLM and Drain template miner as the log summarizer.\
+                                LLM must be specified as path to a model, URL or local file.",
+    )
+    parser.add_argument(
+        "-N",
+        "--n_lines",
+        type=int,
+        default=8,
+        help="The number of lines per chunk for LLM analysis.\
+                            This only makes sense when you are summarizing with LLM.",
+    )
+    parser.add_argument(
+        "-C",
+        "--n_clusters",
+        type=int,
+        default=8,
+        help="Number of clusters for Drain to organize log chunks into.\
+                            This only makes sense when you are summarizing with Drain",
+    )
+    parser.add_argument("-v", "--verbose", action="count", default=0)
+    parser.add_argument("-q", "--quiet", action="store_true")
     return parser.parse_args()
-def main():
+def main():  # pylint: disable=too-many-statements
     """Main execution function."""
     args = setup_args()
@@ -57,8 +88,9 @@ def main():
     # Primary model initialization
     try:
-        model = initialize_model(args.model, filename_suffix=args.filename_suffix,
-                                 verbose=args.verbose > 2)
+        model = initialize_model(
+            args.model, filename_suffix=args.filename_suffix, verbose=args.verbose > 2
+        )
     except ValueError as e:
         LOG.error(e)
         LOG.error("You likely do not have enough memory to load the AI model")
@@ -66,7 +98,9 @@ def main():
     # Log file summarizer selection and initialization
     if args.summarizer == "drain":
-        extractor = DrainExtractor(args.verbose > 1, context=True, max_clusters=args.n_clusters)
+        extractor = DrainExtractor(
+            args.verbose > 1, context=True, max_clusters=args.n_clusters
+        )
     else:
         summarizer_model = initialize_model(args.summarizer, verbose=args.verbose > 2)
         extractor = LLMExtractor(summarizer_model, args.verbose > 1)
@@ -81,7 +115,7 @@ def main():
         sys.exit(4)
     log_summary = extractor(log)
-    ratio = len(log_summary) / len(log.split('\n'))
+    ratio = len(log_summary) / len(log.split("\n"))
     LOG.info("Compression ratio: %s", ratio)
@@ -103,15 +137,19 @@ def main():
     if args.no_stream:
         print(response["choices"][0]["text"])
-        probs = [{'logprob': e} for e in response['choices'][0]['logprobs']['token_logprobs']]
+        probs = [
+            {"logprob": e} for e in response["choices"][0]["logprobs"]["token_logprobs"]
+        ]
     else:
         # Stream the output
         for chunk in response:
             if isinstance(chunk["choices"][0]["logprobs"], dict):
-                probs.append({'logprob': chunk["choices"][0]["logprobs"]['token_logprobs'][0]})
-            delta = chunk['choices'][0]['text']
-            print(delta, end='', flush=True)
+                probs.append(
+                    {"logprob": chunk["choices"][0]["logprobs"]["token_logprobs"][0]}
+                )
+            delta = chunk["choices"][0]["text"]
+            print(delta, end="", flush=True)
     certainty = compute_certainty(probs)
     print(f"\nResponse certainty: {certainty:.2f}%\n")

logdetective/server/models.py CHANGED Viewed

@@ -1,5 +1,6 @@
+from logging import BASIC_FORMAT
 from typing import List, Dict, Optional
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 class BuildLog(BaseModel):
@@ -8,6 +9,34 @@ class BuildLog(BaseModel):
     url: str
+class JobHook(BaseModel):
+    """Model of Job Hook events sent from GitLab.
+    Full details of the specification are available at
+    https://docs.gitlab.com/user/project/integrations/webhook_events/#job-events
+    This model implements only the fields that we care about. The webhook
+    sends many more fields that we will ignore."""
+    # The unique job ID on this GitLab instance.
+    build_id: int
+    # The identifier of the job. We only care about 'build_rpm' and
+    # 'build_centos_stream_rpm' jobs.
+    build_name: str = Field(pattern=r"^build(_.*)?_rpm$")
+    # A string representing the job status. We only care about 'failed' jobs.
+    build_status: str = Field(pattern=r"^failed$")
+    # The kind of webhook message. We are only interested in 'build' messages
+    # which represents job tasks in a pipeline.
+    object_kind: str = Field(pattern=r"^build$")
+    # The unique ID of the enclosing pipeline on this GitLab instance.
+    pipeline_id: int
+    # The unique ID of the project triggering this event
+    project_id: int
 class Response(BaseModel):
     """Model of data returned by Log Detective API
@@ -28,10 +57,13 @@ class StagedResponse(Response):
         https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.llama_types.CreateCompletionResponse
     response_certainty: float
     snippets:
-        list of dictionaries { 'snippet' : '<original_text>, 'comment': CreateCompletionResponse }
+        list of dictionaries {
+        'snippet' : '<original_text>,
+        'comment': CreateCompletionResponse,
+        'line_number': '<location_in_log>' }
     """
-    snippets: List[Dict[str, str | Dict]]
+    snippets: List[Dict[str, str | Dict | int]]
 class InferenceConfig(BaseModel):
@@ -66,11 +98,67 @@ class ExtractorConfig(BaseModel):
         self.verbose = data.get("verbose", False)
+class GitLabConfig(BaseModel):
+    """Model for GitLab configuration of logdetective server."""
+    url: str = None
+    api_url: str = None
+    api_token: str = None
+    # Maximum size of artifacts.zip in MiB. (default: 300 MiB)
+    max_artifact_size: int = 300
+    def __init__(self, data: Optional[dict] = None):
+        super().__init__()
+        if data is None:
+            return
+        self.url = data.get("url", "https://gitlab.com")
+        self.api_url = f"{self.url}/api/v4"
+        self.api_token = data.get("api_token", None)
+        self.max_artifact_size = int(data.get("max_artifact_size")) * 1024 * 1024
+class LogConfig(BaseModel):
+    """Logging configuration"""
+    name: str = "logdetective"
+    level: str | int = "INFO"
+    path: str | None = None
+    format: str = BASIC_FORMAT
+    def __init__(self, data: Optional[dict] = None):
+        super().__init__()
+        if data is None:
+            return
+        self.name = data.get("name", "logdetective")
+        self.level = data.get("level", "INFO").upper()
+        self.path = data.get("path")
+        self.format = data.get("format", BASIC_FORMAT)
+class GeneralConfig(BaseModel):
+    """General config options for Log Detective"""
+    packages: List[str] = None
+    def __init__(self, data: Optional[dict] = None):
+        super().__init__()
+        if data is None:
+            return
+        self.packages = data.get("packages", [])
 class Config(BaseModel):
     """Model for configuration of logdetective server."""
+    log: LogConfig = LogConfig()
     inference: InferenceConfig = InferenceConfig()
     extractor: ExtractorConfig = ExtractorConfig()
+    gitlab: GitLabConfig = GitLabConfig()
+    general: GeneralConfig = GeneralConfig()
     def __init__(self, data: Optional[dict] = None):
         super().__init__()
@@ -78,5 +166,8 @@ class Config(BaseModel):
         if data is None:
             return
+        self.log = LogConfig(data.get("log"))
         self.inference = InferenceConfig(data.get("inference"))
         self.extractor = ExtractorConfig(data.get("extractor"))
+        self.gitlab = GitLabConfig(data.get("gitlab"))
+        self.general = GeneralConfig(data.get("general"))

logdetective/server/server.py CHANGED Viewed

@@ -1,26 +1,35 @@
 import asyncio
 import json
-import logging
 import os
-from typing import List, Annotated
+import re
+import zipfile
+from pathlib import PurePath
+from tempfile import TemporaryFile
+from typing import List, Annotated, Tuple
 from llama_cpp import CreateCompletionResponse
-from fastapi import FastAPI, HTTPException, Depends, Header
+from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends, Header
 from fastapi.responses import StreamingResponse
+from fastapi.responses import Response as BasicResponse
+import gitlab
 import requests
 from logdetective.constants import (
     PROMPT_TEMPLATE,
     SNIPPET_PROMPT_TEMPLATE,
     PROMPT_TEMPLATE_STAGED,
-    SNIPPET_DELIMITER,
 )
 from logdetective.extractors import DrainExtractor
-from logdetective.utils import validate_url, compute_certainty
-from logdetective.server.models import BuildLog, Response, StagedResponse
-from logdetective.server.utils import load_server_config
+from logdetective.utils import (
+    validate_url,
+    compute_certainty,
+    format_snippets,
+    format_analyzed_snippets,
+)
+from logdetective.server.models import BuildLog, JobHook, Response, StagedResponse
+from logdetective.server.utils import load_server_config, get_log
-LOG = logging.getLogger("logdetective")
 LLM_CPP_HOST = os.environ.get("LLAMA_CPP_HOST", "localhost")
 LLM_CPP_SERVER_ADDRESS = f"http://{LLM_CPP_HOST}"
@@ -33,6 +42,11 @@ LLM_API_TOKEN = os.environ.get("LLM_API_TOKEN", None)
 SERVER_CONFIG = load_server_config(SERVER_CONFIG_PATH)
+MR_REGEX = re.compile(r"refs/merge-requests/(\d+)/merge")
+FAILURE_LOG_REGEX = re.compile(r"(\w*\.log)")
+LOG = get_log(SERVER_CONFIG)
 def requires_token_when_set(authentication: Annotated[str | None, Header()] = None):
     """
@@ -65,6 +79,9 @@ def requires_token_when_set(authentication: Annotated[str | None, Header()] = No
 app = FastAPI(dependencies=[Depends(requires_token_when_set)])
+app.gitlab_conn = gitlab.Gitlab(
+    url=SERVER_CONFIG.gitlab.url, private_token=SERVER_CONFIG.gitlab.api_token
+)
 def process_url(url: str) -> str:
@@ -90,7 +107,7 @@ def process_url(url: str) -> str:
     return log_request.text
-def mine_logs(log: str) -> List[str]:
+def mine_logs(log: str) -> List[Tuple[int, str]]:
     """Extract snippets from log text"""
     extractor = DrainExtractor(
         verbose=True, context=True, max_clusters=SERVER_CONFIG.extractor.max_clusters
@@ -141,6 +158,7 @@ async def submit_text(
             stream=stream,
         )
     except requests.RequestException as ex:
+        LOG.error("Llama-cpp query failed: %s", ex)
         raise HTTPException(
             status_code=400, detail=f"Llama-cpp query failed: {ex}"
         ) from ex
@@ -175,6 +193,7 @@ async def analyze_log(build_log: BuildLog):
     """
     log_text = process_url(build_log.url)
     log_summary = mine_logs(log_text)
+    log_summary = format_snippets(log_summary)
     response = await submit_text(PROMPT_TEMPLATE.format(log_summary))
     certainty = 0
@@ -188,7 +207,7 @@ async def analyze_log(build_log: BuildLog):
             raise HTTPException(
                 status_code=400,
                 detail=f"Couldn't compute certainty with data:\n"
-                f"{response["choices"][0]["logprobs"]["content"][0]["top_logprobs"]}",
+                f"{response['choices'][0]['logprobs']['content'][0]['top_logprobs']}",
             ) from ex
     return Response(explanation=response, response_certainty=certainty)
@@ -207,20 +226,15 @@ async def analyze_log_staged(build_log: BuildLog):
     # Process snippets asynchronously
     analyzed_snippets = await asyncio.gather(
-        *[submit_text(SNIPPET_PROMPT_TEMPLATE.format(s)) for s in log_summary]
+        *[submit_text(SNIPPET_PROMPT_TEMPLATE.format(s[1])) for s in log_summary]
     )
     analyzed_snippets = [
-        {"snippet": e[0], "comment": e[1]} for e in zip(log_summary, analyzed_snippets)
+        {"snippet": e[0][1], "line_number": e[0][0], "comment": e[1]}
+        for e in zip(log_summary, analyzed_snippets)
     ]
     final_prompt = PROMPT_TEMPLATE_STAGED.format(
-        f"\n{SNIPPET_DELIMITER}\n".join(
-            [
-                f"[{e["snippet"]}] : [{e["comment"]["choices"][0]["text"]}]"
-                for e in analyzed_snippets
-            ]
-        )
+        format_analyzed_snippets(analyzed_snippets)
     )
     final_analysis = await submit_text(final_prompt)
@@ -237,7 +251,7 @@ async def analyze_log_staged(build_log: BuildLog):
             raise HTTPException(
                 status_code=400,
                 detail=f"Couldn't compute certainty with data:\n"
-                f"{final_analysis["choices"][0]["logprobs"]["content"][0]["top_logprobs"]}",
+                f"{final_analysis['choices'][0]['logprobs']['content'][0]['top_logprobs']}",
             ) from ex
     return StagedResponse(
@@ -257,6 +271,212 @@ async def analyze_log_stream(build_log: BuildLog):
     """
     log_text = process_url(build_log.url)
     log_summary = mine_logs(log_text)
+    log_summary = format_snippets(log_summary)
     stream = await submit_text(PROMPT_TEMPLATE.format(log_summary), stream=True)
     return StreamingResponse(stream)
+@app.post("/webhook/gitlab/job_events")
+async def receive_gitlab_job_event_webhook(
+    job_hook: JobHook, background_tasks: BackgroundTasks
+):
+    """Webhook endpoint for receiving job_events notifications from GitLab
+    https://docs.gitlab.com/user/project/integrations/webhook_events/#job-events
+    lists the full specification for the messages sent for job events."""
+    # Handle the message in the background so we can return 200 immediately
+    background_tasks.add_task(process_gitlab_job_event, job_hook)
+    # No return value or body is required for a webhook.
+    # 204: No Content
+    return BasicResponse(status_code=204)
+async def process_gitlab_job_event(job_hook):
+    """Handle a received job_event webhook from GitLab"""
+    LOG.debug("Received webhook message:\n%s", job_hook)
+    # Look up the project this job belongs to
+    project = await asyncio.to_thread(app.gitlab_conn.projects.get, job_hook.project_id)
+    # check if this project is on the opt-in list
+    if project.name not in SERVER_CONFIG.general.packages:
+        LOG.info("Ignoring unrecognized package %s", project.name)
+        return
+    LOG.info("Processing failed job for %s", project.name)
+    # Retrieve data about the job from the GitLab API
+    job = await asyncio.to_thread(project.jobs.get, job_hook.build_id)
+    # Retrieve the pipeline that started this job
+    pipeline = await asyncio.to_thread(project.pipelines.get, job_hook.pipeline_id)
+    # Verify this is a merge request
+    if pipeline.source != "merge_request_event":
+        LOG.info("Not a merge request pipeline. Ignoring.")
+        return
+    # Extract the merge-request ID from the job
+    match = MR_REGEX.search(pipeline.ref)
+    if not match:
+        LOG.error(
+            "Pipeline source is merge_request_event but no merge request ID was provided."
+        )
+        return
+    merge_request_id = int(match.group(1))
+    LOG.debug("Retrieving log artifacts")
+    # Retrieve the build logs from the merge request artifacts and preprocess them
+    try:
+        preprocessed_log = await retrieve_and_preprocess_koji_logs(job)
+    except LogsTooLargeError:
+        LOG.error("Could not retrieve logs. Too large.")
+        raise
+    # Submit log to Log Detective and await the results.
+    response = await submit_log_to_llm(preprocessed_log)
+    preprocessed_log.close()
+    # Add the Log Detective response as a comment to the merge request
+    await comment_on_mr(merge_request_id, response)
+class LogsTooLargeError(RuntimeError):
+    """The log archive exceeds the configured maximum size"""
+async def retrieve_and_preprocess_koji_logs(job):
+    """Download logs from the merge request artifacts
+    This function will retrieve the build logs and do some minimal
+    preprocessing to determine which log is relevant for analysis.
+    returns: An open, file-like object containing the log contents to be sent
+    for processing by Log Detective. The calling function is responsible for
+    closing this object."""
+    # Make sure the file isn't too large to process.
+    if not await check_artifacts_file_size(job):
+        raise LogsTooLargeError(
+            f"Oversized logs for job {job.id} in project {job.project_id}"
+        )
+    # Create a temporary file to store the downloaded log zipfile.
+    # This will be automatically deleted when the last reference into it
+    # (returned by this function) is closed.
+    tempfile = TemporaryFile(mode="w+b")
+    await asyncio.to_thread(job.artifacts, streamed=True, action=tempfile.write)
+    tempfile.seek(0)
+    failed_arches = {}
+    artifacts_zip = zipfile.ZipFile(tempfile, mode="r")
+    for zipinfo in artifacts_zip.infolist():
+        if zipinfo.filename.endswith("task_failed.log"):
+            # The koji logs store this file in two places: 1) in the
+            # directory with the failed architecture and 2) in the parent
+            # directory. We actually want to ignore the one in the parent
+            # directory, since the rest of the information is in the
+            # specific task directory.
+            # The paths look like `kojilogs/noarch-XXXXXX/task_failed.log`
+            # or `kojilogs/noarch-XXXXXX/x86_64-XXXXXX/task_failed.log`
+            path = PurePath(zipinfo.filename)
+            if len(path.parts) <= 3:
+                continue
+            # Extract the architecture from the immediate parent path
+            architecture = path.parent.parts[-1].split("-")[0]
+            # Open this file and read which log failed.
+            # The string in this log has the format
+            # `see <log> for more information`.
+            # Note: it may sometimes say
+            # `see build.log or root.log for more information`, but in
+            # that situation, we only want to handle build.log (for now),
+            # which means accepting only the first match for the regular
+            # expression.
+            with artifacts_zip.open(zipinfo.filename) as task_failed_log:
+                contents = task_failed_log.read().decode("utf-8")
+                match = FAILURE_LOG_REGEX.search(contents)
+                if not match:
+                    LOG.error(
+                        "task_failed.log does not indicate which log contains the failure."
+                    )
+                    raise SyntaxError(
+                        "task_failed.log does not indicate which log contains the failure."
+                    )
+                failure_log_name = match.group(1)
+            failed_arches[architecture] = PurePath(path.parent, failure_log_name)
+    if not failed_arches:
+        # No failed task found?
+        raise FileNotFoundError("Could not detect failed architecture.")
+    # First check if we only found one failed architecture
+    if len(failed_arches) == 1:
+        failed_arch = list(failed_arches.keys())[0]
+    else:
+        # We only want to handle one arch, so we'll check them in order of
+        # "most to least likely for the maintainer to have access to hardware"
+        # This means: x86_64 > aarch64 > ppc64le > s390x
+        if "x86_64" in failed_arches:
+            failed_arch = "x86_64"
+        elif "aarch64" in failed_arches:
+            failed_arch = "aarch64"
+        elif "ppc64le" in failed_arches:
+            failed_arch = "ppc64le"
+        elif "s390x" in failed_arches:
+            failed_arch = "s390x"
+        else:
+            # It should be impossible for us to get "noarch" here, since
+            # the only way that should happen is for a single architecture
+            # build.
+            raise FileNotFoundError("No failed architecture detected.")
+    LOG.debug("Failed architecture: %s", failed_arch)
+    log_path = failed_arches[failed_arch]
+    LOG.debug("Returning contents of %s", log_path)
+    # Return the log as a file-like object with .read() function
+    return artifacts_zip.open(log_path.as_posix())
+async def check_artifacts_file_size(job):
+    """Method to determine if the artifacts are too large to process"""
+    # First, make sure that the artifacts are of a reasonable size. The
+    # zipped artifact collection will be stored in memory below. The
+    # python-gitlab library doesn't expose a way to check this value directly,
+    # so we need to interact with directly with the headers.
+    artifacts_url = f"{SERVER_CONFIG.gitlab.api_url}/projects/{job.project_id}/jobs/{job.id}/artifacts"  # pylint: disable=line-too-long
+    header_resp = await asyncio.to_thread(
+        requests.head,
+        artifacts_url,
+        allow_redirects=True,
+        headers={"Authorization": f"Bearer {SERVER_CONFIG.gitlab.api_token}"},
+        timeout=(3.07, 5),
+    )
+    content_length = int(header_resp.headers.get("content-length"))
+    LOG.debug(
+        "URL: %s, content-length: %d, max length: %d",
+        artifacts_url,
+        content_length,
+        SERVER_CONFIG.gitlab.max_artifact_size,
+    )
+    return content_length <= SERVER_CONFIG.gitlab.max_artifact_size
+async def submit_log_to_llm(log):
+    """Stream the log to the LLM for processing"""
+    # TODO: query the LLM with the log contents  # pylint: disable=fixme
+    # This function will be implemented later; right now it does nothing.
+    LOG.debug("Log contents:\n%s", log.read())
+    return ""
+async def comment_on_mr(merge_request_id: int, response: str):  # pylint: disable=unused-argument
+    """Add the Log Detective response as a comment to the merge request"""
+    # TODO: Implement this  # pylint: disable=fixme
+    pass  # pylint: disable=unnecessary-pass

logdetective/server/utils.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 import yaml
 from logdetective.server.models import Config
@@ -13,3 +14,31 @@ def load_server_config(path: str | None) -> Config:
         except FileNotFoundError:
             pass
     return Config()
+def get_log(config: Config):
+    """
+    Initialize a logger for this server
+    """
+    log = logging.getLogger(config.log.name)
+    if getattr(log, "initialized", False):
+        return log
+    log.setLevel(config.log.level)
+    # Drop the default handler, we will create it ourselves
+    log.handlers = []
+    # STDOUT
+    stream_handler = logging.StreamHandler()
+    stream_handler.setFormatter(logging.Formatter(config.log.format))
+    log.addHandler(stream_handler)
+    # Log to file
+    if config.log.path:
+        file_handler = logging.FileHandler(config.log.path)
+        file_handler.setFormatter(logging.Formatter(config.log.format))
+        log.addHandler(file_handler)
+    log.initialized = True
+    return log

logdetective/utils.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import logging
 import os
-from typing import Iterator, List, Dict
+from typing import Iterator, List, Dict, Tuple, Generator
 from urllib.parse import urlparse
 import numpy as np
 import requests
 from llama_cpp import Llama, CreateCompletionResponse, CreateCompletionStreamResponse
-from logdetective.constants import PROMPT_TEMPLATE
+from logdetective.constants import PROMPT_TEMPLATE, SNIPPET_DELIMITER
 LOG = logging.getLogger("logdetective")
@@ -25,7 +25,7 @@ def chunk_continues(text: str, index: int) -> bool:
     conditionals = [
         lambda i, string: string[i + 1].isspace(),
         lambda i, string: string[i - 1] == "\\",
-        lambda i, string: string[i - 1] == ":"
+        lambda i, string: string[i - 1] == ":",
     ]
     for c in conditionals:
@@ -36,25 +36,33 @@ def chunk_continues(text: str, index: int) -> bool:
     return False
-def get_chunks(text: str):
+def get_chunks(text: str) -> Generator[Tuple[int, str], None, None]:
     """Split log into chunks according to heuristic
     based on whitespace and backslash presence.
     """
     text_len = len(text)
     i = 0
     chunk = ""
+    # Keep track of the original and next line number
+    # every `\n` hit increases the next_line_number by one.
+    original_line_number = 0
+    next_line_number = 0
     while i < text_len:
         chunk += text[i]
-        if text[i] == '\n':
+        if text[i] == "\n":
+            next_line_number += 1
             if i + 1 < text_len and chunk_continues(text, i):
                 i += 1
                 continue
-            yield chunk
+            yield (original_line_number, chunk)
+            original_line_number = next_line_number + 1
             chunk = ""
         i += 1
-def initialize_model(model_pth: str, filename_suffix: str = ".gguf", verbose: bool = False) -> Llama:
+def initialize_model(
+    model_pth: str, filename_suffix: str = ".gguf", verbose: bool = False
+) -> Llama:
     """Initialize Llama class for inference.
     Args:
         model_pth (str): path to gguf model file or Hugging Face name
@@ -69,14 +77,16 @@ def initialize_model(model_pth: str, filename_suffix: str = ".gguf", verbose: bo
             model_path=model_pth,
             n_ctx=0,  # Maximum context for the model
             verbose=verbose,
-            logits_all=True)
+            logits_all=True,
+        )
     else:
         model = Llama.from_pretrained(
             model_pth,
             f"*{filename_suffix}",
             n_ctx=0,  # Maximum context for the model
             verbose=verbose,
-            logits_all=True)
+            logits_all=True,
+        )
     return model
@@ -91,8 +101,7 @@ def compute_certainty(probs: List[Dict]) -> float:
     This function is used in the server codebase.
     """
-    top_logprobs = [
-        np.exp(e["logprob"]) * 100 for e in probs]
+    top_logprobs = [np.exp(e["logprob"]) * 100 for e in probs]
     certainty = np.median(top_logprobs, axis=0)
     if np.isnan(certainty):
@@ -100,8 +109,9 @@ def compute_certainty(probs: List[Dict]) -> float:
     return certainty
-def process_log(log: str, model: Llama, stream: bool) -> (
-        CreateCompletionResponse | Iterator[CreateCompletionStreamResponse]):
+def process_log(
+    log: str, model: Llama, stream: bool
+) -> CreateCompletionResponse | Iterator[CreateCompletionStreamResponse]:
     """Processes a given log using the provided language model and returns its summary.
     Args:
@@ -112,10 +122,8 @@ def process_log(log: str, model: Llama, stream: bool) -> (
         str: The summary of the given log generated by the language model.
     """
     response = model(
-        prompt=PROMPT_TEMPLATE.format(log),
-        stream=stream,
-        max_tokens=0,
-        logprobs=1)
+        prompt=PROMPT_TEMPLATE.format(log), stream=stream, max_tokens=0, logprobs=1
+    )
     return response
@@ -140,18 +148,41 @@ def retrieve_log_content(log_path: str) -> str:
     return log
-def format_snippets(snippets: list[str]) -> str:
+def format_snippets(snippets: list[str] | list[Tuple[int, str]]) -> str:
     """Format snippets, giving them separator, id and finally
-    concatenating them.
+    concatenating them. If snippets have line number attached,
+    include that in prompt.
+    Line number must be first element in the tuple. Mixed format of snippets
+    is permitted, but may have impact on inference.
     """
     summary = ""
     for i, s in enumerate(snippets):
-        summary += f"""
-        Snippet No. {i}:
+        if isinstance(s, tuple):
+            summary += f"""
+            Snippet No. {i} at line #{s[0]}:
+            {s[1]}
+            ================
+            """
+        else:
+            summary += f"""
+            Snippet No. {i}:
+            {s[1]}
+            ================
+            """
+    return summary
-        {s}
-        ================
-        """
+def format_analyzed_snippets(snippets: list[Dict]) -> str:
+    """Format snippets for submission into staged prompt."""
+    summary = f"\n{SNIPPET_DELIMITER}\n".join(
+        [
+            f"[{e['snippet']}] at line [{e["line_number"]}]: [{e['comment']['choices'][0]['text']}]"
+            for e in snippets
+        ]
+    )
     return summary
@@ -161,7 +192,7 @@ def validate_url(url: str) -> bool:
     Either netloc or path must have non-zero length.
     """
     result = urlparse(url)
-    if result.scheme not in ['http', 'https']:
+    if result.scheme not in ["http", "https"]:
         return False
     if any([result.params, result.query, result.fragment]):
         return False

{logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: logdetective
-Version: 0.2.14
+Version: 0.3.2
 Summary: Log using LLM AI to search for build/test failures and provide ideas for fixing these.
 License: Apache-2.0
 Author: Jiri Podivin
@@ -18,10 +18,15 @@ Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Internet :: Log Analysis
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Software Development :: Debuggers
+Provides-Extra: server
 Requires-Dist: drain3 (>=0.9.11,<0.10.0)
+Requires-Dist: fastapi (>=0.111.1) ; extra == "server"
 Requires-Dist: huggingface-hub (>0.23.2)
 Requires-Dist: llama-cpp-python (>0.2.56,!=0.2.86)
 Requires-Dist: numpy (>=1.26.0)
+Requires-Dist: pydantic (>=2.8.2,<3.0.0) ; extra == "server"
+Requires-Dist: python-gitlab (>=4.4.0)
+Requires-Dist: pyyaml (>=6.0.1,<7.0.0) ; extra == "server"
 Requires-Dist: requests (>0.2.31)
 Project-URL: homepage, https://github.com/fedora-copr/logdetective
 Project-URL: issues, https://github.com/fedora-copr/logdetective/issues
@@ -216,6 +221,30 @@ $ curl -L -o models/mistral-7b-instruct-v0.2.Q4_K_S.gguf https://huggingface.co/
 ```
+Our production instance
+-----------------------
+Our FastAPI server and model inference server run through `podman-compose` on an
+Amazon AWS intance. The VM is provisioned by an
+[ansible playbook](https://pagure.io/fedora-infra/ansible/blob/main/f/roles/logdetective/tasks/main.yml).
+You can control the server through:
+```
+cd /root/logdetective
+podman-compose -f docker-compose-prod.yaml ...
+```
+The `/root` directory contains valuable data. If moving to a new instance,
+please backup the whole directory and transfer it to the new instance.
+Fore some reason, we need to manually run this command after every reboot:
+```
+nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
+```
 License
 -------

logdetective-0.3.2.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+logdetective/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+logdetective/constants.py,sha256=SPSs1Bq6zPms3RsFTmsADwgrnFTn4fefNHzrB-M3RAE,1383
+logdetective/drain3.ini,sha256=ni91eCT1TwTznZwcqWoOVMQcGEnWhEDNCoTPF7cfGfY,1360
+logdetective/extractors.py,sha256=cjxndfJaQur54GXksIQXL7YTxkOng8I8UnQZMN2t5_w,3388
+logdetective/logdetective.py,sha256=KN0KASW63VAnrjVeXK5AO0ob-vSexutTyeg1fd4uj70,4884
+logdetective/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+logdetective/server/models.py,sha256=9QURaw0u9yZKywXwHzv6_rS6XhRBA2UHV5u4b9xkWqc,5196
+logdetective/server/server.py,sha256=o2s4ezQE-a1XY7RFK0vLDFQO_wj9ZgG58SEV0hErLd8,18237
+logdetective/server/utils.py,sha256=osW5-VXxJAxRt7Wd3t1wF7PyW89FE9g4gSZLZCShlLc,1216
+logdetective/utils.py,sha256=59jq7F45Wk8pldzDt4gkh47Hny0T3fy1ggJFjSXDSGo,6148
+logdetective-0.3.2.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
+logdetective-0.3.2.dist-info/METADATA,sha256=vIn_AMoQZAHpsOB_6KXgR8wX1Z0tPEPe34044sj9mKY,10691
+logdetective-0.3.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
+logdetective-0.3.2.dist-info/entry_points.txt,sha256=3K_vXja6PmcA8sNdUi63WdImeiNhVZcEGPTaoJmltfA,63
+logdetective-0.3.2.dist-info/RECORD,,

logdetective-0.2.14.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-logdetective/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-logdetective/constants.py,sha256=6XekuU7sbkY1Pmu4NJajgFbJ0no8PQ3DxQm8NeLKtjE,1383
-logdetective/drain3.ini,sha256=ni91eCT1TwTznZwcqWoOVMQcGEnWhEDNCoTPF7cfGfY,1360
-logdetective/extractors.py,sha256=xfan_dbGCrLH4cguJ2F6W6UkxXMz24Vob39r5-GsNV8,3102
-logdetective/logdetective.py,sha256=03dDCZOx0PRl8KQ5axq5YE90erjoFtcn1tjTuggItco,4684
-logdetective/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-logdetective/server/models.py,sha256=vFFOWg7HoI7_6XCty3Fa5AQPbK6g-HuRCEnaqlKXnWw,2333
-logdetective/server/server.py,sha256=3HOwIXsnas5GvyRCm3Y3-ogxa8g_IomOpfxX-KG_yM8,9240
-logdetective/server/utils.py,sha256=-SB49orES2zU83XJODU_1O9pVQg3CtEisaIm3oEiALA,469
-logdetective/utils.py,sha256=j3u_JruoM57q_7dX3enV04t6WGEg3YNWbu5wmEGmP-I,5019
-logdetective-0.2.14.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
-logdetective-0.2.14.dist-info/METADATA,sha256=COm3Y0ToL6WAWzvY5HHAV9T8BezNTDoOrLqsV5UoKZk,9768
-logdetective-0.2.14.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
-logdetective-0.2.14.dist-info/entry_points.txt,sha256=3K_vXja6PmcA8sNdUi63WdImeiNhVZcEGPTaoJmltfA,63
-logdetective-0.2.14.dist-info/RECORD,,

{logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/LICENSE RENAMED Viewed

File without changes

{logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

logdetective 0.2.14__py3-none-any.whl → 0.3.2__py3-none-any.whl

logdetective 0.2.14py3-none-any.whl → 0.3.2py3-none-any.whl