logdetective 0.2.14__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logdetective/constants.py +2 -3
- logdetective/extractors.py +24 -14
- logdetective/logdetective.py +69 -31
- logdetective/server/models.py +94 -3
- logdetective/server/server.py +240 -20
- logdetective/server/utils.py +29 -0
- logdetective/utils.py +56 -25
- {logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/METADATA +30 -1
- logdetective-0.3.2.dist-info/RECORD +15 -0
- logdetective-0.2.14.dist-info/RECORD +0 -15
- {logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/LICENSE +0 -0
- {logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/WHEEL +0 -0
- {logdetective-0.2.14.dist-info → logdetective-0.3.2.dist-info}/entry_points.txt +0 -0
logdetective/constants.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
# pylint: disable=line-too-long
|
|
3
2
|
DEFAULT_ADVISOR = "fedora-copr/Mistral-7B-Instruct-v0.2-GGUF"
|
|
4
3
|
|
|
@@ -32,7 +31,7 @@ Answer:
|
|
|
32
31
|
"""
|
|
33
32
|
|
|
34
33
|
SNIPPET_PROMPT_TEMPLATE = """
|
|
35
|
-
Analyse following RPM build log snippet.
|
|
34
|
+
Analyse following RPM build log snippet. Describe contents accurately, without speculation or suggestions for resolution.
|
|
36
35
|
|
|
37
36
|
Snippet:
|
|
38
37
|
|
|
@@ -59,4 +58,4 @@ Analysis:
|
|
|
59
58
|
|
|
60
59
|
"""
|
|
61
60
|
|
|
62
|
-
SNIPPET_DELIMITER =
|
|
61
|
+
SNIPPET_DELIMITER = "================"
|
logdetective/extractors.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import logging
|
|
3
|
+
from typing import Tuple
|
|
3
4
|
|
|
4
5
|
import drain3
|
|
5
6
|
from drain3.template_miner_config import TemplateMinerConfig
|
|
@@ -15,13 +16,17 @@ class LLMExtractor:
|
|
|
15
16
|
"""
|
|
16
17
|
A class that extracts relevant information from logs using a language model.
|
|
17
18
|
"""
|
|
19
|
+
|
|
18
20
|
def __init__(self, model: Llama, n_lines: int = 2):
|
|
19
21
|
self.model = model
|
|
20
22
|
self.n_lines = n_lines
|
|
21
23
|
self.grammar = LlamaGrammar.from_string(
|
|
22
|
-
|
|
24
|
+
'root ::= ("Yes" | "No")', verbose=False
|
|
25
|
+
)
|
|
23
26
|
|
|
24
|
-
def __call__(
|
|
27
|
+
def __call__(
|
|
28
|
+
self, log: str, n_lines: int = 2, neighbors: bool = False
|
|
29
|
+
) -> list[str]:
|
|
25
30
|
chunks = self.rate_chunks(log)
|
|
26
31
|
out = self.create_extract(chunks, neighbors)
|
|
27
32
|
return out
|
|
@@ -35,7 +40,7 @@ class LLMExtractor:
|
|
|
35
40
|
log_lines = log.split("\n")
|
|
36
41
|
|
|
37
42
|
for i in range(0, len(log_lines), self.n_lines):
|
|
38
|
-
block =
|
|
43
|
+
block = "\n".join(log_lines[i: i + self.n_lines])
|
|
39
44
|
prompt = SUMMARIZE_PROMPT_TEMPLATE.format(log)
|
|
40
45
|
out = self.model(prompt, max_tokens=7, grammar=self.grammar)
|
|
41
46
|
out = f"{out['choices'][0]['text']}\n"
|
|
@@ -44,8 +49,7 @@ class LLMExtractor:
|
|
|
44
49
|
return results
|
|
45
50
|
|
|
46
51
|
def create_extract(self, chunks: list[tuple], neighbors: bool = False) -> list[str]:
|
|
47
|
-
"""Extract interesting chunks from the model processing.
|
|
48
|
-
"""
|
|
52
|
+
"""Extract interesting chunks from the model processing."""
|
|
49
53
|
interesting = []
|
|
50
54
|
summary = []
|
|
51
55
|
# pylint: disable=consider-using-enumerate
|
|
@@ -64,8 +68,8 @@ class LLMExtractor:
|
|
|
64
68
|
|
|
65
69
|
|
|
66
70
|
class DrainExtractor:
|
|
67
|
-
"""A class that extracts information from logs using a template miner algorithm.
|
|
68
|
-
|
|
71
|
+
"""A class that extracts information from logs using a template miner algorithm."""
|
|
72
|
+
|
|
69
73
|
def __init__(self, verbose: bool = False, context: bool = False, max_clusters=8):
|
|
70
74
|
config = TemplateMinerConfig()
|
|
71
75
|
config.load(f"{os.path.dirname(__file__)}/drain3.ini")
|
|
@@ -75,15 +79,21 @@ class DrainExtractor:
|
|
|
75
79
|
self.verbose = verbose
|
|
76
80
|
self.context = context
|
|
77
81
|
|
|
78
|
-
def __call__(self, log: str) -> list[str]:
|
|
82
|
+
def __call__(self, log: str) -> list[Tuple[int, str]]:
|
|
79
83
|
out = []
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
84
|
+
# First pass create clusters
|
|
85
|
+
for _, chunk in get_chunks(log):
|
|
86
|
+
processed_chunk = self.miner.add_log_message(chunk)
|
|
87
|
+
LOG.debug(processed_chunk)
|
|
88
|
+
# Sort found clusters by size, descending order
|
|
89
|
+
sorted_clusters = sorted(
|
|
90
|
+
self.miner.drain.clusters, key=lambda it: it.size, reverse=True
|
|
91
|
+
)
|
|
92
|
+
# Second pass, only matching lines with clusters,
|
|
93
|
+
# to recover original text
|
|
94
|
+
for chunk_start, chunk in get_chunks(log):
|
|
85
95
|
cluster = self.miner.match(chunk, "always")
|
|
86
96
|
if cluster in sorted_clusters:
|
|
87
|
-
out.append(chunk)
|
|
97
|
+
out.append((chunk_start, chunk))
|
|
88
98
|
sorted_clusters.remove(cluster)
|
|
89
99
|
return out
|
logdetective/logdetective.py
CHANGED
|
@@ -4,40 +4,71 @@ import sys
|
|
|
4
4
|
|
|
5
5
|
from logdetective.constants import DEFAULT_ADVISOR
|
|
6
6
|
from logdetective.utils import (
|
|
7
|
-
process_log,
|
|
7
|
+
process_log,
|
|
8
|
+
initialize_model,
|
|
9
|
+
retrieve_log_content,
|
|
10
|
+
format_snippets,
|
|
11
|
+
compute_certainty,
|
|
12
|
+
)
|
|
8
13
|
from logdetective.extractors import LLMExtractor, DrainExtractor
|
|
9
14
|
|
|
10
15
|
LOG = logging.getLogger("logdetective")
|
|
11
16
|
|
|
12
17
|
|
|
13
18
|
def setup_args():
|
|
14
|
-
"""
|
|
19
|
+
"""Setup argument parser and return arguments."""
|
|
15
20
|
parser = argparse.ArgumentParser("logdetective")
|
|
16
|
-
parser.add_argument(
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"file",
|
|
23
|
+
type=str,
|
|
24
|
+
default="",
|
|
25
|
+
help="The URL or path to the log file to be analyzed.",
|
|
26
|
+
)
|
|
27
|
+
parser.add_argument(
|
|
28
|
+
"-M",
|
|
29
|
+
"--model",
|
|
30
|
+
help="The path or Hugging Face name of the language model for analysis.",
|
|
31
|
+
type=str,
|
|
32
|
+
default=DEFAULT_ADVISOR,
|
|
33
|
+
)
|
|
34
|
+
parser.add_argument(
|
|
35
|
+
"-F",
|
|
36
|
+
"--filename_suffix",
|
|
37
|
+
help="Suffix of the model file name to be retrieved from Hugging Face.\
|
|
23
38
|
Makes sense only if the model is specified with Hugging Face name.",
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
parser.add_argument("-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
parser.add_argument(
|
|
36
|
-
|
|
39
|
+
default="Q4_K_S.gguf",
|
|
40
|
+
)
|
|
41
|
+
parser.add_argument("-n", "--no-stream", action="store_true")
|
|
42
|
+
parser.add_argument(
|
|
43
|
+
"-S",
|
|
44
|
+
"--summarizer",
|
|
45
|
+
type=str,
|
|
46
|
+
default="drain",
|
|
47
|
+
help="Choose between LLM and Drain template miner as the log summarizer.\
|
|
48
|
+
LLM must be specified as path to a model, URL or local file.",
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"-N",
|
|
52
|
+
"--n_lines",
|
|
53
|
+
type=int,
|
|
54
|
+
default=8,
|
|
55
|
+
help="The number of lines per chunk for LLM analysis.\
|
|
56
|
+
This only makes sense when you are summarizing with LLM.",
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument(
|
|
59
|
+
"-C",
|
|
60
|
+
"--n_clusters",
|
|
61
|
+
type=int,
|
|
62
|
+
default=8,
|
|
63
|
+
help="Number of clusters for Drain to organize log chunks into.\
|
|
64
|
+
This only makes sense when you are summarizing with Drain",
|
|
65
|
+
)
|
|
66
|
+
parser.add_argument("-v", "--verbose", action="count", default=0)
|
|
67
|
+
parser.add_argument("-q", "--quiet", action="store_true")
|
|
37
68
|
return parser.parse_args()
|
|
38
69
|
|
|
39
70
|
|
|
40
|
-
def main():
|
|
71
|
+
def main(): # pylint: disable=too-many-statements
|
|
41
72
|
"""Main execution function."""
|
|
42
73
|
args = setup_args()
|
|
43
74
|
|
|
@@ -57,8 +88,9 @@ def main():
|
|
|
57
88
|
|
|
58
89
|
# Primary model initialization
|
|
59
90
|
try:
|
|
60
|
-
model = initialize_model(
|
|
61
|
-
|
|
91
|
+
model = initialize_model(
|
|
92
|
+
args.model, filename_suffix=args.filename_suffix, verbose=args.verbose > 2
|
|
93
|
+
)
|
|
62
94
|
except ValueError as e:
|
|
63
95
|
LOG.error(e)
|
|
64
96
|
LOG.error("You likely do not have enough memory to load the AI model")
|
|
@@ -66,7 +98,9 @@ def main():
|
|
|
66
98
|
|
|
67
99
|
# Log file summarizer selection and initialization
|
|
68
100
|
if args.summarizer == "drain":
|
|
69
|
-
extractor = DrainExtractor(
|
|
101
|
+
extractor = DrainExtractor(
|
|
102
|
+
args.verbose > 1, context=True, max_clusters=args.n_clusters
|
|
103
|
+
)
|
|
70
104
|
else:
|
|
71
105
|
summarizer_model = initialize_model(args.summarizer, verbose=args.verbose > 2)
|
|
72
106
|
extractor = LLMExtractor(summarizer_model, args.verbose > 1)
|
|
@@ -81,7 +115,7 @@ def main():
|
|
|
81
115
|
sys.exit(4)
|
|
82
116
|
log_summary = extractor(log)
|
|
83
117
|
|
|
84
|
-
ratio = len(log_summary) / len(log.split(
|
|
118
|
+
ratio = len(log_summary) / len(log.split("\n"))
|
|
85
119
|
|
|
86
120
|
LOG.info("Compression ratio: %s", ratio)
|
|
87
121
|
|
|
@@ -103,15 +137,19 @@ def main():
|
|
|
103
137
|
|
|
104
138
|
if args.no_stream:
|
|
105
139
|
print(response["choices"][0]["text"])
|
|
106
|
-
probs = [
|
|
140
|
+
probs = [
|
|
141
|
+
{"logprob": e} for e in response["choices"][0]["logprobs"]["token_logprobs"]
|
|
142
|
+
]
|
|
107
143
|
|
|
108
144
|
else:
|
|
109
145
|
# Stream the output
|
|
110
146
|
for chunk in response:
|
|
111
147
|
if isinstance(chunk["choices"][0]["logprobs"], dict):
|
|
112
|
-
probs.append(
|
|
113
|
-
|
|
114
|
-
|
|
148
|
+
probs.append(
|
|
149
|
+
{"logprob": chunk["choices"][0]["logprobs"]["token_logprobs"][0]}
|
|
150
|
+
)
|
|
151
|
+
delta = chunk["choices"][0]["text"]
|
|
152
|
+
print(delta, end="", flush=True)
|
|
115
153
|
certainty = compute_certainty(probs)
|
|
116
154
|
|
|
117
155
|
print(f"\nResponse certainty: {certainty:.2f}%\n")
|
logdetective/server/models.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
from logging import BASIC_FORMAT
|
|
1
2
|
from typing import List, Dict, Optional
|
|
2
|
-
from pydantic import BaseModel
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
class BuildLog(BaseModel):
|
|
@@ -8,6 +9,34 @@ class BuildLog(BaseModel):
|
|
|
8
9
|
url: str
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
class JobHook(BaseModel):
|
|
13
|
+
"""Model of Job Hook events sent from GitLab.
|
|
14
|
+
Full details of the specification are available at
|
|
15
|
+
https://docs.gitlab.com/user/project/integrations/webhook_events/#job-events
|
|
16
|
+
This model implements only the fields that we care about. The webhook
|
|
17
|
+
sends many more fields that we will ignore."""
|
|
18
|
+
|
|
19
|
+
# The unique job ID on this GitLab instance.
|
|
20
|
+
build_id: int
|
|
21
|
+
|
|
22
|
+
# The identifier of the job. We only care about 'build_rpm' and
|
|
23
|
+
# 'build_centos_stream_rpm' jobs.
|
|
24
|
+
build_name: str = Field(pattern=r"^build(_.*)?_rpm$")
|
|
25
|
+
|
|
26
|
+
# A string representing the job status. We only care about 'failed' jobs.
|
|
27
|
+
build_status: str = Field(pattern=r"^failed$")
|
|
28
|
+
|
|
29
|
+
# The kind of webhook message. We are only interested in 'build' messages
|
|
30
|
+
# which represents job tasks in a pipeline.
|
|
31
|
+
object_kind: str = Field(pattern=r"^build$")
|
|
32
|
+
|
|
33
|
+
# The unique ID of the enclosing pipeline on this GitLab instance.
|
|
34
|
+
pipeline_id: int
|
|
35
|
+
|
|
36
|
+
# The unique ID of the project triggering this event
|
|
37
|
+
project_id: int
|
|
38
|
+
|
|
39
|
+
|
|
11
40
|
class Response(BaseModel):
|
|
12
41
|
"""Model of data returned by Log Detective API
|
|
13
42
|
|
|
@@ -28,10 +57,13 @@ class StagedResponse(Response):
|
|
|
28
57
|
https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.llama_types.CreateCompletionResponse
|
|
29
58
|
response_certainty: float
|
|
30
59
|
snippets:
|
|
31
|
-
list of dictionaries {
|
|
60
|
+
list of dictionaries {
|
|
61
|
+
'snippet' : '<original_text>,
|
|
62
|
+
'comment': CreateCompletionResponse,
|
|
63
|
+
'line_number': '<location_in_log>' }
|
|
32
64
|
"""
|
|
33
65
|
|
|
34
|
-
snippets: List[Dict[str, str | Dict]]
|
|
66
|
+
snippets: List[Dict[str, str | Dict | int]]
|
|
35
67
|
|
|
36
68
|
|
|
37
69
|
class InferenceConfig(BaseModel):
|
|
@@ -66,11 +98,67 @@ class ExtractorConfig(BaseModel):
|
|
|
66
98
|
self.verbose = data.get("verbose", False)
|
|
67
99
|
|
|
68
100
|
|
|
101
|
+
class GitLabConfig(BaseModel):
|
|
102
|
+
"""Model for GitLab configuration of logdetective server."""
|
|
103
|
+
|
|
104
|
+
url: str = None
|
|
105
|
+
api_url: str = None
|
|
106
|
+
api_token: str = None
|
|
107
|
+
|
|
108
|
+
# Maximum size of artifacts.zip in MiB. (default: 300 MiB)
|
|
109
|
+
max_artifact_size: int = 300
|
|
110
|
+
|
|
111
|
+
def __init__(self, data: Optional[dict] = None):
|
|
112
|
+
super().__init__()
|
|
113
|
+
if data is None:
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
self.url = data.get("url", "https://gitlab.com")
|
|
117
|
+
self.api_url = f"{self.url}/api/v4"
|
|
118
|
+
self.api_token = data.get("api_token", None)
|
|
119
|
+
self.max_artifact_size = int(data.get("max_artifact_size")) * 1024 * 1024
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class LogConfig(BaseModel):
|
|
123
|
+
"""Logging configuration"""
|
|
124
|
+
|
|
125
|
+
name: str = "logdetective"
|
|
126
|
+
level: str | int = "INFO"
|
|
127
|
+
path: str | None = None
|
|
128
|
+
format: str = BASIC_FORMAT
|
|
129
|
+
|
|
130
|
+
def __init__(self, data: Optional[dict] = None):
|
|
131
|
+
super().__init__()
|
|
132
|
+
if data is None:
|
|
133
|
+
return
|
|
134
|
+
|
|
135
|
+
self.name = data.get("name", "logdetective")
|
|
136
|
+
self.level = data.get("level", "INFO").upper()
|
|
137
|
+
self.path = data.get("path")
|
|
138
|
+
self.format = data.get("format", BASIC_FORMAT)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class GeneralConfig(BaseModel):
|
|
142
|
+
"""General config options for Log Detective"""
|
|
143
|
+
|
|
144
|
+
packages: List[str] = None
|
|
145
|
+
|
|
146
|
+
def __init__(self, data: Optional[dict] = None):
|
|
147
|
+
super().__init__()
|
|
148
|
+
if data is None:
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
self.packages = data.get("packages", [])
|
|
152
|
+
|
|
153
|
+
|
|
69
154
|
class Config(BaseModel):
|
|
70
155
|
"""Model for configuration of logdetective server."""
|
|
71
156
|
|
|
157
|
+
log: LogConfig = LogConfig()
|
|
72
158
|
inference: InferenceConfig = InferenceConfig()
|
|
73
159
|
extractor: ExtractorConfig = ExtractorConfig()
|
|
160
|
+
gitlab: GitLabConfig = GitLabConfig()
|
|
161
|
+
general: GeneralConfig = GeneralConfig()
|
|
74
162
|
|
|
75
163
|
def __init__(self, data: Optional[dict] = None):
|
|
76
164
|
super().__init__()
|
|
@@ -78,5 +166,8 @@ class Config(BaseModel):
|
|
|
78
166
|
if data is None:
|
|
79
167
|
return
|
|
80
168
|
|
|
169
|
+
self.log = LogConfig(data.get("log"))
|
|
81
170
|
self.inference = InferenceConfig(data.get("inference"))
|
|
82
171
|
self.extractor = ExtractorConfig(data.get("extractor"))
|
|
172
|
+
self.gitlab = GitLabConfig(data.get("gitlab"))
|
|
173
|
+
self.general = GeneralConfig(data.get("general"))
|
logdetective/server/server.py
CHANGED
|
@@ -1,26 +1,35 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
|
-
import logging
|
|
4
3
|
import os
|
|
5
|
-
|
|
4
|
+
import re
|
|
5
|
+
import zipfile
|
|
6
|
+
from pathlib import PurePath
|
|
7
|
+
from tempfile import TemporaryFile
|
|
8
|
+
from typing import List, Annotated, Tuple
|
|
9
|
+
|
|
6
10
|
|
|
7
11
|
from llama_cpp import CreateCompletionResponse
|
|
8
|
-
from fastapi import FastAPI, HTTPException, Depends, Header
|
|
12
|
+
from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends, Header
|
|
9
13
|
from fastapi.responses import StreamingResponse
|
|
14
|
+
from fastapi.responses import Response as BasicResponse
|
|
15
|
+
import gitlab
|
|
10
16
|
import requests
|
|
11
17
|
|
|
12
18
|
from logdetective.constants import (
|
|
13
19
|
PROMPT_TEMPLATE,
|
|
14
20
|
SNIPPET_PROMPT_TEMPLATE,
|
|
15
21
|
PROMPT_TEMPLATE_STAGED,
|
|
16
|
-
SNIPPET_DELIMITER,
|
|
17
22
|
)
|
|
18
23
|
from logdetective.extractors import DrainExtractor
|
|
19
|
-
from logdetective.utils import
|
|
20
|
-
|
|
21
|
-
|
|
24
|
+
from logdetective.utils import (
|
|
25
|
+
validate_url,
|
|
26
|
+
compute_certainty,
|
|
27
|
+
format_snippets,
|
|
28
|
+
format_analyzed_snippets,
|
|
29
|
+
)
|
|
30
|
+
from logdetective.server.models import BuildLog, JobHook, Response, StagedResponse
|
|
31
|
+
from logdetective.server.utils import load_server_config, get_log
|
|
22
32
|
|
|
23
|
-
LOG = logging.getLogger("logdetective")
|
|
24
33
|
|
|
25
34
|
LLM_CPP_HOST = os.environ.get("LLAMA_CPP_HOST", "localhost")
|
|
26
35
|
LLM_CPP_SERVER_ADDRESS = f"http://{LLM_CPP_HOST}"
|
|
@@ -33,6 +42,11 @@ LLM_API_TOKEN = os.environ.get("LLM_API_TOKEN", None)
|
|
|
33
42
|
|
|
34
43
|
SERVER_CONFIG = load_server_config(SERVER_CONFIG_PATH)
|
|
35
44
|
|
|
45
|
+
MR_REGEX = re.compile(r"refs/merge-requests/(\d+)/merge")
|
|
46
|
+
FAILURE_LOG_REGEX = re.compile(r"(\w*\.log)")
|
|
47
|
+
|
|
48
|
+
LOG = get_log(SERVER_CONFIG)
|
|
49
|
+
|
|
36
50
|
|
|
37
51
|
def requires_token_when_set(authentication: Annotated[str | None, Header()] = None):
|
|
38
52
|
"""
|
|
@@ -65,6 +79,9 @@ def requires_token_when_set(authentication: Annotated[str | None, Header()] = No
|
|
|
65
79
|
|
|
66
80
|
|
|
67
81
|
app = FastAPI(dependencies=[Depends(requires_token_when_set)])
|
|
82
|
+
app.gitlab_conn = gitlab.Gitlab(
|
|
83
|
+
url=SERVER_CONFIG.gitlab.url, private_token=SERVER_CONFIG.gitlab.api_token
|
|
84
|
+
)
|
|
68
85
|
|
|
69
86
|
|
|
70
87
|
def process_url(url: str) -> str:
|
|
@@ -90,7 +107,7 @@ def process_url(url: str) -> str:
|
|
|
90
107
|
return log_request.text
|
|
91
108
|
|
|
92
109
|
|
|
93
|
-
def mine_logs(log: str) -> List[str]:
|
|
110
|
+
def mine_logs(log: str) -> List[Tuple[int, str]]:
|
|
94
111
|
"""Extract snippets from log text"""
|
|
95
112
|
extractor = DrainExtractor(
|
|
96
113
|
verbose=True, context=True, max_clusters=SERVER_CONFIG.extractor.max_clusters
|
|
@@ -141,6 +158,7 @@ async def submit_text(
|
|
|
141
158
|
stream=stream,
|
|
142
159
|
)
|
|
143
160
|
except requests.RequestException as ex:
|
|
161
|
+
LOG.error("Llama-cpp query failed: %s", ex)
|
|
144
162
|
raise HTTPException(
|
|
145
163
|
status_code=400, detail=f"Llama-cpp query failed: {ex}"
|
|
146
164
|
) from ex
|
|
@@ -175,6 +193,7 @@ async def analyze_log(build_log: BuildLog):
|
|
|
175
193
|
"""
|
|
176
194
|
log_text = process_url(build_log.url)
|
|
177
195
|
log_summary = mine_logs(log_text)
|
|
196
|
+
log_summary = format_snippets(log_summary)
|
|
178
197
|
response = await submit_text(PROMPT_TEMPLATE.format(log_summary))
|
|
179
198
|
certainty = 0
|
|
180
199
|
|
|
@@ -188,7 +207,7 @@ async def analyze_log(build_log: BuildLog):
|
|
|
188
207
|
raise HTTPException(
|
|
189
208
|
status_code=400,
|
|
190
209
|
detail=f"Couldn't compute certainty with data:\n"
|
|
191
|
-
f"{response[
|
|
210
|
+
f"{response['choices'][0]['logprobs']['content'][0]['top_logprobs']}",
|
|
192
211
|
) from ex
|
|
193
212
|
|
|
194
213
|
return Response(explanation=response, response_certainty=certainty)
|
|
@@ -207,20 +226,15 @@ async def analyze_log_staged(build_log: BuildLog):
|
|
|
207
226
|
|
|
208
227
|
# Process snippets asynchronously
|
|
209
228
|
analyzed_snippets = await asyncio.gather(
|
|
210
|
-
*[submit_text(SNIPPET_PROMPT_TEMPLATE.format(s)) for s in log_summary]
|
|
229
|
+
*[submit_text(SNIPPET_PROMPT_TEMPLATE.format(s[1])) for s in log_summary]
|
|
211
230
|
)
|
|
212
231
|
|
|
213
232
|
analyzed_snippets = [
|
|
214
|
-
{"snippet": e[0], "comment": e[1]}
|
|
233
|
+
{"snippet": e[0][1], "line_number": e[0][0], "comment": e[1]}
|
|
234
|
+
for e in zip(log_summary, analyzed_snippets)
|
|
215
235
|
]
|
|
216
|
-
|
|
217
236
|
final_prompt = PROMPT_TEMPLATE_STAGED.format(
|
|
218
|
-
|
|
219
|
-
[
|
|
220
|
-
f"[{e["snippet"]}] : [{e["comment"]["choices"][0]["text"]}]"
|
|
221
|
-
for e in analyzed_snippets
|
|
222
|
-
]
|
|
223
|
-
)
|
|
237
|
+
format_analyzed_snippets(analyzed_snippets)
|
|
224
238
|
)
|
|
225
239
|
|
|
226
240
|
final_analysis = await submit_text(final_prompt)
|
|
@@ -237,7 +251,7 @@ async def analyze_log_staged(build_log: BuildLog):
|
|
|
237
251
|
raise HTTPException(
|
|
238
252
|
status_code=400,
|
|
239
253
|
detail=f"Couldn't compute certainty with data:\n"
|
|
240
|
-
f"{final_analysis[
|
|
254
|
+
f"{final_analysis['choices'][0]['logprobs']['content'][0]['top_logprobs']}",
|
|
241
255
|
) from ex
|
|
242
256
|
|
|
243
257
|
return StagedResponse(
|
|
@@ -257,6 +271,212 @@ async def analyze_log_stream(build_log: BuildLog):
|
|
|
257
271
|
"""
|
|
258
272
|
log_text = process_url(build_log.url)
|
|
259
273
|
log_summary = mine_logs(log_text)
|
|
274
|
+
log_summary = format_snippets(log_summary)
|
|
260
275
|
stream = await submit_text(PROMPT_TEMPLATE.format(log_summary), stream=True)
|
|
261
276
|
|
|
262
277
|
return StreamingResponse(stream)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@app.post("/webhook/gitlab/job_events")
|
|
281
|
+
async def receive_gitlab_job_event_webhook(
|
|
282
|
+
job_hook: JobHook, background_tasks: BackgroundTasks
|
|
283
|
+
):
|
|
284
|
+
"""Webhook endpoint for receiving job_events notifications from GitLab
|
|
285
|
+
https://docs.gitlab.com/user/project/integrations/webhook_events/#job-events
|
|
286
|
+
lists the full specification for the messages sent for job events."""
|
|
287
|
+
|
|
288
|
+
# Handle the message in the background so we can return 200 immediately
|
|
289
|
+
background_tasks.add_task(process_gitlab_job_event, job_hook)
|
|
290
|
+
|
|
291
|
+
# No return value or body is required for a webhook.
|
|
292
|
+
# 204: No Content
|
|
293
|
+
return BasicResponse(status_code=204)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
async def process_gitlab_job_event(job_hook):
|
|
297
|
+
"""Handle a received job_event webhook from GitLab"""
|
|
298
|
+
LOG.debug("Received webhook message:\n%s", job_hook)
|
|
299
|
+
|
|
300
|
+
# Look up the project this job belongs to
|
|
301
|
+
project = await asyncio.to_thread(app.gitlab_conn.projects.get, job_hook.project_id)
|
|
302
|
+
|
|
303
|
+
# check if this project is on the opt-in list
|
|
304
|
+
if project.name not in SERVER_CONFIG.general.packages:
|
|
305
|
+
LOG.info("Ignoring unrecognized package %s", project.name)
|
|
306
|
+
return
|
|
307
|
+
LOG.info("Processing failed job for %s", project.name)
|
|
308
|
+
|
|
309
|
+
# Retrieve data about the job from the GitLab API
|
|
310
|
+
job = await asyncio.to_thread(project.jobs.get, job_hook.build_id)
|
|
311
|
+
|
|
312
|
+
# Retrieve the pipeline that started this job
|
|
313
|
+
pipeline = await asyncio.to_thread(project.pipelines.get, job_hook.pipeline_id)
|
|
314
|
+
|
|
315
|
+
# Verify this is a merge request
|
|
316
|
+
if pipeline.source != "merge_request_event":
|
|
317
|
+
LOG.info("Not a merge request pipeline. Ignoring.")
|
|
318
|
+
return
|
|
319
|
+
|
|
320
|
+
# Extract the merge-request ID from the job
|
|
321
|
+
match = MR_REGEX.search(pipeline.ref)
|
|
322
|
+
if not match:
|
|
323
|
+
LOG.error(
|
|
324
|
+
"Pipeline source is merge_request_event but no merge request ID was provided."
|
|
325
|
+
)
|
|
326
|
+
return
|
|
327
|
+
merge_request_id = int(match.group(1))
|
|
328
|
+
|
|
329
|
+
LOG.debug("Retrieving log artifacts")
|
|
330
|
+
# Retrieve the build logs from the merge request artifacts and preprocess them
|
|
331
|
+
try:
|
|
332
|
+
preprocessed_log = await retrieve_and_preprocess_koji_logs(job)
|
|
333
|
+
except LogsTooLargeError:
|
|
334
|
+
LOG.error("Could not retrieve logs. Too large.")
|
|
335
|
+
raise
|
|
336
|
+
|
|
337
|
+
# Submit log to Log Detective and await the results.
|
|
338
|
+
response = await submit_log_to_llm(preprocessed_log)
|
|
339
|
+
preprocessed_log.close()
|
|
340
|
+
|
|
341
|
+
# Add the Log Detective response as a comment to the merge request
|
|
342
|
+
await comment_on_mr(merge_request_id, response)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
class LogsTooLargeError(RuntimeError):
|
|
346
|
+
"""The log archive exceeds the configured maximum size"""
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
async def retrieve_and_preprocess_koji_logs(job):
|
|
350
|
+
"""Download logs from the merge request artifacts
|
|
351
|
+
|
|
352
|
+
This function will retrieve the build logs and do some minimal
|
|
353
|
+
preprocessing to determine which log is relevant for analysis.
|
|
354
|
+
|
|
355
|
+
returns: An open, file-like object containing the log contents to be sent
|
|
356
|
+
for processing by Log Detective. The calling function is responsible for
|
|
357
|
+
closing this object."""
|
|
358
|
+
|
|
359
|
+
# Make sure the file isn't too large to process.
|
|
360
|
+
if not await check_artifacts_file_size(job):
|
|
361
|
+
raise LogsTooLargeError(
|
|
362
|
+
f"Oversized logs for job {job.id} in project {job.project_id}"
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Create a temporary file to store the downloaded log zipfile.
|
|
366
|
+
# This will be automatically deleted when the last reference into it
|
|
367
|
+
# (returned by this function) is closed.
|
|
368
|
+
tempfile = TemporaryFile(mode="w+b")
|
|
369
|
+
await asyncio.to_thread(job.artifacts, streamed=True, action=tempfile.write)
|
|
370
|
+
tempfile.seek(0)
|
|
371
|
+
|
|
372
|
+
failed_arches = {}
|
|
373
|
+
artifacts_zip = zipfile.ZipFile(tempfile, mode="r")
|
|
374
|
+
for zipinfo in artifacts_zip.infolist():
|
|
375
|
+
if zipinfo.filename.endswith("task_failed.log"):
|
|
376
|
+
# The koji logs store this file in two places: 1) in the
|
|
377
|
+
# directory with the failed architecture and 2) in the parent
|
|
378
|
+
# directory. We actually want to ignore the one in the parent
|
|
379
|
+
# directory, since the rest of the information is in the
|
|
380
|
+
# specific task directory.
|
|
381
|
+
# The paths look like `kojilogs/noarch-XXXXXX/task_failed.log`
|
|
382
|
+
# or `kojilogs/noarch-XXXXXX/x86_64-XXXXXX/task_failed.log`
|
|
383
|
+
path = PurePath(zipinfo.filename)
|
|
384
|
+
if len(path.parts) <= 3:
|
|
385
|
+
continue
|
|
386
|
+
|
|
387
|
+
# Extract the architecture from the immediate parent path
|
|
388
|
+
architecture = path.parent.parts[-1].split("-")[0]
|
|
389
|
+
|
|
390
|
+
# Open this file and read which log failed.
|
|
391
|
+
# The string in this log has the format
|
|
392
|
+
# `see <log> for more information`.
|
|
393
|
+
# Note: it may sometimes say
|
|
394
|
+
# `see build.log or root.log for more information`, but in
|
|
395
|
+
# that situation, we only want to handle build.log (for now),
|
|
396
|
+
# which means accepting only the first match for the regular
|
|
397
|
+
# expression.
|
|
398
|
+
with artifacts_zip.open(zipinfo.filename) as task_failed_log:
|
|
399
|
+
contents = task_failed_log.read().decode("utf-8")
|
|
400
|
+
match = FAILURE_LOG_REGEX.search(contents)
|
|
401
|
+
if not match:
|
|
402
|
+
LOG.error(
|
|
403
|
+
"task_failed.log does not indicate which log contains the failure."
|
|
404
|
+
)
|
|
405
|
+
raise SyntaxError(
|
|
406
|
+
"task_failed.log does not indicate which log contains the failure."
|
|
407
|
+
)
|
|
408
|
+
failure_log_name = match.group(1)
|
|
409
|
+
|
|
410
|
+
failed_arches[architecture] = PurePath(path.parent, failure_log_name)
|
|
411
|
+
|
|
412
|
+
if not failed_arches:
|
|
413
|
+
# No failed task found?
|
|
414
|
+
raise FileNotFoundError("Could not detect failed architecture.")
|
|
415
|
+
|
|
416
|
+
# First check if we only found one failed architecture
|
|
417
|
+
if len(failed_arches) == 1:
|
|
418
|
+
failed_arch = list(failed_arches.keys())[0]
|
|
419
|
+
|
|
420
|
+
else:
|
|
421
|
+
# We only want to handle one arch, so we'll check them in order of
|
|
422
|
+
# "most to least likely for the maintainer to have access to hardware"
|
|
423
|
+
# This means: x86_64 > aarch64 > ppc64le > s390x
|
|
424
|
+
if "x86_64" in failed_arches:
|
|
425
|
+
failed_arch = "x86_64"
|
|
426
|
+
elif "aarch64" in failed_arches:
|
|
427
|
+
failed_arch = "aarch64"
|
|
428
|
+
elif "ppc64le" in failed_arches:
|
|
429
|
+
failed_arch = "ppc64le"
|
|
430
|
+
elif "s390x" in failed_arches:
|
|
431
|
+
failed_arch = "s390x"
|
|
432
|
+
else:
|
|
433
|
+
# It should be impossible for us to get "noarch" here, since
|
|
434
|
+
# the only way that should happen is for a single architecture
|
|
435
|
+
# build.
|
|
436
|
+
raise FileNotFoundError("No failed architecture detected.")
|
|
437
|
+
|
|
438
|
+
LOG.debug("Failed architecture: %s", failed_arch)
|
|
439
|
+
|
|
440
|
+
log_path = failed_arches[failed_arch]
|
|
441
|
+
LOG.debug("Returning contents of %s", log_path)
|
|
442
|
+
|
|
443
|
+
# Return the log as a file-like object with .read() function
|
|
444
|
+
return artifacts_zip.open(log_path.as_posix())
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
async def check_artifacts_file_size(job):
|
|
448
|
+
"""Method to determine if the artifacts are too large to process"""
|
|
449
|
+
# First, make sure that the artifacts are of a reasonable size. The
|
|
450
|
+
# zipped artifact collection will be stored in memory below. The
|
|
451
|
+
# python-gitlab library doesn't expose a way to check this value directly,
|
|
452
|
+
# so we need to interact with directly with the headers.
|
|
453
|
+
artifacts_url = f"{SERVER_CONFIG.gitlab.api_url}/projects/{job.project_id}/jobs/{job.id}/artifacts" # pylint: disable=line-too-long
|
|
454
|
+
header_resp = await asyncio.to_thread(
|
|
455
|
+
requests.head,
|
|
456
|
+
artifacts_url,
|
|
457
|
+
allow_redirects=True,
|
|
458
|
+
headers={"Authorization": f"Bearer {SERVER_CONFIG.gitlab.api_token}"},
|
|
459
|
+
timeout=(3.07, 5),
|
|
460
|
+
)
|
|
461
|
+
content_length = int(header_resp.headers.get("content-length"))
|
|
462
|
+
LOG.debug(
|
|
463
|
+
"URL: %s, content-length: %d, max length: %d",
|
|
464
|
+
artifacts_url,
|
|
465
|
+
content_length,
|
|
466
|
+
SERVER_CONFIG.gitlab.max_artifact_size,
|
|
467
|
+
)
|
|
468
|
+
return content_length <= SERVER_CONFIG.gitlab.max_artifact_size
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
async def submit_log_to_llm(log):
|
|
472
|
+
"""Stream the log to the LLM for processing"""
|
|
473
|
+
# TODO: query the LLM with the log contents # pylint: disable=fixme
|
|
474
|
+
# This function will be implemented later; right now it does nothing.
|
|
475
|
+
LOG.debug("Log contents:\n%s", log.read())
|
|
476
|
+
return ""
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
async def comment_on_mr(merge_request_id: int, response: str): # pylint: disable=unused-argument
|
|
480
|
+
"""Add the Log Detective response as a comment to the merge request"""
|
|
481
|
+
# TODO: Implement this # pylint: disable=fixme
|
|
482
|
+
pass # pylint: disable=unnecessary-pass
|
logdetective/server/utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import logging
|
|
1
2
|
import yaml
|
|
2
3
|
from logdetective.server.models import Config
|
|
3
4
|
|
|
@@ -13,3 +14,31 @@ def load_server_config(path: str | None) -> Config:
|
|
|
13
14
|
except FileNotFoundError:
|
|
14
15
|
pass
|
|
15
16
|
return Config()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_log(config: Config):
|
|
20
|
+
"""
|
|
21
|
+
Initialize a logger for this server
|
|
22
|
+
"""
|
|
23
|
+
log = logging.getLogger(config.log.name)
|
|
24
|
+
if getattr(log, "initialized", False):
|
|
25
|
+
return log
|
|
26
|
+
|
|
27
|
+
log.setLevel(config.log.level)
|
|
28
|
+
|
|
29
|
+
# Drop the default handler, we will create it ourselves
|
|
30
|
+
log.handlers = []
|
|
31
|
+
|
|
32
|
+
# STDOUT
|
|
33
|
+
stream_handler = logging.StreamHandler()
|
|
34
|
+
stream_handler.setFormatter(logging.Formatter(config.log.format))
|
|
35
|
+
log.addHandler(stream_handler)
|
|
36
|
+
|
|
37
|
+
# Log to file
|
|
38
|
+
if config.log.path:
|
|
39
|
+
file_handler = logging.FileHandler(config.log.path)
|
|
40
|
+
file_handler.setFormatter(logging.Formatter(config.log.format))
|
|
41
|
+
log.addHandler(file_handler)
|
|
42
|
+
|
|
43
|
+
log.initialized = True
|
|
44
|
+
return log
|
logdetective/utils.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
-
from typing import Iterator, List, Dict
|
|
3
|
+
from typing import Iterator, List, Dict, Tuple, Generator
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
import numpy as np
|
|
6
6
|
import requests
|
|
7
7
|
|
|
8
8
|
from llama_cpp import Llama, CreateCompletionResponse, CreateCompletionStreamResponse
|
|
9
|
-
from logdetective.constants import PROMPT_TEMPLATE
|
|
9
|
+
from logdetective.constants import PROMPT_TEMPLATE, SNIPPET_DELIMITER
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
LOG = logging.getLogger("logdetective")
|
|
@@ -25,7 +25,7 @@ def chunk_continues(text: str, index: int) -> bool:
|
|
|
25
25
|
conditionals = [
|
|
26
26
|
lambda i, string: string[i + 1].isspace(),
|
|
27
27
|
lambda i, string: string[i - 1] == "\\",
|
|
28
|
-
lambda i, string: string[i - 1] == ":"
|
|
28
|
+
lambda i, string: string[i - 1] == ":",
|
|
29
29
|
]
|
|
30
30
|
|
|
31
31
|
for c in conditionals:
|
|
@@ -36,25 +36,33 @@ def chunk_continues(text: str, index: int) -> bool:
|
|
|
36
36
|
return False
|
|
37
37
|
|
|
38
38
|
|
|
39
|
-
def get_chunks(text: str):
|
|
39
|
+
def get_chunks(text: str) -> Generator[Tuple[int, str], None, None]:
|
|
40
40
|
"""Split log into chunks according to heuristic
|
|
41
41
|
based on whitespace and backslash presence.
|
|
42
42
|
"""
|
|
43
43
|
text_len = len(text)
|
|
44
44
|
i = 0
|
|
45
45
|
chunk = ""
|
|
46
|
+
# Keep track of the original and next line number
|
|
47
|
+
# every `\n` hit increases the next_line_number by one.
|
|
48
|
+
original_line_number = 0
|
|
49
|
+
next_line_number = 0
|
|
46
50
|
while i < text_len:
|
|
47
51
|
chunk += text[i]
|
|
48
|
-
if text[i] ==
|
|
52
|
+
if text[i] == "\n":
|
|
53
|
+
next_line_number += 1
|
|
49
54
|
if i + 1 < text_len and chunk_continues(text, i):
|
|
50
55
|
i += 1
|
|
51
56
|
continue
|
|
52
|
-
yield chunk
|
|
57
|
+
yield (original_line_number, chunk)
|
|
58
|
+
original_line_number = next_line_number + 1
|
|
53
59
|
chunk = ""
|
|
54
60
|
i += 1
|
|
55
61
|
|
|
56
62
|
|
|
57
|
-
def initialize_model(
|
|
63
|
+
def initialize_model(
|
|
64
|
+
model_pth: str, filename_suffix: str = ".gguf", verbose: bool = False
|
|
65
|
+
) -> Llama:
|
|
58
66
|
"""Initialize Llama class for inference.
|
|
59
67
|
Args:
|
|
60
68
|
model_pth (str): path to gguf model file or Hugging Face name
|
|
@@ -69,14 +77,16 @@ def initialize_model(model_pth: str, filename_suffix: str = ".gguf", verbose: bo
|
|
|
69
77
|
model_path=model_pth,
|
|
70
78
|
n_ctx=0, # Maximum context for the model
|
|
71
79
|
verbose=verbose,
|
|
72
|
-
logits_all=True
|
|
80
|
+
logits_all=True,
|
|
81
|
+
)
|
|
73
82
|
else:
|
|
74
83
|
model = Llama.from_pretrained(
|
|
75
84
|
model_pth,
|
|
76
85
|
f"*{filename_suffix}",
|
|
77
86
|
n_ctx=0, # Maximum context for the model
|
|
78
87
|
verbose=verbose,
|
|
79
|
-
logits_all=True
|
|
88
|
+
logits_all=True,
|
|
89
|
+
)
|
|
80
90
|
|
|
81
91
|
return model
|
|
82
92
|
|
|
@@ -91,8 +101,7 @@ def compute_certainty(probs: List[Dict]) -> float:
|
|
|
91
101
|
This function is used in the server codebase.
|
|
92
102
|
"""
|
|
93
103
|
|
|
94
|
-
top_logprobs = [
|
|
95
|
-
np.exp(e["logprob"]) * 100 for e in probs]
|
|
104
|
+
top_logprobs = [np.exp(e["logprob"]) * 100 for e in probs]
|
|
96
105
|
|
|
97
106
|
certainty = np.median(top_logprobs, axis=0)
|
|
98
107
|
if np.isnan(certainty):
|
|
@@ -100,8 +109,9 @@ def compute_certainty(probs: List[Dict]) -> float:
|
|
|
100
109
|
return certainty
|
|
101
110
|
|
|
102
111
|
|
|
103
|
-
def process_log(
|
|
104
|
-
|
|
112
|
+
def process_log(
|
|
113
|
+
log: str, model: Llama, stream: bool
|
|
114
|
+
) -> CreateCompletionResponse | Iterator[CreateCompletionStreamResponse]:
|
|
105
115
|
"""Processes a given log using the provided language model and returns its summary.
|
|
106
116
|
|
|
107
117
|
Args:
|
|
@@ -112,10 +122,8 @@ def process_log(log: str, model: Llama, stream: bool) -> (
|
|
|
112
122
|
str: The summary of the given log generated by the language model.
|
|
113
123
|
"""
|
|
114
124
|
response = model(
|
|
115
|
-
prompt=PROMPT_TEMPLATE.format(log),
|
|
116
|
-
|
|
117
|
-
max_tokens=0,
|
|
118
|
-
logprobs=1)
|
|
125
|
+
prompt=PROMPT_TEMPLATE.format(log), stream=stream, max_tokens=0, logprobs=1
|
|
126
|
+
)
|
|
119
127
|
|
|
120
128
|
return response
|
|
121
129
|
|
|
@@ -140,18 +148,41 @@ def retrieve_log_content(log_path: str) -> str:
|
|
|
140
148
|
return log
|
|
141
149
|
|
|
142
150
|
|
|
143
|
-
def format_snippets(snippets: list[str]) -> str:
|
|
151
|
+
def format_snippets(snippets: list[str] | list[Tuple[int, str]]) -> str:
|
|
144
152
|
"""Format snippets, giving them separator, id and finally
|
|
145
|
-
concatenating them.
|
|
153
|
+
concatenating them. If snippets have line number attached,
|
|
154
|
+
include that in prompt.
|
|
155
|
+
|
|
156
|
+
Line number must be first element in the tuple. Mixed format of snippets
|
|
157
|
+
is permitted, but may have impact on inference.
|
|
146
158
|
"""
|
|
147
159
|
summary = ""
|
|
148
160
|
for i, s in enumerate(snippets):
|
|
149
|
-
|
|
150
|
-
|
|
161
|
+
if isinstance(s, tuple):
|
|
162
|
+
summary += f"""
|
|
163
|
+
Snippet No. {i} at line #{s[0]}:
|
|
164
|
+
|
|
165
|
+
{s[1]}
|
|
166
|
+
================
|
|
167
|
+
"""
|
|
168
|
+
else:
|
|
169
|
+
summary += f"""
|
|
170
|
+
Snippet No. {i}:
|
|
171
|
+
|
|
172
|
+
{s[1]}
|
|
173
|
+
================
|
|
174
|
+
"""
|
|
175
|
+
return summary
|
|
176
|
+
|
|
151
177
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
178
|
+
def format_analyzed_snippets(snippets: list[Dict]) -> str:
|
|
179
|
+
"""Format snippets for submission into staged prompt."""
|
|
180
|
+
summary = f"\n{SNIPPET_DELIMITER}\n".join(
|
|
181
|
+
[
|
|
182
|
+
f"[{e['snippet']}] at line [{e["line_number"]}]: [{e['comment']['choices'][0]['text']}]"
|
|
183
|
+
for e in snippets
|
|
184
|
+
]
|
|
185
|
+
)
|
|
155
186
|
return summary
|
|
156
187
|
|
|
157
188
|
|
|
@@ -161,7 +192,7 @@ def validate_url(url: str) -> bool:
|
|
|
161
192
|
Either netloc or path must have non-zero length.
|
|
162
193
|
"""
|
|
163
194
|
result = urlparse(url)
|
|
164
|
-
if result.scheme not in [
|
|
195
|
+
if result.scheme not in ["http", "https"]:
|
|
165
196
|
return False
|
|
166
197
|
if any([result.params, result.query, result.fragment]):
|
|
167
198
|
return False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: logdetective
|
|
3
|
-
Version: 0.2
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Log using LLM AI to search for build/test failures and provide ideas for fixing these.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Jiri Podivin
|
|
@@ -18,10 +18,15 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
18
18
|
Classifier: Topic :: Internet :: Log Analysis
|
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
20
|
Classifier: Topic :: Software Development :: Debuggers
|
|
21
|
+
Provides-Extra: server
|
|
21
22
|
Requires-Dist: drain3 (>=0.9.11,<0.10.0)
|
|
23
|
+
Requires-Dist: fastapi (>=0.111.1) ; extra == "server"
|
|
22
24
|
Requires-Dist: huggingface-hub (>0.23.2)
|
|
23
25
|
Requires-Dist: llama-cpp-python (>0.2.56,!=0.2.86)
|
|
24
26
|
Requires-Dist: numpy (>=1.26.0)
|
|
27
|
+
Requires-Dist: pydantic (>=2.8.2,<3.0.0) ; extra == "server"
|
|
28
|
+
Requires-Dist: python-gitlab (>=4.4.0)
|
|
29
|
+
Requires-Dist: pyyaml (>=6.0.1,<7.0.0) ; extra == "server"
|
|
25
30
|
Requires-Dist: requests (>0.2.31)
|
|
26
31
|
Project-URL: homepage, https://github.com/fedora-copr/logdetective
|
|
27
32
|
Project-URL: issues, https://github.com/fedora-copr/logdetective/issues
|
|
@@ -216,6 +221,30 @@ $ curl -L -o models/mistral-7b-instruct-v0.2.Q4_K_S.gguf https://huggingface.co/
|
|
|
216
221
|
```
|
|
217
222
|
|
|
218
223
|
|
|
224
|
+
Our production instance
|
|
225
|
+
-----------------------
|
|
226
|
+
|
|
227
|
+
Our FastAPI server and model inference server run through `podman-compose` on an
|
|
228
|
+
Amazon AWS intance. The VM is provisioned by an
|
|
229
|
+
[ansible playbook](https://pagure.io/fedora-infra/ansible/blob/main/f/roles/logdetective/tasks/main.yml).
|
|
230
|
+
|
|
231
|
+
You can control the server through:
|
|
232
|
+
|
|
233
|
+
```
|
|
234
|
+
cd /root/logdetective
|
|
235
|
+
podman-compose -f docker-compose-prod.yaml ...
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
The `/root` directory contains valuable data. If moving to a new instance,
|
|
239
|
+
please backup the whole directory and transfer it to the new instance.
|
|
240
|
+
|
|
241
|
+
Fore some reason, we need to manually run this command after every reboot:
|
|
242
|
+
|
|
243
|
+
```
|
|
244
|
+
nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
|
|
219
248
|
License
|
|
220
249
|
-------
|
|
221
250
|
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
logdetective/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
logdetective/constants.py,sha256=SPSs1Bq6zPms3RsFTmsADwgrnFTn4fefNHzrB-M3RAE,1383
|
|
3
|
+
logdetective/drain3.ini,sha256=ni91eCT1TwTznZwcqWoOVMQcGEnWhEDNCoTPF7cfGfY,1360
|
|
4
|
+
logdetective/extractors.py,sha256=cjxndfJaQur54GXksIQXL7YTxkOng8I8UnQZMN2t5_w,3388
|
|
5
|
+
logdetective/logdetective.py,sha256=KN0KASW63VAnrjVeXK5AO0ob-vSexutTyeg1fd4uj70,4884
|
|
6
|
+
logdetective/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
logdetective/server/models.py,sha256=9QURaw0u9yZKywXwHzv6_rS6XhRBA2UHV5u4b9xkWqc,5196
|
|
8
|
+
logdetective/server/server.py,sha256=o2s4ezQE-a1XY7RFK0vLDFQO_wj9ZgG58SEV0hErLd8,18237
|
|
9
|
+
logdetective/server/utils.py,sha256=osW5-VXxJAxRt7Wd3t1wF7PyW89FE9g4gSZLZCShlLc,1216
|
|
10
|
+
logdetective/utils.py,sha256=59jq7F45Wk8pldzDt4gkh47Hny0T3fy1ggJFjSXDSGo,6148
|
|
11
|
+
logdetective-0.3.2.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
12
|
+
logdetective-0.3.2.dist-info/METADATA,sha256=vIn_AMoQZAHpsOB_6KXgR8wX1Z0tPEPe34044sj9mKY,10691
|
|
13
|
+
logdetective-0.3.2.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
14
|
+
logdetective-0.3.2.dist-info/entry_points.txt,sha256=3K_vXja6PmcA8sNdUi63WdImeiNhVZcEGPTaoJmltfA,63
|
|
15
|
+
logdetective-0.3.2.dist-info/RECORD,,
|
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
logdetective/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
-
logdetective/constants.py,sha256=6XekuU7sbkY1Pmu4NJajgFbJ0no8PQ3DxQm8NeLKtjE,1383
|
|
3
|
-
logdetective/drain3.ini,sha256=ni91eCT1TwTznZwcqWoOVMQcGEnWhEDNCoTPF7cfGfY,1360
|
|
4
|
-
logdetective/extractors.py,sha256=xfan_dbGCrLH4cguJ2F6W6UkxXMz24Vob39r5-GsNV8,3102
|
|
5
|
-
logdetective/logdetective.py,sha256=03dDCZOx0PRl8KQ5axq5YE90erjoFtcn1tjTuggItco,4684
|
|
6
|
-
logdetective/server/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
logdetective/server/models.py,sha256=vFFOWg7HoI7_6XCty3Fa5AQPbK6g-HuRCEnaqlKXnWw,2333
|
|
8
|
-
logdetective/server/server.py,sha256=3HOwIXsnas5GvyRCm3Y3-ogxa8g_IomOpfxX-KG_yM8,9240
|
|
9
|
-
logdetective/server/utils.py,sha256=-SB49orES2zU83XJODU_1O9pVQg3CtEisaIm3oEiALA,469
|
|
10
|
-
logdetective/utils.py,sha256=j3u_JruoM57q_7dX3enV04t6WGEg3YNWbu5wmEGmP-I,5019
|
|
11
|
-
logdetective-0.2.14.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
12
|
-
logdetective-0.2.14.dist-info/METADATA,sha256=COm3Y0ToL6WAWzvY5HHAV9T8BezNTDoOrLqsV5UoKZk,9768
|
|
13
|
-
logdetective-0.2.14.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
|
14
|
-
logdetective-0.2.14.dist-info/entry_points.txt,sha256=3K_vXja6PmcA8sNdUi63WdImeiNhVZcEGPTaoJmltfA,63
|
|
15
|
-
logdetective-0.2.14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|