logdetective 0.4.0__py3-none-any.whl → 2.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logdetective/constants.py +33 -12
- logdetective/extractors.py +137 -68
- logdetective/logdetective.py +102 -33
- logdetective/models.py +99 -0
- logdetective/prompts-summary-first.yml +20 -0
- logdetective/prompts-summary-only.yml +13 -0
- logdetective/prompts.yml +90 -0
- logdetective/remote_log.py +67 -0
- logdetective/server/compressors.py +186 -0
- logdetective/server/config.py +78 -0
- logdetective/server/database/base.py +34 -26
- logdetective/server/database/models/__init__.py +33 -0
- logdetective/server/database/models/exceptions.py +17 -0
- logdetective/server/database/models/koji.py +143 -0
- logdetective/server/database/models/merge_request_jobs.py +623 -0
- logdetective/server/database/models/metrics.py +427 -0
- logdetective/server/emoji.py +148 -0
- logdetective/server/exceptions.py +37 -0
- logdetective/server/gitlab.py +451 -0
- logdetective/server/koji.py +159 -0
- logdetective/server/llm.py +309 -0
- logdetective/server/metric.py +75 -30
- logdetective/server/models.py +426 -23
- logdetective/server/plot.py +432 -0
- logdetective/server/server.py +580 -468
- logdetective/server/templates/base_response.html.j2 +59 -0
- logdetective/server/templates/gitlab_full_comment.md.j2 +73 -0
- logdetective/server/templates/gitlab_short_comment.md.j2 +62 -0
- logdetective/server/utils.py +98 -32
- logdetective/skip_snippets.yml +12 -0
- logdetective/utils.py +187 -73
- logdetective-2.11.0.dist-info/METADATA +568 -0
- logdetective-2.11.0.dist-info/RECORD +40 -0
- {logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info}/WHEEL +1 -1
- logdetective/server/database/models.py +0 -88
- logdetective-0.4.0.dist-info/METADATA +0 -333
- logdetective-0.4.0.dist-info/RECORD +0 -19
- {logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info}/entry_points.txt +0 -0
- {logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info/licenses}/LICENSE +0 -0
logdetective/utils.py
CHANGED
|
@@ -1,63 +1,80 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
+
import subprocess as sp
|
|
3
4
|
from typing import Iterator, List, Dict, Tuple, Generator
|
|
4
5
|
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import aiohttp
|
|
5
8
|
import numpy as np
|
|
6
|
-
import
|
|
9
|
+
import yaml
|
|
7
10
|
|
|
8
|
-
from llama_cpp import
|
|
9
|
-
|
|
10
|
-
|
|
11
|
+
from llama_cpp import (
|
|
12
|
+
Llama,
|
|
13
|
+
CreateChatCompletionResponse,
|
|
14
|
+
CreateChatCompletionStreamResponse,
|
|
15
|
+
)
|
|
16
|
+
from logdetective.constants import SNIPPET_DELIMITER
|
|
17
|
+
from logdetective.models import PromptConfig, SkipSnippets
|
|
18
|
+
from logdetective.remote_log import RemoteLog
|
|
11
19
|
|
|
12
20
|
LOG = logging.getLogger("logdetective")
|
|
13
21
|
|
|
14
22
|
|
|
15
|
-
def
|
|
23
|
+
def new_message(text: str) -> bool:
|
|
16
24
|
"""Set of heuristics for determining whether or not
|
|
17
25
|
does the current chunk of log text continue on next line.
|
|
18
26
|
|
|
19
27
|
Following rules are checked, in order:
|
|
20
|
-
* is the
|
|
21
|
-
* is the
|
|
22
|
-
* is the previous character colon ':'
|
|
23
|
-
|
|
28
|
+
* is the first character is whitespace
|
|
29
|
+
* is the first character backslash '|'
|
|
24
30
|
"""
|
|
25
31
|
conditionals = [
|
|
26
|
-
lambda
|
|
27
|
-
lambda
|
|
28
|
-
lambda i, string: string[i - 1] == ":",
|
|
32
|
+
lambda string: string[0].isspace(),
|
|
33
|
+
lambda string: string[0] == "|",
|
|
29
34
|
]
|
|
30
35
|
|
|
31
36
|
for c in conditionals:
|
|
32
|
-
y = c(
|
|
37
|
+
y = c(text)
|
|
33
38
|
if y:
|
|
34
|
-
return
|
|
39
|
+
return False
|
|
35
40
|
|
|
36
|
-
return
|
|
41
|
+
return True
|
|
37
42
|
|
|
38
43
|
|
|
39
|
-
def get_chunks(
|
|
44
|
+
def get_chunks(
|
|
45
|
+
text: str, max_chunk_len: int = 2000
|
|
46
|
+
) -> Generator[Tuple[int, str], None, None]:
|
|
40
47
|
"""Split log into chunks according to heuristic
|
|
41
48
|
based on whitespace and backslash presence.
|
|
42
49
|
"""
|
|
43
|
-
|
|
44
|
-
|
|
50
|
+
lines = text.splitlines()
|
|
51
|
+
|
|
52
|
+
# Chunk we will be yielding
|
|
45
53
|
chunk = ""
|
|
46
|
-
#
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
54
|
+
# Number of line where the message started
|
|
55
|
+
original_line = 0
|
|
56
|
+
for i, line in enumerate(lines):
|
|
57
|
+
if len(line) == 0:
|
|
58
|
+
continue
|
|
59
|
+
if new_message(line):
|
|
60
|
+
# Yield chunk if we have it
|
|
61
|
+
if len(chunk) > 0:
|
|
62
|
+
yield (original_line, chunk)
|
|
63
|
+
original_line = i
|
|
64
|
+
chunk = line
|
|
65
|
+
else:
|
|
66
|
+
chunk += "\n" + line
|
|
67
|
+
if len(chunk) > max_chunk_len:
|
|
68
|
+
# If the chunk is too long, keep splitting into smaller chunks
|
|
69
|
+
# until we reach manageable size
|
|
70
|
+
while len(chunk) > max_chunk_len:
|
|
71
|
+
remainder = chunk[max_chunk_len:]
|
|
72
|
+
chunk = chunk[:max_chunk_len]
|
|
73
|
+
yield (original_line, chunk)
|
|
74
|
+
chunk = remainder
|
|
75
|
+
|
|
76
|
+
# if we still have some text left over
|
|
77
|
+
yield (original_line, chunk)
|
|
61
78
|
|
|
62
79
|
|
|
63
80
|
def initialize_model(
|
|
@@ -110,25 +127,43 @@ def compute_certainty(probs: List[Dict]) -> float:
|
|
|
110
127
|
|
|
111
128
|
|
|
112
129
|
def process_log(
|
|
113
|
-
log: str, model: Llama, stream: bool
|
|
114
|
-
) ->
|
|
130
|
+
log: str, model: Llama, stream: bool, prompt_templates: PromptConfig, temperature: float
|
|
131
|
+
) -> CreateChatCompletionResponse | Iterator[CreateChatCompletionStreamResponse]:
|
|
115
132
|
"""Processes a given log using the provided language model and returns its summary.
|
|
116
133
|
|
|
117
134
|
Args:
|
|
118
135
|
log (str): The input log to be processed.
|
|
119
136
|
model (Llama): The language model used for processing the log.
|
|
120
|
-
|
|
137
|
+
stream (bool): Return output as Iterator.
|
|
138
|
+
prompt_template (str): Which prompt template to use.
|
|
139
|
+
temperature (float): Temperature parameter for model runtime.
|
|
121
140
|
Returns:
|
|
122
141
|
str: The summary of the given log generated by the language model.
|
|
123
142
|
"""
|
|
124
|
-
|
|
125
|
-
|
|
143
|
+
messages = [
|
|
144
|
+
{
|
|
145
|
+
"role": "system",
|
|
146
|
+
"content": prompt_templates.default_system_prompt
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
"role": "user",
|
|
150
|
+
"content": prompt_templates.prompt_template.format(log)
|
|
151
|
+
},
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
response = model.create_chat_completion(
|
|
155
|
+
messages=messages,
|
|
156
|
+
stream=stream,
|
|
157
|
+
max_tokens=0,
|
|
158
|
+
logprobs=True,
|
|
159
|
+
top_logprobs=1,
|
|
160
|
+
temperature=temperature,
|
|
126
161
|
)
|
|
127
162
|
|
|
128
163
|
return response
|
|
129
164
|
|
|
130
165
|
|
|
131
|
-
def retrieve_log_content(log_path: str) -> str:
|
|
166
|
+
async def retrieve_log_content(http: aiohttp.ClientSession, log_path: str) -> str:
|
|
132
167
|
"""Get content of the file on the log_path path.
|
|
133
168
|
Path is assumed to be valid URL if it has a scheme.
|
|
134
169
|
Otherwise it attempts to pull it from local filesystem."""
|
|
@@ -143,7 +178,8 @@ def retrieve_log_content(log_path: str) -> str:
|
|
|
143
178
|
log = f.read()
|
|
144
179
|
|
|
145
180
|
else:
|
|
146
|
-
|
|
181
|
+
remote_log = RemoteLog(log_path, http)
|
|
182
|
+
log = await remote_log.get_url_content()
|
|
147
183
|
|
|
148
184
|
return log
|
|
149
185
|
|
|
@@ -156,46 +192,124 @@ def format_snippets(snippets: list[str] | list[Tuple[int, str]]) -> str:
|
|
|
156
192
|
Line number must be first element in the tuple. Mixed format of snippets
|
|
157
193
|
is permitted, but may have impact on inference.
|
|
158
194
|
"""
|
|
159
|
-
summary = ""
|
|
195
|
+
summary = "\n"
|
|
160
196
|
for i, s in enumerate(snippets):
|
|
161
197
|
if isinstance(s, tuple):
|
|
162
|
-
|
|
163
|
-
Snippet No. {i} at line #{
|
|
164
|
-
|
|
165
|
-
{s[1]}
|
|
166
|
-
================
|
|
167
|
-
"""
|
|
198
|
+
line_number, snippet_content = s
|
|
199
|
+
header = f"Snippet No. {i} at line #{line_number}:"
|
|
168
200
|
else:
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
{
|
|
173
|
-
|
|
174
|
-
""
|
|
201
|
+
header = f"Snippet No. {i}:"
|
|
202
|
+
snippet_content = s
|
|
203
|
+
summary += (
|
|
204
|
+
f"{header}\n"
|
|
205
|
+
"\n"
|
|
206
|
+
f"{snippet_content}\n"
|
|
207
|
+
f"{SNIPPET_DELIMITER}\n"
|
|
208
|
+
f"\n"
|
|
209
|
+
)
|
|
175
210
|
return summary
|
|
176
211
|
|
|
177
212
|
|
|
178
|
-
def
|
|
179
|
-
"""
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
213
|
+
def load_prompts(path: str | None) -> PromptConfig:
|
|
214
|
+
"""Load prompts from given yaml file if there is one.
|
|
215
|
+
Alternatively use defaults."""
|
|
216
|
+
if path:
|
|
217
|
+
try:
|
|
218
|
+
with open(path, "r") as file:
|
|
219
|
+
return PromptConfig(yaml.safe_load(file))
|
|
220
|
+
except FileNotFoundError:
|
|
221
|
+
print("Prompt configuration file not found, reverting to defaults.")
|
|
222
|
+
return PromptConfig()
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def prompt_to_messages(
|
|
226
|
+
user_message: str,
|
|
227
|
+
system_prompt: str | None = None,
|
|
228
|
+
system_role: str = "developer",
|
|
229
|
+
user_role: str = "user",
|
|
230
|
+
) -> List[Dict[str, str]]:
|
|
231
|
+
"""Turn prompt into list of message dictionaries.
|
|
232
|
+
If `system_role` and `user_role` are the same, only a single message is created,
|
|
233
|
+
as concatenation of `user_message` and `system_prompt`. This is useful for models which
|
|
234
|
+
do not have separate system role, such as mistral.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
if system_role == user_role:
|
|
238
|
+
messages = [
|
|
239
|
+
{"role": system_role, "content": f"{system_prompt}\n{user_message}"}
|
|
240
|
+
]
|
|
241
|
+
else:
|
|
242
|
+
messages = [
|
|
243
|
+
{"role": system_role, "content": system_prompt},
|
|
244
|
+
{
|
|
245
|
+
"role": user_role,
|
|
246
|
+
"content": user_message,
|
|
247
|
+
},
|
|
184
248
|
]
|
|
185
|
-
)
|
|
186
|
-
return summary
|
|
187
249
|
|
|
250
|
+
return messages
|
|
188
251
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
252
|
+
|
|
253
|
+
def filter_snippet_patterns(snippet: str, skip_snippets: SkipSnippets) -> bool:
|
|
254
|
+
"""Try to match snippet agains provided patterns to determine if we should
|
|
255
|
+
filter it out or not."""
|
|
256
|
+
for key, pattern in skip_snippets.snippet_patterns.items():
|
|
257
|
+
if pattern.match(snippet):
|
|
258
|
+
LOG.debug("Snippet `%s` has matched agains skip pattern %s", snippet, key)
|
|
259
|
+
return True
|
|
260
|
+
|
|
261
|
+
return False
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def load_skip_snippet_patterns(path: str | None) -> SkipSnippets:
|
|
265
|
+
"""Load dictionary of snippet patterns we want to skip."""
|
|
266
|
+
if path:
|
|
267
|
+
try:
|
|
268
|
+
with open(path, "r") as file:
|
|
269
|
+
return SkipSnippets(yaml.safe_load(file))
|
|
270
|
+
except OSError as e:
|
|
271
|
+
LOG.error("Couldn't open file with snippet skip patterns `%s`", path)
|
|
272
|
+
raise e
|
|
273
|
+
|
|
274
|
+
return SkipSnippets({})
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def check_csgrep() -> bool:
|
|
278
|
+
"""Verifies presence of csgrep in path"""
|
|
279
|
+
try:
|
|
280
|
+
result = sp.run(
|
|
281
|
+
["csgrep", "--version"],
|
|
282
|
+
text=True,
|
|
283
|
+
check=True,
|
|
284
|
+
shell=False,
|
|
285
|
+
capture_output=True,
|
|
286
|
+
timeout=1.0,
|
|
287
|
+
)
|
|
288
|
+
except (FileNotFoundError, sp.TimeoutExpired, sp.CalledProcessError) as ex:
|
|
289
|
+
LOG.error("Required binary `csgrep` was not found in path: %s", ex)
|
|
200
290
|
return False
|
|
201
|
-
|
|
291
|
+
if result.returncode == 0:
|
|
292
|
+
return True
|
|
293
|
+
LOG.error("Issue was encountered while calling `csgrep`: `%s`", result.stderr)
|
|
294
|
+
|
|
295
|
+
return False
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def mine_logs(log: str, extractors: list) -> List[Tuple[int, str]]:
|
|
299
|
+
"""Extract snippets from log text using extractors provided.
|
|
300
|
+
Each extractor is applied in turn on original log.
|
|
301
|
+
Depending on characteristics of extractors used, there may be
|
|
302
|
+
an overlap in snippets extracted."""
|
|
303
|
+
|
|
304
|
+
log_summary = []
|
|
305
|
+
|
|
306
|
+
LOG.info("Getting summary")
|
|
307
|
+
|
|
308
|
+
for extractor in extractors:
|
|
309
|
+
log_summary.extend(extractor(log))
|
|
310
|
+
|
|
311
|
+
ratio = len("\n".join([text for _, text in log_summary])) / len(log)
|
|
312
|
+
LOG.debug("Log summary: \n %s", log_summary)
|
|
313
|
+
LOG.info("Snippets: %s Compression ratio: %s", len(log_summary), ratio)
|
|
314
|
+
|
|
315
|
+
return log_summary
|