logdetective 0.4.0__py3-none-any.whl → 2.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. logdetective/constants.py +33 -12
  2. logdetective/extractors.py +137 -68
  3. logdetective/logdetective.py +102 -33
  4. logdetective/models.py +99 -0
  5. logdetective/prompts-summary-first.yml +20 -0
  6. logdetective/prompts-summary-only.yml +13 -0
  7. logdetective/prompts.yml +90 -0
  8. logdetective/remote_log.py +67 -0
  9. logdetective/server/compressors.py +186 -0
  10. logdetective/server/config.py +78 -0
  11. logdetective/server/database/base.py +34 -26
  12. logdetective/server/database/models/__init__.py +33 -0
  13. logdetective/server/database/models/exceptions.py +17 -0
  14. logdetective/server/database/models/koji.py +143 -0
  15. logdetective/server/database/models/merge_request_jobs.py +623 -0
  16. logdetective/server/database/models/metrics.py +427 -0
  17. logdetective/server/emoji.py +148 -0
  18. logdetective/server/exceptions.py +37 -0
  19. logdetective/server/gitlab.py +451 -0
  20. logdetective/server/koji.py +159 -0
  21. logdetective/server/llm.py +309 -0
  22. logdetective/server/metric.py +75 -30
  23. logdetective/server/models.py +426 -23
  24. logdetective/server/plot.py +432 -0
  25. logdetective/server/server.py +580 -468
  26. logdetective/server/templates/base_response.html.j2 +59 -0
  27. logdetective/server/templates/gitlab_full_comment.md.j2 +73 -0
  28. logdetective/server/templates/gitlab_short_comment.md.j2 +62 -0
  29. logdetective/server/utils.py +98 -32
  30. logdetective/skip_snippets.yml +12 -0
  31. logdetective/utils.py +187 -73
  32. logdetective-2.11.0.dist-info/METADATA +568 -0
  33. logdetective-2.11.0.dist-info/RECORD +40 -0
  34. {logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info}/WHEEL +1 -1
  35. logdetective/server/database/models.py +0 -88
  36. logdetective-0.4.0.dist-info/METADATA +0 -333
  37. logdetective-0.4.0.dist-info/RECORD +0 -19
  38. {logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info}/entry_points.txt +0 -0
  39. {logdetective-0.4.0.dist-info → logdetective-2.11.0.dist-info/licenses}/LICENSE +0 -0
logdetective/utils.py CHANGED
@@ -1,63 +1,80 @@
1
1
  import logging
2
2
  import os
3
+ import subprocess as sp
3
4
  from typing import Iterator, List, Dict, Tuple, Generator
4
5
  from urllib.parse import urlparse
6
+
7
+ import aiohttp
5
8
  import numpy as np
6
- import requests
9
+ import yaml
7
10
 
8
- from llama_cpp import Llama, CreateCompletionResponse, CreateCompletionStreamResponse
9
- from logdetective.constants import PROMPT_TEMPLATE, SNIPPET_DELIMITER
10
- from logdetective.server.models import AnalyzedSnippet
11
+ from llama_cpp import (
12
+ Llama,
13
+ CreateChatCompletionResponse,
14
+ CreateChatCompletionStreamResponse,
15
+ )
16
+ from logdetective.constants import SNIPPET_DELIMITER
17
+ from logdetective.models import PromptConfig, SkipSnippets
18
+ from logdetective.remote_log import RemoteLog
11
19
 
12
20
  LOG = logging.getLogger("logdetective")
13
21
 
14
22
 
15
- def chunk_continues(text: str, index: int) -> bool:
23
+ def new_message(text: str) -> bool:
16
24
  """Set of heuristics for determining whether or not
17
25
  does the current chunk of log text continue on next line.
18
26
 
19
27
  Following rules are checked, in order:
20
- * is the next character is whitespace
21
- * is the previous character backslash '\\'
22
- * is the previous character colon ':'
23
-
28
+ * is the first character is whitespace
29
+ * is the first character backslash '|'
24
30
  """
25
31
  conditionals = [
26
- lambda i, string: string[i + 1].isspace(),
27
- lambda i, string: string[i - 1] == "\\",
28
- lambda i, string: string[i - 1] == ":",
32
+ lambda string: string[0].isspace(),
33
+ lambda string: string[0] == "|",
29
34
  ]
30
35
 
31
36
  for c in conditionals:
32
- y = c(index, text)
37
+ y = c(text)
33
38
  if y:
34
- return True
39
+ return False
35
40
 
36
- return False
41
+ return True
37
42
 
38
43
 
39
- def get_chunks(text: str) -> Generator[Tuple[int, str], None, None]:
44
+ def get_chunks(
45
+ text: str, max_chunk_len: int = 2000
46
+ ) -> Generator[Tuple[int, str], None, None]:
40
47
  """Split log into chunks according to heuristic
41
48
  based on whitespace and backslash presence.
42
49
  """
43
- text_len = len(text)
44
- i = 0
50
+ lines = text.splitlines()
51
+
52
+ # Chunk we will be yielding
45
53
  chunk = ""
46
- # Keep track of the original and next line number
47
- # every `\n` hit increases the next_line_number by one.
48
- original_line_number = 0
49
- next_line_number = 0
50
- while i < text_len:
51
- chunk += text[i]
52
- if text[i] == "\n":
53
- next_line_number += 1
54
- if i + 1 < text_len and chunk_continues(text, i):
55
- i += 1
56
- continue
57
- yield (original_line_number, chunk)
58
- original_line_number = next_line_number + 1
59
- chunk = ""
60
- i += 1
54
+ # Number of line where the message started
55
+ original_line = 0
56
+ for i, line in enumerate(lines):
57
+ if len(line) == 0:
58
+ continue
59
+ if new_message(line):
60
+ # Yield chunk if we have it
61
+ if len(chunk) > 0:
62
+ yield (original_line, chunk)
63
+ original_line = i
64
+ chunk = line
65
+ else:
66
+ chunk += "\n" + line
67
+ if len(chunk) > max_chunk_len:
68
+ # If the chunk is too long, keep splitting into smaller chunks
69
+ # until we reach manageable size
70
+ while len(chunk) > max_chunk_len:
71
+ remainder = chunk[max_chunk_len:]
72
+ chunk = chunk[:max_chunk_len]
73
+ yield (original_line, chunk)
74
+ chunk = remainder
75
+
76
+ # if we still have some text left over
77
+ yield (original_line, chunk)
61
78
 
62
79
 
63
80
  def initialize_model(
@@ -110,25 +127,43 @@ def compute_certainty(probs: List[Dict]) -> float:
110
127
 
111
128
 
112
129
  def process_log(
113
- log: str, model: Llama, stream: bool
114
- ) -> CreateCompletionResponse | Iterator[CreateCompletionStreamResponse]:
130
+ log: str, model: Llama, stream: bool, prompt_templates: PromptConfig, temperature: float
131
+ ) -> CreateChatCompletionResponse | Iterator[CreateChatCompletionStreamResponse]:
115
132
  """Processes a given log using the provided language model and returns its summary.
116
133
 
117
134
  Args:
118
135
  log (str): The input log to be processed.
119
136
  model (Llama): The language model used for processing the log.
120
-
137
+ stream (bool): Return output as Iterator.
138
+ prompt_template (str): Which prompt template to use.
139
+ temperature (float): Temperature parameter for model runtime.
121
140
  Returns:
122
141
  str: The summary of the given log generated by the language model.
123
142
  """
124
- response = model(
125
- prompt=PROMPT_TEMPLATE.format(log), stream=stream, max_tokens=0, logprobs=1
143
+ messages = [
144
+ {
145
+ "role": "system",
146
+ "content": prompt_templates.default_system_prompt
147
+ },
148
+ {
149
+ "role": "user",
150
+ "content": prompt_templates.prompt_template.format(log)
151
+ },
152
+ ]
153
+
154
+ response = model.create_chat_completion(
155
+ messages=messages,
156
+ stream=stream,
157
+ max_tokens=0,
158
+ logprobs=True,
159
+ top_logprobs=1,
160
+ temperature=temperature,
126
161
  )
127
162
 
128
163
  return response
129
164
 
130
165
 
131
- def retrieve_log_content(log_path: str) -> str:
166
+ async def retrieve_log_content(http: aiohttp.ClientSession, log_path: str) -> str:
132
167
  """Get content of the file on the log_path path.
133
168
  Path is assumed to be valid URL if it has a scheme.
134
169
  Otherwise it attempts to pull it from local filesystem."""
@@ -143,7 +178,8 @@ def retrieve_log_content(log_path: str) -> str:
143
178
  log = f.read()
144
179
 
145
180
  else:
146
- log = requests.get(log_path, timeout=60).text
181
+ remote_log = RemoteLog(log_path, http)
182
+ log = await remote_log.get_url_content()
147
183
 
148
184
  return log
149
185
 
@@ -156,46 +192,124 @@ def format_snippets(snippets: list[str] | list[Tuple[int, str]]) -> str:
156
192
  Line number must be first element in the tuple. Mixed format of snippets
157
193
  is permitted, but may have impact on inference.
158
194
  """
159
- summary = ""
195
+ summary = "\n"
160
196
  for i, s in enumerate(snippets):
161
197
  if isinstance(s, tuple):
162
- summary += f"""
163
- Snippet No. {i} at line #{s[0]}:
164
-
165
- {s[1]}
166
- ================
167
- """
198
+ line_number, snippet_content = s
199
+ header = f"Snippet No. {i} at line #{line_number}:"
168
200
  else:
169
- summary += f"""
170
- Snippet No. {i}:
171
-
172
- {s[1]}
173
- ================
174
- """
201
+ header = f"Snippet No. {i}:"
202
+ snippet_content = s
203
+ summary += (
204
+ f"{header}\n"
205
+ "\n"
206
+ f"{snippet_content}\n"
207
+ f"{SNIPPET_DELIMITER}\n"
208
+ f"\n"
209
+ )
175
210
  return summary
176
211
 
177
212
 
178
- def format_analyzed_snippets(snippets: list[AnalyzedSnippet]) -> str:
179
- """Format snippets for submission into staged prompt."""
180
- summary = f"\n{SNIPPET_DELIMITER}\n".join(
181
- [
182
- f"[{e.text}] at line [{e.line_number}]: [{e.explanation.text}]"
183
- for e in snippets
213
+ def load_prompts(path: str | None) -> PromptConfig:
214
+ """Load prompts from given yaml file if there is one.
215
+ Alternatively use defaults."""
216
+ if path:
217
+ try:
218
+ with open(path, "r") as file:
219
+ return PromptConfig(yaml.safe_load(file))
220
+ except FileNotFoundError:
221
+ print("Prompt configuration file not found, reverting to defaults.")
222
+ return PromptConfig()
223
+
224
+
225
+ def prompt_to_messages(
226
+ user_message: str,
227
+ system_prompt: str | None = None,
228
+ system_role: str = "developer",
229
+ user_role: str = "user",
230
+ ) -> List[Dict[str, str]]:
231
+ """Turn prompt into list of message dictionaries.
232
+ If `system_role` and `user_role` are the same, only a single message is created,
233
+ as concatenation of `user_message` and `system_prompt`. This is useful for models which
234
+ do not have separate system role, such as mistral.
235
+ """
236
+
237
+ if system_role == user_role:
238
+ messages = [
239
+ {"role": system_role, "content": f"{system_prompt}\n{user_message}"}
240
+ ]
241
+ else:
242
+ messages = [
243
+ {"role": system_role, "content": system_prompt},
244
+ {
245
+ "role": user_role,
246
+ "content": user_message,
247
+ },
184
248
  ]
185
- )
186
- return summary
187
249
 
250
+ return messages
188
251
 
189
- def validate_url(url: str) -> bool:
190
- """Validate incoming URL to be at least somewhat sensible for log files
191
- Only http and https protocols permitted. No result, params or query fields allowed.
192
- Either netloc or path must have non-zero length.
193
- """
194
- result = urlparse(url)
195
- if result.scheme not in ["http", "https"]:
196
- return False
197
- if any([result.params, result.query, result.fragment]):
198
- return False
199
- if not (result.path or result.netloc):
252
+
253
+ def filter_snippet_patterns(snippet: str, skip_snippets: SkipSnippets) -> bool:
254
+ """Try to match snippet agains provided patterns to determine if we should
255
+ filter it out or not."""
256
+ for key, pattern in skip_snippets.snippet_patterns.items():
257
+ if pattern.match(snippet):
258
+ LOG.debug("Snippet `%s` has matched agains skip pattern %s", snippet, key)
259
+ return True
260
+
261
+ return False
262
+
263
+
264
+ def load_skip_snippet_patterns(path: str | None) -> SkipSnippets:
265
+ """Load dictionary of snippet patterns we want to skip."""
266
+ if path:
267
+ try:
268
+ with open(path, "r") as file:
269
+ return SkipSnippets(yaml.safe_load(file))
270
+ except OSError as e:
271
+ LOG.error("Couldn't open file with snippet skip patterns `%s`", path)
272
+ raise e
273
+
274
+ return SkipSnippets({})
275
+
276
+
277
+ def check_csgrep() -> bool:
278
+ """Verifies presence of csgrep in path"""
279
+ try:
280
+ result = sp.run(
281
+ ["csgrep", "--version"],
282
+ text=True,
283
+ check=True,
284
+ shell=False,
285
+ capture_output=True,
286
+ timeout=1.0,
287
+ )
288
+ except (FileNotFoundError, sp.TimeoutExpired, sp.CalledProcessError) as ex:
289
+ LOG.error("Required binary `csgrep` was not found in path: %s", ex)
200
290
  return False
201
- return True
291
+ if result.returncode == 0:
292
+ return True
293
+ LOG.error("Issue was encountered while calling `csgrep`: `%s`", result.stderr)
294
+
295
+ return False
296
+
297
+
298
+ def mine_logs(log: str, extractors: list) -> List[Tuple[int, str]]:
299
+ """Extract snippets from log text using extractors provided.
300
+ Each extractor is applied in turn on original log.
301
+ Depending on characteristics of extractors used, there may be
302
+ an overlap in snippets extracted."""
303
+
304
+ log_summary = []
305
+
306
+ LOG.info("Getting summary")
307
+
308
+ for extractor in extractors:
309
+ log_summary.extend(extractor(log))
310
+
311
+ ratio = len("\n".join([text for _, text in log_summary])) / len(log)
312
+ LOG.debug("Log summary: \n %s", log_summary)
313
+ LOG.info("Snippets: %s Compression ratio: %s", len(log_summary), ratio)
314
+
315
+ return log_summary