logdetective 1.4.0__tar.gz → 1.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {logdetective-1.4.0 → logdetective-1.6.0}/PKG-INFO +30 -4
  2. {logdetective-1.4.0 → logdetective-1.6.0}/README.md +28 -2
  3. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/constants.py +0 -11
  4. logdetective-1.6.0/logdetective/extractors.py +55 -0
  5. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/logdetective.py +35 -22
  6. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/models.py +32 -6
  7. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/prompts.yml +0 -11
  8. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/remote_log.py +2 -4
  9. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/config.py +12 -5
  10. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/emoji.py +3 -1
  11. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/llm.py +11 -2
  12. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/plot.py +36 -35
  13. logdetective-1.6.0/logdetective/skip_snippets.yml +12 -0
  14. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/utils.py +34 -14
  15. {logdetective-1.4.0 → logdetective-1.6.0}/pyproject.toml +2 -2
  16. logdetective-1.4.0/logdetective/extractors.py +0 -105
  17. {logdetective-1.4.0 → logdetective-1.6.0}/LICENSE +0 -0
  18. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/__init__.py +0 -0
  19. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/drain3.ini +0 -0
  20. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/prompts-summary-first.yml +0 -0
  21. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/prompts-summary-only.yml +0 -0
  22. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/__init__.py +0 -0
  23. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/compressors.py +0 -0
  24. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/database/__init__.py +0 -0
  25. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/database/base.py +0 -0
  26. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/database/models/__init__.py +0 -0
  27. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/database/models/merge_request_jobs.py +0 -0
  28. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/database/models/metrics.py +0 -0
  29. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/gitlab.py +0 -0
  30. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/metric.py +0 -0
  31. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/models.py +0 -0
  32. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/server.py +0 -0
  33. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/templates/gitlab_full_comment.md.j2 +0 -0
  34. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective/server/templates/gitlab_short_comment.md.j2 +0 -0
  35. {logdetective-1.4.0 → logdetective-1.6.0}/logdetective.1.asciidoc +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: logdetective
3
- Version: 1.4.0
3
+ Version: 1.6.0
4
4
  Summary: Log using LLM AI to search for build/test failures and provide ideas for fixing these.
5
5
  License: Apache-2.0
6
6
  Author: Jiri Podivin
7
7
  Author-email: jpodivin@gmail.com
8
8
  Requires-Python: >=3.11,<4.0
9
- Classifier: Development Status :: 4 - Beta
9
+ Classifier: Development Status :: 5 - Production/Stable
10
10
  Classifier: Environment :: Console
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: Apache Software License
@@ -87,9 +87,10 @@ Usage
87
87
  To analyze a log file, run the script with the following command line arguments:
88
88
  - `url` (required): The URL of the log file to be analyzed.
89
89
  - `--model` (optional, default: "Mistral-7B-Instruct-v0.2-GGUF"): The path or URL of the language model for analysis. As we are using LLama.cpp we want this to be in the `gguf` format. You can include the download link to the model here. If the model is already on your machine it will skip the download.
90
- - `--summarizer` (optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
91
- - `--n_lines` (optional, default: 8): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
90
+ - `--summarizer` DISABLED: LLM summarization option was removed. Argument is kept for backward compatibility only.(optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
91
+ - `--n_lines` DISABLED: LLM summarization option was removed. Argument is kept for backward compatibility only. (optional, default: 8): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
92
92
  - `--n_clusters` (optional, default 8): Number of clusters for Drain to organize log chunks into. This only makes sense when you are summarizing with Drain
93
+ - `--skip_snippets` Path to patterns for skipping snippets.
93
94
 
94
95
  Example usage:
95
96
 
@@ -376,6 +377,9 @@ HTTPS certificate generated through:
376
377
  certbot certonly --standalone -d logdetective01.fedorainfracloud.org
377
378
  ```
378
379
 
380
+ Certificates need to be be placed into location specified by the`LOGDETECTIVE_CERTDIR`
381
+ env var and the service should be restarted.
382
+
379
383
  Querying statistics
380
384
  -------------------
381
385
 
@@ -435,6 +439,28 @@ with spaces, or replacement fields marked with curly braces, `{}` left for inser
435
439
  Number of replacement fields in new prompts, must be the same as in originals.
436
440
  Although their position may be different.
437
441
 
442
+
443
+ Skip Snippets
444
+ -------------
445
+
446
+ Certain log chunks may not contribute to the analysis of the problem under any circumstances.
447
+ User can specify regular expressions, matching such log chunks, along with simple description,
448
+ using Skip Snippets feature.
449
+
450
+ Patterns to be skipped must be defined yaml file as a dictionary, where key is a description
451
+ and value is a regular expression. For example:
452
+
453
+ ```
454
+ child_exit_code_zero: "Child return code was: 0"
455
+ ```
456
+
457
+ Special care must be taken not to write a regular expression which may match
458
+ too many chunks, or which may be evaluated as data structure by the yaml parser.
459
+
460
+ Example of a valid pattern definition file: `logdetective/skip_patterns.yml`,
461
+ can be used as a starting point and is used as a default if no other definition is provided.
462
+
463
+
438
464
  License
439
465
  -------
440
466
 
@@ -43,9 +43,10 @@ Usage
43
43
  To analyze a log file, run the script with the following command line arguments:
44
44
  - `url` (required): The URL of the log file to be analyzed.
45
45
  - `--model` (optional, default: "Mistral-7B-Instruct-v0.2-GGUF"): The path or URL of the language model for analysis. As we are using LLama.cpp we want this to be in the `gguf` format. You can include the download link to the model here. If the model is already on your machine it will skip the download.
46
- - `--summarizer` (optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
47
- - `--n_lines` (optional, default: 8): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
46
+ - `--summarizer` DISABLED: LLM summarization option was removed. Argument is kept for backward compatibility only.(optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
47
+ - `--n_lines` DISABLED: LLM summarization option was removed. Argument is kept for backward compatibility only. (optional, default: 8): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
48
48
  - `--n_clusters` (optional, default 8): Number of clusters for Drain to organize log chunks into. This only makes sense when you are summarizing with Drain
49
+ - `--skip_snippets` Path to patterns for skipping snippets.
49
50
 
50
51
  Example usage:
51
52
 
@@ -332,6 +333,9 @@ HTTPS certificate generated through:
332
333
  certbot certonly --standalone -d logdetective01.fedorainfracloud.org
333
334
  ```
334
335
 
336
+ Certificates need to be be placed into location specified by the`LOGDETECTIVE_CERTDIR`
337
+ env var and the service should be restarted.
338
+
335
339
  Querying statistics
336
340
  -------------------
337
341
 
@@ -391,6 +395,28 @@ with spaces, or replacement fields marked with curly braces, `{}` left for inser
391
395
  Number of replacement fields in new prompts, must be the same as in originals.
392
396
  Although their position may be different.
393
397
 
398
+
399
+ Skip Snippets
400
+ -------------
401
+
402
+ Certain log chunks may not contribute to the analysis of the problem under any circumstances.
403
+ User can specify regular expressions, matching such log chunks, along with simple description,
404
+ using Skip Snippets feature.
405
+
406
+ Patterns to be skipped must be defined yaml file as a dictionary, where key is a description
407
+ and value is a regular expression. For example:
408
+
409
+ ```
410
+ child_exit_code_zero: "Child return code was: 0"
411
+ ```
412
+
413
+ Special care must be taken not to write a regular expression which may match
414
+ too many chunks, or which may be evaluated as data structure by the yaml parser.
415
+
416
+ Example of a valid pattern definition file: `logdetective/skip_patterns.yml`,
417
+ can be used as a starting point and is used as a default if no other definition is provided.
418
+
419
+
394
420
  License
395
421
  -------
396
422
 
@@ -26,17 +26,6 @@ Analysis:
26
26
 
27
27
  """
28
28
 
29
- SUMMARIZATION_PROMPT_TEMPLATE = """
30
- Does following log contain error or issue?
31
-
32
- Log:
33
-
34
- {}
35
-
36
- Answer:
37
-
38
- """
39
-
40
29
  SNIPPET_PROMPT_TEMPLATE = """
41
30
  Analyse following RPM build log snippet. Describe contents accurately, without speculation or suggestions for resolution.
42
31
 
@@ -0,0 +1,55 @@
1
+ import os
2
+ import logging
3
+ from typing import Tuple
4
+
5
+ import drain3
6
+ from drain3.template_miner_config import TemplateMinerConfig
7
+
8
+ from logdetective.utils import get_chunks, filter_snippet_patterns
9
+ from logdetective.models import SkipSnippets
10
+
11
+ LOG = logging.getLogger("logdetective")
12
+
13
+
14
+ class DrainExtractor:
15
+ """A class that extracts information from logs using a template miner algorithm."""
16
+
17
+ def __init__(
18
+ self,
19
+ verbose: bool = False,
20
+ context: bool = False,
21
+ max_clusters=8,
22
+ skip_snippets: SkipSnippets = SkipSnippets({}),
23
+ ):
24
+ config = TemplateMinerConfig()
25
+ config.load(f"{os.path.dirname(__file__)}/drain3.ini")
26
+ config.profiling_enabled = verbose
27
+ config.drain_max_clusters = max_clusters
28
+ self.miner = drain3.TemplateMiner(config=config)
29
+ self.verbose = verbose
30
+ self.context = context
31
+ self.skip_snippets = skip_snippets
32
+
33
+ def __call__(self, log: str) -> list[Tuple[int, str]]:
34
+ out = []
35
+ # Create chunks
36
+ chunks = list(get_chunks(log))
37
+ # Keep only chunks that don't match any of the excluded patterns
38
+ chunks = [
39
+ (_, chunk)
40
+ for _, chunk in chunks
41
+ if not filter_snippet_patterns(chunk, self.skip_snippets)
42
+ ]
43
+ # First pass create clusters
44
+ for _, chunk in chunks:
45
+ processed_chunk = self.miner.add_log_message(chunk)
46
+ LOG.debug(processed_chunk)
47
+ clusters = list(self.miner.drain.clusters)
48
+ # Second pass, only matching lines with clusters,
49
+ # to recover original text
50
+ for chunk_start, chunk in chunks:
51
+ cluster = self.miner.match(chunk, "always")
52
+ if cluster in clusters:
53
+ out.append((chunk_start, chunk))
54
+ clusters.remove(cluster)
55
+ return out
@@ -14,8 +14,9 @@ from logdetective.utils import (
14
14
  format_snippets,
15
15
  compute_certainty,
16
16
  load_prompts,
17
+ load_skip_snippet_patterns,
17
18
  )
18
- from logdetective.extractors import LLMExtractor, DrainExtractor
19
+ from logdetective.extractors import DrainExtractor
19
20
 
20
21
  LOG = logging.getLogger("logdetective")
21
22
 
@@ -49,16 +50,16 @@ def setup_args():
49
50
  "--summarizer",
50
51
  type=str,
51
52
  default="drain",
52
- help="Choose between LLM and Drain template miner as the log summarizer.\
53
- LLM must be specified as path to a model, URL or local file.",
53
+ help="DISABLED: LLM summarization option was removed. \
54
+ Argument is kept for backward compatibility only.",
54
55
  )
55
56
  parser.add_argument(
56
57
  "-N",
57
58
  "--n_lines",
58
59
  type=int,
59
- default=8,
60
- help="The number of lines per chunk for LLM analysis.\
61
- This only makes sense when you are summarizing with LLM.",
60
+ default=None,
61
+ help="DISABLED: LLM summarization option was removed. \
62
+ Argument is kept for backward compatibility only.",
62
63
  )
63
64
  parser.add_argument(
64
65
  "-C",
@@ -74,13 +75,19 @@ def setup_args():
74
75
  "--prompts",
75
76
  type=str,
76
77
  default=f"{os.path.dirname(__file__)}/prompts.yml",
77
- help="Path to prompt configuration file."
78
+ help="Path to prompt configuration file.",
78
79
  )
79
80
  parser.add_argument(
80
81
  "--temperature",
81
82
  type=float,
82
83
  default=DEFAULT_TEMPERATURE,
83
- help="Temperature for inference."
84
+ help="Temperature for inference.",
85
+ )
86
+ parser.add_argument(
87
+ "--skip_snippets",
88
+ type=str,
89
+ default=f"{os.path.dirname(__file__)}/skip_snippets.yml",
90
+ help="Path to patterns for skipping snippets.",
84
91
  )
85
92
  return parser.parse_args()
86
93
 
@@ -93,6 +100,10 @@ async def run(): # pylint: disable=too-many-statements,too-many-locals
93
100
  sys.stderr.write("Error: --quiet and --verbose is mutually exclusive.\n")
94
101
  sys.exit(2)
95
102
 
103
+ # Emit warning about use of discontinued args
104
+ if args.n_lines or args.summarizer != "drain":
105
+ LOG.warning("LLM based summarization was removed. Drain will be used instead.")
106
+
96
107
  # Logging facility setup
97
108
  log_level = logging.INFO
98
109
  if args.verbose >= 1:
@@ -116,18 +127,19 @@ async def run(): # pylint: disable=too-many-statements,too-many-locals
116
127
  LOG.error("You likely do not have enough memory to load the AI model")
117
128
  sys.exit(3)
118
129
 
119
- # Log file summarizer selection and initialization
120
- if args.summarizer == "drain":
121
- extractor = DrainExtractor(
122
- args.verbose > 1, context=True, max_clusters=args.n_clusters
123
- )
124
- else:
125
- summarizer_model = initialize_model(args.summarizer, verbose=args.verbose > 2)
126
- extractor = LLMExtractor(
127
- summarizer_model,
128
- args.verbose > 1,
129
- prompts_configuration.summarization_prompt_template,
130
- )
130
+ try:
131
+ skip_snippets = load_skip_snippet_patterns(args.skip_snippets)
132
+ except OSError as e:
133
+ LOG.error(e)
134
+ sys.exit(5)
135
+
136
+ # Log file summarizer initialization
137
+ extractor = DrainExtractor(
138
+ args.verbose > 1,
139
+ context=True,
140
+ max_clusters=args.n_clusters,
141
+ skip_snippets=skip_snippets,
142
+ )
131
143
 
132
144
  LOG.info("Getting summary")
133
145
 
@@ -151,7 +163,8 @@ async def run(): # pylint: disable=too-many-statements,too-many-locals
151
163
 
152
164
  prompt = (
153
165
  f"{prompts_configuration.default_system_prompt}\n"
154
- f"{prompts_configuration.prompt_template}")
166
+ f"{prompts_configuration.prompt_template}"
167
+ )
155
168
 
156
169
  stream = True
157
170
  if args.no_stream:
@@ -191,7 +204,7 @@ async def run(): # pylint: disable=too-many-statements,too-many-locals
191
204
 
192
205
 
193
206
  def main():
194
- """ Evaluate logdetective program and wait for it to finish """
207
+ """Evaluate logdetective program and wait for it to finish"""
195
208
  asyncio.run(run())
196
209
 
197
210
 
@@ -1,10 +1,10 @@
1
+ import re
1
2
  from typing import Optional
2
- from pydantic import BaseModel
3
+ from pydantic import BaseModel, model_validator
3
4
 
4
5
  from logdetective.constants import (
5
6
  PROMPT_TEMPLATE,
6
7
  PROMPT_TEMPLATE_STAGED,
7
- SUMMARIZATION_PROMPT_TEMPLATE,
8
8
  SNIPPET_PROMPT_TEMPLATE,
9
9
  DEFAULT_SYSTEM_PROMPT,
10
10
  )
@@ -14,7 +14,6 @@ class PromptConfig(BaseModel):
14
14
  """Configuration for basic log detective prompts."""
15
15
 
16
16
  prompt_template: str = PROMPT_TEMPLATE
17
- summarization_prompt_template: str = SUMMARIZATION_PROMPT_TEMPLATE
18
17
  snippet_prompt_template: str = SNIPPET_PROMPT_TEMPLATE
19
18
  prompt_template_staged: str = PROMPT_TEMPLATE_STAGED
20
19
 
@@ -27,9 +26,6 @@ class PromptConfig(BaseModel):
27
26
  if data is None:
28
27
  return
29
28
  self.prompt_template = data.get("prompt_template", PROMPT_TEMPLATE)
30
- self.summarization_prompt_template = data.get(
31
- "summarization_prompt_template", SUMMARIZATION_PROMPT_TEMPLATE
32
- )
33
29
  self.snippet_prompt_template = data.get(
34
30
  "snippet_prompt_template", SNIPPET_PROMPT_TEMPLATE
35
31
  )
@@ -45,3 +41,33 @@ class PromptConfig(BaseModel):
45
41
  self.staged_system_prompt = data.get(
46
42
  "staged_system_prompt", DEFAULT_SYSTEM_PROMPT
47
43
  )
44
+
45
+
46
+ class SkipSnippets(BaseModel):
47
+ """Regular expressions defining snippets we should not analyze"""
48
+
49
+ snippet_patterns: dict[str, re.Pattern] = {}
50
+
51
+ def __init__(self, data: Optional[dict] = None):
52
+ super().__init__(data=data)
53
+ if data is None:
54
+ return
55
+ self.snippet_patterns = {
56
+ key: re.compile(pattern) for key, pattern in data.items()
57
+ }
58
+
59
+ @model_validator(mode="before")
60
+ @classmethod
61
+ def check_patterns(cls, data: dict):
62
+ """Check if all supplied patterns are valid regular expressions.
63
+ Techically replicating what is done in __init__ but with nicer error message."""
64
+ patterns = data["data"]
65
+ for key, pattern in patterns.items():
66
+ try:
67
+ re.compile(pattern=pattern)
68
+ except (TypeError, re.error) as ex:
69
+ raise ValueError(
70
+ f"Invalid pattern `{pattern}` with name `{key}` supplied for skipping in logs."
71
+ ) from ex
72
+
73
+ return data
@@ -21,17 +21,6 @@ prompt_template: |
21
21
 
22
22
  Analysis:
23
23
 
24
-
25
- summarization_prompt_template: |
26
- Does following log contain error or issue?
27
-
28
- Log:
29
-
30
- {}
31
-
32
- Answer:
33
-
34
-
35
24
  snippet_prompt_template: |
36
25
  Analyse following RPM build log snippet. Describe contents accurately, without speculation or suggestions for resolution.
37
26
 
@@ -53,7 +53,7 @@ class RemoteLog:
53
53
  LOG.debug("process url %s", self.url)
54
54
  try:
55
55
  response = await self._http_session.get(self.url, raise_for_status=True)
56
- except aiohttp.ClientResponseError as ex:
56
+ except (aiohttp.ClientResponseError, aiohttp.ClientConnectorError) as ex:
57
57
  raise RuntimeError(f"We couldn't obtain the logs: {ex}") from ex
58
58
  return await response.text()
59
59
  LOG.error("Invalid URL received ")
@@ -64,6 +64,4 @@ class RemoteLog:
64
64
  try:
65
65
  return await self.get_url_content()
66
66
  except RuntimeError as ex:
67
- raise HTTPBadRequest(
68
- reason=f"We couldn't obtain the logs: {ex}"
69
- ) from ex
67
+ raise HTTPBadRequest(reason=f"We couldn't obtain the logs: {ex}") from ex
@@ -3,8 +3,9 @@ import logging
3
3
  import yaml
4
4
  from openai import AsyncOpenAI
5
5
 
6
- from logdetective.utils import load_prompts
6
+ from logdetective.utils import load_prompts, load_skip_snippet_patterns
7
7
  from logdetective.server.models import Config, InferenceConfig
8
+ import logdetective
8
9
 
9
10
 
10
11
  def load_server_config(path: str | None) -> Config:
@@ -52,18 +53,24 @@ def get_log(config: Config):
52
53
 
53
54
 
54
55
  def get_openai_api_client(ineference_config: InferenceConfig):
55
- """Set up AsyncOpenAI client with default configuration.
56
- """
56
+ """Set up AsyncOpenAI client with default configuration."""
57
57
  return AsyncOpenAI(
58
- api_key=ineference_config.api_token,
59
- base_url=ineference_config.url)
58
+ api_key=ineference_config.api_token, base_url=ineference_config.url
59
+ )
60
60
 
61
61
 
62
62
  SERVER_CONFIG_PATH = os.environ.get("LOGDETECTIVE_SERVER_CONF", None)
63
63
  SERVER_PROMPT_PATH = os.environ.get("LOGDETECTIVE_PROMPTS", None)
64
+ # The default location for skip patterns is in the same directory
65
+ # as logdetective __init__.py file.
66
+ SERVER_SKIP_PATTERNS_PATH = os.environ.get(
67
+ "LOGDETECIVE_SKIP_PATTERNS",
68
+ f"{os.path.dirname(logdetective.__file__)}/skip_snippets.yml",
69
+ )
64
70
 
65
71
  SERVER_CONFIG = load_server_config(SERVER_CONFIG_PATH)
66
72
  PROMPT_CONFIG = load_prompts(SERVER_PROMPT_PATH)
73
+ SKIP_SNIPPETS_CONFIG = load_skip_snippet_patterns(SERVER_SKIP_PATTERNS_PATH)
67
74
 
68
75
  LOG = get_log(SERVER_CONFIG)
69
76
 
@@ -51,7 +51,9 @@ async def _handle_gitlab_operation(func: Callable, *args):
51
51
  else:
52
52
  LOG.exception(log_msg)
53
53
  except Exception as e: # pylint: disable=broad-exception-caught
54
- LOG.exception("Unexpected error during GitLab operation %s(%s): %s", func, args, e)
54
+ LOG.exception(
55
+ "Unexpected error during GitLab operation %s(%s): %s", func, args, e
56
+ )
55
57
 
56
58
 
57
59
  async def collect_emojis_in_comments( # pylint: disable=too-many-locals
@@ -16,7 +16,13 @@ from logdetective.utils import (
16
16
  compute_certainty,
17
17
  prompt_to_messages,
18
18
  )
19
- from logdetective.server.config import LOG, SERVER_CONFIG, PROMPT_CONFIG, CLIENT
19
+ from logdetective.server.config import (
20
+ LOG,
21
+ SERVER_CONFIG,
22
+ PROMPT_CONFIG,
23
+ CLIENT,
24
+ SKIP_SNIPPETS_CONFIG,
25
+ )
20
26
  from logdetective.server.models import (
21
27
  AnalyzedSnippet,
22
28
  InferenceConfig,
@@ -42,7 +48,10 @@ def format_analyzed_snippets(snippets: list[AnalyzedSnippet]) -> str:
42
48
  def mine_logs(log: str) -> List[Tuple[int, str]]:
43
49
  """Extract snippets from log text"""
44
50
  extractor = DrainExtractor(
45
- verbose=True, context=True, max_clusters=SERVER_CONFIG.extractor.max_clusters
51
+ verbose=True,
52
+ context=True,
53
+ max_clusters=SERVER_CONFIG.extractor.max_clusters,
54
+ skip_snippets=SKIP_SNIPPETS_CONFIG,
46
55
  )
47
56
 
48
57
  LOG.info("Getting summary")
@@ -2,12 +2,10 @@ import datetime
2
2
  from typing import Optional, Union, Dict
3
3
 
4
4
  import numpy
5
- import matplotlib
6
- import matplotlib.figure
7
- import matplotlib.pyplot
5
+ from numpy.typing import ArrayLike
6
+ from matplotlib import dates, colormaps, axes, pyplot, figure
8
7
 
9
- from matplotlib.pyplot import cm
10
- from logdetective.server import models
8
+ from logdetective.server.models import TimePeriod
11
9
  from logdetective.server.database.models import (
12
10
  AnalyzeRequestMetrics,
13
11
  EndpointType,
@@ -18,25 +16,25 @@ from logdetective.server.database.models import (
18
16
  class Definition:
19
17
  """Define plot details, given a time period."""
20
18
 
21
- def __init__(self, time_period: models.TimePeriod):
19
+ def __init__(self, time_period: TimePeriod):
22
20
  self.time_period = time_period
23
21
  self.days_diff = time_period.get_time_period().days
24
22
  if self.time_period.hours:
25
23
  self._freq = "H"
26
24
  self._time_format = "%Y-%m-%d %H"
27
- self._locator = matplotlib.dates.HourLocator(interval=2)
25
+ self._locator = dates.HourLocator(interval=2)
28
26
  self._time_unit = "hour"
29
27
  self._time_delta = datetime.timedelta(hours=1)
30
28
  elif self.time_period.days:
31
29
  self._freq = "D"
32
30
  self._time_format = "%Y-%m-%d"
33
- self._locator = matplotlib.dates.DayLocator(interval=1)
31
+ self._locator = dates.DayLocator(interval=1)
34
32
  self._time_unit = "day"
35
33
  self._time_delta = datetime.timedelta(days=1)
36
34
  elif self.time_period.weeks:
37
35
  self._freq = "W"
38
36
  self._time_format = "%Y-%m-%d"
39
- self._locator = matplotlib.dates.WeekdayLocator(interval=1)
37
+ self._locator = dates.WeekdayLocator(interval=1)
40
38
  self._time_unit = "week"
41
39
  self._time_delta = datetime.timedelta(weeks=1)
42
40
 
@@ -120,10 +118,10 @@ def create_time_series_arrays(
120
118
 
121
119
 
122
120
  def _add_bar_chart(
123
- ax: matplotlib.figure.Axes,
121
+ ax: axes.Axes,
124
122
  plot_def: Definition,
125
- timestamps: numpy.array,
126
- values: numpy.array,
123
+ timestamps: ArrayLike,
124
+ values: ArrayLike,
127
125
  label: str,
128
126
  ) -> None:
129
127
  """Add a blue bar chart"""
@@ -142,18 +140,18 @@ def _add_bar_chart(
142
140
  ax.set_ylabel(label, color="blue")
143
141
  ax.tick_params(axis="y", labelcolor="blue")
144
142
 
145
- ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter(plot_def.time_format))
143
+ ax.xaxis.set_major_formatter(dates.DateFormatter(plot_def.time_format))
146
144
  ax.xaxis.set_major_locator(plot_def.locator)
147
145
 
148
- matplotlib.pyplot.xticks(rotation=45)
146
+ pyplot.xticks(rotation=45)
149
147
 
150
148
  ax.grid(True, alpha=0.3)
151
149
 
152
150
 
153
151
  def _add_line_chart( # pylint: disable=too-many-arguments disable=too-many-positional-arguments
154
- ax: matplotlib.figure.Axes,
155
- timestamps: numpy.array,
156
- values: numpy.array,
152
+ ax: axes.Axes,
153
+ timestamps: ArrayLike,
154
+ values: ArrayLike,
157
155
  label: str,
158
156
  color: str = "red",
159
157
  set_label: bool = True,
@@ -166,10 +164,10 @@ def _add_line_chart( # pylint: disable=too-many-arguments disable=too-many-posi
166
164
 
167
165
 
168
166
  def requests_per_time(
169
- period_of_time: models.TimePeriod,
167
+ period_of_time: TimePeriod,
170
168
  endpoint: EndpointType = EndpointType.ANALYZE,
171
169
  end_time: Optional[datetime.datetime] = None,
172
- ) -> matplotlib.figure.Figure:
170
+ ) -> figure.Figure:
173
171
  """
174
172
  Generate a visualization of request counts over a specified time period.
175
173
 
@@ -200,13 +198,13 @@ def requests_per_time(
200
198
  requests_counts, plot_def, start_time, end_time
201
199
  )
202
200
 
203
- fig, ax1 = matplotlib.pyplot.subplots(figsize=(12, 6))
201
+ fig, ax1 = pyplot.subplots(figsize=(12, 6))
204
202
  _add_bar_chart(ax1, plot_def, timestamps, counts, "Requests")
205
203
 
206
204
  ax2 = ax1.twinx()
207
205
  _add_line_chart(ax2, timestamps, numpy.cumsum(counts), "Cumulative Requests")
208
206
 
209
- matplotlib.pyplot.title(
207
+ pyplot.title(
210
208
  f"Requests received for API {endpoint} ({start_time.strftime(plot_def.time_format)} "
211
209
  f"to {end_time.strftime(plot_def.time_format)})"
212
210
  )
@@ -215,16 +213,16 @@ def requests_per_time(
215
213
  lines2, labels2 = ax2.get_legend_handles_labels()
216
214
  ax1.legend(lines1 + lines2, labels1 + labels2, loc="center")
217
215
 
218
- matplotlib.pyplot.tight_layout()
216
+ pyplot.tight_layout()
219
217
 
220
218
  return fig
221
219
 
222
220
 
223
221
  def average_time_per_responses( # pylint: disable=too-many-locals
224
- period_of_time: models.TimePeriod,
222
+ period_of_time: TimePeriod,
225
223
  endpoint: EndpointType = EndpointType.ANALYZE,
226
224
  end_time: Optional[datetime.datetime] = None,
227
- ) -> matplotlib.figure.Figure:
225
+ ) -> figure.Figure:
228
226
  """
229
227
  Generate a visualization of average response time and length over a specified time period.
230
228
 
@@ -259,7 +257,7 @@ def average_time_per_responses( # pylint: disable=too-many-locals
259
257
  float,
260
258
  )
261
259
 
262
- fig, ax1 = matplotlib.pyplot.subplots(figsize=(12, 6))
260
+ fig, ax1 = pyplot.subplots(figsize=(12, 6))
263
261
  _add_bar_chart(
264
262
  ax1, plot_def, timestamps, average_time, "average response time (seconds)"
265
263
  )
@@ -280,7 +278,7 @@ def average_time_per_responses( # pylint: disable=too-many-locals
280
278
  ax2 = ax1.twinx()
281
279
  _add_line_chart(ax2, timestamps, average_length, "average response length (chars)")
282
280
 
283
- matplotlib.pyplot.title(
281
+ pyplot.title(
284
282
  f"average response time for API {endpoint} ({start_time.strftime(plot_def.time_format)} "
285
283
  f"to {end_time.strftime(plot_def.time_format)})"
286
284
  )
@@ -289,7 +287,7 @@ def average_time_per_responses( # pylint: disable=too-many-locals
289
287
  lines2, labels2 = ax2.get_legend_handles_labels()
290
288
  ax1.legend(lines1 + lines2, labels1 + labels2, loc="center")
291
289
 
292
- matplotlib.pyplot.tight_layout()
290
+ pyplot.tight_layout()
293
291
 
294
292
  return fig
295
293
 
@@ -322,7 +320,7 @@ def _collect_emoji_data(
322
320
 
323
321
 
324
322
  def _plot_emoji_data( # pylint: disable=too-many-locals
325
- ax: matplotlib.figure.Axes,
323
+ ax: axes.Axes,
326
324
  reactions_values_dict: Dict[str, Dict[datetime.datetime, int]],
327
325
  plot_def: Definition,
328
326
  start_time: datetime.datetime,
@@ -340,7 +338,10 @@ def _plot_emoji_data( # pylint: disable=too-many-locals
340
338
  )
341
339
  all_counts.extend(counts)
342
340
 
343
- colors = [cm.viridis(i) for i in numpy.linspace(0, 1, len(reactions_values_dict))] # pylint: disable=no-member
341
+ colors = [
342
+ colormaps["viridis"](i)
343
+ for i in numpy.linspace(0, 1, len(reactions_values_dict))
344
+ ]
344
345
 
345
346
  first_emoji = True
346
347
  for i, (emoji, dict_counts) in enumerate(reactions_values_dict.items()):
@@ -369,9 +370,9 @@ def _plot_emoji_data( # pylint: disable=too-many-locals
369
370
 
370
371
 
371
372
  def emojis_per_time(
372
- period_of_time: models.TimePeriod,
373
+ period_of_time: TimePeriod,
373
374
  end_time: Optional[datetime.datetime] = None,
374
- ) -> matplotlib.figure.Figure:
375
+ ) -> figure.Figure:
375
376
  """
376
377
  Generate a visualization of overall emoji feedback
377
378
  over a specified time period.
@@ -396,13 +397,13 @@ def emojis_per_time(
396
397
  start_time = period_of_time.get_period_start_time(end_time)
397
398
  reactions_values_dict = _collect_emoji_data(start_time, plot_def)
398
399
 
399
- fig, ax = matplotlib.pyplot.subplots(figsize=(12, 6))
400
+ fig, ax = pyplot.subplots(figsize=(12, 6))
400
401
 
401
402
  emoji_lines, emoji_labels = _plot_emoji_data(
402
403
  ax, reactions_values_dict, plot_def, start_time, end_time
403
404
  )
404
405
 
405
- matplotlib.pyplot.title(
406
+ pyplot.title(
406
407
  f"Emoji feedback ({start_time.strftime(plot_def.time_format)} "
407
408
  f"to {end_time.strftime(plot_def.time_format)})"
408
409
  )
@@ -419,11 +420,11 @@ def emojis_per_time(
419
420
  ax.set_ylabel("Count")
420
421
 
421
422
  # Format x-axis
422
- ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter(plot_def.time_format))
423
+ ax.xaxis.set_major_formatter(dates.DateFormatter(plot_def.time_format))
423
424
  ax.xaxis.set_major_locator(plot_def.locator)
424
425
  ax.tick_params(axis="x", labelrotation=45)
425
426
  ax.grid(True, alpha=0.3)
426
427
 
427
- matplotlib.pyplot.tight_layout()
428
+ pyplot.tight_layout()
428
429
 
429
430
  return fig
@@ -0,0 +1,12 @@
1
+ # This file holds patterns you want to skip during log parsing.
2
+ # By default, no patterns are supplied.
3
+ # Patterns are to be specified as values of dictionary,
4
+ # with each key being a descriptive name of the pattern.
5
+ # Patterns themselves are evaluated as a regular expression.
6
+ # Make sure to avoid regular expressions that may be interpreted
7
+ # as yaml syntax.
8
+ # Example:
9
+
10
+ # contains_capital_a: "^.*A.*"
11
+ # starts_with_numeric: "^[0-9].*"
12
+ child_exit_code_zero: "Child return code was: 0"
@@ -8,7 +8,7 @@ import numpy as np
8
8
  import yaml
9
9
 
10
10
  from llama_cpp import Llama, CreateCompletionResponse, CreateCompletionStreamResponse
11
- from logdetective.models import PromptConfig
11
+ from logdetective.models import PromptConfig, SkipSnippets
12
12
  from logdetective.remote_log import RemoteLog
13
13
 
14
14
 
@@ -179,7 +179,7 @@ def format_snippets(snippets: list[str] | list[Tuple[int, str]]) -> str:
179
179
  summary += f"""
180
180
  Snippet No. {i}:
181
181
 
182
- {s[1]}
182
+ {s}
183
183
  ================
184
184
  """
185
185
  return summary
@@ -198,8 +198,11 @@ def load_prompts(path: str | None) -> PromptConfig:
198
198
 
199
199
 
200
200
  def prompt_to_messages(
201
- user_message: str, system_prompt: str | None = None,
202
- system_role: str = "developer", user_role: str = "user") -> List[Dict[str, str]]:
201
+ user_message: str,
202
+ system_prompt: str | None = None,
203
+ system_role: str = "developer",
204
+ user_role: str = "user",
205
+ ) -> List[Dict[str, str]]:
203
206
  """Turn prompt into list of message dictionaries.
204
207
  If `system_role` and `user_role` are the same, only a single message is created,
205
208
  as concatenation of `user_message` and `system_prompt`. This is useful for models which
@@ -208,22 +211,39 @@ def prompt_to_messages(
208
211
 
209
212
  if system_role == user_role:
210
213
  messages = [
211
- {
212
- "role": system_role,
213
- "content": f"{system_prompt}\n{user_message}"
214
- }
214
+ {"role": system_role, "content": f"{system_prompt}\n{user_message}"}
215
215
  ]
216
216
  else:
217
-
218
217
  messages = [
219
- {
220
- "role": system_role,
221
- "content": system_prompt
222
- },
218
+ {"role": system_role, "content": system_prompt},
223
219
  {
224
220
  "role": user_role,
225
221
  "content": user_message,
226
- }
222
+ },
227
223
  ]
228
224
 
229
225
  return messages
226
+
227
+
228
+ def filter_snippet_patterns(snippet: str, skip_snippets: SkipSnippets) -> bool:
229
+ """Try to match snippet agains provided patterns to determine if we should
230
+ filter it out or not."""
231
+ for key, pattern in skip_snippets.snippet_patterns.items():
232
+ if pattern.match(snippet):
233
+ LOG.debug("Snippet `%s` has matched agains skip pattern %s", snippet, key)
234
+ return True
235
+
236
+ return False
237
+
238
+
239
+ def load_skip_snippet_patterns(path: str | None) -> SkipSnippets:
240
+ """Load dictionary of snippet patterns we want to skip."""
241
+ if path:
242
+ try:
243
+ with open(path, "r") as file:
244
+ return SkipSnippets(yaml.safe_load(file))
245
+ except OSError as e:
246
+ LOG.error("Couldn't open file with snippet skip patterns `%s`", path)
247
+ raise e
248
+
249
+ return SkipSnippets({})
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "logdetective"
3
- version = "1.4.0"
3
+ version = "1.6.0"
4
4
  description = "Log using LLM AI to search for build/test failures and provide ideas for fixing these."
5
5
  authors = ["Jiri Podivin <jpodivin@gmail.com>"]
6
6
  license = "Apache-2.0"
@@ -15,7 +15,7 @@ packages = [
15
15
  { include = "logdetective" }
16
16
  ]
17
17
  classifiers = [
18
- "Development Status :: 4 - Beta",
18
+ "Development Status :: 5 - Production/Stable",
19
19
  "Environment :: Console",
20
20
  "Intended Audience :: Developers",
21
21
  "License :: OSI Approved :: Apache Software License",
@@ -1,105 +0,0 @@
1
- import os
2
- import logging
3
- from typing import Tuple
4
-
5
- import drain3
6
- from drain3.template_miner_config import TemplateMinerConfig
7
- from llama_cpp import Llama, LlamaGrammar
8
-
9
- from logdetective.constants import SUMMARIZATION_PROMPT_TEMPLATE
10
- from logdetective.utils import get_chunks
11
-
12
- LOG = logging.getLogger("logdetective")
13
-
14
-
15
- class LLMExtractor:
16
- """
17
- A class that extracts relevant information from logs using a language model.
18
- """
19
-
20
- def __init__(
21
- self,
22
- model: Llama,
23
- n_lines: int = 2,
24
- prompt: str = SUMMARIZATION_PROMPT_TEMPLATE,
25
- ):
26
- self.model = model
27
- self.n_lines = n_lines
28
- self.grammar = LlamaGrammar.from_string(
29
- 'root ::= ("Yes" | "No")', verbose=False
30
- )
31
- self.prompt = prompt
32
-
33
- def __call__(
34
- self, log: str, n_lines: int = 2, neighbors: bool = False
35
- ) -> list[str]:
36
- chunks = self.rate_chunks(log)
37
- out = self.create_extract(chunks, neighbors)
38
- return out
39
-
40
- def rate_chunks(self, log: str) -> list[tuple]:
41
- """Scan log by the model and store results.
42
-
43
- :param log: log file content
44
- """
45
- results = []
46
- log_lines = log.split("\n")
47
-
48
- for i in range(0, len(log_lines), self.n_lines):
49
- block = "\n".join(log_lines[i: i + self.n_lines])
50
- prompt = self.prompt.format(log)
51
- out = self.model(prompt, max_tokens=7, grammar=self.grammar)
52
- out = f"{out['choices'][0]['text']}\n"
53
- results.append((block, out))
54
-
55
- return results
56
-
57
- def create_extract(self, chunks: list[tuple], neighbors: bool = False) -> list[str]:
58
- """Extract interesting chunks from the model processing."""
59
- interesting = []
60
- summary = []
61
- # pylint: disable=consider-using-enumerate
62
- for i in range(len(chunks)):
63
- if chunks[i][1].startswith("Yes"):
64
- interesting.append(i)
65
- if neighbors:
66
- interesting.extend([max(i - 1, 0), min(i + 1, len(chunks) - 1)])
67
-
68
- interesting = set(interesting)
69
-
70
- for i in interesting:
71
- summary.append(chunks[i][0])
72
-
73
- return summary
74
-
75
-
76
- class DrainExtractor:
77
- """A class that extracts information from logs using a template miner algorithm."""
78
-
79
- def __init__(self, verbose: bool = False, context: bool = False, max_clusters=8):
80
- config = TemplateMinerConfig()
81
- config.load(f"{os.path.dirname(__file__)}/drain3.ini")
82
- config.profiling_enabled = verbose
83
- config.drain_max_clusters = max_clusters
84
- self.miner = drain3.TemplateMiner(config=config)
85
- self.verbose = verbose
86
- self.context = context
87
-
88
- def __call__(self, log: str) -> list[Tuple[int, str]]:
89
- out = []
90
- # First pass create clusters
91
- for _, chunk in get_chunks(log):
92
- processed_chunk = self.miner.add_log_message(chunk)
93
- LOG.debug(processed_chunk)
94
- # Sort found clusters by size, descending order
95
- sorted_clusters = sorted(
96
- self.miner.drain.clusters, key=lambda it: it.size, reverse=True
97
- )
98
- # Second pass, only matching lines with clusters,
99
- # to recover original text
100
- for chunk_start, chunk in get_chunks(log):
101
- cluster = self.miner.match(chunk, "always")
102
- if cluster in sorted_clusters:
103
- out.append((chunk_start, chunk))
104
- sorted_clusters.remove(cluster)
105
- return out
File without changes