logdetective 2.0.2__py3-none-any.whl → 2.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logdetective/extractors.py +131 -23
- logdetective/logdetective.py +25 -12
- logdetective/models.py +26 -0
- logdetective/server/llm.py +19 -10
- logdetective/server/models.py +46 -1
- logdetective/server/utils.py +9 -27
- logdetective/utils.py +48 -4
- {logdetective-2.0.2.dist-info → logdetective-2.2.0.dist-info}/METADATA +28 -1
- {logdetective-2.0.2.dist-info → logdetective-2.2.0.dist-info}/RECORD +12 -12
- {logdetective-2.0.2.dist-info → logdetective-2.2.0.dist-info}/LICENSE +0 -0
- {logdetective-2.0.2.dist-info → logdetective-2.2.0.dist-info}/WHEEL +0 -0
- {logdetective-2.0.2.dist-info → logdetective-2.2.0.dist-info}/entry_points.txt +0 -0
logdetective/extractors.py
CHANGED
|
@@ -1,57 +1,165 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import logging
|
|
3
|
+
import subprocess as sp
|
|
3
4
|
from typing import Tuple
|
|
4
5
|
|
|
5
6
|
import drain3
|
|
6
7
|
from drain3.template_miner_config import TemplateMinerConfig
|
|
8
|
+
from pydantic import ValidationError
|
|
7
9
|
|
|
8
10
|
from logdetective.utils import get_chunks, filter_snippet_patterns
|
|
9
|
-
from logdetective.models import SkipSnippets
|
|
11
|
+
from logdetective.models import SkipSnippets, CSGrepOutput
|
|
10
12
|
|
|
11
13
|
LOG = logging.getLogger("logdetective")
|
|
12
14
|
|
|
13
15
|
|
|
14
|
-
class
|
|
15
|
-
"""
|
|
16
|
+
class Extractor:
|
|
17
|
+
"""Base extractor class."""
|
|
16
18
|
|
|
17
19
|
def __init__(
|
|
18
20
|
self,
|
|
19
21
|
verbose: bool = False,
|
|
20
|
-
context: bool = False,
|
|
21
|
-
max_clusters=8,
|
|
22
22
|
skip_snippets: SkipSnippets = SkipSnippets({}),
|
|
23
|
-
max_snippet_len: int = 2000
|
|
24
|
-
):
|
|
25
|
-
config = TemplateMinerConfig()
|
|
26
|
-
config.load(f"{os.path.dirname(__file__)}/drain3.ini")
|
|
27
|
-
config.profiling_enabled = verbose
|
|
28
|
-
config.drain_max_clusters = max_clusters
|
|
29
|
-
self.miner = drain3.TemplateMiner(config=config)
|
|
23
|
+
max_snippet_len: int = 2000,
|
|
24
|
+
):
|
|
30
25
|
self.verbose = verbose
|
|
31
|
-
self.context = context
|
|
32
26
|
self.skip_snippets = skip_snippets
|
|
33
27
|
self.max_snippet_len = max_snippet_len
|
|
34
28
|
|
|
35
29
|
def __call__(self, log: str) -> list[Tuple[int, str]]:
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
30
|
+
raise NotImplementedError
|
|
31
|
+
|
|
32
|
+
def filter_snippet_patterns(
|
|
33
|
+
self, chunks: list[tuple[int, str]]
|
|
34
|
+
) -> list[tuple[int, str]]:
|
|
35
|
+
"""Keep only chunks that don't match any of the excluded patterns"""
|
|
40
36
|
chunks = [
|
|
41
37
|
(_, chunk)
|
|
42
38
|
for _, chunk in chunks
|
|
43
39
|
if not filter_snippet_patterns(chunk, self.skip_snippets)
|
|
44
40
|
]
|
|
45
|
-
|
|
41
|
+
return chunks
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class DrainExtractor(Extractor):
|
|
45
|
+
"""A class that extracts information from logs using a template miner algorithm."""
|
|
46
|
+
|
|
47
|
+
_clusters: list
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
verbose: bool = False,
|
|
52
|
+
skip_snippets: SkipSnippets = SkipSnippets({}),
|
|
53
|
+
max_snippet_len: int = 2000,
|
|
54
|
+
max_clusters: int = 8,
|
|
55
|
+
):
|
|
56
|
+
super().__init__(verbose, skip_snippets, max_snippet_len)
|
|
57
|
+
config = TemplateMinerConfig()
|
|
58
|
+
config.load(f"{os.path.dirname(__file__)}/drain3.ini")
|
|
59
|
+
config.profiling_enabled = verbose
|
|
60
|
+
config.drain_max_clusters = max_clusters
|
|
61
|
+
self.miner = drain3.TemplateMiner(config=config)
|
|
62
|
+
|
|
63
|
+
def __call__(self, log: str) -> list[Tuple[int, str]]:
|
|
64
|
+
# Create chunks
|
|
65
|
+
chunks = list(get_chunks(log, self.max_snippet_len))
|
|
66
|
+
|
|
67
|
+
chunks = self.filter_snippet_patterns(chunks)
|
|
68
|
+
|
|
69
|
+
# First pass to create clusters
|
|
70
|
+
self._create_clusters(chunks=chunks)
|
|
71
|
+
|
|
72
|
+
# Second pass, only matching lines with clusters,
|
|
73
|
+
# to recover original text
|
|
74
|
+
snippets = self._extract_messages(chunks=chunks)
|
|
75
|
+
return snippets
|
|
76
|
+
|
|
77
|
+
def _create_clusters(self, chunks: list[tuple[int, str]]):
|
|
78
|
+
"""First pass to create clusters"""
|
|
46
79
|
for _, chunk in chunks:
|
|
47
80
|
processed_chunk = self.miner.add_log_message(chunk)
|
|
48
81
|
LOG.debug(processed_chunk)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
82
|
+
self._clusters = list(self.miner.drain.clusters)
|
|
83
|
+
|
|
84
|
+
def _extract_messages(self, chunks: list[tuple[int, str]]) -> list[tuple[int, str]]:
|
|
85
|
+
"""Second pass with drain using patterns from the first,
|
|
86
|
+
to extract matching lines and their numbers."""
|
|
87
|
+
out = []
|
|
88
|
+
|
|
52
89
|
for chunk_start, chunk in chunks:
|
|
53
90
|
cluster = self.miner.match(chunk, "always")
|
|
54
|
-
if cluster in
|
|
91
|
+
if cluster in self._clusters:
|
|
55
92
|
out.append((chunk_start, chunk))
|
|
56
|
-
|
|
93
|
+
self._clusters.remove(cluster)
|
|
57
94
|
return out
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class CSGrepExtractor(DrainExtractor):
|
|
98
|
+
"""Extract messages using csgrep
|
|
99
|
+
This extractor is only effective at retrieving messages from GCC
|
|
100
|
+
compiler and associated utilities, it is not capable of safely
|
|
101
|
+
extracting other messages from the logs. Therefore, it must only
|
|
102
|
+
be used together with the Drain based extractor."""
|
|
103
|
+
|
|
104
|
+
def __init__(
|
|
105
|
+
self,
|
|
106
|
+
verbose: bool = False,
|
|
107
|
+
skip_snippets: SkipSnippets = SkipSnippets({}),
|
|
108
|
+
max_snippet_len: int = 2000,
|
|
109
|
+
max_clusters: int = 8,
|
|
110
|
+
):
|
|
111
|
+
super().__init__(verbose, skip_snippets, max_snippet_len, max_clusters)
|
|
112
|
+
|
|
113
|
+
def __call__(self, log: str) -> list[Tuple[int, str]]:
|
|
114
|
+
"""Extract error messages from log using csgrep"""
|
|
115
|
+
chunks = []
|
|
116
|
+
try:
|
|
117
|
+
# We are not running binary in check mode, since csgrep
|
|
118
|
+
# can produce many errors due to log file syntax
|
|
119
|
+
result = sp.run(
|
|
120
|
+
[
|
|
121
|
+
"csgrep",
|
|
122
|
+
"--event=error",
|
|
123
|
+
"--remove-duplicates",
|
|
124
|
+
"--mode=json",
|
|
125
|
+
"--quiet",
|
|
126
|
+
],
|
|
127
|
+
input=log,
|
|
128
|
+
shell=False,
|
|
129
|
+
check=False,
|
|
130
|
+
capture_output=True,
|
|
131
|
+
text=True,
|
|
132
|
+
timeout=1.0,
|
|
133
|
+
)
|
|
134
|
+
except sp.TimeoutExpired as ex:
|
|
135
|
+
LOG.exception("Exception encountered while parsing log with csgrep %s", ex)
|
|
136
|
+
raise ex
|
|
137
|
+
if result.returncode != 0:
|
|
138
|
+
# This can happen even if `csgrep` managed to extract useful info.
|
|
139
|
+
# Most commonly, when it encountered unexpected syntax in the log.
|
|
140
|
+
LOG.warning("csgrep call resulted in an error")
|
|
141
|
+
LOG.debug("csgrep error: `%s`", result.stderr)
|
|
142
|
+
if not result.stdout:
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
# Parse JSON output from csgrep
|
|
146
|
+
try:
|
|
147
|
+
report = CSGrepOutput.model_validate_json(result.stdout)
|
|
148
|
+
except ValidationError as ex:
|
|
149
|
+
LOG.exception("Exception encountered while parsing csgrpe output %s", ex)
|
|
150
|
+
raise ex
|
|
151
|
+
for defect in report.defects:
|
|
152
|
+
# Single original error message can be split across multiple events
|
|
153
|
+
# before returning, we will turn them back into single string.
|
|
154
|
+
# We must also extract the original line number.
|
|
155
|
+
# Line number is NOT location of message in the log, but location of
|
|
156
|
+
# the issue in source, we can't really mix the two, so we'll set it to `0`.
|
|
157
|
+
|
|
158
|
+
chunks.append((0, "\n".join([event.message for event in defect.events])))
|
|
159
|
+
|
|
160
|
+
chunks = self.filter_snippet_patterns(chunks)
|
|
161
|
+
LOG.info("Total %d messages extracted with csgrep", len(chunks))
|
|
162
|
+
self._create_clusters(chunks=chunks)
|
|
163
|
+
snippets = self._extract_messages(chunks=chunks)
|
|
164
|
+
|
|
165
|
+
return snippets
|
logdetective/logdetective.py
CHANGED
|
@@ -15,8 +15,10 @@ from logdetective.utils import (
|
|
|
15
15
|
compute_certainty,
|
|
16
16
|
load_prompts,
|
|
17
17
|
load_skip_snippet_patterns,
|
|
18
|
+
check_csgrep,
|
|
19
|
+
mine_logs,
|
|
18
20
|
)
|
|
19
|
-
from logdetective.extractors import DrainExtractor
|
|
21
|
+
from logdetective.extractors import DrainExtractor, CSGrepExtractor
|
|
20
22
|
|
|
21
23
|
LOG = logging.getLogger("logdetective")
|
|
22
24
|
|
|
@@ -89,10 +91,13 @@ def setup_args():
|
|
|
89
91
|
default=f"{os.path.dirname(__file__)}/skip_snippets.yml",
|
|
90
92
|
help="Path to patterns for skipping snippets.",
|
|
91
93
|
)
|
|
94
|
+
parser.add_argument(
|
|
95
|
+
"--csgrep", action="store_true", help="Use csgrep to process the log."
|
|
96
|
+
)
|
|
92
97
|
return parser.parse_args()
|
|
93
98
|
|
|
94
99
|
|
|
95
|
-
async def run(): # pylint: disable=too-many-statements,too-many-locals
|
|
100
|
+
async def run(): # pylint: disable=too-many-statements,too-many-locals,too-many-branches
|
|
96
101
|
"""Main execution function."""
|
|
97
102
|
args = setup_args()
|
|
98
103
|
|
|
@@ -134,13 +139,25 @@ async def run(): # pylint: disable=too-many-statements,too-many-locals
|
|
|
134
139
|
sys.exit(5)
|
|
135
140
|
|
|
136
141
|
# Log file summarizer initialization
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
+
extractors = []
|
|
143
|
+
extractors.append(
|
|
144
|
+
DrainExtractor(
|
|
145
|
+
args.verbose > 1,
|
|
146
|
+
max_clusters=args.n_clusters,
|
|
147
|
+
skip_snippets=skip_snippets,
|
|
148
|
+
)
|
|
142
149
|
)
|
|
143
150
|
|
|
151
|
+
if args.csgrep:
|
|
152
|
+
if not check_csgrep():
|
|
153
|
+
LOG.error(
|
|
154
|
+
"You have requested use of `csgrep` when it isn't available on your system."
|
|
155
|
+
)
|
|
156
|
+
sys.exit(6)
|
|
157
|
+
extractors.append(
|
|
158
|
+
CSGrepExtractor(args.verbose > 1, skip_snippets=skip_snippets)
|
|
159
|
+
)
|
|
160
|
+
|
|
144
161
|
LOG.info("Getting summary")
|
|
145
162
|
|
|
146
163
|
async with aiohttp.ClientSession() as http:
|
|
@@ -150,12 +167,8 @@ async def run(): # pylint: disable=too-many-statements,too-many-locals
|
|
|
150
167
|
# file does not exist
|
|
151
168
|
LOG.error(e)
|
|
152
169
|
sys.exit(4)
|
|
153
|
-
log_summary = extractor(log)
|
|
154
|
-
|
|
155
|
-
ratio = len(log_summary) / len(log.split("\n"))
|
|
156
|
-
|
|
157
|
-
LOG.info("Compression ratio: %s", ratio)
|
|
158
170
|
|
|
171
|
+
log_summary = mine_logs(log=log, extractors=extractors)
|
|
159
172
|
LOG.info("Analyzing the text")
|
|
160
173
|
|
|
161
174
|
log_summary = format_snippets(log_summary)
|
logdetective/models.py
CHANGED
|
@@ -71,3 +71,29 @@ class SkipSnippets(BaseModel):
|
|
|
71
71
|
) from ex
|
|
72
72
|
|
|
73
73
|
return data
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class CSGrepEvent(BaseModel):
|
|
77
|
+
"""`csgrep` splits error and warning messages into individual events."""
|
|
78
|
+
|
|
79
|
+
file_name: str
|
|
80
|
+
line: int
|
|
81
|
+
event: str
|
|
82
|
+
message: str
|
|
83
|
+
verbosity_level: int
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class CSGrepDefect(BaseModel):
|
|
87
|
+
"""Defects detected by `csgrep`"""
|
|
88
|
+
|
|
89
|
+
checker: str
|
|
90
|
+
language: str
|
|
91
|
+
tool: str
|
|
92
|
+
key_event_idx: int
|
|
93
|
+
events: list[CSGrepEvent]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class CSGrepOutput(BaseModel):
|
|
97
|
+
"""Parsed output of `gsgrep`"""
|
|
98
|
+
|
|
99
|
+
defects: list[CSGrepDefect]
|
logdetective/server/llm.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import asyncio
|
|
3
3
|
import random
|
|
4
|
+
import time
|
|
4
5
|
from typing import List, Tuple, Dict
|
|
5
6
|
|
|
6
7
|
import backoff
|
|
@@ -15,6 +16,7 @@ from logdetective.utils import (
|
|
|
15
16
|
compute_certainty,
|
|
16
17
|
prompt_to_messages,
|
|
17
18
|
format_snippets,
|
|
19
|
+
mine_logs,
|
|
18
20
|
)
|
|
19
21
|
from logdetective.server.config import (
|
|
20
22
|
LOG,
|
|
@@ -33,10 +35,10 @@ from logdetective.server.models import (
|
|
|
33
35
|
)
|
|
34
36
|
from logdetective.server.utils import (
|
|
35
37
|
format_analyzed_snippets,
|
|
36
|
-
mine_logs,
|
|
37
38
|
should_we_giveup,
|
|
38
39
|
we_give_up,
|
|
39
40
|
filter_snippets,
|
|
41
|
+
construct_final_prompt,
|
|
40
42
|
)
|
|
41
43
|
|
|
42
44
|
|
|
@@ -184,10 +186,13 @@ async def analyze_snippets(
|
|
|
184
186
|
|
|
185
187
|
async def perfrom_analysis(log_text: str) -> Response:
|
|
186
188
|
"""Sumbit log file snippets in aggregate to LLM and retrieve results"""
|
|
187
|
-
log_summary = mine_logs(log_text)
|
|
189
|
+
log_summary = mine_logs(log_text, SERVER_CONFIG.extractor.get_extractors())
|
|
188
190
|
log_summary = format_snippets(log_summary)
|
|
191
|
+
|
|
192
|
+
final_prompt = construct_final_prompt(log_summary, PROMPT_CONFIG.prompt_template)
|
|
193
|
+
|
|
189
194
|
messages = prompt_to_messages(
|
|
190
|
-
|
|
195
|
+
final_prompt,
|
|
191
196
|
PROMPT_CONFIG.default_system_prompt,
|
|
192
197
|
SERVER_CONFIG.inference.system_role,
|
|
193
198
|
SERVER_CONFIG.inference.user_role,
|
|
@@ -213,10 +218,13 @@ async def perfrom_analysis(log_text: str) -> Response:
|
|
|
213
218
|
|
|
214
219
|
async def perform_analyis_stream(log_text: str) -> AsyncStream:
|
|
215
220
|
"""Submit log file snippets in aggregate and return a stream of tokens"""
|
|
216
|
-
log_summary = mine_logs(log_text)
|
|
221
|
+
log_summary = mine_logs(log_text, SERVER_CONFIG.extractor.get_extractors())
|
|
217
222
|
log_summary = format_snippets(log_summary)
|
|
223
|
+
|
|
224
|
+
final_prompt = construct_final_prompt(log_summary, PROMPT_CONFIG.prompt_template)
|
|
225
|
+
|
|
218
226
|
messages = prompt_to_messages(
|
|
219
|
-
|
|
227
|
+
final_prompt,
|
|
220
228
|
PROMPT_CONFIG.default_system_prompt,
|
|
221
229
|
SERVER_CONFIG.inference.system_role,
|
|
222
230
|
SERVER_CONFIG.inference.user_role,
|
|
@@ -235,8 +243,8 @@ async def perform_analyis_stream(log_text: str) -> AsyncStream:
|
|
|
235
243
|
|
|
236
244
|
async def perform_staged_analysis(log_text: str) -> StagedResponse:
|
|
237
245
|
"""Submit the log file snippets to the LLM and retrieve their results"""
|
|
238
|
-
log_summary = mine_logs(log_text)
|
|
239
|
-
|
|
246
|
+
log_summary = mine_logs(log_text, SERVER_CONFIG.extractor.get_extractors())
|
|
247
|
+
start = time.time()
|
|
240
248
|
if SERVER_CONFIG.general.top_k_snippets:
|
|
241
249
|
rated_snippets = await analyze_snippets(
|
|
242
250
|
log_summary=log_summary,
|
|
@@ -265,10 +273,11 @@ async def perform_staged_analysis(log_text: str) -> StagedResponse:
|
|
|
265
273
|
AnalyzedSnippet(line_number=e[0][0], text=e[0][1], explanation=e[1])
|
|
266
274
|
for e in zip(log_summary, processed_snippets)
|
|
267
275
|
]
|
|
276
|
+
delta = time.time() - start
|
|
277
|
+
LOG.info("Snippet analysis performed in %f s", delta)
|
|
278
|
+
log_summary = format_analyzed_snippets(processed_snippets)
|
|
279
|
+
final_prompt = construct_final_prompt(log_summary, PROMPT_CONFIG.prompt_template_staged)
|
|
268
280
|
|
|
269
|
-
final_prompt = PROMPT_CONFIG.prompt_template_staged.format(
|
|
270
|
-
format_analyzed_snippets(processed_snippets)
|
|
271
|
-
)
|
|
272
281
|
messages = prompt_to_messages(
|
|
273
282
|
final_prompt,
|
|
274
283
|
PROMPT_CONFIG.staged_system_prompt,
|
logdetective/server/models.py
CHANGED
|
@@ -26,6 +26,9 @@ from logdetective.constants import (
|
|
|
26
26
|
USER_ROLE_DEFAULT,
|
|
27
27
|
)
|
|
28
28
|
|
|
29
|
+
from logdetective.extractors import Extractor, DrainExtractor, CSGrepExtractor
|
|
30
|
+
from logdetective.utils import check_csgrep
|
|
31
|
+
|
|
29
32
|
|
|
30
33
|
class BuildLog(BaseModel):
|
|
31
34
|
"""Model of data submitted to API."""
|
|
@@ -247,15 +250,56 @@ class ExtractorConfig(BaseModel):
|
|
|
247
250
|
max_clusters: int = 8
|
|
248
251
|
verbose: bool = False
|
|
249
252
|
max_snippet_len: int = 2000
|
|
253
|
+
csgrep: bool = False
|
|
254
|
+
|
|
255
|
+
_extractors: List[Extractor] = []
|
|
256
|
+
|
|
257
|
+
def _setup_extractors(self):
|
|
258
|
+
"""Initialize extractors with common settings."""
|
|
259
|
+
self._extractors = [
|
|
260
|
+
DrainExtractor(
|
|
261
|
+
verbose=self.verbose,
|
|
262
|
+
max_snippet_len=self.max_snippet_len,
|
|
263
|
+
max_clusters=self.max_clusters,
|
|
264
|
+
)
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
if self.csgrep:
|
|
268
|
+
self._extractors.append(
|
|
269
|
+
CSGrepExtractor(
|
|
270
|
+
verbose=self.verbose,
|
|
271
|
+
max_snippet_len=self.max_snippet_len,
|
|
272
|
+
)
|
|
273
|
+
)
|
|
250
274
|
|
|
251
275
|
def __init__(self, data: Optional[dict] = None):
|
|
252
|
-
super().__init__()
|
|
276
|
+
super().__init__(data=data)
|
|
277
|
+
|
|
253
278
|
if data is None:
|
|
279
|
+
self._setup_extractors()
|
|
254
280
|
return
|
|
255
281
|
|
|
256
282
|
self.max_clusters = data.get("max_clusters", 8)
|
|
257
283
|
self.verbose = data.get("verbose", False)
|
|
258
284
|
self.max_snippet_len = data.get("max_snippet_len", 2000)
|
|
285
|
+
self.csgrep = data.get("csgrep", False)
|
|
286
|
+
|
|
287
|
+
self._setup_extractors()
|
|
288
|
+
|
|
289
|
+
def get_extractors(self) -> List[Extractor]:
|
|
290
|
+
"""Return list of initialized extractors, each will be applied in turn
|
|
291
|
+
on original log text to retrieve snippets."""
|
|
292
|
+
return self._extractors
|
|
293
|
+
|
|
294
|
+
@field_validator("csgrep", mode="after")
|
|
295
|
+
@classmethod
|
|
296
|
+
def validate_csgrep(cls, value: bool) -> bool:
|
|
297
|
+
"""Verify that csgrep is available if requested."""
|
|
298
|
+
if not check_csgrep():
|
|
299
|
+
raise ValueError(
|
|
300
|
+
"Requested csgrep extractor but `csgrep` binary is not in the PATH"
|
|
301
|
+
)
|
|
302
|
+
return value
|
|
259
303
|
|
|
260
304
|
|
|
261
305
|
class GitLabInstanceConfig(BaseModel): # pylint: disable=too-many-instance-attributes
|
|
@@ -481,6 +525,7 @@ class Config(BaseModel):
|
|
|
481
525
|
log: LogConfig = LogConfig()
|
|
482
526
|
inference: InferenceConfig = InferenceConfig()
|
|
483
527
|
snippet_inference: InferenceConfig = InferenceConfig()
|
|
528
|
+
# TODO(jpodivin): Extend to work with multiple extractor configs
|
|
484
529
|
extractor: ExtractorConfig = ExtractorConfig()
|
|
485
530
|
gitlab: GitLabConfig = GitLabConfig()
|
|
486
531
|
koji: KojiConfig = KojiConfig()
|
logdetective/server/utils.py
CHANGED
|
@@ -1,15 +1,10 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import List
|
|
2
2
|
|
|
3
3
|
import aiohttp
|
|
4
4
|
from fastapi import HTTPException
|
|
5
5
|
|
|
6
6
|
from logdetective.constants import SNIPPET_DELIMITER
|
|
7
|
-
from logdetective.
|
|
8
|
-
from logdetective.server.config import (
|
|
9
|
-
LOG,
|
|
10
|
-
SERVER_CONFIG,
|
|
11
|
-
SKIP_SNIPPETS_CONFIG,
|
|
12
|
-
)
|
|
7
|
+
from logdetective.server.config import LOG
|
|
13
8
|
from logdetective.server.exceptions import LogDetectiveConnectionError
|
|
14
9
|
from logdetective.server.models import AnalyzedSnippet, RatedSnippetAnalysis
|
|
15
10
|
|
|
@@ -22,26 +17,6 @@ def format_analyzed_snippets(snippets: list[AnalyzedSnippet]) -> str:
|
|
|
22
17
|
return summary
|
|
23
18
|
|
|
24
19
|
|
|
25
|
-
def mine_logs(log: str) -> List[Tuple[int, str]]:
|
|
26
|
-
"""Extract snippets from log text"""
|
|
27
|
-
extractor = DrainExtractor(
|
|
28
|
-
verbose=True,
|
|
29
|
-
context=True,
|
|
30
|
-
max_clusters=SERVER_CONFIG.extractor.max_clusters,
|
|
31
|
-
skip_snippets=SKIP_SNIPPETS_CONFIG,
|
|
32
|
-
max_snippet_len=SERVER_CONFIG.extractor.max_snippet_len
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
LOG.info("Getting summary")
|
|
36
|
-
log_summary = extractor(log)
|
|
37
|
-
|
|
38
|
-
ratio = len(log_summary) / len(log.split("\n"))
|
|
39
|
-
LOG.debug("Log summary: \n %s", log_summary)
|
|
40
|
-
LOG.info("Compression ratio: %s", ratio)
|
|
41
|
-
|
|
42
|
-
return log_summary
|
|
43
|
-
|
|
44
|
-
|
|
45
20
|
def connection_error_giveup(details: dict) -> None:
|
|
46
21
|
"""Too many connection errors, give up.
|
|
47
22
|
"""
|
|
@@ -120,3 +95,10 @@ def filter_snippets(
|
|
|
120
95
|
processed_snippets = sorted(processed_snippets, key=select_line_number)
|
|
121
96
|
|
|
122
97
|
return processed_snippets
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def construct_final_prompt(formatted_snippets: str, prompt_template: str) -> str:
|
|
101
|
+
"""Create final prompt from processed snippets and csgrep output, if it is available."""
|
|
102
|
+
|
|
103
|
+
final_prompt = prompt_template.format(formatted_snippets)
|
|
104
|
+
return final_prompt
|
logdetective/utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import os
|
|
3
|
+
import subprocess as sp
|
|
3
4
|
from typing import Iterator, List, Dict, Tuple, Generator
|
|
4
5
|
from urllib.parse import urlparse
|
|
5
6
|
|
|
@@ -8,10 +9,10 @@ import numpy as np
|
|
|
8
9
|
import yaml
|
|
9
10
|
|
|
10
11
|
from llama_cpp import Llama, CreateCompletionResponse, CreateCompletionStreamResponse
|
|
12
|
+
from logdetective.constants import SNIPPET_DELIMITER
|
|
11
13
|
from logdetective.models import PromptConfig, SkipSnippets
|
|
12
14
|
from logdetective.remote_log import RemoteLog
|
|
13
15
|
|
|
14
|
-
|
|
15
16
|
LOG = logging.getLogger("logdetective")
|
|
16
17
|
|
|
17
18
|
|
|
@@ -39,7 +40,9 @@ def chunk_continues(text: str, index: int) -> bool:
|
|
|
39
40
|
return False
|
|
40
41
|
|
|
41
42
|
|
|
42
|
-
def get_chunks(
|
|
43
|
+
def get_chunks(
|
|
44
|
+
text: str, max_len: int = 2000
|
|
45
|
+
) -> Generator[Tuple[int, str], None, None]:
|
|
43
46
|
"""Split log into chunks according to heuristic
|
|
44
47
|
based on whitespace and backslash presence.
|
|
45
48
|
"""
|
|
@@ -173,14 +176,14 @@ def format_snippets(snippets: list[str] | list[Tuple[int, str]]) -> str:
|
|
|
173
176
|
Snippet No. {i} at line #{s[0]}:
|
|
174
177
|
|
|
175
178
|
{s[1]}
|
|
176
|
-
|
|
179
|
+
{SNIPPET_DELIMITER}
|
|
177
180
|
"""
|
|
178
181
|
else:
|
|
179
182
|
summary += f"""
|
|
180
183
|
Snippet No. {i}:
|
|
181
184
|
|
|
182
185
|
{s}
|
|
183
|
-
|
|
186
|
+
{SNIPPET_DELIMITER}
|
|
184
187
|
"""
|
|
185
188
|
return summary
|
|
186
189
|
|
|
@@ -247,3 +250,44 @@ def load_skip_snippet_patterns(path: str | None) -> SkipSnippets:
|
|
|
247
250
|
raise e
|
|
248
251
|
|
|
249
252
|
return SkipSnippets({})
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def check_csgrep() -> bool:
|
|
256
|
+
"""Verifies presence of csgrep in path"""
|
|
257
|
+
try:
|
|
258
|
+
result = sp.run(
|
|
259
|
+
["csgrep", "--version"],
|
|
260
|
+
text=True,
|
|
261
|
+
check=True,
|
|
262
|
+
shell=False,
|
|
263
|
+
capture_output=True,
|
|
264
|
+
timeout=1.0,
|
|
265
|
+
)
|
|
266
|
+
except (FileNotFoundError, sp.TimeoutExpired, sp.CalledProcessError) as ex:
|
|
267
|
+
LOG.error("Required binary `csgrep` was not found in path: %s", ex)
|
|
268
|
+
return False
|
|
269
|
+
if result.returncode == 0:
|
|
270
|
+
return True
|
|
271
|
+
LOG.error("Issue was encountered while calling `csgrep`: `%s`", result.stderr)
|
|
272
|
+
|
|
273
|
+
return False
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def mine_logs(log: str, extractors: list) -> List[Tuple[int, str]]:
|
|
277
|
+
"""Extract snippets from log text using extractors provided.
|
|
278
|
+
Each extractor is applied in turn on original log.
|
|
279
|
+
Depending on characteristics of extractors used, there may be
|
|
280
|
+
an overlap in snippets extracted."""
|
|
281
|
+
|
|
282
|
+
log_summary = []
|
|
283
|
+
|
|
284
|
+
LOG.info("Getting summary")
|
|
285
|
+
|
|
286
|
+
for extractor in extractors:
|
|
287
|
+
log_summary.extend(extractor(log))
|
|
288
|
+
|
|
289
|
+
ratio = len("\n".join([text for _, text in log_summary])) / len(log)
|
|
290
|
+
LOG.debug("Log summary: \n %s", log_summary)
|
|
291
|
+
LOG.info("Snippets: %s Compression ratio: %s", len(log_summary), ratio)
|
|
292
|
+
|
|
293
|
+
return log_summary
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: logdetective
|
|
3
|
-
Version: 2.0
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Log using LLM AI to search for build/test failures and provide ideas for fixing these.
|
|
5
5
|
License: Apache-2.0
|
|
6
6
|
Author: Jiri Podivin
|
|
@@ -494,6 +494,33 @@ Example of a valid pattern definition file: `logdetective/skip_patterns.yml`,
|
|
|
494
494
|
can be used as a starting point and is used as a default if no other definition is provided.
|
|
495
495
|
|
|
496
496
|
|
|
497
|
+
Extracting snippets with csgrep
|
|
498
|
+
-------------------------------
|
|
499
|
+
|
|
500
|
+
When working with logs containing messages from GCC, it can be beneficial to employ
|
|
501
|
+
additional extractor based on `csgrep` tool, to ensure that the messages are kept intact.
|
|
502
|
+
Since `csgrep` is not available as a python package, it must be installed separately,
|
|
503
|
+
with a package manager or from [source](https://github.com/csutils/csdiff).
|
|
504
|
+
|
|
505
|
+
The binary is available as part of `csdiff` package on Fedora.
|
|
506
|
+
|
|
507
|
+
```
|
|
508
|
+
dnf install csdiff
|
|
509
|
+
```
|
|
510
|
+
|
|
511
|
+
When working with CLI Log Detective, the csgrep extractor can be activated using option `--csgrep`.
|
|
512
|
+
While in server mode, the `csgrep` field in `extractor` config needs to be set to `true`.
|
|
513
|
+
|
|
514
|
+
```
|
|
515
|
+
csgrep: true
|
|
516
|
+
```
|
|
517
|
+
|
|
518
|
+
Both options are disabled by default and error will be produced if the option is used,
|
|
519
|
+
but `csgrep` is not present in the $PATH.
|
|
520
|
+
|
|
521
|
+
The container images are built with `csdiff` installed.
|
|
522
|
+
|
|
523
|
+
|
|
497
524
|
License
|
|
498
525
|
-------
|
|
499
526
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
logdetective/__init__.py,sha256=VqRngDcuFT7JWms8Qc_MsOvajoXVOKPr-S1kqY3Pqhc,59
|
|
2
2
|
logdetective/constants.py,sha256=aCwrkBrDdS_kbNESK-Z-ewg--DSzodV2OMgwEq3UE38,2456
|
|
3
3
|
logdetective/drain3.ini,sha256=ni91eCT1TwTznZwcqWoOVMQcGEnWhEDNCoTPF7cfGfY,1360
|
|
4
|
-
logdetective/extractors.py,sha256=
|
|
5
|
-
logdetective/logdetective.py,sha256=
|
|
6
|
-
logdetective/models.py,sha256=
|
|
4
|
+
logdetective/extractors.py,sha256=Nh5wMcLXtcYLFAHwr6naCPPOtWzLUCW2iF__UKfckUY,5927
|
|
5
|
+
logdetective/logdetective.py,sha256=Ck7TL3YvdQG8zniudM8bM51LfTyVW6Ea3BarTjzjWHo,6606
|
|
6
|
+
logdetective/models.py,sha256=uczmQtWFgSp_ZGssngdTM4qzPF1o64dCy0469GoSbjQ,2937
|
|
7
7
|
logdetective/prompts-summary-first.yml,sha256=3Zfp4NNOfaFYq5xBlBjeQa5PdjYfS4v17OtJqQ-DRpU,821
|
|
8
8
|
logdetective/prompts-summary-only.yml,sha256=8U9AMJV8ePW-0CoXOXlQoO92DAJDeutIT8ntSkkm6W0,470
|
|
9
9
|
logdetective/prompts.yml,sha256=Mq8RdWgJxxhrQYgammojJkXULJNpzSLU0N_BryOxKgc,3906
|
|
@@ -22,18 +22,18 @@ logdetective/server/emoji.py,sha256=hV4O0yfL0l1a3kWLImvBsY4AJQauKs7okYOGBEtYVz0,
|
|
|
22
22
|
logdetective/server/exceptions.py,sha256=piV7wVKc-rw_pHrThbZbUjtmjuO5qUbjVNFwjdfcP3Q,864
|
|
23
23
|
logdetective/server/gitlab.py,sha256=MrAprXLTN6Q15qBC_Y2y42iKdtmIfed_pfjEt0gABvc,16422
|
|
24
24
|
logdetective/server/koji.py,sha256=LG1pRiKUFvYFRKzgQoUG3pUHfcEwMoaMNjUSMKw_pBA,5640
|
|
25
|
-
logdetective/server/llm.py,sha256=
|
|
25
|
+
logdetective/server/llm.py,sha256=bmA6LsV80OdO60q4WLoKuehuVDEYq-HhBAYcZeLfrv8,10150
|
|
26
26
|
logdetective/server/metric.py,sha256=QrrX1FmMa7sc57av0P9UFOiCIFYVLs1opOWV3ObYo0s,4086
|
|
27
|
-
logdetective/server/models.py,sha256=
|
|
27
|
+
logdetective/server/models.py,sha256=rsdEf3lw0fvjWKhC9evaSsfZQR-H2mg0uig4KA6ho0c,20762
|
|
28
28
|
logdetective/server/plot.py,sha256=C98U9prGoPkp8_t4v2dovdZuwOhSbxXSeB_K9Q2r3NE,14607
|
|
29
29
|
logdetective/server/server.py,sha256=zap8Mz3NTFvaDJMNQDATbPYk6MhQ9o1J9gJECnGWvuQ,24694
|
|
30
30
|
logdetective/server/templates/gitlab_full_comment.md.j2,sha256=H4NPjm3l8X5d0TNtfyZZZj_gHY1Y7hWEqY6RaVA8qt0,1947
|
|
31
31
|
logdetective/server/templates/gitlab_short_comment.md.j2,sha256=vPisU1c98LPKEwlKtMrtlqnEOlbykPZK96MpHAf-o88,1758
|
|
32
|
-
logdetective/server/utils.py,sha256=
|
|
32
|
+
logdetective/server/utils.py,sha256=7ub-Nz7LUP_idwi2_nEC4FBuY9otSBUVy9nw86-sjYc,3861
|
|
33
33
|
logdetective/skip_snippets.yml,sha256=reGlhPPCo06nNUJWiC2LY-OJOoPdcyOB7QBTSMeh0eg,487
|
|
34
|
-
logdetective/utils.py,sha256=
|
|
35
|
-
logdetective-2.0.
|
|
36
|
-
logdetective-2.0.
|
|
37
|
-
logdetective-2.0.
|
|
38
|
-
logdetective-2.0.
|
|
39
|
-
logdetective-2.0.
|
|
34
|
+
logdetective/utils.py,sha256=9EyHKGNxtS1ObSepL-T3M43rKIxQJkFDA5yllLbS5Bs,9178
|
|
35
|
+
logdetective-2.2.0.dist-info/LICENSE,sha256=z8d0m5b2O9McPEK1xHG_dWgUBT6EfBDz6wA0F7xSPTA,11358
|
|
36
|
+
logdetective-2.2.0.dist-info/METADATA,sha256=9bIs4_L1PPIilOBBsfoLULfTTtZ8RYLCmq_0XpxJHXQ,21455
|
|
37
|
+
logdetective-2.2.0.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
|
38
|
+
logdetective-2.2.0.dist-info/entry_points.txt,sha256=3K_vXja6PmcA8sNdUi63WdImeiNhVZcEGPTaoJmltfA,63
|
|
39
|
+
logdetective-2.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|