logdetective 0.9.1__py3-none-any.whl → 0.11.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logdetective/constants.py +4 -0
- logdetective/{server/remote_log.py → remote_log.py} +3 -43
- logdetective/server/compressors.py +49 -4
- logdetective/server/{utils.py → config.py} +12 -13
- logdetective/server/database/models/merge_request_jobs.py +95 -7
- logdetective/server/emoji.py +104 -0
- logdetective/server/gitlab.py +413 -0
- logdetective/server/llm.py +284 -0
- logdetective/server/metric.py +27 -9
- logdetective/server/models.py +78 -6
- logdetective/server/plot.py +157 -9
- logdetective/server/server.py +181 -639
- logdetective/utils.py +1 -1
- {logdetective-0.9.1.dist-info → logdetective-0.11.1.dist-info}/METADATA +5 -3
- logdetective-0.11.1.dist-info/RECORD +31 -0
- logdetective-0.9.1.dist-info/RECORD +0 -28
- {logdetective-0.9.1.dist-info → logdetective-0.11.1.dist-info}/LICENSE +0 -0
- {logdetective-0.9.1.dist-info → logdetective-0.11.1.dist-info}/WHEEL +0 -0
- {logdetective-0.9.1.dist-info → logdetective-0.11.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import asyncio
|
|
3
|
+
import zipfile
|
|
4
|
+
from pathlib import Path, PurePath
|
|
5
|
+
from tempfile import TemporaryFile
|
|
6
|
+
|
|
7
|
+
from fastapi import HTTPException
|
|
8
|
+
|
|
9
|
+
import gitlab
|
|
10
|
+
import gitlab.v4
|
|
11
|
+
import gitlab.v4.objects
|
|
12
|
+
import jinja2
|
|
13
|
+
import aiohttp
|
|
14
|
+
import sqlalchemy
|
|
15
|
+
|
|
16
|
+
from logdetective.server.config import SERVER_CONFIG, LOG
|
|
17
|
+
from logdetective.server.llm import perform_staged_analysis
|
|
18
|
+
from logdetective.server.metric import add_new_metrics, update_metrics
|
|
19
|
+
from logdetective.server.models import (
|
|
20
|
+
GitLabInstanceConfig,
|
|
21
|
+
JobHook,
|
|
22
|
+
StagedResponse,
|
|
23
|
+
)
|
|
24
|
+
from logdetective.server.database.models import (
|
|
25
|
+
Comments,
|
|
26
|
+
EndpointType,
|
|
27
|
+
Forge,
|
|
28
|
+
AnalyzeRequestMetrics,
|
|
29
|
+
)
|
|
30
|
+
from logdetective.server.compressors import RemoteLogCompressor
|
|
31
|
+
|
|
32
|
+
MR_REGEX = re.compile(r"refs/merge-requests/(\d+)/.*$")
|
|
33
|
+
FAILURE_LOG_REGEX = re.compile(r"(\w*\.log)")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def process_gitlab_job_event(
|
|
37
|
+
http: aiohttp.ClientSession,
|
|
38
|
+
gitlab_cfg: GitLabInstanceConfig,
|
|
39
|
+
forge: Forge,
|
|
40
|
+
job_hook: JobHook,
|
|
41
|
+
): # pylint: disable=too-many-locals
|
|
42
|
+
"""Handle a received job_event webhook from GitLab"""
|
|
43
|
+
LOG.debug("Received webhook message from %s:\n%s", forge.value, job_hook)
|
|
44
|
+
|
|
45
|
+
# Look up the project this job belongs to
|
|
46
|
+
gitlab_conn = gitlab_cfg.get_connection()
|
|
47
|
+
project = await asyncio.to_thread(gitlab_conn.projects.get, job_hook.project_id)
|
|
48
|
+
LOG.info("Processing failed job for %s", project.name)
|
|
49
|
+
|
|
50
|
+
# Retrieve data about the job from the GitLab API
|
|
51
|
+
job = await asyncio.to_thread(project.jobs.get, job_hook.build_id)
|
|
52
|
+
|
|
53
|
+
# For easy retrieval later, we'll add project_name and project_url to the
|
|
54
|
+
# job object
|
|
55
|
+
job.project_name = project.name
|
|
56
|
+
job.project_url = project.web_url
|
|
57
|
+
|
|
58
|
+
# Retrieve the pipeline that started this job
|
|
59
|
+
pipeline = await asyncio.to_thread(project.pipelines.get, job_hook.pipeline_id)
|
|
60
|
+
|
|
61
|
+
# Verify this is a merge request
|
|
62
|
+
if pipeline.source != "merge_request_event":
|
|
63
|
+
LOG.info("Not a merge request pipeline. Ignoring.")
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
# Extract the merge-request ID from the job
|
|
67
|
+
match = MR_REGEX.search(pipeline.ref)
|
|
68
|
+
if not match:
|
|
69
|
+
LOG.error(
|
|
70
|
+
"Pipeline source is merge_request_event but no merge request ID was provided."
|
|
71
|
+
)
|
|
72
|
+
return
|
|
73
|
+
merge_request_iid = int(match.group(1))
|
|
74
|
+
|
|
75
|
+
LOG.debug("Retrieving log artifacts")
|
|
76
|
+
# Retrieve the build logs from the merge request artifacts and preprocess them
|
|
77
|
+
try:
|
|
78
|
+
log_url, preprocessed_log = await retrieve_and_preprocess_koji_logs(gitlab_cfg, http, job)
|
|
79
|
+
except LogsTooLargeError:
|
|
80
|
+
LOG.error("Could not retrieve logs. Too large.")
|
|
81
|
+
raise
|
|
82
|
+
|
|
83
|
+
# Submit log to Log Detective and await the results.
|
|
84
|
+
log_text = preprocessed_log.read().decode(encoding="utf-8")
|
|
85
|
+
metrics_id = await add_new_metrics(
|
|
86
|
+
api_name=EndpointType.ANALYZE_GITLAB_JOB,
|
|
87
|
+
url=log_url,
|
|
88
|
+
http_session=http,
|
|
89
|
+
compressed_log_content=RemoteLogCompressor.zip_text(log_text),
|
|
90
|
+
)
|
|
91
|
+
staged_response = await perform_staged_analysis(http, log_text=log_text)
|
|
92
|
+
update_metrics(metrics_id, staged_response)
|
|
93
|
+
preprocessed_log.close()
|
|
94
|
+
|
|
95
|
+
# check if this project is on the opt-in list for posting comments.
|
|
96
|
+
if project.name not in SERVER_CONFIG.general.packages:
|
|
97
|
+
LOG.info("Not publishing comment for unrecognized package %s", project.name)
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
# Add the Log Detective response as a comment to the merge request
|
|
101
|
+
await comment_on_mr(
|
|
102
|
+
forge,
|
|
103
|
+
project,
|
|
104
|
+
merge_request_iid,
|
|
105
|
+
job,
|
|
106
|
+
log_url,
|
|
107
|
+
staged_response,
|
|
108
|
+
metrics_id,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return staged_response
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class LogsTooLargeError(RuntimeError):
|
|
115
|
+
"""The log archive exceeds the configured maximum size"""
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
async def retrieve_and_preprocess_koji_logs(
|
|
119
|
+
gitlab_cfg: GitLabInstanceConfig,
|
|
120
|
+
http: aiohttp.ClientSession,
|
|
121
|
+
job: gitlab.v4.objects.ProjectJob
|
|
122
|
+
): # pylint: disable=too-many-branches,too-many-locals
|
|
123
|
+
"""Download logs from the merge request artifacts
|
|
124
|
+
|
|
125
|
+
This function will retrieve the build logs and do some minimal
|
|
126
|
+
preprocessing to determine which log is relevant for analysis.
|
|
127
|
+
|
|
128
|
+
returns: The URL pointing to the selected log file and an open, file-like
|
|
129
|
+
object containing the log contents to be sent for processing by Log
|
|
130
|
+
Detective. The calling function is responsible for closing this object."""
|
|
131
|
+
|
|
132
|
+
# Make sure the file isn't too large to process.
|
|
133
|
+
if not await check_artifacts_file_size(gitlab_cfg, http, job):
|
|
134
|
+
raise LogsTooLargeError(
|
|
135
|
+
f"Oversized logs for job {job.id} in project {job.project_id}"
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# Create a temporary file to store the downloaded log zipfile.
|
|
139
|
+
# This will be automatically deleted when the last reference into it
|
|
140
|
+
# (returned by this function) is closed.
|
|
141
|
+
tempfile = TemporaryFile(mode="w+b")
|
|
142
|
+
await asyncio.to_thread(job.artifacts, streamed=True, action=tempfile.write)
|
|
143
|
+
tempfile.seek(0)
|
|
144
|
+
|
|
145
|
+
failed_arches = {}
|
|
146
|
+
artifacts_zip = zipfile.ZipFile(tempfile, mode="r") # pylint: disable=consider-using-with
|
|
147
|
+
for zipinfo in artifacts_zip.infolist():
|
|
148
|
+
if zipinfo.filename.endswith("task_failed.log"):
|
|
149
|
+
# The koji logs store this file in two places: 1) in the
|
|
150
|
+
# directory with the failed architecture and 2) in the parent
|
|
151
|
+
# directory. Most of the time, we want to ignore the one in the
|
|
152
|
+
# parent directory, since the rest of the information is in the
|
|
153
|
+
# specific task directory. However, there are some situations
|
|
154
|
+
# where non-build failures (such as "Target build already exists")
|
|
155
|
+
# may be presented only at the top level.
|
|
156
|
+
# The paths look like `kojilogs/noarch-XXXXXX/task_failed.log`
|
|
157
|
+
# or `kojilogs/noarch-XXXXXX/x86_64-XXXXXX/task_failed.log`
|
|
158
|
+
path = PurePath(zipinfo.filename)
|
|
159
|
+
if len(path.parts) <= 3:
|
|
160
|
+
failed_arches["toplevel"] = path
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# Extract the architecture from the immediate parent path
|
|
164
|
+
architecture = path.parent.parts[-1].split("-")[0]
|
|
165
|
+
|
|
166
|
+
# Open this file and read which log failed.
|
|
167
|
+
# The string in this log has the format
|
|
168
|
+
# `see <log> for more information`.
|
|
169
|
+
# Note: it may sometimes say
|
|
170
|
+
# `see build.log or root.log for more information`, but in
|
|
171
|
+
# that situation, we only want to handle build.log (for now),
|
|
172
|
+
# which means accepting only the first match for the regular
|
|
173
|
+
# expression.
|
|
174
|
+
with artifacts_zip.open(zipinfo.filename) as task_failed_log:
|
|
175
|
+
contents = task_failed_log.read().decode("utf-8")
|
|
176
|
+
match = FAILURE_LOG_REGEX.search(contents)
|
|
177
|
+
if not match:
|
|
178
|
+
LOG.error(
|
|
179
|
+
"task_failed.log does not indicate which log contains the failure."
|
|
180
|
+
)
|
|
181
|
+
raise SyntaxError(
|
|
182
|
+
"task_failed.log does not indicate which log contains the failure."
|
|
183
|
+
)
|
|
184
|
+
failure_log_name = match.group(1)
|
|
185
|
+
|
|
186
|
+
failed_arches[architecture] = PurePath(path.parent, failure_log_name)
|
|
187
|
+
|
|
188
|
+
if not failed_arches:
|
|
189
|
+
# No failed task found in the sub-tasks.
|
|
190
|
+
raise FileNotFoundError("Could not detect failed architecture.")
|
|
191
|
+
|
|
192
|
+
# We only want to handle one arch, so we'll check them in order of
|
|
193
|
+
# "most to least likely for the maintainer to have access to hardware"
|
|
194
|
+
# This means: x86_64 > aarch64 > riscv > ppc64le > s390x
|
|
195
|
+
if "x86_64" in failed_arches:
|
|
196
|
+
failed_arch = "x86_64"
|
|
197
|
+
elif "aarch64" in failed_arches:
|
|
198
|
+
failed_arch = "aarch64"
|
|
199
|
+
elif "riscv" in failed_arches:
|
|
200
|
+
failed_arch = "riscv"
|
|
201
|
+
elif "ppc64le" in failed_arches:
|
|
202
|
+
failed_arch = "ppc64le"
|
|
203
|
+
elif "s390x" in failed_arches:
|
|
204
|
+
failed_arch = "s390x"
|
|
205
|
+
elif "noarch" in failed_arches:
|
|
206
|
+
# May have failed during BuildSRPMFromSCM phase
|
|
207
|
+
failed_arch = "noarch"
|
|
208
|
+
elif "toplevel" in failed_arches:
|
|
209
|
+
# Probably a Koji-specific error, not a build error
|
|
210
|
+
failed_arch = "toplevel"
|
|
211
|
+
else:
|
|
212
|
+
# We have one or more architectures that we don't know about? Just
|
|
213
|
+
# pick the first alphabetically.
|
|
214
|
+
failed_arch = sorted(list(failed_arches.keys()))[0]
|
|
215
|
+
|
|
216
|
+
LOG.debug("Failed architecture: %s", failed_arch)
|
|
217
|
+
|
|
218
|
+
log_path = failed_arches[failed_arch].as_posix()
|
|
219
|
+
|
|
220
|
+
log_url = f"{gitlab_cfg.api_url}/projects/{job.project_id}/jobs/{job.id}/artifacts/{log_path}" # pylint: disable=line-too-long
|
|
221
|
+
LOG.debug("Returning contents of %s", log_url)
|
|
222
|
+
|
|
223
|
+
# Return the log as a file-like object with .read() function
|
|
224
|
+
return log_url, artifacts_zip.open(log_path)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
async def check_artifacts_file_size(
|
|
228
|
+
gitlab_cfg: GitLabInstanceConfig,
|
|
229
|
+
http: aiohttp.ClientSession,
|
|
230
|
+
job: gitlab.v4.objects.ProjectJob,
|
|
231
|
+
):
|
|
232
|
+
"""Method to determine if the artifacts are too large to process"""
|
|
233
|
+
# First, make sure that the artifacts are of a reasonable size. The
|
|
234
|
+
# zipped artifact collection will be stored in memory below. The
|
|
235
|
+
# python-gitlab library doesn't expose a way to check this value directly,
|
|
236
|
+
# so we need to interact with directly with the headers.
|
|
237
|
+
artifacts_url = f"{gitlab_cfg.api_url}/projects/{job.project_id}/jobs/{job.id}/artifacts" # pylint: disable=line-too-long
|
|
238
|
+
LOG.debug("checking artifact URL %s", artifacts_url)
|
|
239
|
+
try:
|
|
240
|
+
head_response = await http.head(
|
|
241
|
+
artifacts_url,
|
|
242
|
+
allow_redirects=True,
|
|
243
|
+
headers={"Authorization": f"Bearer {gitlab_cfg.api_token}"},
|
|
244
|
+
timeout=5,
|
|
245
|
+
raise_for_status=True,
|
|
246
|
+
)
|
|
247
|
+
except aiohttp.ClientResponseError as ex:
|
|
248
|
+
raise HTTPException(
|
|
249
|
+
status_code=400,
|
|
250
|
+
detail=f"Unable to check artifact URL: [{ex.status}] {ex.message}",
|
|
251
|
+
) from ex
|
|
252
|
+
content_length = int(head_response.headers.get("content-length"))
|
|
253
|
+
LOG.debug(
|
|
254
|
+
"URL: %s, content-length: %d, max length: %d",
|
|
255
|
+
artifacts_url,
|
|
256
|
+
content_length,
|
|
257
|
+
gitlab_cfg.max_artifact_size,
|
|
258
|
+
)
|
|
259
|
+
return content_length <= gitlab_cfg.max_artifact_size
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
async def comment_on_mr( # pylint: disable=too-many-arguments disable=too-many-positional-arguments
|
|
263
|
+
forge: Forge,
|
|
264
|
+
project: gitlab.v4.objects.Project,
|
|
265
|
+
merge_request_iid: int,
|
|
266
|
+
job: gitlab.v4.objects.ProjectJob,
|
|
267
|
+
log_url: str,
|
|
268
|
+
response: StagedResponse,
|
|
269
|
+
metrics_id: int,
|
|
270
|
+
):
|
|
271
|
+
"""Add the Log Detective response as a comment to the merge request"""
|
|
272
|
+
LOG.debug(
|
|
273
|
+
"Primary Explanation for %s MR %d: %s",
|
|
274
|
+
project.name,
|
|
275
|
+
merge_request_iid,
|
|
276
|
+
response.explanation.text,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
# First, we'll see if there's an existing comment on this Merge Request
|
|
280
|
+
# and wrap it in <details></details> to reduce noise.
|
|
281
|
+
await suppress_latest_comment(forge, project, merge_request_iid)
|
|
282
|
+
|
|
283
|
+
# Get the formatted short comment.
|
|
284
|
+
short_comment = await generate_mr_comment(job, log_url, response, full=False)
|
|
285
|
+
|
|
286
|
+
# Look up the merge request
|
|
287
|
+
merge_request = await asyncio.to_thread(
|
|
288
|
+
project.mergerequests.get, merge_request_iid
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Submit a new comment to the Merge Request using the Gitlab API
|
|
292
|
+
discussion = await asyncio.to_thread(
|
|
293
|
+
merge_request.discussions.create, {"body": short_comment}
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Get the ID of the first note
|
|
297
|
+
note_id = discussion.attributes["notes"][0]["id"]
|
|
298
|
+
note = discussion.notes.get(note_id)
|
|
299
|
+
|
|
300
|
+
# Update the comment with the full details
|
|
301
|
+
# We do this in a second step so we don't bombard the user's email
|
|
302
|
+
# notifications with a massive message. Gitlab doesn't send email for
|
|
303
|
+
# comment edits.
|
|
304
|
+
full_comment = await generate_mr_comment(job, log_url, response, full=True)
|
|
305
|
+
note.body = full_comment
|
|
306
|
+
|
|
307
|
+
# Pause for five seconds before sending the snippet data, otherwise
|
|
308
|
+
# Gitlab may bundle the edited message together with the creation
|
|
309
|
+
# message in email.
|
|
310
|
+
await asyncio.sleep(5)
|
|
311
|
+
await asyncio.to_thread(note.save)
|
|
312
|
+
|
|
313
|
+
# Save the new comment to the database
|
|
314
|
+
try:
|
|
315
|
+
metrics = AnalyzeRequestMetrics.get_metric_by_id(metrics_id)
|
|
316
|
+
Comments.create(
|
|
317
|
+
forge,
|
|
318
|
+
project.id,
|
|
319
|
+
merge_request_iid,
|
|
320
|
+
job.id,
|
|
321
|
+
discussion.id,
|
|
322
|
+
metrics,
|
|
323
|
+
)
|
|
324
|
+
except sqlalchemy.exc.IntegrityError:
|
|
325
|
+
# We most likely attempted to save a new comment for the same
|
|
326
|
+
# build job. This is somewhat common during development when we're
|
|
327
|
+
# submitting requests manually. It shouldn't really happen in
|
|
328
|
+
# production.
|
|
329
|
+
if not SERVER_CONFIG.general.devmode:
|
|
330
|
+
raise
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
async def suppress_latest_comment(
|
|
334
|
+
gitlab_instance: str,
|
|
335
|
+
project: gitlab.v4.objects.Project,
|
|
336
|
+
merge_request_iid: int,
|
|
337
|
+
) -> None:
|
|
338
|
+
"""Look up the latest comment on this Merge Request, if any, and wrap it
|
|
339
|
+
in a <details></details> block with a comment indicating that it has been
|
|
340
|
+
superseded by a new push."""
|
|
341
|
+
|
|
342
|
+
# Ask the database for the last known comment for this MR
|
|
343
|
+
previous_comment = Comments.get_latest_comment(
|
|
344
|
+
gitlab_instance, project.id, merge_request_iid
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
if previous_comment is None:
|
|
348
|
+
# No existing comment, so nothing to do.
|
|
349
|
+
return
|
|
350
|
+
|
|
351
|
+
# Retrieve its content from the Gitlab API
|
|
352
|
+
|
|
353
|
+
# Look up the merge request
|
|
354
|
+
merge_request = await asyncio.to_thread(
|
|
355
|
+
project.mergerequests.get, merge_request_iid
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
# Find the discussion matching the latest comment ID
|
|
359
|
+
discussion = await asyncio.to_thread(
|
|
360
|
+
merge_request.discussions.get, previous_comment.comment_id
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Get the ID of the first note
|
|
364
|
+
note_id = discussion.attributes["notes"][0]["id"]
|
|
365
|
+
note = discussion.notes.get(note_id)
|
|
366
|
+
|
|
367
|
+
# Wrap the note in <details>, indicating why.
|
|
368
|
+
note.body = (
|
|
369
|
+
"This comment has been superseded by a newer "
|
|
370
|
+
f"Log Detective analysis.\n<details>\n{note.body}\n</details>"
|
|
371
|
+
)
|
|
372
|
+
await asyncio.to_thread(note.save)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
async def generate_mr_comment(
|
|
376
|
+
job: gitlab.v4.objects.ProjectJob,
|
|
377
|
+
log_url: str,
|
|
378
|
+
response: StagedResponse,
|
|
379
|
+
full: bool = True,
|
|
380
|
+
) -> str:
|
|
381
|
+
"""Use a template to generate a comment string to submit to Gitlab"""
|
|
382
|
+
|
|
383
|
+
# Locate and load the comment template
|
|
384
|
+
script_path = Path(__file__).resolve().parent
|
|
385
|
+
template_path = Path(script_path, "templates")
|
|
386
|
+
jinja_env = jinja2.Environment(loader=jinja2.FileSystemLoader(template_path))
|
|
387
|
+
|
|
388
|
+
if full:
|
|
389
|
+
tpl = jinja_env.get_template("gitlab_full_comment.md.j2")
|
|
390
|
+
else:
|
|
391
|
+
tpl = jinja_env.get_template("gitlab_short_comment.md.j2")
|
|
392
|
+
|
|
393
|
+
artifacts_url = f"{job.project_url}/-/jobs/{job.id}/artifacts/download"
|
|
394
|
+
|
|
395
|
+
if response.response_certainty >= 90:
|
|
396
|
+
emoji_face = ":slight_smile:"
|
|
397
|
+
elif response.response_certainty >= 70:
|
|
398
|
+
emoji_face = ":neutral_face:"
|
|
399
|
+
else:
|
|
400
|
+
emoji_face = ":frowning2:"
|
|
401
|
+
|
|
402
|
+
# Generate the comment from the template
|
|
403
|
+
content = tpl.render(
|
|
404
|
+
package=job.project_name,
|
|
405
|
+
explanation=response.explanation.text,
|
|
406
|
+
certainty=f"{response.response_certainty:.2f}",
|
|
407
|
+
emoji_face=emoji_face,
|
|
408
|
+
snippets=response.snippets,
|
|
409
|
+
log_url=log_url,
|
|
410
|
+
artifacts_url=artifacts_url,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
return content
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
from typing import List, Tuple, Dict, Any, Union
|
|
5
|
+
|
|
6
|
+
import backoff
|
|
7
|
+
from aiohttp import StreamReader
|
|
8
|
+
from fastapi import HTTPException
|
|
9
|
+
|
|
10
|
+
import aiohttp
|
|
11
|
+
|
|
12
|
+
from logdetective.constants import SNIPPET_DELIMITER
|
|
13
|
+
from logdetective.extractors import DrainExtractor
|
|
14
|
+
from logdetective.utils import (
|
|
15
|
+
compute_certainty,
|
|
16
|
+
)
|
|
17
|
+
from logdetective.server.config import LOG, SERVER_CONFIG, PROMPT_CONFIG
|
|
18
|
+
from logdetective.server.models import (
|
|
19
|
+
StagedResponse,
|
|
20
|
+
Explanation,
|
|
21
|
+
AnalyzedSnippet,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
LLM_CPP_SERVER_TIMEOUT = os.environ.get("LLAMA_CPP_SERVER_TIMEOUT", 600)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def format_analyzed_snippets(snippets: list[AnalyzedSnippet]) -> str:
|
|
29
|
+
"""Format snippets for submission into staged prompt."""
|
|
30
|
+
summary = f"\n{SNIPPET_DELIMITER}\n".join(
|
|
31
|
+
[
|
|
32
|
+
f"[{e.text}] at line [{e.line_number}]: [{e.explanation.text}]"
|
|
33
|
+
for e in snippets
|
|
34
|
+
]
|
|
35
|
+
)
|
|
36
|
+
return summary
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def mine_logs(log: str) -> List[Tuple[int, str]]:
|
|
40
|
+
"""Extract snippets from log text"""
|
|
41
|
+
extractor = DrainExtractor(
|
|
42
|
+
verbose=True, context=True, max_clusters=SERVER_CONFIG.extractor.max_clusters
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
LOG.info("Getting summary")
|
|
46
|
+
log_summary = extractor(log)
|
|
47
|
+
|
|
48
|
+
ratio = len(log_summary) / len(log.split("\n"))
|
|
49
|
+
LOG.debug("Log summary: \n %s", log_summary)
|
|
50
|
+
LOG.info("Compression ratio: %s", ratio)
|
|
51
|
+
|
|
52
|
+
return log_summary
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def submit_to_llm_endpoint(
|
|
56
|
+
http: aiohttp.ClientSession,
|
|
57
|
+
url: str,
|
|
58
|
+
data: Dict[str, Any],
|
|
59
|
+
headers: Dict[str, str],
|
|
60
|
+
stream: bool,
|
|
61
|
+
) -> Any:
|
|
62
|
+
"""Send request to selected API endpoint. Verifying successful request unless
|
|
63
|
+
the using the stream response.
|
|
64
|
+
|
|
65
|
+
url:
|
|
66
|
+
data:
|
|
67
|
+
headers:
|
|
68
|
+
stream:
|
|
69
|
+
"""
|
|
70
|
+
async with SERVER_CONFIG.inference.get_limiter():
|
|
71
|
+
LOG.debug("async request %s headers=%s data=%s", url, headers, data)
|
|
72
|
+
response = await http.post(
|
|
73
|
+
url,
|
|
74
|
+
headers=headers,
|
|
75
|
+
# we need to use the `json=` parameter here and let aiohttp
|
|
76
|
+
# handle the json-encoding
|
|
77
|
+
json=data,
|
|
78
|
+
timeout=int(LLM_CPP_SERVER_TIMEOUT),
|
|
79
|
+
# Docs says chunked takes int, but:
|
|
80
|
+
# DeprecationWarning: Chunk size is deprecated #1615
|
|
81
|
+
# So let's make sure we either put True or None here
|
|
82
|
+
chunked=True if stream else None,
|
|
83
|
+
raise_for_status=True,
|
|
84
|
+
)
|
|
85
|
+
if stream:
|
|
86
|
+
return response
|
|
87
|
+
try:
|
|
88
|
+
return json.loads(await response.text())
|
|
89
|
+
except UnicodeDecodeError as ex:
|
|
90
|
+
LOG.error("Error encountered while parsing llama server response: %s", ex)
|
|
91
|
+
raise HTTPException(
|
|
92
|
+
status_code=400,
|
|
93
|
+
detail=f"Couldn't parse the response.\nError: {ex}\nData: {response.text}",
|
|
94
|
+
) from ex
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def should_we_giveup(exc: aiohttp.ClientResponseError) -> bool:
|
|
98
|
+
"""
|
|
99
|
+
From backoff's docs:
|
|
100
|
+
|
|
101
|
+
> a function which accepts the exception and returns
|
|
102
|
+
> a truthy value if the exception should not be retried
|
|
103
|
+
"""
|
|
104
|
+
LOG.info("Should we give up on retrying error %s", exc)
|
|
105
|
+
return exc.status < 500
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def we_give_up(details: backoff._typing.Details):
|
|
109
|
+
"""
|
|
110
|
+
retries didn't work (or we got a different exc)
|
|
111
|
+
we give up and raise proper 500 for our API endpoint
|
|
112
|
+
"""
|
|
113
|
+
LOG.error("Inference error: %s", details["args"])
|
|
114
|
+
raise HTTPException(500, "Request to the inference API failed")
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@backoff.on_exception(
|
|
118
|
+
lambda: backoff.constant([10, 30, 120]),
|
|
119
|
+
aiohttp.ClientResponseError,
|
|
120
|
+
max_tries=3,
|
|
121
|
+
giveup=should_we_giveup,
|
|
122
|
+
raise_on_giveup=False,
|
|
123
|
+
on_giveup=we_give_up,
|
|
124
|
+
)
|
|
125
|
+
async def submit_text( # pylint: disable=R0913,R0917
|
|
126
|
+
http: aiohttp.ClientSession,
|
|
127
|
+
text: str,
|
|
128
|
+
max_tokens: int = -1,
|
|
129
|
+
log_probs: int = 1,
|
|
130
|
+
stream: bool = False,
|
|
131
|
+
model: str = "default-model",
|
|
132
|
+
) -> Explanation:
|
|
133
|
+
"""Submit prompt to LLM using a selected endpoint.
|
|
134
|
+
max_tokens: number of tokens to be produces, 0 indicates run until encountering EOS
|
|
135
|
+
log_probs: number of token choices to produce log probs for
|
|
136
|
+
"""
|
|
137
|
+
LOG.info("Analyzing the text")
|
|
138
|
+
|
|
139
|
+
headers = {"Content-Type": "application/json"}
|
|
140
|
+
|
|
141
|
+
if SERVER_CONFIG.inference.api_token:
|
|
142
|
+
headers["Authorization"] = f"Bearer {SERVER_CONFIG.inference.api_token}"
|
|
143
|
+
|
|
144
|
+
if SERVER_CONFIG.inference.api_endpoint == "/chat/completions":
|
|
145
|
+
return await submit_text_chat_completions(
|
|
146
|
+
http, text, headers, max_tokens, log_probs > 0, stream, model
|
|
147
|
+
)
|
|
148
|
+
return await submit_text_completions(
|
|
149
|
+
http, text, headers, max_tokens, log_probs, stream, model
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
async def submit_text_completions( # pylint: disable=R0913,R0917
|
|
154
|
+
http: aiohttp.ClientSession,
|
|
155
|
+
text: str,
|
|
156
|
+
headers: dict,
|
|
157
|
+
max_tokens: int = -1,
|
|
158
|
+
log_probs: int = 1,
|
|
159
|
+
stream: bool = False,
|
|
160
|
+
model: str = "default-model",
|
|
161
|
+
) -> Explanation:
|
|
162
|
+
"""Submit prompt to OpenAI API completions endpoint.
|
|
163
|
+
max_tokens: number of tokens to be produces, 0 indicates run until encountering EOS
|
|
164
|
+
log_probs: number of token choices to produce log probs for
|
|
165
|
+
"""
|
|
166
|
+
LOG.info("Submitting to /v1/completions endpoint")
|
|
167
|
+
data = {
|
|
168
|
+
"prompt": text,
|
|
169
|
+
"max_tokens": max_tokens,
|
|
170
|
+
"logprobs": log_probs,
|
|
171
|
+
"stream": stream,
|
|
172
|
+
"model": model,
|
|
173
|
+
"temperature": SERVER_CONFIG.inference.temperature,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
response = await submit_to_llm_endpoint(
|
|
177
|
+
http,
|
|
178
|
+
f"{SERVER_CONFIG.inference.url}/v1/completions",
|
|
179
|
+
data,
|
|
180
|
+
headers,
|
|
181
|
+
stream,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return Explanation(
|
|
185
|
+
text=response["choices"][0]["text"], logprobs=response["choices"][0]["logprobs"]
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
async def submit_text_chat_completions( # pylint: disable=R0913,R0917
|
|
190
|
+
http: aiohttp.ClientSession,
|
|
191
|
+
text: str,
|
|
192
|
+
headers: dict,
|
|
193
|
+
max_tokens: int = -1,
|
|
194
|
+
log_probs: int = 1,
|
|
195
|
+
stream: bool = False,
|
|
196
|
+
model: str = "default-model",
|
|
197
|
+
) -> Union[Explanation, StreamReader]:
|
|
198
|
+
"""Submit prompt to OpenAI API /chat/completions endpoint.
|
|
199
|
+
max_tokens: number of tokens to be produces, 0 indicates run until encountering EOS
|
|
200
|
+
log_probs: number of token choices to produce log probs for
|
|
201
|
+
"""
|
|
202
|
+
LOG.info("Submitting to /v1/chat/completions endpoint")
|
|
203
|
+
|
|
204
|
+
data = {
|
|
205
|
+
"messages": [
|
|
206
|
+
{
|
|
207
|
+
"role": "user",
|
|
208
|
+
"content": text,
|
|
209
|
+
}
|
|
210
|
+
],
|
|
211
|
+
"max_tokens": max_tokens,
|
|
212
|
+
"logprobs": log_probs,
|
|
213
|
+
"stream": stream,
|
|
214
|
+
"model": model,
|
|
215
|
+
"temperature": SERVER_CONFIG.inference.temperature,
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
response = await submit_to_llm_endpoint(
|
|
219
|
+
http,
|
|
220
|
+
f"{SERVER_CONFIG.inference.url}/v1/chat/completions",
|
|
221
|
+
data,
|
|
222
|
+
headers,
|
|
223
|
+
stream,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
if stream:
|
|
227
|
+
return response
|
|
228
|
+
return Explanation(
|
|
229
|
+
text=response["choices"][0]["message"]["content"],
|
|
230
|
+
logprobs=response["choices"][0]["logprobs"]["content"],
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
async def perform_staged_analysis(
|
|
235
|
+
http: aiohttp.ClientSession, log_text: str
|
|
236
|
+
) -> StagedResponse:
|
|
237
|
+
"""Submit the log file snippets to the LLM and retrieve their results"""
|
|
238
|
+
log_summary = mine_logs(log_text)
|
|
239
|
+
|
|
240
|
+
# Process snippets asynchronously
|
|
241
|
+
awaitables = [
|
|
242
|
+
submit_text(
|
|
243
|
+
http,
|
|
244
|
+
PROMPT_CONFIG.snippet_prompt_template.format(s),
|
|
245
|
+
model=SERVER_CONFIG.inference.model,
|
|
246
|
+
max_tokens=SERVER_CONFIG.inference.max_tokens,
|
|
247
|
+
)
|
|
248
|
+
for s in log_summary
|
|
249
|
+
]
|
|
250
|
+
analyzed_snippets = await asyncio.gather(*awaitables)
|
|
251
|
+
|
|
252
|
+
analyzed_snippets = [
|
|
253
|
+
AnalyzedSnippet(line_number=e[0][0], text=e[0][1], explanation=e[1])
|
|
254
|
+
for e in zip(log_summary, analyzed_snippets)
|
|
255
|
+
]
|
|
256
|
+
final_prompt = PROMPT_CONFIG.prompt_template_staged.format(
|
|
257
|
+
format_analyzed_snippets(analyzed_snippets)
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
final_analysis = await submit_text(
|
|
261
|
+
http,
|
|
262
|
+
final_prompt,
|
|
263
|
+
model=SERVER_CONFIG.inference.model,
|
|
264
|
+
max_tokens=SERVER_CONFIG.inference.max_tokens,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
certainty = 0
|
|
268
|
+
|
|
269
|
+
if final_analysis.logprobs:
|
|
270
|
+
try:
|
|
271
|
+
certainty = compute_certainty(final_analysis.logprobs)
|
|
272
|
+
except ValueError as ex:
|
|
273
|
+
LOG.error("Error encountered while computing certainty: %s", ex)
|
|
274
|
+
raise HTTPException(
|
|
275
|
+
status_code=400,
|
|
276
|
+
detail=f"Couldn't compute certainty with data:\n"
|
|
277
|
+
f"{final_analysis.logprobs}",
|
|
278
|
+
) from ex
|
|
279
|
+
|
|
280
|
+
return StagedResponse(
|
|
281
|
+
explanation=final_analysis,
|
|
282
|
+
snippets=analyzed_snippets,
|
|
283
|
+
response_certainty=certainty,
|
|
284
|
+
)
|