logdetective 0.5.11__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,15 +3,18 @@ import json
3
3
  import os
4
4
  import re
5
5
  import zipfile
6
+ from enum import Enum
7
+ from contextlib import asynccontextmanager
6
8
  from pathlib import Path, PurePath
7
9
  from tempfile import TemporaryFile
8
- from typing import List, Annotated, Tuple, Dict, Any
10
+ from typing import List, Annotated, Tuple, Dict, Any, Union
9
11
  from io import BytesIO
10
12
 
11
-
13
+ import backoff
12
14
  import matplotlib
13
15
  import matplotlib.pyplot
14
- from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends, Header
16
+ from aiohttp import StreamReader
17
+ from fastapi import FastAPI, HTTPException, BackgroundTasks, Depends, Header, Request
15
18
 
16
19
  from fastapi.responses import StreamingResponse
17
20
  from fastapi.responses import Response as BasicResponse
@@ -19,11 +22,14 @@ import gitlab
19
22
  import gitlab.v4
20
23
  import gitlab.v4.objects
21
24
  import jinja2
22
- import requests
25
+ import aiohttp
26
+ import sqlalchemy
27
+ import sentry_sdk
28
+
29
+ import logdetective.server.database.base
23
30
 
24
31
  from logdetective.extractors import DrainExtractor
25
32
  from logdetective.utils import (
26
- validate_url,
27
33
  compute_certainty,
28
34
  format_snippets,
29
35
  load_prompts,
@@ -33,7 +39,7 @@ from logdetective.server.utils import (
33
39
  get_log,
34
40
  format_analyzed_snippets,
35
41
  )
36
- from logdetective.server.metric import track_request
42
+ from logdetective.server.metric import track_request, add_new_metrics, update_metrics
37
43
  from logdetective.server.models import (
38
44
  BuildLog,
39
45
  JobHook,
@@ -43,8 +49,14 @@ from logdetective.server.models import (
43
49
  AnalyzedSnippet,
44
50
  TimePeriod,
45
51
  )
46
- from logdetective.server import plot
47
- from logdetective.server.database.models import EndpointType
52
+ from logdetective.server import plot as plot_engine
53
+ from logdetective.server.remote_log import RemoteLog
54
+ from logdetective.server.database.models import (
55
+ Comments,
56
+ EndpointType,
57
+ Forge,
58
+ )
59
+ from logdetective.server.database.models import AnalyzeRequestMetrics
48
60
 
49
61
  LLM_CPP_SERVER_TIMEOUT = os.environ.get("LLAMA_CPP_SERVER_TIMEOUT", 600)
50
62
  LOG_SOURCE_REQUEST_TIMEOUT = os.environ.get("LOG_SOURCE_REQUEST_TIMEOUT", 60)
@@ -60,6 +72,34 @@ FAILURE_LOG_REGEX = re.compile(r"(\w*\.log)")
60
72
 
61
73
  LOG = get_log(SERVER_CONFIG)
62
74
 
75
+ if sentry_dsn := SERVER_CONFIG.general.sentry_dsn:
76
+ sentry_sdk.init(dsn=str(sentry_dsn), traces_sample_rate=1.0)
77
+
78
+
79
+ @asynccontextmanager
80
+ async def lifespan(fapp: FastAPI):
81
+ """
82
+ Establish one HTTP session
83
+ """
84
+ fapp.http = aiohttp.ClientSession(
85
+ timeout=aiohttp.ClientTimeout(
86
+ total=int(LOG_SOURCE_REQUEST_TIMEOUT), connect=3.07
87
+ )
88
+ )
89
+
90
+ # Ensure that the database is initialized.
91
+ logdetective.server.database.base.init()
92
+
93
+ yield
94
+ await fapp.http.close()
95
+
96
+
97
+ async def get_http_session(request: Request) -> aiohttp.ClientSession:
98
+ """
99
+ Return the single aiohttp ClientSession for this app
100
+ """
101
+ return request.app.http
102
+
63
103
 
64
104
  def requires_token_when_set(authentication: Annotated[str | None, Header()] = None):
65
105
  """
@@ -91,35 +131,12 @@ def requires_token_when_set(authentication: Annotated[str | None, Header()] = No
91
131
  raise HTTPException(status_code=401, detail=f"Token {token} not valid.")
92
132
 
93
133
 
94
- app = FastAPI(dependencies=[Depends(requires_token_when_set)])
134
+ app = FastAPI(dependencies=[Depends(requires_token_when_set)], lifespan=lifespan)
95
135
  app.gitlab_conn = gitlab.Gitlab(
96
136
  url=SERVER_CONFIG.gitlab.url, private_token=SERVER_CONFIG.gitlab.api_token
97
137
  )
98
138
 
99
139
 
100
- def process_url(url: str) -> str:
101
- """Validate log URL and return log text."""
102
- if validate_url(url=url):
103
- try:
104
- log_request = requests.get(url, timeout=int(LOG_SOURCE_REQUEST_TIMEOUT))
105
- except requests.RequestException as ex:
106
- raise HTTPException(
107
- status_code=400, detail=f"We couldn't obtain the logs: {ex}"
108
- ) from ex
109
-
110
- if not log_request.ok:
111
- raise HTTPException(
112
- status_code=400,
113
- detail="Something went wrong while getting the logs: "
114
- f"[{log_request.status_code}] {log_request.text}",
115
- )
116
- else:
117
- LOG.error("Invalid URL received ")
118
- raise HTTPException(status_code=400, detail=f"Invalid log URL: {url}")
119
-
120
- return log_request.text
121
-
122
-
123
140
  def mine_logs(log: str) -> List[Tuple[int, str]]:
124
141
  """Extract snippets from log text"""
125
142
  extractor = DrainExtractor(
@@ -137,7 +154,11 @@ def mine_logs(log: str) -> List[Tuple[int, str]]:
137
154
 
138
155
 
139
156
  async def submit_to_llm_endpoint(
140
- url: str, data: Dict[str, Any], headers: Dict[str, str], stream: bool
157
+ http: aiohttp.ClientSession,
158
+ url: str,
159
+ data: Dict[str, Any],
160
+ headers: Dict[str, str],
161
+ stream: bool,
141
162
  ) -> Any:
142
163
  """Send request to selected API endpoint. Verifying successful request unless
143
164
  the using the stream response.
@@ -147,40 +168,62 @@ async def submit_to_llm_endpoint(
147
168
  headers:
148
169
  stream:
149
170
  """
171
+ LOG.debug("async request %s headers=%s data=%s", url, headers, data)
172
+ response = await http.post(
173
+ url,
174
+ headers=headers,
175
+ # we need to use the `json=` parameter here and let aiohttp
176
+ # handle the json-encoding
177
+ json=data,
178
+ timeout=int(LLM_CPP_SERVER_TIMEOUT),
179
+ # Docs says chunked takes int, but:
180
+ # DeprecationWarning: Chunk size is deprecated #1615
181
+ # So let's make sure we either put True or None here
182
+ chunked=True if stream else None,
183
+ raise_for_status=True,
184
+ )
185
+ if stream:
186
+ return response
150
187
  try:
151
- # Expects llama-cpp server to run on LLM_CPP_SERVER_ADDRESS:LLM_CPP_SERVER_PORT
152
- response = requests.post(
153
- url,
154
- headers=headers,
155
- data=json.dumps(data),
156
- timeout=int(LLM_CPP_SERVER_TIMEOUT),
157
- stream=stream,
158
- )
159
- except requests.RequestException as ex:
160
- LOG.error("Llama-cpp query failed: %s", ex)
188
+ return json.loads(await response.text())
189
+ except UnicodeDecodeError as ex:
190
+ LOG.error("Error encountered while parsing llama server response: %s", ex)
161
191
  raise HTTPException(
162
- status_code=400, detail=f"Llama-cpp query failed: {ex}"
192
+ status_code=400,
193
+ detail=f"Couldn't parse the response.\nError: {ex}\nData: {response.text}",
163
194
  ) from ex
164
- if not stream:
165
- if not response.ok:
166
- raise HTTPException(
167
- status_code=400,
168
- detail="Something went wrong while getting a response from the llama server: "
169
- f"[{response.status_code}] {response.text}",
170
- )
171
- try:
172
- response = json.loads(response.text)
173
- except UnicodeDecodeError as ex:
174
- LOG.error("Error encountered while parsing llama server response: %s", ex)
175
- raise HTTPException(
176
- status_code=400,
177
- detail=f"Couldn't parse the response.\nError: {ex}\nData: {response.text}",
178
- ) from ex
179
195
 
180
- return response
196
+
197
+ def should_we_giveup(exc: aiohttp.ClientResponseError) -> bool:
198
+ """
199
+ From backoff's docs:
200
+
201
+ > a function which accepts the exception and returns
202
+ > a truthy value if the exception should not be retried
203
+ """
204
+ LOG.info("Should we give up on retrying error %s", exc)
205
+ return exc.status < 500
206
+
207
+
208
+ def we_give_up(details: backoff._typing.Details):
209
+ """
210
+ retries didn't work (or we got a different exc)
211
+ we give up and raise proper 500 for our API endpoint
212
+ """
213
+ LOG.error("Inference error: %s", details["args"])
214
+ raise HTTPException(500, "Request to the inference API failed")
181
215
 
182
216
 
217
+ @backoff.on_exception(
218
+ backoff.expo,
219
+ aiohttp.ClientResponseError,
220
+ max_tries=3,
221
+ giveup=should_we_giveup,
222
+ raise_on_giveup=False,
223
+ on_giveup=we_give_up,
224
+ )
183
225
  async def submit_text( # pylint: disable=R0913,R0917
226
+ http: aiohttp.ClientSession,
184
227
  text: str,
185
228
  max_tokens: int = -1,
186
229
  log_probs: int = 1,
@@ -200,14 +243,15 @@ async def submit_text( # pylint: disable=R0913,R0917
200
243
 
201
244
  if SERVER_CONFIG.inference.api_endpoint == "/chat/completions":
202
245
  return await submit_text_chat_completions(
203
- text, headers, max_tokens, log_probs > 0, stream, model
246
+ http, text, headers, max_tokens, log_probs > 0, stream, model
204
247
  )
205
248
  return await submit_text_completions(
206
- text, headers, max_tokens, log_probs, stream, model
249
+ http, text, headers, max_tokens, log_probs, stream, model
207
250
  )
208
251
 
209
252
 
210
253
  async def submit_text_completions( # pylint: disable=R0913,R0917
254
+ http: aiohttp.ClientSession,
211
255
  text: str,
212
256
  headers: dict,
213
257
  max_tokens: int = -1,
@@ -230,6 +274,7 @@ async def submit_text_completions( # pylint: disable=R0913,R0917
230
274
  }
231
275
 
232
276
  response = await submit_to_llm_endpoint(
277
+ http,
233
278
  f"{SERVER_CONFIG.inference.url}/v1/completions",
234
279
  data,
235
280
  headers,
@@ -242,13 +287,14 @@ async def submit_text_completions( # pylint: disable=R0913,R0917
242
287
 
243
288
 
244
289
  async def submit_text_chat_completions( # pylint: disable=R0913,R0917
290
+ http: aiohttp.ClientSession,
245
291
  text: str,
246
292
  headers: dict,
247
293
  max_tokens: int = -1,
248
294
  log_probs: int = 1,
249
295
  stream: bool = False,
250
296
  model: str = "default-model",
251
- ) -> Explanation:
297
+ ) -> Union[Explanation, StreamReader]:
252
298
  """Submit prompt to OpenAI API /chat/completions endpoint.
253
299
  max_tokens: number of tokens to be produces, 0 indicates run until encountering EOS
254
300
  log_probs: number of token choices to produce log probs for
@@ -270,6 +316,7 @@ async def submit_text_chat_completions( # pylint: disable=R0913,R0917
270
316
  }
271
317
 
272
318
  response = await submit_to_llm_endpoint(
319
+ http,
273
320
  f"{SERVER_CONFIG.inference.url}/v1/chat/completions",
274
321
  data,
275
322
  headers,
@@ -277,10 +324,7 @@ async def submit_text_chat_completions( # pylint: disable=R0913,R0917
277
324
  )
278
325
 
279
326
  if stream:
280
- return Explanation(
281
- text=response["choices"][0]["delta"]["content"],
282
- logprobs=response["choices"][0]["logprobs"]["content"],
283
- )
327
+ return response
284
328
  return Explanation(
285
329
  text=response["choices"][0]["message"]["content"],
286
330
  logprobs=response["choices"][0]["logprobs"]["content"],
@@ -289,17 +333,21 @@ async def submit_text_chat_completions( # pylint: disable=R0913,R0917
289
333
 
290
334
  @app.post("/analyze", response_model=Response)
291
335
  @track_request()
292
- async def analyze_log(build_log: BuildLog):
336
+ async def analyze_log(
337
+ build_log: BuildLog, http_session: aiohttp.ClientSession = Depends(get_http_session)
338
+ ):
293
339
  """Provide endpoint for log file submission and analysis.
294
340
  Request must be in form {"url":"<YOUR_URL_HERE>"}.
295
341
  URL must be valid for the request to be passed to the LLM server.
296
342
  Meaning that it must contain appropriate scheme, path and netloc,
297
343
  while lacking result, params or query fields.
298
344
  """
299
- log_text = process_url(build_log.url)
345
+ remote_log = RemoteLog(build_log.url, http_session)
346
+ log_text = await remote_log.process_url()
300
347
  log_summary = mine_logs(log_text)
301
348
  log_summary = format_snippets(log_summary)
302
349
  response = await submit_text(
350
+ http_session,
303
351
  PROMPT_CONFIG.prompt_template.format(log_summary),
304
352
  model=SERVER_CONFIG.inference.model,
305
353
  max_tokens=SERVER_CONFIG.inference.max_tokens,
@@ -319,21 +367,26 @@ async def analyze_log(build_log: BuildLog):
319
367
  return Response(explanation=response, response_certainty=certainty)
320
368
 
321
369
 
322
- @app.post("/analyze/staged", response_model=StagedResponse)
323
370
  @track_request()
324
- async def analyze_log_staged(build_log: BuildLog):
371
+ @app.post("/analyze/staged", response_model=StagedResponse)
372
+ async def analyze_log_staged(
373
+ build_log: BuildLog, http_session: aiohttp.ClientSession = Depends(get_http_session)
374
+ ):
325
375
  """Provide endpoint for log file submission and analysis.
326
376
  Request must be in form {"url":"<YOUR_URL_HERE>"}.
327
377
  URL must be valid for the request to be passed to the LLM server.
328
378
  Meaning that it must contain appropriate scheme, path and netloc,
329
379
  while lacking result, params or query fields.
330
380
  """
331
- log_text = process_url(build_log.url)
381
+ remote_log = RemoteLog(build_log.url, http_session)
382
+ log_text = await remote_log.process_url()
332
383
 
333
- return await perform_staged_analysis(log_text=log_text)
384
+ return await perform_staged_analysis(http_session, log_text=log_text)
334
385
 
335
386
 
336
- async def perform_staged_analysis(log_text: str) -> StagedResponse:
387
+ async def perform_staged_analysis(
388
+ http: aiohttp.ClientSession, log_text: str
389
+ ) -> StagedResponse:
337
390
  """Submit the log file snippets to the LLM and retrieve their results"""
338
391
  log_summary = mine_logs(log_text)
339
392
 
@@ -341,6 +394,7 @@ async def perform_staged_analysis(log_text: str) -> StagedResponse:
341
394
  analyzed_snippets = await asyncio.gather(
342
395
  *[
343
396
  submit_text(
397
+ http,
344
398
  PROMPT_CONFIG.snippet_prompt_template.format(s),
345
399
  model=SERVER_CONFIG.inference.model,
346
400
  max_tokens=SERVER_CONFIG.inference.max_tokens,
@@ -358,6 +412,7 @@ async def perform_staged_analysis(log_text: str) -> StagedResponse:
358
412
  )
359
413
 
360
414
  final_analysis = await submit_text(
415
+ http,
361
416
  final_prompt,
362
417
  model=SERVER_CONFIG.inference.model,
363
418
  max_tokens=SERVER_CONFIG.inference.max_tokens,
@@ -385,14 +440,17 @@ async def perform_staged_analysis(log_text: str) -> StagedResponse:
385
440
 
386
441
  @app.post("/analyze/stream", response_class=StreamingResponse)
387
442
  @track_request()
388
- async def analyze_log_stream(build_log: BuildLog):
443
+ async def analyze_log_stream(
444
+ build_log: BuildLog, http_session: aiohttp.ClientSession = Depends(get_http_session)
445
+ ):
389
446
  """Stream response endpoint for Logdetective.
390
447
  Request must be in form {"url":"<YOUR_URL_HERE>"}.
391
448
  URL must be valid for the request to be passed to the LLM server.
392
449
  Meaning that it must contain appropriate scheme, path and netloc,
393
450
  while lacking result, params or query fields.
394
451
  """
395
- log_text = process_url(build_log.url)
452
+ remote_log = RemoteLog(build_log.url, http_session)
453
+ log_text = await remote_log.process_url()
396
454
  log_summary = mine_logs(log_text)
397
455
  log_summary = format_snippets(log_summary)
398
456
  headers = {"Content-Type": "application/json"}
@@ -400,42 +458,63 @@ async def analyze_log_stream(build_log: BuildLog):
400
458
  if SERVER_CONFIG.inference.api_token:
401
459
  headers["Authorization"] = f"Bearer {SERVER_CONFIG.inference.api_token}"
402
460
 
403
- stream = await submit_text_chat_completions(
404
- PROMPT_CONFIG.prompt_template.format(log_summary), stream=True, headers=headers,
405
- model=SERVER_CONFIG.inference.model,
406
- max_tokens=SERVER_CONFIG.inference.max_tokens,
407
- )
461
+ try:
462
+ stream = await submit_text_chat_completions(
463
+ http_session,
464
+ PROMPT_CONFIG.prompt_template.format(log_summary),
465
+ stream=True,
466
+ headers=headers,
467
+ model=SERVER_CONFIG.inference.model,
468
+ max_tokens=SERVER_CONFIG.inference.max_tokens,
469
+ )
470
+ except aiohttp.ClientResponseError as ex:
471
+ raise HTTPException(
472
+ status_code=400,
473
+ detail="HTTP Error while getting response from inference server "
474
+ f"[{ex.status}] {ex.message}",
475
+ ) from ex
408
476
 
477
+ # we need to figure out a better response here, this is how it looks rn:
478
+ # b'data: {"choices":[{"finish_reason":"stop","index":0,"delta":{}}],
479
+ # "created":1744818071,"id":"chatcmpl-c9geTxNcQO7M9wR...
409
480
  return StreamingResponse(stream)
410
481
 
411
482
 
412
483
  @app.post("/webhook/gitlab/job_events")
413
484
  async def receive_gitlab_job_event_webhook(
414
- job_hook: JobHook, background_tasks: BackgroundTasks
485
+ x_gitlab_instance: Annotated[str | None, Header()],
486
+ job_hook: JobHook,
487
+ background_tasks: BackgroundTasks,
488
+ http: aiohttp.ClientSession = Depends(get_http_session),
415
489
  ):
416
490
  """Webhook endpoint for receiving job_events notifications from GitLab
417
491
  https://docs.gitlab.com/user/project/integrations/webhook_events/#job-events
418
492
  lists the full specification for the messages sent for job events."""
419
493
 
494
+ try:
495
+ forge = Forge(x_gitlab_instance)
496
+ except ValueError:
497
+ LOG.critical("%s is not a recognized forge. Ignoring.", x_gitlab_instance)
498
+ return BasicResponse(status_code=400)
499
+
420
500
  # Handle the message in the background so we can return 200 immediately
421
- background_tasks.add_task(process_gitlab_job_event, job_hook)
501
+ background_tasks.add_task(process_gitlab_job_event, http, forge, job_hook)
422
502
 
423
503
  # No return value or body is required for a webhook.
424
504
  # 204: No Content
425
505
  return BasicResponse(status_code=204)
426
506
 
427
507
 
428
- async def process_gitlab_job_event(job_hook):
508
+ async def process_gitlab_job_event(
509
+ http: aiohttp.ClientSession,
510
+ forge: Forge,
511
+ job_hook: JobHook,
512
+ ):
429
513
  """Handle a received job_event webhook from GitLab"""
430
- LOG.debug("Received webhook message:\n%s", job_hook)
514
+ LOG.debug("Received webhook message from %s:\n%s", forge.value, job_hook)
431
515
 
432
516
  # Look up the project this job belongs to
433
517
  project = await asyncio.to_thread(app.gitlab_conn.projects.get, job_hook.project_id)
434
-
435
- # check if this project is on the opt-in list
436
- if project.name not in SERVER_CONFIG.general.packages:
437
- LOG.info("Ignoring unrecognized package %s", project.name)
438
- return
439
518
  LOG.info("Processing failed job for %s", project.name)
440
519
 
441
520
  # Retrieve data about the job from the GitLab API
@@ -466,25 +545,49 @@ async def process_gitlab_job_event(job_hook):
466
545
  LOG.debug("Retrieving log artifacts")
467
546
  # Retrieve the build logs from the merge request artifacts and preprocess them
468
547
  try:
469
- log_url, preprocessed_log = await retrieve_and_preprocess_koji_logs(job)
548
+ log_url, preprocessed_log = await retrieve_and_preprocess_koji_logs(http, job)
470
549
  except LogsTooLargeError:
471
550
  LOG.error("Could not retrieve logs. Too large.")
472
551
  raise
473
552
 
474
553
  # Submit log to Log Detective and await the results.
475
554
  log_text = preprocessed_log.read().decode(encoding="utf-8")
476
- staged_response = await perform_staged_analysis(log_text=log_text)
555
+ metrics_id = await add_new_metrics(
556
+ api_name=EndpointType.ANALYZE_GITLAB_JOB,
557
+ url=log_url,
558
+ http_session=http,
559
+ compressed_log_content=RemoteLog.zip_text(log_text),
560
+ )
561
+ staged_response = await perform_staged_analysis(http, log_text=log_text)
562
+ update_metrics(metrics_id, staged_response)
477
563
  preprocessed_log.close()
478
564
 
565
+ # check if this project is on the opt-in list for posting comments.
566
+ if project.name not in SERVER_CONFIG.general.packages:
567
+ LOG.info("Not publishing comment for unrecognized package %s", project.name)
568
+ return
569
+
479
570
  # Add the Log Detective response as a comment to the merge request
480
- await comment_on_mr(project, merge_request_iid, job, log_url, staged_response)
571
+ await comment_on_mr(
572
+ forge,
573
+ project,
574
+ merge_request_iid,
575
+ job,
576
+ log_url,
577
+ staged_response,
578
+ metrics_id,
579
+ )
580
+
581
+ return staged_response
481
582
 
482
583
 
483
584
  class LogsTooLargeError(RuntimeError):
484
585
  """The log archive exceeds the configured maximum size"""
485
586
 
486
587
 
487
- async def retrieve_and_preprocess_koji_logs(job: gitlab.v4.objects.ProjectJob):
588
+ async def retrieve_and_preprocess_koji_logs(
589
+ http: aiohttp.ClientSession, job: gitlab.v4.objects.ProjectJob
590
+ ): # pylint: disable=too-many-branches
488
591
  """Download logs from the merge request artifacts
489
592
 
490
593
  This function will retrieve the build logs and do some minimal
@@ -495,7 +598,7 @@ async def retrieve_and_preprocess_koji_logs(job: gitlab.v4.objects.ProjectJob):
495
598
  Detective. The calling function is responsible for closing this object."""
496
599
 
497
600
  # Make sure the file isn't too large to process.
498
- if not await check_artifacts_file_size(job):
601
+ if not await check_artifacts_file_size(http, job):
499
602
  raise LogsTooLargeError(
500
603
  f"Oversized logs for job {job.id} in project {job.project_id}"
501
604
  )
@@ -513,13 +616,16 @@ async def retrieve_and_preprocess_koji_logs(job: gitlab.v4.objects.ProjectJob):
513
616
  if zipinfo.filename.endswith("task_failed.log"):
514
617
  # The koji logs store this file in two places: 1) in the
515
618
  # directory with the failed architecture and 2) in the parent
516
- # directory. We actually want to ignore the one in the parent
517
- # directory, since the rest of the information is in the
518
- # specific task directory.
619
+ # directory. Most of the time, we want to ignore the one in the
620
+ # parent directory, since the rest of the information is in the
621
+ # specific task directory. However, there are some situations
622
+ # where non-build failures (such as "Target build already exists")
623
+ # may be presented only at the top level.
519
624
  # The paths look like `kojilogs/noarch-XXXXXX/task_failed.log`
520
625
  # or `kojilogs/noarch-XXXXXX/x86_64-XXXXXX/task_failed.log`
521
626
  path = PurePath(zipinfo.filename)
522
627
  if len(path.parts) <= 3:
628
+ failed_arches["toplevel"] = path
523
629
  continue
524
630
 
525
631
  # Extract the architecture from the immediate parent path
@@ -548,30 +654,32 @@ async def retrieve_and_preprocess_koji_logs(job: gitlab.v4.objects.ProjectJob):
548
654
  failed_arches[architecture] = PurePath(path.parent, failure_log_name)
549
655
 
550
656
  if not failed_arches:
551
- # No failed task found?
657
+ # No failed task found in the sub-tasks.
552
658
  raise FileNotFoundError("Could not detect failed architecture.")
553
659
 
554
- # First check if we only found one failed architecture
555
- if len(failed_arches) == 1:
556
- failed_arch = list(failed_arches.keys())[0]
557
-
660
+ # We only want to handle one arch, so we'll check them in order of
661
+ # "most to least likely for the maintainer to have access to hardware"
662
+ # This means: x86_64 > aarch64 > riscv > ppc64le > s390x
663
+ if "x86_64" in failed_arches:
664
+ failed_arch = "x86_64"
665
+ elif "aarch64" in failed_arches:
666
+ failed_arch = "aarch64"
667
+ elif "riscv" in failed_arches:
668
+ failed_arch = "riscv"
669
+ elif "ppc64le" in failed_arches:
670
+ failed_arch = "ppc64le"
671
+ elif "s390x" in failed_arches:
672
+ failed_arch = "s390x"
673
+ elif "noarch" in failed_arches:
674
+ # May have failed during BuildSRPMFromSCM phase
675
+ failed_arch = "noarch"
676
+ elif "toplevel" in failed_arches:
677
+ # Probably a Koji-specific error, not a build error
678
+ failed_arch = "toplevel"
558
679
  else:
559
- # We only want to handle one arch, so we'll check them in order of
560
- # "most to least likely for the maintainer to have access to hardware"
561
- # This means: x86_64 > aarch64 > ppc64le > s390x
562
- if "x86_64" in failed_arches:
563
- failed_arch = "x86_64"
564
- elif "aarch64" in failed_arches:
565
- failed_arch = "aarch64"
566
- elif "ppc64le" in failed_arches:
567
- failed_arch = "ppc64le"
568
- elif "s390x" in failed_arches:
569
- failed_arch = "s390x"
570
- else:
571
- # It should be impossible for us to get "noarch" here, since
572
- # the only way that should happen is for a single architecture
573
- # build.
574
- raise FileNotFoundError("No failed architecture detected.")
680
+ # We have one or more architectures that we don't know about? Just
681
+ # pick the first alphabetically.
682
+ failed_arch = sorted(list(failed_arches.keys()))[0]
575
683
 
576
684
  LOG.debug("Failed architecture: %s", failed_arch)
577
685
 
@@ -584,21 +692,31 @@ async def retrieve_and_preprocess_koji_logs(job: gitlab.v4.objects.ProjectJob):
584
692
  return log_url, artifacts_zip.open(log_path)
585
693
 
586
694
 
587
- async def check_artifacts_file_size(job):
695
+ async def check_artifacts_file_size(
696
+ http: aiohttp.ClientSession,
697
+ job: gitlab.v4.objects.ProjectJob,
698
+ ):
588
699
  """Method to determine if the artifacts are too large to process"""
589
700
  # First, make sure that the artifacts are of a reasonable size. The
590
701
  # zipped artifact collection will be stored in memory below. The
591
702
  # python-gitlab library doesn't expose a way to check this value directly,
592
703
  # so we need to interact with directly with the headers.
593
704
  artifacts_url = f"{SERVER_CONFIG.gitlab.api_url}/projects/{job.project_id}/jobs/{job.id}/artifacts" # pylint: disable=line-too-long
594
- header_resp = await asyncio.to_thread(
595
- requests.head,
596
- artifacts_url,
597
- allow_redirects=True,
598
- headers={"Authorization": f"Bearer {SERVER_CONFIG.gitlab.api_token}"},
599
- timeout=(3.07, 5),
600
- )
601
- content_length = int(header_resp.headers.get("content-length"))
705
+ LOG.debug("checking artifact URL %s", artifacts_url)
706
+ try:
707
+ head_response = await http.head(
708
+ artifacts_url,
709
+ allow_redirects=True,
710
+ headers={"Authorization": f"Bearer {SERVER_CONFIG.gitlab.api_token}"},
711
+ timeout=5,
712
+ raise_for_status=True,
713
+ )
714
+ except aiohttp.ClientResponseError as ex:
715
+ raise HTTPException(
716
+ status_code=400,
717
+ detail=f"Unable to check artifact URL: [{ex.status}] {ex.message}",
718
+ ) from ex
719
+ content_length = int(head_response.headers.get("content-length"))
602
720
  LOG.debug(
603
721
  "URL: %s, content-length: %d, max length: %d",
604
722
  artifacts_url,
@@ -608,12 +726,14 @@ async def check_artifacts_file_size(job):
608
726
  return content_length <= SERVER_CONFIG.gitlab.max_artifact_size
609
727
 
610
728
 
611
- async def comment_on_mr(
729
+ async def comment_on_mr( # pylint: disable=too-many-arguments disable=too-many-positional-arguments
730
+ forge: Forge,
612
731
  project: gitlab.v4.objects.Project,
613
732
  merge_request_iid: int,
614
733
  job: gitlab.v4.objects.ProjectJob,
615
734
  log_url: str,
616
735
  response: StagedResponse,
736
+ metrics_id: int,
617
737
  ):
618
738
  """Add the Log Detective response as a comment to the merge request"""
619
739
  LOG.debug(
@@ -623,6 +743,10 @@ async def comment_on_mr(
623
743
  response.explanation.text,
624
744
  )
625
745
 
746
+ # First, we'll see if there's an existing comment on this Merge Request
747
+ # and wrap it in <details></details> to reduce noise.
748
+ await suppress_latest_comment(forge, project, merge_request_iid)
749
+
626
750
  # Get the formatted short comment.
627
751
  short_comment = await generate_mr_comment(job, log_url, response, full=False)
628
752
 
@@ -653,6 +777,67 @@ async def comment_on_mr(
653
777
  await asyncio.sleep(5)
654
778
  await asyncio.to_thread(note.save)
655
779
 
780
+ # Save the new comment to the database
781
+ try:
782
+ metrics = AnalyzeRequestMetrics.get_metric_by_id(metrics_id)
783
+ Comments.create(
784
+ forge,
785
+ project.id,
786
+ merge_request_iid,
787
+ job.id,
788
+ discussion.id,
789
+ metrics,
790
+ )
791
+ except sqlalchemy.exc.IntegrityError:
792
+ # We most likely attempted to save a new comment for the same
793
+ # build job. This is somewhat common during development when we're
794
+ # submitting requests manually. It shouldn't really happen in
795
+ # production.
796
+ if not SERVER_CONFIG.general.devmode:
797
+ raise
798
+
799
+
800
+ async def suppress_latest_comment(
801
+ gitlab_instance: str,
802
+ project: gitlab.v4.objects.Project,
803
+ merge_request_iid: int,
804
+ ) -> None:
805
+ """Look up the latest comment on this Merge Request, if any, and wrap it
806
+ in a <details></details> block with a comment indicating that it has been
807
+ superseded by a new push."""
808
+
809
+ # Ask the database for the last known comment for this MR
810
+ previous_comment = Comments.get_latest_comment(
811
+ gitlab_instance, project.id, merge_request_iid
812
+ )
813
+
814
+ if previous_comment is None:
815
+ # No existing comment, so nothing to do.
816
+ return
817
+
818
+ # Retrieve its content from the Gitlab API
819
+
820
+ # Look up the merge request
821
+ merge_request = await asyncio.to_thread(
822
+ project.mergerequests.get, merge_request_iid
823
+ )
824
+
825
+ # Find the discussion matching the latest comment ID
826
+ discussion = await asyncio.to_thread(
827
+ merge_request.discussions.get, previous_comment.comment_id
828
+ )
829
+
830
+ # Get the ID of the first note
831
+ note_id = discussion.attributes["notes"][0]["id"]
832
+ note = discussion.notes.get(note_id)
833
+
834
+ # Wrap the note in <details>, indicating why.
835
+ note.body = (
836
+ "This comment has been superseded by a newer "
837
+ f"Log Detective analysis.\n<details>\n{note.body}\n</details>"
838
+ )
839
+ await asyncio.to_thread(note.save)
840
+
656
841
 
657
842
  async def generate_mr_comment(
658
843
  job: gitlab.v4.objects.ProjectJob,
@@ -727,61 +912,70 @@ def _multiple_svg_figures_response(figures: list[matplotlib.figure.Figure]):
727
912
  return BasicResponse(content=html_content, media_type="text/html")
728
913
 
729
914
 
730
- @app.get("/metrics/analyze", response_class=StreamingResponse)
731
- async def show_analyze_metrics(period_since_now: TimePeriod = Depends(TimePeriod)):
732
- """Show statistics for requests and responses in the given period of time
733
- for the /analyze API endpoint."""
734
- fig_requests = plot.requests_per_time(period_since_now, EndpointType.ANALYZE)
735
- fig_responses = plot.average_time_per_responses(
736
- period_since_now, EndpointType.ANALYZE
737
- )
738
- return _multiple_svg_figures_response([fig_requests, fig_responses])
915
+ class MetricRoute(str, Enum):
916
+ """Routes for metrics"""
739
917
 
918
+ ANALYZE = "analyze"
919
+ ANALYZE_STAGED = "analyze-staged"
920
+ ANALYZE_GITLAB_JOB = "analyze-gitlab"
740
921
 
741
- @app.get("/metrics/analyze/requests", response_class=StreamingResponse)
742
- async def show_analyze_requests(period_since_now: TimePeriod = Depends(TimePeriod)):
743
- """Show statistics for the requests received in the given period of time
744
- for the /analyze API endpoint."""
745
- fig = plot.requests_per_time(period_since_now, EndpointType.ANALYZE)
746
- return _svg_figure_response(fig)
747
922
 
923
+ class Plot(str, Enum):
924
+ """Type of served plots"""
748
925
 
749
- @app.get("/metrics/analyze/responses", response_class=StreamingResponse)
750
- async def show_analyze_responses(period_since_now: TimePeriod = Depends(TimePeriod)):
751
- """Show statistics for responses given in the specified period of time
752
- for the /analyze API endpoint."""
753
- fig = plot.average_time_per_responses(period_since_now, EndpointType.ANALYZE)
754
- return _svg_figure_response(fig)
926
+ REQUESTS = "requests"
927
+ RESPONSES = "responses"
928
+ BOTH = ""
755
929
 
756
930
 
757
- @app.get("/metrics/analyze/staged", response_class=StreamingResponse)
758
- async def show_analyze_staged_metrics(
759
- period_since_now: TimePeriod = Depends(TimePeriod),
760
- ):
761
- """Show statistics for requests and responses in the given period of time
762
- for the /analyze/staged API endpoint."""
763
- fig_requests = plot.requests_per_time(period_since_now, EndpointType.ANALYZE_STAGED)
764
- fig_responses = plot.average_time_per_responses(
765
- period_since_now, EndpointType.ANALYZE_STAGED
766
- )
767
- return _multiple_svg_figures_response([fig_requests, fig_responses])
931
+ ROUTE_TO_ENDPOINT_TYPES = {
932
+ MetricRoute.ANALYZE: EndpointType.ANALYZE,
933
+ MetricRoute.ANALYZE_STAGED: EndpointType.ANALYZE_STAGED,
934
+ MetricRoute.ANALYZE_GITLAB_JOB: EndpointType.ANALYZE_GITLAB_JOB,
935
+ }
768
936
 
769
937
 
770
- @app.get("/metrics/analyze/staged/requests", response_class=StreamingResponse)
771
- async def show_analyze_staged_requests(
938
+ @app.get("/metrics/{route}/", response_class=StreamingResponse)
939
+ @app.get("/metrics/{route}/{plot}", response_class=StreamingResponse)
940
+ async def get_metrics(
941
+ route: MetricRoute,
942
+ plot: Plot = Plot.BOTH,
772
943
  period_since_now: TimePeriod = Depends(TimePeriod),
773
944
  ):
774
- """Show statistics for the requests received in the given period of time
775
- for the /analyze/staged API endpoint."""
776
- fig = plot.requests_per_time(period_since_now, EndpointType.ANALYZE_STAGED)
777
- return _svg_figure_response(fig)
778
-
945
+ """Get an handler for visualize statistics for the specified endpoint and plot."""
946
+ endpoint_type = ROUTE_TO_ENDPOINT_TYPES[route]
947
+
948
+ async def handler():
949
+ """Show statistics for the specified endpoint and plot."""
950
+ if plot == Plot.REQUESTS:
951
+ fig = plot_engine.requests_per_time(period_since_now, endpoint_type)
952
+ return _svg_figure_response(fig)
953
+ if plot == Plot.RESPONSES:
954
+ fig = plot_engine.average_time_per_responses(
955
+ period_since_now, endpoint_type
956
+ )
957
+ return _svg_figure_response(fig)
958
+ # BOTH
959
+ fig_requests = plot_engine.requests_per_time(period_since_now, endpoint_type)
960
+ fig_responses = plot_engine.average_time_per_responses(
961
+ period_since_now, endpoint_type
962
+ )
963
+ return _multiple_svg_figures_response([fig_requests, fig_responses])
964
+
965
+ descriptions = {
966
+ Plot.REQUESTS: (
967
+ "Show statistics for the requests received in the given period of time "
968
+ f"for the /{endpoint_type.value} API endpoint."
969
+ ),
970
+ Plot.RESPONSES: (
971
+ "Show statistics for responses given in the specified period of time "
972
+ f"for the /{endpoint_type.value} API endpoint."
973
+ ),
974
+ Plot.BOTH: (
975
+ "Show statistics for requests and responses in the given period of time "
976
+ f"for the /{endpoint_type.value} API endpoint."
977
+ ),
978
+ }
979
+ handler.__doc__ = descriptions[plot]
779
980
 
780
- @app.get("/metrics/analyze/staged/responses", response_class=StreamingResponse)
781
- async def show_analyze_staged_responses(
782
- period_since_now: TimePeriod = Depends(TimePeriod),
783
- ):
784
- """Show statistics for responses given in the specified period of time
785
- for the /analyze/staged API endpoint."""
786
- fig = plot.average_time_per_responses(period_since_now, EndpointType.ANALYZE_STAGED)
787
- return _svg_figure_response(fig)
981
+ return await handler()