buildgrid 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,19 @@ from contextlib import ExitStack
22
22
  from dataclasses import dataclass
23
23
  from datetime import datetime, timedelta
24
24
  from time import time
25
- from typing import Any, Callable, Generator, Iterable, NamedTuple, Required, Sequence, Tuple, TypedDict, TypeVar, cast
25
+ from typing import (
26
+ Any,
27
+ Callable,
28
+ Generator,
29
+ Iterable,
30
+ NamedTuple,
31
+ Required,
32
+ Sequence,
33
+ Tuple,
34
+ TypedDict,
35
+ TypeVar,
36
+ cast,
37
+ )
26
38
 
27
39
  from buildgrid_metering.client import SyncMeteringServiceClient
28
40
  from buildgrid_metering.models.dataclasses import ComputingUsage, Identity, Usage
@@ -30,7 +42,18 @@ from google.protobuf.any_pb2 import Any as ProtoAny
30
42
  from google.protobuf.internal.containers import RepeatedCompositeFieldContainer
31
43
  from google.protobuf.timestamp_pb2 import Timestamp
32
44
  from grpc import Channel
33
- from sqlalchemy import ColumnExpressionArgument, CursorResult, and_, delete, func, insert, or_, select, text, update
45
+ from sqlalchemy import (
46
+ ColumnExpressionArgument,
47
+ CursorResult,
48
+ and_,
49
+ delete,
50
+ func,
51
+ insert,
52
+ or_,
53
+ select,
54
+ text,
55
+ update,
56
+ )
34
57
  from sqlalchemy.dialects import postgresql
35
58
  from sqlalchemy.exc import IntegrityError
36
59
  from sqlalchemy.orm import Session, joinedload
@@ -51,7 +74,9 @@ from buildgrid._protos.build.bazel.remote.execution.v2.remote_execution_pb2 impo
51
74
  from buildgrid._protos.build.buildbox.execution_stats_pb2 import ExecutionStatistics
52
75
  from buildgrid._protos.build.buildgrid.identity_pb2 import ClientIdentity
53
76
  from buildgrid._protos.build.buildgrid.introspection_pb2 import JobEvent
54
- from buildgrid._protos.build.buildgrid.quota_pb2 import InstanceQuota as InstanceQuotaProto
77
+ from buildgrid._protos.build.buildgrid.quota_pb2 import (
78
+ InstanceQuota as InstanceQuotaProto,
79
+ )
55
80
  from buildgrid._protos.build.buildgrid.scheduling_pb2 import SchedulingMetadata
56
81
  from buildgrid._protos.google.devtools.remoteworkers.v1test2.bots_pb2 import Lease
57
82
  from buildgrid._protos.google.longrunning import operations_pb2
@@ -62,7 +87,11 @@ from buildgrid.server.actioncache.caches.action_cache_abc import ActionCacheABC
62
87
  from buildgrid.server.cas.storage.storage_abc import StorageABC
63
88
  from buildgrid.server.client.asset import AssetClient
64
89
  from buildgrid.server.client.logstream import logstream_client
65
- from buildgrid.server.context import current_instance, instance_context, try_current_instance
90
+ from buildgrid.server.context import (
91
+ current_instance,
92
+ instance_context,
93
+ try_current_instance,
94
+ )
66
95
  from buildgrid.server.decorators import timed
67
96
  from buildgrid.server.enums import (
68
97
  BotStatus,
@@ -85,11 +114,22 @@ from buildgrid.server.exceptions import (
85
114
  )
86
115
  from buildgrid.server.logging import Tags, buildgrid_logger
87
116
  from buildgrid.server.metrics_names import METRIC
88
- from buildgrid.server.metrics_utils import publish_counter_metric, publish_timer_metric, timer
89
- from buildgrid.server.operations.filtering import DEFAULT_SORT_KEYS, OperationFilter, SortKey
117
+ from buildgrid.server.metrics_utils import (
118
+ publish_counter_metric,
119
+ publish_timer_metric,
120
+ timer,
121
+ )
122
+ from buildgrid.server.operations.filtering import (
123
+ DEFAULT_SORT_KEYS,
124
+ OperationFilter,
125
+ SortKey,
126
+ )
90
127
  from buildgrid.server.scheduler import events
91
128
  from buildgrid.server.scheduler.cohorts import CohortSet
92
- from buildgrid.server.settings import DEFAULT_MAX_EXECUTION_TIMEOUT, SQL_SCHEDULER_METRICS_PUBLISH_INTERVAL_SECONDS
129
+ from buildgrid.server.settings import (
130
+ DEFAULT_MAX_EXECUTION_TIMEOUT,
131
+ SQL_SCHEDULER_METRICS_PUBLISH_INTERVAL_SECONDS,
132
+ )
93
133
  from buildgrid.server.sql.models import Base as OrmBase
94
134
  from buildgrid.server.sql.models import (
95
135
  BotEntry,
@@ -180,9 +220,9 @@ class AgedJobHandlerOptions(NamedTuple):
180
220
  )
181
221
 
182
222
  return AgedJobHandlerOptions(
183
- job_max_age=_dict_to_timedelta(job_max_age_cfg) if job_max_age_cfg else timedelta(days=30),
184
- handling_period=_dict_to_timedelta(handling_period_cfg) if handling_period_cfg else timedelta(minutes=5),
185
- max_handling_window=max_handling_window_cfg if max_handling_window_cfg else 10000,
223
+ job_max_age=(_dict_to_timedelta(job_max_age_cfg) if job_max_age_cfg else timedelta(days=30)),
224
+ handling_period=(_dict_to_timedelta(handling_period_cfg) if handling_period_cfg else timedelta(minutes=5)),
225
+ max_handling_window=(max_handling_window_cfg if max_handling_window_cfg else 10000),
186
226
  )
187
227
 
188
228
 
@@ -196,7 +236,15 @@ BotAssignmentFn = Callable[[Session, JobEntry], Tuple[BotEntry, str] | None]
196
236
 
197
237
  # See `_match_job_to_bot` for parameters
198
238
  MatchJobToBotFn = Callable[
199
- [Session, JobEntry, float, BotAssignmentFn, str | None, ColumnExpressionArgument[bool] | None], None
239
+ [
240
+ Session,
241
+ JobEntry,
242
+ float,
243
+ BotAssignmentFn,
244
+ str | None,
245
+ ColumnExpressionArgument[bool] | None,
246
+ ],
247
+ None,
200
248
  ]
201
249
 
202
250
 
@@ -409,11 +457,17 @@ class Scheduler:
409
457
  except NotFoundError:
410
458
  pass
411
459
  except Exception:
412
- LOGGER.exception("Checking ActionCache for action failed.", tags=dict(digest=action_digest))
460
+ LOGGER.exception(
461
+ "Checking ActionCache for action failed.",
462
+ tags=dict(digest=action_digest),
463
+ )
413
464
 
414
465
  # Extend retention for action
415
466
  self._update_action_retention(
416
- action, action_digest, self.queued_action_retention_hours, instance_name=current_instance()
467
+ action,
468
+ action_digest,
469
+ self.queued_action_retention_hours,
470
+ instance_name=current_instance(),
417
471
  )
418
472
 
419
473
  return self.create_operation_for_new_job(
@@ -456,7 +510,10 @@ class Scheduler:
456
510
 
457
511
  # Reschedule if priority is now greater, and we're still waiting on it to start.
458
512
  if priority < job.priority and job.stage == OperationStage.QUEUED.value:
459
- LOGGER.info("Job assigned a new priority.", tags=dict(job_name=job.name, priority=priority))
513
+ LOGGER.info(
514
+ "Job assigned a new priority.",
515
+ tags=dict(job_name=job.name, priority=priority),
516
+ )
460
517
  job.priority = priority
461
518
  job.assigned = False
462
519
 
@@ -672,7 +729,12 @@ class Scheduler:
672
729
  if job:
673
730
  LOGGER.debug(
674
731
  "Loaded job from db.",
675
- tags=dict(job_name=job_name, job_stage=job.stage, result=job.result, instance_name=job.instance_name),
732
+ tags=dict(
733
+ job_name=job_name,
734
+ job_stage=job.stage,
735
+ result=job.result,
736
+ instance_name=job.instance_name,
737
+ ),
676
738
  )
677
739
 
678
740
  return job
@@ -983,7 +1045,10 @@ class Scheduler:
983
1045
  for platform_filter in platform_filters:
984
1046
  key, value = platform_filter.value.split(":", 1)
985
1047
  platform_clauses.append(
986
- and_(PlatformEntry.key == key, platform_filter.operator(PlatformEntry.value, value))
1048
+ and_(
1049
+ PlatformEntry.key == key,
1050
+ platform_filter.operator(PlatformEntry.value, value),
1051
+ )
987
1052
  )
988
1053
 
989
1054
  job_name_subquery = (
@@ -1078,7 +1143,10 @@ class Scheduler:
1078
1143
  JobEntry.property_label.label("property_label"),
1079
1144
  func.count(JobEntry.name).label("job_count"),
1080
1145
  )
1081
- .where(JobEntry.stage < OperationStage.COMPLETED.value, JobEntry.instance_name == instance_name)
1146
+ .where(
1147
+ JobEntry.stage < OperationStage.COMPLETED.value,
1148
+ JobEntry.instance_name == instance_name,
1149
+ )
1082
1150
  .group_by(JobEntry.stage, JobEntry.property_label),
1083
1151
  ).all()
1084
1152
 
@@ -1112,7 +1180,13 @@ class Scheduler:
1112
1180
  )
1113
1181
  )
1114
1182
 
1115
- def _assign_job_to_bot(self, session: Session, job: JobEntry, bot: BotEntry, assignment_strategy: str = "") -> None:
1183
+ def _assign_job_to_bot(
1184
+ self,
1185
+ session: Session,
1186
+ job: JobEntry,
1187
+ bot: BotEntry,
1188
+ assignment_strategy: str = "",
1189
+ ) -> None:
1116
1190
  """Assigns a job to a bot, updating both the job and bot entries in the database.
1117
1191
  `job` and `bot` ORM objects must be from `session`.
1118
1192
  """
@@ -1174,14 +1248,22 @@ class Scheduler:
1174
1248
  ).scalar_one_or_none():
1175
1249
  LOGGER.debug(
1176
1250
  "Matched bot by sampling.",
1177
- tags={"bot_name": bot.name, "attempt": attempt + 1, "bot_capacity": bot.capacity},
1251
+ tags={
1252
+ "bot_name": bot.name,
1253
+ "attempt": attempt + 1,
1254
+ "bot_capacity": bot.capacity,
1255
+ },
1178
1256
  )
1179
1257
  return bot
1180
1258
  LOGGER.debug("No bot matched by sampling after all attempts.")
1181
1259
  return None
1182
1260
 
1183
1261
  def match_bot_by_capacity(
1184
- self, session: Session, job: JobEntry, sampling: SamplingConfig | None = None, bot_cohort: str | None = None
1262
+ self,
1263
+ session: Session,
1264
+ job: JobEntry,
1265
+ sampling: SamplingConfig | None = None,
1266
+ bot_cohort: str | None = None,
1185
1267
  ) -> Tuple[BotEntry, str] | None:
1186
1268
  """Select a bot for a job by capacity."""
1187
1269
  query = (
@@ -1311,7 +1393,8 @@ class Scheduler:
1311
1393
  )
1312
1394
  .order_by(JobEntry.priority.desc(), JobEntry.queued_timestamp.desc())
1313
1395
  .limit(1)
1314
- .with_for_update(skip_locked=True, of=[BotEntry, JobEntry]) # type: ignore
1396
+ # ignore typing as older version of SQLAlchemy .with_for_update() doesn't understand typing for tuples see https://github.com/sqlalchemy/sqlalchemy/issues/12730
1397
+ .with_for_update(skip_locked=True, of=[BotEntry, JobEntry]) # type: ignore[list-item, unused-ignore]
1315
1398
  .execution_options(populate_existing=True)
1316
1399
  )
1317
1400
  if bot_evicted_job := session.execute(eviction_query).one_or_none():
@@ -1332,7 +1415,11 @@ class Scheduler:
1332
1415
  bot.capacity += 1 # Restore capacity from evicted job
1333
1416
 
1334
1417
  session.add(
1335
- JobHistoryEntry(event_type=JobHistoryEvent.EVICTED.value, job_name=evicted_job.name, payload=None)
1418
+ JobHistoryEntry(
1419
+ event_type=JobHistoryEvent.EVICTED.value,
1420
+ job_name=evicted_job.name,
1421
+ payload=None,
1422
+ )
1336
1423
  )
1337
1424
 
1338
1425
  assignment = (bot, JobAssignmentStrategy.PREEMPTION.value)
@@ -1380,7 +1467,8 @@ class Scheduler:
1380
1467
  # The caller didn't check the usage, we apply a minimum check here against max_quota
1381
1468
  instance_quota = session.execute(
1382
1469
  select(InstanceQuota).where(
1383
- InstanceQuota.bot_cohort == bot.cohort, InstanceQuota.instance_name == job.instance_name
1470
+ InstanceQuota.bot_cohort == bot.cohort,
1471
+ InstanceQuota.instance_name == job.instance_name,
1384
1472
  )
1385
1473
  ).scalar_one_or_none()
1386
1474
  if instance_quota is not None and instance_quota.current_usage >= instance_quota.max_quota:
@@ -1439,7 +1527,9 @@ class Scheduler:
1439
1527
  bot_assignment_fn = bot_assignment_fn or self.match_bot_by_capacity
1440
1528
 
1441
1529
  def assign_with_guard(
1442
- session: Session, match_fn: MatchJobToBotFn, guard: ColumnExpressionArgument[bool]
1530
+ session: Session,
1531
+ match_fn: MatchJobToBotFn,
1532
+ guard: ColumnExpressionArgument[bool],
1443
1533
  ) -> bool:
1444
1534
  instance_names_query = (
1445
1535
  select(InstanceQuota.instance_name).where(InstanceQuota.bot_cohort == cohort).where(guard)
@@ -1458,7 +1548,14 @@ class Scheduler:
1458
1548
 
1459
1549
  job = session.execute(job_statement).scalar_one_or_none()
1460
1550
  if job is not None:
1461
- match_fn(session, job, failure_backoff, bot_assignment_fn, assigner_name, guard)
1551
+ match_fn(
1552
+ session,
1553
+ job,
1554
+ failure_backoff,
1555
+ bot_assignment_fn,
1556
+ assigner_name,
1557
+ guard,
1558
+ )
1462
1559
  return True
1463
1560
 
1464
1561
  return False
@@ -1487,12 +1584,16 @@ class Scheduler:
1487
1584
 
1488
1585
  # First, prioritize instances which are below their minimum quota
1489
1586
  updated = assign_with_guard(
1490
- session, match_with_preemption, InstanceQuota.current_usage < InstanceQuota.min_quota
1587
+ session,
1588
+ match_with_preemption,
1589
+ InstanceQuota.current_usage < InstanceQuota.min_quota,
1491
1590
  )
1492
1591
  # Next, consider instances which are below their maximum quota
1493
1592
  if not updated:
1494
1593
  updated = assign_with_guard(
1495
- session, self._match_job_to_bot, InstanceQuota.current_usage < InstanceQuota.max_quota
1594
+ session,
1595
+ self._match_job_to_bot,
1596
+ InstanceQuota.current_usage < InstanceQuota.max_quota,
1496
1597
  )
1497
1598
 
1498
1599
  return 1 if updated else 0
@@ -1635,7 +1736,9 @@ class Scheduler:
1635
1736
  .with_for_update(skip_locked=True)
1636
1737
  )
1637
1738
  return self._batch_timeout_jobs(
1638
- jobs_to_timeout_stmt, code_pb2.UNAVAILABLE, "Operation has been queued for too long"
1739
+ jobs_to_timeout_stmt,
1740
+ code_pb2.UNAVAILABLE,
1741
+ "Operation has been queued for too long",
1639
1742
  )
1640
1743
 
1641
1744
  def prune_timer_loop(self, shutdown_requested: threading.Event) -> None:
@@ -1684,7 +1787,8 @@ class Scheduler:
1684
1787
  with self._sql.session() as session:
1685
1788
  options = {"synchronize_session": "fetch"}
1686
1789
  num_rows_deleted: int = cast(
1687
- CursorResult[Any], session.execute(delete_stmt, execution_options=options)
1790
+ CursorResult[Any],
1791
+ session.execute(delete_stmt, execution_options=options),
1688
1792
  ).rowcount
1689
1793
 
1690
1794
  if num_rows_deleted:
@@ -1855,7 +1959,11 @@ class Scheduler:
1855
1959
  }
1856
1960
  LOGGER.debug("Closing bot session.", tags=log_tags)
1857
1961
  for job in self._get_incomplete_jobs_for_bot(bot.bot_id, session, with_for_update=True):
1858
- lease_tags = {**log_tags, "db.lease_id": job.name, "db.lease_state": job.lease_state()}
1962
+ lease_tags = {
1963
+ **log_tags,
1964
+ "db.lease_id": job.name,
1965
+ "db.lease_state": job.lease_state(),
1966
+ }
1859
1967
  LOGGER.debug("Reassigning job for bot session.", tags=lease_tags)
1860
1968
  self._retry_job(session, job)
1861
1969
  self._notify_job_updated(job.name, session)
@@ -1866,7 +1974,10 @@ class Scheduler:
1866
1974
  self._batch_update_instance_quota_usage(session, usage_diff)
1867
1975
 
1868
1976
  def session_expiry_timer_loop(self, shutdown_requested: threading.Event) -> None:
1869
- LOGGER.info("Starting BotSession reaper.", tags=dict(keepalive_timeout=self.bot_session_keepalive_timeout))
1977
+ LOGGER.info(
1978
+ "Starting BotSession reaper.",
1979
+ tags=dict(keepalive_timeout=self.bot_session_keepalive_timeout),
1980
+ )
1870
1981
  while not shutdown_requested.is_set():
1871
1982
  try:
1872
1983
  while self.reap_expired_sessions():
@@ -1896,7 +2007,10 @@ class Scheduler:
1896
2007
  LOGGER.warning(
1897
2008
  "BotSession has expired.",
1898
2009
  tags=dict(
1899
- name=bot.name, bot_id=bot.bot_id, instance_name=bot.instance_name, deadline=bot.expiry_time
2010
+ name=bot.name,
2011
+ bot_id=bot.bot_id,
2012
+ instance_name=bot.instance_name,
2013
+ deadline=bot.expiry_time,
1900
2014
  ),
1901
2015
  )
1902
2016
  bots_by_instance[bot.instance_name].append(bot)
@@ -1930,7 +2044,11 @@ class Scheduler:
1930
2044
 
1931
2045
  @timed(METRIC.SCHEDULER.ASSIGNMENT_DURATION)
1932
2046
  def _fetch_job_for_bot(
1933
- self, session: Session, bot: BotEntry, usage_diffs: InstanceQuotaUsageDiffs, log_tags: Tags
2047
+ self,
2048
+ session: Session,
2049
+ bot: BotEntry,
2050
+ usage_diffs: InstanceQuotaUsageDiffs,
2051
+ log_tags: Tags,
1934
2052
  ) -> JobEntry | None:
1935
2053
  # Attempt to fetch a new job for a bot to work on.
1936
2054
  # This can help if there are usually more jobs available than bots.
@@ -1961,7 +2079,10 @@ class Scheduler:
1961
2079
  if next_job := session.execute(job_statement).scalar_one_or_none():
1962
2080
  log_tags["db.next_job_name"] = next_job.name
1963
2081
  self._assign_job_to_bot(
1964
- session, next_job, bot, assignment_strategy=JobAssignmentStrategy.PROACTIVE.value
2082
+ session,
2083
+ next_job,
2084
+ bot,
2085
+ assignment_strategy=JobAssignmentStrategy.PROACTIVE.value,
1965
2086
  )
1966
2087
  start_timestamp = Timestamp()
1967
2088
  start_timestamp.FromDatetime(next_job.queued_timestamp)
@@ -2010,7 +2131,6 @@ class Scheduler:
2010
2131
  "request.bot_id": bot_id,
2011
2132
  "request.bot_status": bot_status,
2012
2133
  "request.bot_name": bot_name,
2013
- "request.leases": {lease.id: lease.state for lease in bot_session_leases},
2014
2134
  "request.capacity": max_capacity,
2015
2135
  }
2016
2136
 
@@ -2112,7 +2232,7 @@ class Scheduler:
2112
2232
  if db_bot_version == bot_version:
2113
2233
  return active_leases, bot_version
2114
2234
 
2115
- bot_jobs_stmt = select(JobEntry).where(
2235
+ bot_job_names_stmt = select(JobEntry.name).where(
2116
2236
  JobEntry.worker_name == bot_id,
2117
2237
  JobEntry.stage >= OperationStage.QUEUED.value,
2118
2238
  JobEntry.stage < OperationStage.COMPLETED.value,
@@ -2120,20 +2240,25 @@ class Scheduler:
2120
2240
 
2121
2241
  # If this bot is instance-restricted, only look for jobs in the current instance.
2122
2242
  if instance_restricted_bot:
2123
- bot_jobs_stmt = bot_jobs_stmt.where(self._job_in_instance_pool())
2243
+ bot_job_names_stmt = bot_job_names_stmt.where(self._job_in_instance_pool())
2124
2244
 
2125
- jobs = {job.name: job for job in session.execute(bot_jobs_stmt).scalars().all()}
2126
- db_lease_ids = set(jobs.keys())
2127
- log_tags["db.leases"] = {job.name: job.lease_state() for job in jobs.values()}
2245
+ db_lease_ids = set(session.execute(bot_job_names_stmt).scalars().all())
2128
2246
 
2129
2247
  for lease in active_leases:
2130
2248
  # Set specific tags in log lines for the lease currently being synchronized.
2131
2249
  # This can help to identify a problematic lease in logs for a bot with multiple leases assigned.
2132
- lease_tags = {**log_tags, "request.lease_id": lease.id, "request.lease_state": lease.state}
2250
+ lease_tags = {
2251
+ **log_tags,
2252
+ "request.lease_id": lease.id,
2253
+ "request.lease_state": lease.state,
2254
+ }
2133
2255
 
2134
2256
  # If the database has no lease, but the work is completed, we probably timed out the last call.
2135
2257
  if lease.id not in db_lease_ids and lease.state == LeaseState.COMPLETED.value:
2136
- LOGGER.debug("No lease in database, but session lease is completed. Skipping.", tags=lease_tags)
2258
+ LOGGER.debug(
2259
+ "No lease in database, but session lease is completed. Skipping.",
2260
+ tags=lease_tags,
2261
+ )
2137
2262
  continue
2138
2263
 
2139
2264
  # Remove this lease ID from db_lease_ids if present, now that we know we're handling it.
@@ -2143,7 +2268,10 @@ class Scheduler:
2143
2268
 
2144
2269
  job = self._get_job(lease.id, session)
2145
2270
  if not job or job.worker_name != bot_id:
2146
- LOGGER.info("Lease is deleted or assigned to another bot. Skipping.", tags=lease_tags)
2271
+ LOGGER.info(
2272
+ "Lease is deleted or assigned to another bot. Skipping.",
2273
+ tags=lease_tags,
2274
+ )
2147
2275
  continue
2148
2276
 
2149
2277
  lease_tags["db.lease_id"] = job.name
@@ -2201,7 +2329,10 @@ class Scheduler:
2201
2329
  if lease_state == LeaseState.PENDING.value:
2202
2330
  # Need another iteration to flip the state to ACTIVE
2203
2331
  # See also `_activate_bot_pending_leases`
2204
- LOGGER.debug("Lease was assigned by an old scheduler during synchronization.", tags=log_tags)
2332
+ LOGGER.debug(
2333
+ "Lease was assigned by an old scheduler during synchronization.",
2334
+ tags=log_tags,
2335
+ )
2205
2336
  continue
2206
2337
 
2207
2338
  # Assign:
@@ -2263,13 +2394,20 @@ class Scheduler:
2263
2394
  raise InvalidArgumentError(f"Bot does not exist while reporting completed leases. {log_tags}")
2264
2395
 
2265
2396
  for lease in completed_leases:
2266
- lease_tags = {**log_tags, "request.lease_id": lease.id, "request.lease_state": lease.state}
2397
+ lease_tags = {
2398
+ **log_tags,
2399
+ "request.lease_id": lease.id,
2400
+ "request.lease_state": lease.state,
2401
+ }
2267
2402
  job = self._get_job(lease.id, session, with_for_update=True)
2268
2403
 
2269
2404
  if not job or job.worker_name != bot.bot_id or job.stage != OperationStage.EXECUTING.value:
2270
2405
  if job:
2271
2406
  lease_tags["job.stage"] = job.stage
2272
- LOGGER.warning("Completed lease points to non-existent or invalid job. Skipping.", tags=lease_tags)
2407
+ LOGGER.warning(
2408
+ "Completed lease points to non-existent or invalid job. Skipping.",
2409
+ tags=lease_tags,
2410
+ )
2273
2411
  continue
2274
2412
 
2275
2413
  completion_tags = {
@@ -2324,7 +2462,8 @@ class Scheduler:
2324
2462
 
2325
2463
  if job.n_tries >= self.max_job_attempts:
2326
2464
  status = status_pb2.Status(
2327
- code=code_pb2.ABORTED, message=f"Job was retried {job.n_tries} unsuccessfully. Aborting."
2465
+ code=code_pb2.ABORTED,
2466
+ message=f"Job was retried {job.n_tries} unsuccessfully. Aborting.",
2328
2467
  )
2329
2468
  self._complete_job(session, job, status=status)
2330
2469
  return
@@ -2486,14 +2625,22 @@ class Scheduler:
2486
2625
  try:
2487
2626
  LOGGER.debug(
2488
2627
  "Recording bot locality hint.",
2489
- tags=dict(job_name=job.name, bot_name=bot_name, locality_hint=job.locality_hint),
2628
+ tags=dict(
2629
+ job_name=job.name,
2630
+ bot_name=bot_name,
2631
+ locality_hint=job.locality_hint,
2632
+ ),
2490
2633
  )
2491
2634
  self._record_bot_locality_hint(session, bot_name, job.locality_hint)
2492
2635
  except Exception:
2493
2636
  # Don't fail job completion if locality hint recording fails
2494
2637
  LOGGER.warning(
2495
2638
  "Failed to record bot locality hint.",
2496
- tags=dict(job_name=job.name, bot_name=bot_name, locality_hint=job.locality_hint),
2639
+ tags=dict(
2640
+ job_name=job.name,
2641
+ bot_name=bot_name,
2642
+ locality_hint=job.locality_hint,
2643
+ ),
2497
2644
  exc_info=True,
2498
2645
  )
2499
2646
 
@@ -2506,7 +2653,9 @@ class Scheduler:
2506
2653
  )
2507
2654
  if action_result.ByteSize() > 0:
2508
2655
  self._update_action_result_retention(
2509
- action_result, retention_hours=self.action_result_retention_hours, instance_name=job.instance_name
2656
+ action_result,
2657
+ retention_hours=self.action_result_retention_hours,
2658
+ instance_name=job.instance_name,
2510
2659
  )
2511
2660
 
2512
2661
  worker_duration = None
@@ -2554,7 +2703,11 @@ class Scheduler:
2554
2703
 
2555
2704
  # bot count by status for each property label
2556
2705
  query_per_label = (
2557
- session.query(BotEntry.bot_status, PropertyLabelEntry.property_label, func.count(BotEntry.bot_status))
2706
+ session.query(
2707
+ BotEntry.bot_status,
2708
+ PropertyLabelEntry.property_label,
2709
+ func.count(BotEntry.bot_status),
2710
+ )
2558
2711
  .join(BotEntry, BotEntry.name == PropertyLabelEntry.bot_name)
2559
2712
  .group_by(BotEntry.bot_status, PropertyLabelEntry.property_label)
2560
2713
  .filter(self._bot_in_instance_pool())
@@ -2571,7 +2724,11 @@ class Scheduler:
2571
2724
  metrics["available_capacity_total"][BotStatus(status)] = cast(int, capacity)
2572
2725
 
2573
2726
  capacity_per_label_stmt = (
2574
- select(BotEntry.bot_status, PropertyLabelEntry.property_label, func.sum(BotEntry.capacity))
2727
+ select(
2728
+ BotEntry.bot_status,
2729
+ PropertyLabelEntry.property_label,
2730
+ func.sum(BotEntry.capacity),
2731
+ )
2575
2732
  .join(BotEntry, BotEntry.name == PropertyLabelEntry.bot_name)
2576
2733
  .group_by(BotEntry.bot_status, PropertyLabelEntry.property_label)
2577
2734
  .where(self._bot_in_instance_pool())
@@ -2595,7 +2752,11 @@ class Scheduler:
2595
2752
 
2596
2753
  locate_bot_stmt = (
2597
2754
  select(BotEntry)
2598
- .where(BotEntry.name == bot_name, BotEntry.bot_id == bot_id, self._bot_in_instance_pool())
2755
+ .where(
2756
+ BotEntry.name == bot_name,
2757
+ BotEntry.bot_id == bot_id,
2758
+ self._bot_in_instance_pool(),
2759
+ )
2599
2760
  .with_for_update()
2600
2761
  )
2601
2762
  with self._sql.session() as session:
@@ -2630,7 +2791,12 @@ class Scheduler:
2630
2791
  stdout_stream_name=job.stdout_stream_write_name or "",
2631
2792
  partial_execution_metadata=self.get_execute_action_metadata(job),
2632
2793
  )
2633
- metadata.append(("executeoperationmetadata-bin", job_metadata.SerializeToString()))
2794
+ metadata.append(
2795
+ (
2796
+ "executeoperationmetadata-bin",
2797
+ job_metadata.SerializeToString(),
2798
+ )
2799
+ )
2634
2800
 
2635
2801
  return metadata
2636
2802
 
@@ -2647,9 +2813,15 @@ class Scheduler:
2647
2813
  assign_timestamp(metadata.worker_start_timestamp, job.worker_start_timestamp)
2648
2814
  assign_timestamp(metadata.worker_completed_timestamp, job.worker_completed_timestamp)
2649
2815
  assign_timestamp(metadata.input_fetch_start_timestamp, job.input_fetch_start_timestamp)
2650
- assign_timestamp(metadata.input_fetch_completed_timestamp, job.input_fetch_completed_timestamp)
2816
+ assign_timestamp(
2817
+ metadata.input_fetch_completed_timestamp,
2818
+ job.input_fetch_completed_timestamp,
2819
+ )
2651
2820
  assign_timestamp(metadata.output_upload_start_timestamp, job.output_upload_start_timestamp)
2652
- assign_timestamp(metadata.output_upload_completed_timestamp, job.output_upload_completed_timestamp)
2821
+ assign_timestamp(
2822
+ metadata.output_upload_completed_timestamp,
2823
+ job.output_upload_completed_timestamp,
2824
+ )
2653
2825
  assign_timestamp(metadata.execution_start_timestamp, job.execution_start_timestamp)
2654
2826
  assign_timestamp(metadata.execution_completed_timestamp, job.execution_completed_timestamp)
2655
2827
 
@@ -2697,7 +2869,12 @@ class Scheduler:
2697
2869
  ) -> None:
2698
2870
  with self._sql_ro.session(expire_on_commit=False) as session:
2699
2871
  self._publish_execution_stats(
2700
- session, job_name, instance_name, execution_metadata, property_label, assigner_name
2872
+ session,
2873
+ job_name,
2874
+ instance_name,
2875
+ execution_metadata,
2876
+ property_label,
2877
+ assigner_name,
2701
2878
  )
2702
2879
 
2703
2880
  def _publish_execution_stats(
@@ -2720,17 +2897,46 @@ class Scheduler:
2720
2897
  upload_start = execution_metadata.output_upload_start_timestamp
2721
2898
  upload_completed = execution_metadata.output_upload_completed_timestamp
2722
2899
 
2723
- self._publish_job_duration(instance_name, queued, worker_completed, "Total", property_label, assigner_name)
2900
+ self._publish_job_duration(
2901
+ instance_name,
2902
+ queued,
2903
+ worker_completed,
2904
+ "Total",
2905
+ property_label,
2906
+ assigner_name,
2907
+ )
2724
2908
  # The Queued time is missing here as it's posted as soon as worker has accepted the job.
2725
2909
  self._publish_job_duration(
2726
- instance_name, worker_start, worker_completed, "Worker", property_label, assigner_name
2910
+ instance_name,
2911
+ worker_start,
2912
+ worker_completed,
2913
+ "Worker",
2914
+ property_label,
2915
+ assigner_name,
2727
2916
  )
2728
- self._publish_job_duration(instance_name, fetch_start, fetch_completed, "Fetch", property_label, assigner_name)
2729
2917
  self._publish_job_duration(
2730
- instance_name, execution_start, execution_completed, "Execution", property_label, assigner_name
2918
+ instance_name,
2919
+ fetch_start,
2920
+ fetch_completed,
2921
+ "Fetch",
2922
+ property_label,
2923
+ assigner_name,
2731
2924
  )
2732
2925
  self._publish_job_duration(
2733
- instance_name, upload_start, upload_completed, "Upload", property_label, assigner_name
2926
+ instance_name,
2927
+ execution_start,
2928
+ execution_completed,
2929
+ "Execution",
2930
+ property_label,
2931
+ assigner_name,
2932
+ )
2933
+ self._publish_job_duration(
2934
+ instance_name,
2935
+ upload_start,
2936
+ upload_completed,
2937
+ "Upload",
2938
+ property_label,
2939
+ assigner_name,
2734
2940
  )
2735
2941
 
2736
2942
  if self.metering_client is None or len(execution_metadata.auxiliary_metadata) == 0:
@@ -2767,10 +2973,18 @@ class Scheduler:
2767
2973
  )
2768
2974
  self.metering_client.put_usage(identity=client_id, operation_name=op.name, usage=usage)
2769
2975
  except Exception as exc:
2770
- LOGGER.exception("Cannot publish resource usage.", tags=dict(job_name=job_name), exc_info=exc)
2976
+ LOGGER.exception(
2977
+ "Cannot publish resource usage.",
2978
+ tags=dict(job_name=job_name),
2979
+ exc_info=exc,
2980
+ )
2771
2981
 
2772
2982
  def _update_action_retention(
2773
- self, action: Action, action_digest: Digest, retention_hours: float | None, instance_name: str
2983
+ self,
2984
+ action: Action,
2985
+ action_digest: Digest,
2986
+ retention_hours: float | None,
2987
+ instance_name: str,
2774
2988
  ) -> None:
2775
2989
  if not self.asset_client or not retention_hours:
2776
2990
  return
@@ -2791,14 +3005,18 @@ class Scheduler:
2791
3005
  instance_name=instance_name,
2792
3006
  )
2793
3007
  LOGGER.debug(
2794
- "Extended the retention of action.", tags=dict(digest=action_digest, retention_hours=retention_hours)
3008
+ "Extended the retention of action.",
3009
+ tags=dict(digest=action_digest, retention_hours=retention_hours),
2795
3010
  )
2796
3011
  except Exception:
2797
3012
  LOGGER.exception("Failed to push action as an asset.", tags=dict(digest=action_digest))
2798
3013
  # Not a fatal path, don't reraise here
2799
3014
 
2800
3015
  def _update_action_result_retention(
2801
- self, action_result: ActionResult, retention_hours: float | None, instance_name: str
3016
+ self,
3017
+ action_result: ActionResult,
3018
+ retention_hours: float | None,
3019
+ instance_name: str,
2802
3020
  ) -> None:
2803
3021
  if not self.asset_client or not retention_hours:
2804
3022
  return
@@ -2842,11 +3060,16 @@ class Scheduler:
2842
3060
  instance_name=instance_name,
2843
3061
  )
2844
3062
  LOGGER.debug(
2845
- "Extended the retention of action result.", tags=dict(digest=digest, retention_hours=retention_hours)
3063
+ "Extended the retention of action result.",
3064
+ tags=dict(digest=digest, retention_hours=retention_hours),
2846
3065
  )
2847
3066
 
2848
3067
  except Exception as e:
2849
- LOGGER.exception("Failed to push action_result as an asset.", tags=dict(digest=digest), exc_info=e)
3068
+ LOGGER.exception(
3069
+ "Failed to push action_result as an asset.",
3070
+ tags=dict(digest=digest),
3071
+ exc_info=e,
3072
+ )
2850
3073
  # Not a fatal path, don't reraise here
2851
3074
 
2852
3075
  def _record_bot_locality_hint(self, session: Session, bot_name: str, locality_hint: str) -> None:
@@ -2885,7 +3108,8 @@ class Scheduler:
2885
3108
  # Delete all hints older than the K-th most recent
2886
3109
  session.execute(
2887
3110
  delete(BotLocalityHintEntry).where(
2888
- BotLocalityHintEntry.bot_name == bot_name, BotLocalityHintEntry.sequence_number < k_th_seq
3111
+ BotLocalityHintEntry.bot_name == bot_name,
3112
+ BotLocalityHintEntry.sequence_number < k_th_seq,
2889
3113
  )
2890
3114
  )
2891
3115
 
@@ -2983,7 +3207,11 @@ class Scheduler:
2983
3207
 
2984
3208
  LOGGER.warning(
2985
3209
  "Instance usage not updated.",
2986
- tags={"cohort": bot_cohort, "instance_name": instance_name, "delta": delta},
3210
+ tags={
3211
+ "cohort": bot_cohort,
3212
+ "instance_name": instance_name,
3213
+ "delta": delta,
3214
+ },
2987
3215
  )
2988
3216
  return False
2989
3217
  return True