buildgrid 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- buildgrid/server/app/cli.py +4 -4
- buildgrid/server/app/settings/parser.py +46 -17
- buildgrid/server/app/settings/schema.yml +16 -2
- buildgrid/server/bots/service.py +4 -10
- buildgrid/server/cas/storage/index/index_abc.py +0 -7
- buildgrid/server/cas/storage/index/sql.py +91 -215
- buildgrid/server/cas/storage/redis_fmb_cache.py +220 -0
- buildgrid/server/enums.py +5 -0
- buildgrid/server/metrics_names.py +0 -2
- buildgrid/server/scheduler/impl.py +298 -70
- buildgrid/server/sql/alembic/versions/3737630fc9cf_remove_deleted_column_from_sql_cas_index.py +43 -0
- buildgrid/server/sql/models.py +0 -2
- buildgrid/server/sql/utils.py +3 -3
- buildgrid/server/utils/bots.py +1 -1
- buildgrid/server/version.py +1 -1
- {buildgrid-0.3.5.dist-info → buildgrid-0.4.0.dist-info}/METADATA +2 -2
- {buildgrid-0.3.5.dist-info → buildgrid-0.4.0.dist-info}/RECORD +21 -19
- {buildgrid-0.3.5.dist-info → buildgrid-0.4.0.dist-info}/WHEEL +1 -1
- {buildgrid-0.3.5.dist-info → buildgrid-0.4.0.dist-info}/entry_points.txt +0 -0
- {buildgrid-0.3.5.dist-info → buildgrid-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {buildgrid-0.3.5.dist-info → buildgrid-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -22,7 +22,19 @@ from contextlib import ExitStack
|
|
|
22
22
|
from dataclasses import dataclass
|
|
23
23
|
from datetime import datetime, timedelta
|
|
24
24
|
from time import time
|
|
25
|
-
from typing import
|
|
25
|
+
from typing import (
|
|
26
|
+
Any,
|
|
27
|
+
Callable,
|
|
28
|
+
Generator,
|
|
29
|
+
Iterable,
|
|
30
|
+
NamedTuple,
|
|
31
|
+
Required,
|
|
32
|
+
Sequence,
|
|
33
|
+
Tuple,
|
|
34
|
+
TypedDict,
|
|
35
|
+
TypeVar,
|
|
36
|
+
cast,
|
|
37
|
+
)
|
|
26
38
|
|
|
27
39
|
from buildgrid_metering.client import SyncMeteringServiceClient
|
|
28
40
|
from buildgrid_metering.models.dataclasses import ComputingUsage, Identity, Usage
|
|
@@ -30,7 +42,18 @@ from google.protobuf.any_pb2 import Any as ProtoAny
|
|
|
30
42
|
from google.protobuf.internal.containers import RepeatedCompositeFieldContainer
|
|
31
43
|
from google.protobuf.timestamp_pb2 import Timestamp
|
|
32
44
|
from grpc import Channel
|
|
33
|
-
from sqlalchemy import
|
|
45
|
+
from sqlalchemy import (
|
|
46
|
+
ColumnExpressionArgument,
|
|
47
|
+
CursorResult,
|
|
48
|
+
and_,
|
|
49
|
+
delete,
|
|
50
|
+
func,
|
|
51
|
+
insert,
|
|
52
|
+
or_,
|
|
53
|
+
select,
|
|
54
|
+
text,
|
|
55
|
+
update,
|
|
56
|
+
)
|
|
34
57
|
from sqlalchemy.dialects import postgresql
|
|
35
58
|
from sqlalchemy.exc import IntegrityError
|
|
36
59
|
from sqlalchemy.orm import Session, joinedload
|
|
@@ -51,7 +74,9 @@ from buildgrid._protos.build.bazel.remote.execution.v2.remote_execution_pb2 impo
|
|
|
51
74
|
from buildgrid._protos.build.buildbox.execution_stats_pb2 import ExecutionStatistics
|
|
52
75
|
from buildgrid._protos.build.buildgrid.identity_pb2 import ClientIdentity
|
|
53
76
|
from buildgrid._protos.build.buildgrid.introspection_pb2 import JobEvent
|
|
54
|
-
from buildgrid._protos.build.buildgrid.quota_pb2 import
|
|
77
|
+
from buildgrid._protos.build.buildgrid.quota_pb2 import (
|
|
78
|
+
InstanceQuota as InstanceQuotaProto,
|
|
79
|
+
)
|
|
55
80
|
from buildgrid._protos.build.buildgrid.scheduling_pb2 import SchedulingMetadata
|
|
56
81
|
from buildgrid._protos.google.devtools.remoteworkers.v1test2.bots_pb2 import Lease
|
|
57
82
|
from buildgrid._protos.google.longrunning import operations_pb2
|
|
@@ -62,7 +87,11 @@ from buildgrid.server.actioncache.caches.action_cache_abc import ActionCacheABC
|
|
|
62
87
|
from buildgrid.server.cas.storage.storage_abc import StorageABC
|
|
63
88
|
from buildgrid.server.client.asset import AssetClient
|
|
64
89
|
from buildgrid.server.client.logstream import logstream_client
|
|
65
|
-
from buildgrid.server.context import
|
|
90
|
+
from buildgrid.server.context import (
|
|
91
|
+
current_instance,
|
|
92
|
+
instance_context,
|
|
93
|
+
try_current_instance,
|
|
94
|
+
)
|
|
66
95
|
from buildgrid.server.decorators import timed
|
|
67
96
|
from buildgrid.server.enums import (
|
|
68
97
|
BotStatus,
|
|
@@ -85,11 +114,22 @@ from buildgrid.server.exceptions import (
|
|
|
85
114
|
)
|
|
86
115
|
from buildgrid.server.logging import Tags, buildgrid_logger
|
|
87
116
|
from buildgrid.server.metrics_names import METRIC
|
|
88
|
-
from buildgrid.server.metrics_utils import
|
|
89
|
-
|
|
117
|
+
from buildgrid.server.metrics_utils import (
|
|
118
|
+
publish_counter_metric,
|
|
119
|
+
publish_timer_metric,
|
|
120
|
+
timer,
|
|
121
|
+
)
|
|
122
|
+
from buildgrid.server.operations.filtering import (
|
|
123
|
+
DEFAULT_SORT_KEYS,
|
|
124
|
+
OperationFilter,
|
|
125
|
+
SortKey,
|
|
126
|
+
)
|
|
90
127
|
from buildgrid.server.scheduler import events
|
|
91
128
|
from buildgrid.server.scheduler.cohorts import CohortSet
|
|
92
|
-
from buildgrid.server.settings import
|
|
129
|
+
from buildgrid.server.settings import (
|
|
130
|
+
DEFAULT_MAX_EXECUTION_TIMEOUT,
|
|
131
|
+
SQL_SCHEDULER_METRICS_PUBLISH_INTERVAL_SECONDS,
|
|
132
|
+
)
|
|
93
133
|
from buildgrid.server.sql.models import Base as OrmBase
|
|
94
134
|
from buildgrid.server.sql.models import (
|
|
95
135
|
BotEntry,
|
|
@@ -180,9 +220,9 @@ class AgedJobHandlerOptions(NamedTuple):
|
|
|
180
220
|
)
|
|
181
221
|
|
|
182
222
|
return AgedJobHandlerOptions(
|
|
183
|
-
job_max_age=_dict_to_timedelta(job_max_age_cfg) if job_max_age_cfg else timedelta(days=30),
|
|
184
|
-
handling_period=_dict_to_timedelta(handling_period_cfg) if handling_period_cfg else timedelta(minutes=5),
|
|
185
|
-
max_handling_window=max_handling_window_cfg if max_handling_window_cfg else 10000,
|
|
223
|
+
job_max_age=(_dict_to_timedelta(job_max_age_cfg) if job_max_age_cfg else timedelta(days=30)),
|
|
224
|
+
handling_period=(_dict_to_timedelta(handling_period_cfg) if handling_period_cfg else timedelta(minutes=5)),
|
|
225
|
+
max_handling_window=(max_handling_window_cfg if max_handling_window_cfg else 10000),
|
|
186
226
|
)
|
|
187
227
|
|
|
188
228
|
|
|
@@ -196,7 +236,15 @@ BotAssignmentFn = Callable[[Session, JobEntry], Tuple[BotEntry, str] | None]
|
|
|
196
236
|
|
|
197
237
|
# See `_match_job_to_bot` for parameters
|
|
198
238
|
MatchJobToBotFn = Callable[
|
|
199
|
-
[
|
|
239
|
+
[
|
|
240
|
+
Session,
|
|
241
|
+
JobEntry,
|
|
242
|
+
float,
|
|
243
|
+
BotAssignmentFn,
|
|
244
|
+
str | None,
|
|
245
|
+
ColumnExpressionArgument[bool] | None,
|
|
246
|
+
],
|
|
247
|
+
None,
|
|
200
248
|
]
|
|
201
249
|
|
|
202
250
|
|
|
@@ -409,11 +457,17 @@ class Scheduler:
|
|
|
409
457
|
except NotFoundError:
|
|
410
458
|
pass
|
|
411
459
|
except Exception:
|
|
412
|
-
LOGGER.exception(
|
|
460
|
+
LOGGER.exception(
|
|
461
|
+
"Checking ActionCache for action failed.",
|
|
462
|
+
tags=dict(digest=action_digest),
|
|
463
|
+
)
|
|
413
464
|
|
|
414
465
|
# Extend retention for action
|
|
415
466
|
self._update_action_retention(
|
|
416
|
-
action,
|
|
467
|
+
action,
|
|
468
|
+
action_digest,
|
|
469
|
+
self.queued_action_retention_hours,
|
|
470
|
+
instance_name=current_instance(),
|
|
417
471
|
)
|
|
418
472
|
|
|
419
473
|
return self.create_operation_for_new_job(
|
|
@@ -456,7 +510,10 @@ class Scheduler:
|
|
|
456
510
|
|
|
457
511
|
# Reschedule if priority is now greater, and we're still waiting on it to start.
|
|
458
512
|
if priority < job.priority and job.stage == OperationStage.QUEUED.value:
|
|
459
|
-
LOGGER.info(
|
|
513
|
+
LOGGER.info(
|
|
514
|
+
"Job assigned a new priority.",
|
|
515
|
+
tags=dict(job_name=job.name, priority=priority),
|
|
516
|
+
)
|
|
460
517
|
job.priority = priority
|
|
461
518
|
job.assigned = False
|
|
462
519
|
|
|
@@ -672,7 +729,12 @@ class Scheduler:
|
|
|
672
729
|
if job:
|
|
673
730
|
LOGGER.debug(
|
|
674
731
|
"Loaded job from db.",
|
|
675
|
-
tags=dict(
|
|
732
|
+
tags=dict(
|
|
733
|
+
job_name=job_name,
|
|
734
|
+
job_stage=job.stage,
|
|
735
|
+
result=job.result,
|
|
736
|
+
instance_name=job.instance_name,
|
|
737
|
+
),
|
|
676
738
|
)
|
|
677
739
|
|
|
678
740
|
return job
|
|
@@ -983,7 +1045,10 @@ class Scheduler:
|
|
|
983
1045
|
for platform_filter in platform_filters:
|
|
984
1046
|
key, value = platform_filter.value.split(":", 1)
|
|
985
1047
|
platform_clauses.append(
|
|
986
|
-
and_(
|
|
1048
|
+
and_(
|
|
1049
|
+
PlatformEntry.key == key,
|
|
1050
|
+
platform_filter.operator(PlatformEntry.value, value),
|
|
1051
|
+
)
|
|
987
1052
|
)
|
|
988
1053
|
|
|
989
1054
|
job_name_subquery = (
|
|
@@ -1078,7 +1143,10 @@ class Scheduler:
|
|
|
1078
1143
|
JobEntry.property_label.label("property_label"),
|
|
1079
1144
|
func.count(JobEntry.name).label("job_count"),
|
|
1080
1145
|
)
|
|
1081
|
-
.where(
|
|
1146
|
+
.where(
|
|
1147
|
+
JobEntry.stage < OperationStage.COMPLETED.value,
|
|
1148
|
+
JobEntry.instance_name == instance_name,
|
|
1149
|
+
)
|
|
1082
1150
|
.group_by(JobEntry.stage, JobEntry.property_label),
|
|
1083
1151
|
).all()
|
|
1084
1152
|
|
|
@@ -1112,7 +1180,13 @@ class Scheduler:
|
|
|
1112
1180
|
)
|
|
1113
1181
|
)
|
|
1114
1182
|
|
|
1115
|
-
def _assign_job_to_bot(
|
|
1183
|
+
def _assign_job_to_bot(
|
|
1184
|
+
self,
|
|
1185
|
+
session: Session,
|
|
1186
|
+
job: JobEntry,
|
|
1187
|
+
bot: BotEntry,
|
|
1188
|
+
assignment_strategy: str = "",
|
|
1189
|
+
) -> None:
|
|
1116
1190
|
"""Assigns a job to a bot, updating both the job and bot entries in the database.
|
|
1117
1191
|
`job` and `bot` ORM objects must be from `session`.
|
|
1118
1192
|
"""
|
|
@@ -1174,14 +1248,22 @@ class Scheduler:
|
|
|
1174
1248
|
).scalar_one_or_none():
|
|
1175
1249
|
LOGGER.debug(
|
|
1176
1250
|
"Matched bot by sampling.",
|
|
1177
|
-
tags={
|
|
1251
|
+
tags={
|
|
1252
|
+
"bot_name": bot.name,
|
|
1253
|
+
"attempt": attempt + 1,
|
|
1254
|
+
"bot_capacity": bot.capacity,
|
|
1255
|
+
},
|
|
1178
1256
|
)
|
|
1179
1257
|
return bot
|
|
1180
1258
|
LOGGER.debug("No bot matched by sampling after all attempts.")
|
|
1181
1259
|
return None
|
|
1182
1260
|
|
|
1183
1261
|
def match_bot_by_capacity(
|
|
1184
|
-
self,
|
|
1262
|
+
self,
|
|
1263
|
+
session: Session,
|
|
1264
|
+
job: JobEntry,
|
|
1265
|
+
sampling: SamplingConfig | None = None,
|
|
1266
|
+
bot_cohort: str | None = None,
|
|
1185
1267
|
) -> Tuple[BotEntry, str] | None:
|
|
1186
1268
|
"""Select a bot for a job by capacity."""
|
|
1187
1269
|
query = (
|
|
@@ -1311,7 +1393,8 @@ class Scheduler:
|
|
|
1311
1393
|
)
|
|
1312
1394
|
.order_by(JobEntry.priority.desc(), JobEntry.queued_timestamp.desc())
|
|
1313
1395
|
.limit(1)
|
|
1314
|
-
.with_for_update(
|
|
1396
|
+
# ignore typing as older version of SQLAlchemy .with_for_update() doesn't understand typing for tuples see https://github.com/sqlalchemy/sqlalchemy/issues/12730
|
|
1397
|
+
.with_for_update(skip_locked=True, of=[BotEntry, JobEntry]) # type: ignore[list-item, unused-ignore]
|
|
1315
1398
|
.execution_options(populate_existing=True)
|
|
1316
1399
|
)
|
|
1317
1400
|
if bot_evicted_job := session.execute(eviction_query).one_or_none():
|
|
@@ -1332,7 +1415,11 @@ class Scheduler:
|
|
|
1332
1415
|
bot.capacity += 1 # Restore capacity from evicted job
|
|
1333
1416
|
|
|
1334
1417
|
session.add(
|
|
1335
|
-
JobHistoryEntry(
|
|
1418
|
+
JobHistoryEntry(
|
|
1419
|
+
event_type=JobHistoryEvent.EVICTED.value,
|
|
1420
|
+
job_name=evicted_job.name,
|
|
1421
|
+
payload=None,
|
|
1422
|
+
)
|
|
1336
1423
|
)
|
|
1337
1424
|
|
|
1338
1425
|
assignment = (bot, JobAssignmentStrategy.PREEMPTION.value)
|
|
@@ -1380,7 +1467,8 @@ class Scheduler:
|
|
|
1380
1467
|
# The caller didn't check the usage, we apply a minimum check here against max_quota
|
|
1381
1468
|
instance_quota = session.execute(
|
|
1382
1469
|
select(InstanceQuota).where(
|
|
1383
|
-
InstanceQuota.bot_cohort == bot.cohort,
|
|
1470
|
+
InstanceQuota.bot_cohort == bot.cohort,
|
|
1471
|
+
InstanceQuota.instance_name == job.instance_name,
|
|
1384
1472
|
)
|
|
1385
1473
|
).scalar_one_or_none()
|
|
1386
1474
|
if instance_quota is not None and instance_quota.current_usage >= instance_quota.max_quota:
|
|
@@ -1439,7 +1527,9 @@ class Scheduler:
|
|
|
1439
1527
|
bot_assignment_fn = bot_assignment_fn or self.match_bot_by_capacity
|
|
1440
1528
|
|
|
1441
1529
|
def assign_with_guard(
|
|
1442
|
-
session: Session,
|
|
1530
|
+
session: Session,
|
|
1531
|
+
match_fn: MatchJobToBotFn,
|
|
1532
|
+
guard: ColumnExpressionArgument[bool],
|
|
1443
1533
|
) -> bool:
|
|
1444
1534
|
instance_names_query = (
|
|
1445
1535
|
select(InstanceQuota.instance_name).where(InstanceQuota.bot_cohort == cohort).where(guard)
|
|
@@ -1458,7 +1548,14 @@ class Scheduler:
|
|
|
1458
1548
|
|
|
1459
1549
|
job = session.execute(job_statement).scalar_one_or_none()
|
|
1460
1550
|
if job is not None:
|
|
1461
|
-
match_fn(
|
|
1551
|
+
match_fn(
|
|
1552
|
+
session,
|
|
1553
|
+
job,
|
|
1554
|
+
failure_backoff,
|
|
1555
|
+
bot_assignment_fn,
|
|
1556
|
+
assigner_name,
|
|
1557
|
+
guard,
|
|
1558
|
+
)
|
|
1462
1559
|
return True
|
|
1463
1560
|
|
|
1464
1561
|
return False
|
|
@@ -1487,12 +1584,16 @@ class Scheduler:
|
|
|
1487
1584
|
|
|
1488
1585
|
# First, prioritize instances which are below their minimum quota
|
|
1489
1586
|
updated = assign_with_guard(
|
|
1490
|
-
session,
|
|
1587
|
+
session,
|
|
1588
|
+
match_with_preemption,
|
|
1589
|
+
InstanceQuota.current_usage < InstanceQuota.min_quota,
|
|
1491
1590
|
)
|
|
1492
1591
|
# Next, consider instances which are below their maximum quota
|
|
1493
1592
|
if not updated:
|
|
1494
1593
|
updated = assign_with_guard(
|
|
1495
|
-
session,
|
|
1594
|
+
session,
|
|
1595
|
+
self._match_job_to_bot,
|
|
1596
|
+
InstanceQuota.current_usage < InstanceQuota.max_quota,
|
|
1496
1597
|
)
|
|
1497
1598
|
|
|
1498
1599
|
return 1 if updated else 0
|
|
@@ -1635,7 +1736,9 @@ class Scheduler:
|
|
|
1635
1736
|
.with_for_update(skip_locked=True)
|
|
1636
1737
|
)
|
|
1637
1738
|
return self._batch_timeout_jobs(
|
|
1638
|
-
jobs_to_timeout_stmt,
|
|
1739
|
+
jobs_to_timeout_stmt,
|
|
1740
|
+
code_pb2.UNAVAILABLE,
|
|
1741
|
+
"Operation has been queued for too long",
|
|
1639
1742
|
)
|
|
1640
1743
|
|
|
1641
1744
|
def prune_timer_loop(self, shutdown_requested: threading.Event) -> None:
|
|
@@ -1684,7 +1787,8 @@ class Scheduler:
|
|
|
1684
1787
|
with self._sql.session() as session:
|
|
1685
1788
|
options = {"synchronize_session": "fetch"}
|
|
1686
1789
|
num_rows_deleted: int = cast(
|
|
1687
|
-
CursorResult[Any],
|
|
1790
|
+
CursorResult[Any],
|
|
1791
|
+
session.execute(delete_stmt, execution_options=options),
|
|
1688
1792
|
).rowcount
|
|
1689
1793
|
|
|
1690
1794
|
if num_rows_deleted:
|
|
@@ -1855,7 +1959,11 @@ class Scheduler:
|
|
|
1855
1959
|
}
|
|
1856
1960
|
LOGGER.debug("Closing bot session.", tags=log_tags)
|
|
1857
1961
|
for job in self._get_incomplete_jobs_for_bot(bot.bot_id, session, with_for_update=True):
|
|
1858
|
-
lease_tags = {
|
|
1962
|
+
lease_tags = {
|
|
1963
|
+
**log_tags,
|
|
1964
|
+
"db.lease_id": job.name,
|
|
1965
|
+
"db.lease_state": job.lease_state(),
|
|
1966
|
+
}
|
|
1859
1967
|
LOGGER.debug("Reassigning job for bot session.", tags=lease_tags)
|
|
1860
1968
|
self._retry_job(session, job)
|
|
1861
1969
|
self._notify_job_updated(job.name, session)
|
|
@@ -1866,7 +1974,10 @@ class Scheduler:
|
|
|
1866
1974
|
self._batch_update_instance_quota_usage(session, usage_diff)
|
|
1867
1975
|
|
|
1868
1976
|
def session_expiry_timer_loop(self, shutdown_requested: threading.Event) -> None:
|
|
1869
|
-
LOGGER.info(
|
|
1977
|
+
LOGGER.info(
|
|
1978
|
+
"Starting BotSession reaper.",
|
|
1979
|
+
tags=dict(keepalive_timeout=self.bot_session_keepalive_timeout),
|
|
1980
|
+
)
|
|
1870
1981
|
while not shutdown_requested.is_set():
|
|
1871
1982
|
try:
|
|
1872
1983
|
while self.reap_expired_sessions():
|
|
@@ -1896,7 +2007,10 @@ class Scheduler:
|
|
|
1896
2007
|
LOGGER.warning(
|
|
1897
2008
|
"BotSession has expired.",
|
|
1898
2009
|
tags=dict(
|
|
1899
|
-
name=bot.name,
|
|
2010
|
+
name=bot.name,
|
|
2011
|
+
bot_id=bot.bot_id,
|
|
2012
|
+
instance_name=bot.instance_name,
|
|
2013
|
+
deadline=bot.expiry_time,
|
|
1900
2014
|
),
|
|
1901
2015
|
)
|
|
1902
2016
|
bots_by_instance[bot.instance_name].append(bot)
|
|
@@ -1930,7 +2044,11 @@ class Scheduler:
|
|
|
1930
2044
|
|
|
1931
2045
|
@timed(METRIC.SCHEDULER.ASSIGNMENT_DURATION)
|
|
1932
2046
|
def _fetch_job_for_bot(
|
|
1933
|
-
self,
|
|
2047
|
+
self,
|
|
2048
|
+
session: Session,
|
|
2049
|
+
bot: BotEntry,
|
|
2050
|
+
usage_diffs: InstanceQuotaUsageDiffs,
|
|
2051
|
+
log_tags: Tags,
|
|
1934
2052
|
) -> JobEntry | None:
|
|
1935
2053
|
# Attempt to fetch a new job for a bot to work on.
|
|
1936
2054
|
# This can help if there are usually more jobs available than bots.
|
|
@@ -1961,7 +2079,10 @@ class Scheduler:
|
|
|
1961
2079
|
if next_job := session.execute(job_statement).scalar_one_or_none():
|
|
1962
2080
|
log_tags["db.next_job_name"] = next_job.name
|
|
1963
2081
|
self._assign_job_to_bot(
|
|
1964
|
-
session,
|
|
2082
|
+
session,
|
|
2083
|
+
next_job,
|
|
2084
|
+
bot,
|
|
2085
|
+
assignment_strategy=JobAssignmentStrategy.PROACTIVE.value,
|
|
1965
2086
|
)
|
|
1966
2087
|
start_timestamp = Timestamp()
|
|
1967
2088
|
start_timestamp.FromDatetime(next_job.queued_timestamp)
|
|
@@ -2010,7 +2131,6 @@ class Scheduler:
|
|
|
2010
2131
|
"request.bot_id": bot_id,
|
|
2011
2132
|
"request.bot_status": bot_status,
|
|
2012
2133
|
"request.bot_name": bot_name,
|
|
2013
|
-
"request.leases": {lease.id: lease.state for lease in bot_session_leases},
|
|
2014
2134
|
"request.capacity": max_capacity,
|
|
2015
2135
|
}
|
|
2016
2136
|
|
|
@@ -2112,7 +2232,7 @@ class Scheduler:
|
|
|
2112
2232
|
if db_bot_version == bot_version:
|
|
2113
2233
|
return active_leases, bot_version
|
|
2114
2234
|
|
|
2115
|
-
|
|
2235
|
+
bot_job_names_stmt = select(JobEntry.name).where(
|
|
2116
2236
|
JobEntry.worker_name == bot_id,
|
|
2117
2237
|
JobEntry.stage >= OperationStage.QUEUED.value,
|
|
2118
2238
|
JobEntry.stage < OperationStage.COMPLETED.value,
|
|
@@ -2120,20 +2240,25 @@ class Scheduler:
|
|
|
2120
2240
|
|
|
2121
2241
|
# If this bot is instance-restricted, only look for jobs in the current instance.
|
|
2122
2242
|
if instance_restricted_bot:
|
|
2123
|
-
|
|
2243
|
+
bot_job_names_stmt = bot_job_names_stmt.where(self._job_in_instance_pool())
|
|
2124
2244
|
|
|
2125
|
-
|
|
2126
|
-
db_lease_ids = set(jobs.keys())
|
|
2127
|
-
log_tags["db.leases"] = {job.name: job.lease_state() for job in jobs.values()}
|
|
2245
|
+
db_lease_ids = set(session.execute(bot_job_names_stmt).scalars().all())
|
|
2128
2246
|
|
|
2129
2247
|
for lease in active_leases:
|
|
2130
2248
|
# Set specific tags in log lines for the lease currently being synchronized.
|
|
2131
2249
|
# This can help to identify a problematic lease in logs for a bot with multiple leases assigned.
|
|
2132
|
-
lease_tags = {
|
|
2250
|
+
lease_tags = {
|
|
2251
|
+
**log_tags,
|
|
2252
|
+
"request.lease_id": lease.id,
|
|
2253
|
+
"request.lease_state": lease.state,
|
|
2254
|
+
}
|
|
2133
2255
|
|
|
2134
2256
|
# If the database has no lease, but the work is completed, we probably timed out the last call.
|
|
2135
2257
|
if lease.id not in db_lease_ids and lease.state == LeaseState.COMPLETED.value:
|
|
2136
|
-
LOGGER.debug(
|
|
2258
|
+
LOGGER.debug(
|
|
2259
|
+
"No lease in database, but session lease is completed. Skipping.",
|
|
2260
|
+
tags=lease_tags,
|
|
2261
|
+
)
|
|
2137
2262
|
continue
|
|
2138
2263
|
|
|
2139
2264
|
# Remove this lease ID from db_lease_ids if present, now that we know we're handling it.
|
|
@@ -2143,7 +2268,10 @@ class Scheduler:
|
|
|
2143
2268
|
|
|
2144
2269
|
job = self._get_job(lease.id, session)
|
|
2145
2270
|
if not job or job.worker_name != bot_id:
|
|
2146
|
-
LOGGER.info(
|
|
2271
|
+
LOGGER.info(
|
|
2272
|
+
"Lease is deleted or assigned to another bot. Skipping.",
|
|
2273
|
+
tags=lease_tags,
|
|
2274
|
+
)
|
|
2147
2275
|
continue
|
|
2148
2276
|
|
|
2149
2277
|
lease_tags["db.lease_id"] = job.name
|
|
@@ -2201,7 +2329,10 @@ class Scheduler:
|
|
|
2201
2329
|
if lease_state == LeaseState.PENDING.value:
|
|
2202
2330
|
# Need another iteration to flip the state to ACTIVE
|
|
2203
2331
|
# See also `_activate_bot_pending_leases`
|
|
2204
|
-
LOGGER.debug(
|
|
2332
|
+
LOGGER.debug(
|
|
2333
|
+
"Lease was assigned by an old scheduler during synchronization.",
|
|
2334
|
+
tags=log_tags,
|
|
2335
|
+
)
|
|
2205
2336
|
continue
|
|
2206
2337
|
|
|
2207
2338
|
# Assign:
|
|
@@ -2263,13 +2394,20 @@ class Scheduler:
|
|
|
2263
2394
|
raise InvalidArgumentError(f"Bot does not exist while reporting completed leases. {log_tags}")
|
|
2264
2395
|
|
|
2265
2396
|
for lease in completed_leases:
|
|
2266
|
-
lease_tags = {
|
|
2397
|
+
lease_tags = {
|
|
2398
|
+
**log_tags,
|
|
2399
|
+
"request.lease_id": lease.id,
|
|
2400
|
+
"request.lease_state": lease.state,
|
|
2401
|
+
}
|
|
2267
2402
|
job = self._get_job(lease.id, session, with_for_update=True)
|
|
2268
2403
|
|
|
2269
2404
|
if not job or job.worker_name != bot.bot_id or job.stage != OperationStage.EXECUTING.value:
|
|
2270
2405
|
if job:
|
|
2271
2406
|
lease_tags["job.stage"] = job.stage
|
|
2272
|
-
LOGGER.warning(
|
|
2407
|
+
LOGGER.warning(
|
|
2408
|
+
"Completed lease points to non-existent or invalid job. Skipping.",
|
|
2409
|
+
tags=lease_tags,
|
|
2410
|
+
)
|
|
2273
2411
|
continue
|
|
2274
2412
|
|
|
2275
2413
|
completion_tags = {
|
|
@@ -2324,7 +2462,8 @@ class Scheduler:
|
|
|
2324
2462
|
|
|
2325
2463
|
if job.n_tries >= self.max_job_attempts:
|
|
2326
2464
|
status = status_pb2.Status(
|
|
2327
|
-
code=code_pb2.ABORTED,
|
|
2465
|
+
code=code_pb2.ABORTED,
|
|
2466
|
+
message=f"Job was retried {job.n_tries} unsuccessfully. Aborting.",
|
|
2328
2467
|
)
|
|
2329
2468
|
self._complete_job(session, job, status=status)
|
|
2330
2469
|
return
|
|
@@ -2486,14 +2625,22 @@ class Scheduler:
|
|
|
2486
2625
|
try:
|
|
2487
2626
|
LOGGER.debug(
|
|
2488
2627
|
"Recording bot locality hint.",
|
|
2489
|
-
tags=dict(
|
|
2628
|
+
tags=dict(
|
|
2629
|
+
job_name=job.name,
|
|
2630
|
+
bot_name=bot_name,
|
|
2631
|
+
locality_hint=job.locality_hint,
|
|
2632
|
+
),
|
|
2490
2633
|
)
|
|
2491
2634
|
self._record_bot_locality_hint(session, bot_name, job.locality_hint)
|
|
2492
2635
|
except Exception:
|
|
2493
2636
|
# Don't fail job completion if locality hint recording fails
|
|
2494
2637
|
LOGGER.warning(
|
|
2495
2638
|
"Failed to record bot locality hint.",
|
|
2496
|
-
tags=dict(
|
|
2639
|
+
tags=dict(
|
|
2640
|
+
job_name=job.name,
|
|
2641
|
+
bot_name=bot_name,
|
|
2642
|
+
locality_hint=job.locality_hint,
|
|
2643
|
+
),
|
|
2497
2644
|
exc_info=True,
|
|
2498
2645
|
)
|
|
2499
2646
|
|
|
@@ -2506,7 +2653,9 @@ class Scheduler:
|
|
|
2506
2653
|
)
|
|
2507
2654
|
if action_result.ByteSize() > 0:
|
|
2508
2655
|
self._update_action_result_retention(
|
|
2509
|
-
action_result,
|
|
2656
|
+
action_result,
|
|
2657
|
+
retention_hours=self.action_result_retention_hours,
|
|
2658
|
+
instance_name=job.instance_name,
|
|
2510
2659
|
)
|
|
2511
2660
|
|
|
2512
2661
|
worker_duration = None
|
|
@@ -2554,7 +2703,11 @@ class Scheduler:
|
|
|
2554
2703
|
|
|
2555
2704
|
# bot count by status for each property label
|
|
2556
2705
|
query_per_label = (
|
|
2557
|
-
session.query(
|
|
2706
|
+
session.query(
|
|
2707
|
+
BotEntry.bot_status,
|
|
2708
|
+
PropertyLabelEntry.property_label,
|
|
2709
|
+
func.count(BotEntry.bot_status),
|
|
2710
|
+
)
|
|
2558
2711
|
.join(BotEntry, BotEntry.name == PropertyLabelEntry.bot_name)
|
|
2559
2712
|
.group_by(BotEntry.bot_status, PropertyLabelEntry.property_label)
|
|
2560
2713
|
.filter(self._bot_in_instance_pool())
|
|
@@ -2571,7 +2724,11 @@ class Scheduler:
|
|
|
2571
2724
|
metrics["available_capacity_total"][BotStatus(status)] = cast(int, capacity)
|
|
2572
2725
|
|
|
2573
2726
|
capacity_per_label_stmt = (
|
|
2574
|
-
select(
|
|
2727
|
+
select(
|
|
2728
|
+
BotEntry.bot_status,
|
|
2729
|
+
PropertyLabelEntry.property_label,
|
|
2730
|
+
func.sum(BotEntry.capacity),
|
|
2731
|
+
)
|
|
2575
2732
|
.join(BotEntry, BotEntry.name == PropertyLabelEntry.bot_name)
|
|
2576
2733
|
.group_by(BotEntry.bot_status, PropertyLabelEntry.property_label)
|
|
2577
2734
|
.where(self._bot_in_instance_pool())
|
|
@@ -2595,7 +2752,11 @@ class Scheduler:
|
|
|
2595
2752
|
|
|
2596
2753
|
locate_bot_stmt = (
|
|
2597
2754
|
select(BotEntry)
|
|
2598
|
-
.where(
|
|
2755
|
+
.where(
|
|
2756
|
+
BotEntry.name == bot_name,
|
|
2757
|
+
BotEntry.bot_id == bot_id,
|
|
2758
|
+
self._bot_in_instance_pool(),
|
|
2759
|
+
)
|
|
2599
2760
|
.with_for_update()
|
|
2600
2761
|
)
|
|
2601
2762
|
with self._sql.session() as session:
|
|
@@ -2630,7 +2791,12 @@ class Scheduler:
|
|
|
2630
2791
|
stdout_stream_name=job.stdout_stream_write_name or "",
|
|
2631
2792
|
partial_execution_metadata=self.get_execute_action_metadata(job),
|
|
2632
2793
|
)
|
|
2633
|
-
metadata.append(
|
|
2794
|
+
metadata.append(
|
|
2795
|
+
(
|
|
2796
|
+
"executeoperationmetadata-bin",
|
|
2797
|
+
job_metadata.SerializeToString(),
|
|
2798
|
+
)
|
|
2799
|
+
)
|
|
2634
2800
|
|
|
2635
2801
|
return metadata
|
|
2636
2802
|
|
|
@@ -2647,9 +2813,15 @@ class Scheduler:
|
|
|
2647
2813
|
assign_timestamp(metadata.worker_start_timestamp, job.worker_start_timestamp)
|
|
2648
2814
|
assign_timestamp(metadata.worker_completed_timestamp, job.worker_completed_timestamp)
|
|
2649
2815
|
assign_timestamp(metadata.input_fetch_start_timestamp, job.input_fetch_start_timestamp)
|
|
2650
|
-
assign_timestamp(
|
|
2816
|
+
assign_timestamp(
|
|
2817
|
+
metadata.input_fetch_completed_timestamp,
|
|
2818
|
+
job.input_fetch_completed_timestamp,
|
|
2819
|
+
)
|
|
2651
2820
|
assign_timestamp(metadata.output_upload_start_timestamp, job.output_upload_start_timestamp)
|
|
2652
|
-
assign_timestamp(
|
|
2821
|
+
assign_timestamp(
|
|
2822
|
+
metadata.output_upload_completed_timestamp,
|
|
2823
|
+
job.output_upload_completed_timestamp,
|
|
2824
|
+
)
|
|
2653
2825
|
assign_timestamp(metadata.execution_start_timestamp, job.execution_start_timestamp)
|
|
2654
2826
|
assign_timestamp(metadata.execution_completed_timestamp, job.execution_completed_timestamp)
|
|
2655
2827
|
|
|
@@ -2697,7 +2869,12 @@ class Scheduler:
|
|
|
2697
2869
|
) -> None:
|
|
2698
2870
|
with self._sql_ro.session(expire_on_commit=False) as session:
|
|
2699
2871
|
self._publish_execution_stats(
|
|
2700
|
-
session,
|
|
2872
|
+
session,
|
|
2873
|
+
job_name,
|
|
2874
|
+
instance_name,
|
|
2875
|
+
execution_metadata,
|
|
2876
|
+
property_label,
|
|
2877
|
+
assigner_name,
|
|
2701
2878
|
)
|
|
2702
2879
|
|
|
2703
2880
|
def _publish_execution_stats(
|
|
@@ -2720,17 +2897,46 @@ class Scheduler:
|
|
|
2720
2897
|
upload_start = execution_metadata.output_upload_start_timestamp
|
|
2721
2898
|
upload_completed = execution_metadata.output_upload_completed_timestamp
|
|
2722
2899
|
|
|
2723
|
-
self._publish_job_duration(
|
|
2900
|
+
self._publish_job_duration(
|
|
2901
|
+
instance_name,
|
|
2902
|
+
queued,
|
|
2903
|
+
worker_completed,
|
|
2904
|
+
"Total",
|
|
2905
|
+
property_label,
|
|
2906
|
+
assigner_name,
|
|
2907
|
+
)
|
|
2724
2908
|
# The Queued time is missing here as it's posted as soon as worker has accepted the job.
|
|
2725
2909
|
self._publish_job_duration(
|
|
2726
|
-
instance_name,
|
|
2910
|
+
instance_name,
|
|
2911
|
+
worker_start,
|
|
2912
|
+
worker_completed,
|
|
2913
|
+
"Worker",
|
|
2914
|
+
property_label,
|
|
2915
|
+
assigner_name,
|
|
2727
2916
|
)
|
|
2728
|
-
self._publish_job_duration(instance_name, fetch_start, fetch_completed, "Fetch", property_label, assigner_name)
|
|
2729
2917
|
self._publish_job_duration(
|
|
2730
|
-
instance_name,
|
|
2918
|
+
instance_name,
|
|
2919
|
+
fetch_start,
|
|
2920
|
+
fetch_completed,
|
|
2921
|
+
"Fetch",
|
|
2922
|
+
property_label,
|
|
2923
|
+
assigner_name,
|
|
2731
2924
|
)
|
|
2732
2925
|
self._publish_job_duration(
|
|
2733
|
-
instance_name,
|
|
2926
|
+
instance_name,
|
|
2927
|
+
execution_start,
|
|
2928
|
+
execution_completed,
|
|
2929
|
+
"Execution",
|
|
2930
|
+
property_label,
|
|
2931
|
+
assigner_name,
|
|
2932
|
+
)
|
|
2933
|
+
self._publish_job_duration(
|
|
2934
|
+
instance_name,
|
|
2935
|
+
upload_start,
|
|
2936
|
+
upload_completed,
|
|
2937
|
+
"Upload",
|
|
2938
|
+
property_label,
|
|
2939
|
+
assigner_name,
|
|
2734
2940
|
)
|
|
2735
2941
|
|
|
2736
2942
|
if self.metering_client is None or len(execution_metadata.auxiliary_metadata) == 0:
|
|
@@ -2767,10 +2973,18 @@ class Scheduler:
|
|
|
2767
2973
|
)
|
|
2768
2974
|
self.metering_client.put_usage(identity=client_id, operation_name=op.name, usage=usage)
|
|
2769
2975
|
except Exception as exc:
|
|
2770
|
-
LOGGER.exception(
|
|
2976
|
+
LOGGER.exception(
|
|
2977
|
+
"Cannot publish resource usage.",
|
|
2978
|
+
tags=dict(job_name=job_name),
|
|
2979
|
+
exc_info=exc,
|
|
2980
|
+
)
|
|
2771
2981
|
|
|
2772
2982
|
def _update_action_retention(
|
|
2773
|
-
self,
|
|
2983
|
+
self,
|
|
2984
|
+
action: Action,
|
|
2985
|
+
action_digest: Digest,
|
|
2986
|
+
retention_hours: float | None,
|
|
2987
|
+
instance_name: str,
|
|
2774
2988
|
) -> None:
|
|
2775
2989
|
if not self.asset_client or not retention_hours:
|
|
2776
2990
|
return
|
|
@@ -2791,14 +3005,18 @@ class Scheduler:
|
|
|
2791
3005
|
instance_name=instance_name,
|
|
2792
3006
|
)
|
|
2793
3007
|
LOGGER.debug(
|
|
2794
|
-
"Extended the retention of action.",
|
|
3008
|
+
"Extended the retention of action.",
|
|
3009
|
+
tags=dict(digest=action_digest, retention_hours=retention_hours),
|
|
2795
3010
|
)
|
|
2796
3011
|
except Exception:
|
|
2797
3012
|
LOGGER.exception("Failed to push action as an asset.", tags=dict(digest=action_digest))
|
|
2798
3013
|
# Not a fatal path, don't reraise here
|
|
2799
3014
|
|
|
2800
3015
|
def _update_action_result_retention(
|
|
2801
|
-
self,
|
|
3016
|
+
self,
|
|
3017
|
+
action_result: ActionResult,
|
|
3018
|
+
retention_hours: float | None,
|
|
3019
|
+
instance_name: str,
|
|
2802
3020
|
) -> None:
|
|
2803
3021
|
if not self.asset_client or not retention_hours:
|
|
2804
3022
|
return
|
|
@@ -2842,11 +3060,16 @@ class Scheduler:
|
|
|
2842
3060
|
instance_name=instance_name,
|
|
2843
3061
|
)
|
|
2844
3062
|
LOGGER.debug(
|
|
2845
|
-
"Extended the retention of action result.",
|
|
3063
|
+
"Extended the retention of action result.",
|
|
3064
|
+
tags=dict(digest=digest, retention_hours=retention_hours),
|
|
2846
3065
|
)
|
|
2847
3066
|
|
|
2848
3067
|
except Exception as e:
|
|
2849
|
-
LOGGER.exception(
|
|
3068
|
+
LOGGER.exception(
|
|
3069
|
+
"Failed to push action_result as an asset.",
|
|
3070
|
+
tags=dict(digest=digest),
|
|
3071
|
+
exc_info=e,
|
|
3072
|
+
)
|
|
2850
3073
|
# Not a fatal path, don't reraise here
|
|
2851
3074
|
|
|
2852
3075
|
def _record_bot_locality_hint(self, session: Session, bot_name: str, locality_hint: str) -> None:
|
|
@@ -2885,7 +3108,8 @@ class Scheduler:
|
|
|
2885
3108
|
# Delete all hints older than the K-th most recent
|
|
2886
3109
|
session.execute(
|
|
2887
3110
|
delete(BotLocalityHintEntry).where(
|
|
2888
|
-
BotLocalityHintEntry.bot_name == bot_name,
|
|
3111
|
+
BotLocalityHintEntry.bot_name == bot_name,
|
|
3112
|
+
BotLocalityHintEntry.sequence_number < k_th_seq,
|
|
2889
3113
|
)
|
|
2890
3114
|
)
|
|
2891
3115
|
|
|
@@ -2983,7 +3207,11 @@ class Scheduler:
|
|
|
2983
3207
|
|
|
2984
3208
|
LOGGER.warning(
|
|
2985
3209
|
"Instance usage not updated.",
|
|
2986
|
-
tags={
|
|
3210
|
+
tags={
|
|
3211
|
+
"cohort": bot_cohort,
|
|
3212
|
+
"instance_name": instance_name,
|
|
3213
|
+
"delta": delta,
|
|
3214
|
+
},
|
|
2987
3215
|
)
|
|
2988
3216
|
return False
|
|
2989
3217
|
return True
|