primitive 0.2.39__tar.gz → 0.2.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {primitive-0.2.39 → primitive-0.2.40}/PKG-INFO +1 -1
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/__about__.py +1 -1
- primitive-0.2.40/src/primitive/agent/actions.py +150 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/agent/commands.py +1 -1
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/agent/runner.py +39 -36
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/client.py +3 -2
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/actions.py +31 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/graphql/fragments.py +1 -0
- primitive-0.2.40/src/primitive/monitor/actions.py +196 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/actions.py +24 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/exceptions.py +11 -0
- primitive-0.2.40/src/primitive/utils/psutil.py +26 -0
- primitive-0.2.39/src/primitive/agent/actions.py +0 -114
- primitive-0.2.39/src/primitive/db/base.py +0 -5
- primitive-0.2.39/src/primitive/db/models.py +0 -88
- primitive-0.2.39/src/primitive/db/sqlite.py +0 -70
- primitive-0.2.39/src/primitive/monitor/actions.py +0 -247
- {primitive-0.2.39 → primitive-0.2.40}/.git-hooks/pre-commit +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/.gitattributes +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/.github/workflows/lint.yml +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/.github/workflows/publish.yml +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/.github/workflows/pyright.yml +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/.gitignore +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/.vscode/extensions.json +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/.vscode/settings.json +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/LICENSE.txt +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/Makefile +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/README.md +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/linux setup.md +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/pyproject.toml +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/agent/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/agent/uploader.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/graphql/queries.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/cli.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/launch_agents.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/launch_service.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/ui.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/exec/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/exec/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/exec/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/exec/interactive.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/graphql/fragments.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/graphql/mutations.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/graphql/queries.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/graphql/queries.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/graphql/relay.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/graphql/sdk.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/graphql/utility_fragments.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/android.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/graphql/fragments.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/graphql/mutations.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/graphql/queries.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/ui.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/graphql/mutations.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/graphql/queries.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/monitor/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/graphql/fragments.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/graphql/mutations.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/graphql/queries.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/graphql/fragments.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/graphql/mutations.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/graphql/queries.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/provisioning/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/provisioning/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/provisioning/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/provisioning/graphql/queries.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/commands.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/graphql/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/graphql/fragments.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/graphql/mutations.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/graphql/queries.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/actions.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/auth.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/cache.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/chunk_size.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/config.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/daemons.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/logging.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/memory_size.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/printer.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/shell.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/text.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/tests/__init__.py +0 -0
- {primitive-0.2.39 → primitive-0.2.40}/uv.lock +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: primitive
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.40
|
4
4
|
Project-URL: Documentation, https://github.com//primitivecorp/primitive-cli#readme
|
5
5
|
Project-URL: Issues, https://github.com//primitivecorp/primitive-cli/issues
|
6
6
|
Project-URL: Source, https://github.com//primitivecorp/primitive-cli
|
@@ -0,0 +1,150 @@
|
|
1
|
+
import sys
|
2
|
+
from time import sleep
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from loguru import logger
|
6
|
+
|
7
|
+
from primitive.__about__ import __version__
|
8
|
+
from primitive.agent.runner import Runner
|
9
|
+
from primitive.agent.uploader import Uploader
|
10
|
+
from primitive.utils.actions import BaseAction
|
11
|
+
|
12
|
+
|
13
|
+
class Agent(BaseAction):
|
14
|
+
def start(self, job_run_id: Optional[str] = None):
|
15
|
+
logger.remove()
|
16
|
+
logger.add(
|
17
|
+
sink=sys.stderr,
|
18
|
+
format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <level>{message}</level>",
|
19
|
+
backtrace=True,
|
20
|
+
diagnose=True,
|
21
|
+
level="DEBUG" if self.primitive.DEBUG else "INFO",
|
22
|
+
)
|
23
|
+
logger.info("primitive agent")
|
24
|
+
logger.info(f"Version: {__version__}")
|
25
|
+
|
26
|
+
# TODO: tighten logic for determining if we're running in a container
|
27
|
+
RUNNING_IN_CONTAINER = False
|
28
|
+
if job_run_id is not None:
|
29
|
+
logger.info("Running in container...")
|
30
|
+
RUNNING_IN_CONTAINER = True
|
31
|
+
|
32
|
+
# Create uploader
|
33
|
+
uploader = Uploader(primitive=self.primitive)
|
34
|
+
|
35
|
+
try:
|
36
|
+
while True:
|
37
|
+
logger.debug("Scanning for files to upload...")
|
38
|
+
uploader.scan()
|
39
|
+
|
40
|
+
logger.debug("Checking for pending job runs for this device...")
|
41
|
+
|
42
|
+
# From Dylan June 30th:
|
43
|
+
# If passed an explicit job_run_id:
|
44
|
+
# - check if the JobRun exists in the API
|
45
|
+
# - if it does, set it to request_in_progress
|
46
|
+
# - if it does not, log an error and stop execution
|
47
|
+
# If no job_run_id is passed:
|
48
|
+
# - verify that this is a Node with an active Reservation
|
49
|
+
# - if the Reservation is active AND it has a JobRun associated with it,
|
50
|
+
# then query for that JobRun
|
51
|
+
# - if no JobRuns are found in the API, wait for another active reservation
|
52
|
+
# - if a JobRun is found, set it to request_in_progress
|
53
|
+
# - then wait for the JobRun to be in_progress from the API
|
54
|
+
|
55
|
+
active_reservation_id = None
|
56
|
+
job_run_data: dict = {}
|
57
|
+
|
58
|
+
if RUNNING_IN_CONTAINER and job_run_id:
|
59
|
+
job_run_result = self.primitive.jobs.get_job_run(id=job_run_id)
|
60
|
+
if job_run_result.data:
|
61
|
+
job_run_data = job_run_result.data.get("jobRun", {})
|
62
|
+
else:
|
63
|
+
hardware = self.primitive.hardware.get_own_hardware_details()
|
64
|
+
# fetch the latest hardware and activeReservation details
|
65
|
+
if active_reservation_data := hardware["activeReservation"]:
|
66
|
+
active_reservation_id = active_reservation_data.get("id", None)
|
67
|
+
|
68
|
+
if active_reservation_id is not None:
|
69
|
+
job_run_data = (
|
70
|
+
self.primitive.reservations.get_job_run_for_reservation_id(
|
71
|
+
reservation_id=active_reservation_id
|
72
|
+
)
|
73
|
+
)
|
74
|
+
job_run_id = job_run_data.get("id", None)
|
75
|
+
|
76
|
+
if (
|
77
|
+
len(job_run_data.keys()) == 0
|
78
|
+
or not job_run_data.get("id")
|
79
|
+
or job_run_id is None
|
80
|
+
):
|
81
|
+
if RUNNING_IN_CONTAINER:
|
82
|
+
logger.info("Running in container, exiting due to no JobRun.")
|
83
|
+
break
|
84
|
+
logger.debug("No pending Job Run found, sleeping...")
|
85
|
+
sleep(5)
|
86
|
+
continue
|
87
|
+
|
88
|
+
logger.debug("Found pending Job Run")
|
89
|
+
logger.debug(f"Job Run ID: {job_run_data.get('id')}")
|
90
|
+
logger.debug(f"Job Name: {job_run_data.get('job').get('name')}")
|
91
|
+
|
92
|
+
logger.info(
|
93
|
+
f"Setting JobRun {job_run_data.get('job').get('name')} to request_in_progress"
|
94
|
+
)
|
95
|
+
# we are setting to request_in_progress here which puts a started_at time on the JobRun in the API's database
|
96
|
+
# any time spent pulling Git repositories, setting up, etc, counts as compute time
|
97
|
+
job_run_result = self.primitive.jobs.job_run_update(
|
98
|
+
id=job_run_id, status="request_in_progress"
|
99
|
+
)
|
100
|
+
|
101
|
+
while job_run_data["status"] != "in_progress":
|
102
|
+
logger.info(
|
103
|
+
f"Waiting for JobRun {job_run_data.get('name')} to be in_progress"
|
104
|
+
)
|
105
|
+
sleep(1)
|
106
|
+
job_run_result = self.primitive.jobs.get_job_run(id=job_run_id)
|
107
|
+
if job_run_result.data is not None:
|
108
|
+
job_run_data = job_run_result.data.get("jobRun", {})
|
109
|
+
|
110
|
+
runner = Runner(
|
111
|
+
primitive=self.primitive,
|
112
|
+
job_run=job_run_data,
|
113
|
+
)
|
114
|
+
|
115
|
+
try:
|
116
|
+
runner.setup()
|
117
|
+
except Exception as exception:
|
118
|
+
logger.exception(
|
119
|
+
f"Exception while initializing runner: {exception}"
|
120
|
+
)
|
121
|
+
self.primitive.jobs.job_run_update(
|
122
|
+
id=job_run_id,
|
123
|
+
status="request_completed",
|
124
|
+
conclusion="failure",
|
125
|
+
)
|
126
|
+
continue
|
127
|
+
|
128
|
+
try:
|
129
|
+
runner.execute_job_run()
|
130
|
+
except Exception as exception:
|
131
|
+
logger.exception(f"Exception while executing job: {exception}")
|
132
|
+
self.primitive.jobs.job_run_update(
|
133
|
+
id=job_run_id,
|
134
|
+
status="request_completed",
|
135
|
+
conclusion="failure",
|
136
|
+
)
|
137
|
+
finally:
|
138
|
+
runner.cleanup()
|
139
|
+
|
140
|
+
# NOTE: also run scan here to force upload of artifacts
|
141
|
+
# This should probably eventually be another daemon?
|
142
|
+
uploader.scan()
|
143
|
+
|
144
|
+
if RUNNING_IN_CONTAINER:
|
145
|
+
logger.info("Running in container, exiting after job run")
|
146
|
+
break
|
147
|
+
|
148
|
+
sleep(5)
|
149
|
+
except KeyboardInterrupt:
|
150
|
+
logger.info("Stopping primitive agent...")
|
@@ -10,10 +10,10 @@ from typing import Dict, List, TypedDict
|
|
10
10
|
import yaml
|
11
11
|
from loguru import logger
|
12
12
|
|
13
|
-
from
|
14
|
-
from
|
15
|
-
from
|
16
|
-
from
|
13
|
+
from primitive.utils.cache import get_artifacts_cache, get_logs_cache, get_sources_cache
|
14
|
+
from primitive.utils.logging import fmt, log_context
|
15
|
+
from primitive.utils.psutil import kill_process_and_children
|
16
|
+
from primitive.utils.shell import env_to_dict
|
17
17
|
|
18
18
|
try:
|
19
19
|
from yaml import CLoader as Loader
|
@@ -109,27 +109,21 @@ class Runner:
|
|
109
109
|
self.job_settings["rootDirectory"]
|
110
110
|
)
|
111
111
|
|
112
|
-
|
113
|
-
|
114
|
-
logger.info(f"Using job config from database for {self.job['slug']}")
|
115
|
-
self.config = db_config
|
116
|
-
else:
|
117
|
-
# Attempt to parse the job yaml file
|
118
|
-
job_filename = self.job_settings["repositoryFilename"]
|
119
|
-
logger.info(f"Scanning directory for job file {job_filename}")
|
112
|
+
job_filename = self.job_settings["repositoryFilename"]
|
113
|
+
logger.info(f"Scanning directory for job file {job_filename}")
|
120
114
|
|
121
|
-
|
115
|
+
job_config_file = Path(self.source_dir / ".primitive" / job_filename)
|
122
116
|
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
117
|
+
if job_config_file.exists():
|
118
|
+
logger.info(
|
119
|
+
f"Found job description for {self.job['slug']} at {job_config_file}"
|
120
|
+
)
|
121
|
+
self.config = yaml.load(open(job_config_file, "r"), Loader=Loader)
|
122
|
+
else:
|
123
|
+
logger.error(
|
124
|
+
f"No job description with matching filename '{job_filename}' found"
|
125
|
+
)
|
126
|
+
raise FileNotFoundError
|
133
127
|
|
134
128
|
# Setup initial process environment
|
135
129
|
self.initial_env = os.environ
|
@@ -147,15 +141,11 @@ class Runner:
|
|
147
141
|
)
|
148
142
|
|
149
143
|
@log_context(label="execute")
|
150
|
-
def
|
151
|
-
logger.info(f"Executing {self.job['slug']} job")
|
152
|
-
self.primitive.jobs.job_run_update(
|
153
|
-
self.job_run["id"], status="request_in_progress"
|
154
|
-
)
|
155
|
-
|
144
|
+
def execute_job_run(self) -> None:
|
156
145
|
self.modified_env = {**self.initial_env}
|
157
146
|
task_failed = False
|
158
147
|
cancelled = False
|
148
|
+
timed_out = False
|
159
149
|
|
160
150
|
for task in self.config["executes"]:
|
161
151
|
# Everything inside this loop should be contextualized with the task label
|
@@ -171,6 +161,9 @@ class Runner:
|
|
171
161
|
if status_value == "completed" and conclusion_value == "cancelled":
|
172
162
|
cancelled = True
|
173
163
|
break
|
164
|
+
if status_value == "completed" and conclusion_value == "timed_out":
|
165
|
+
timed_out = True
|
166
|
+
break
|
174
167
|
|
175
168
|
# Everything within this block should be contextualized as user logs
|
176
169
|
with logger.contextualize(type="user"):
|
@@ -186,11 +179,17 @@ class Runner:
|
|
186
179
|
conclusion_value = status.data["jobRun"]["conclusion"]
|
187
180
|
if status_value == "completed" and conclusion_value == "cancelled":
|
188
181
|
cancelled = True
|
182
|
+
if status_value == "completed" and conclusion_value == "timed_out":
|
183
|
+
timed_out = True
|
189
184
|
|
190
185
|
if cancelled:
|
191
186
|
logger.warning("Job cancelled by user")
|
192
187
|
return
|
193
188
|
|
189
|
+
if timed_out:
|
190
|
+
logger.error("Job timed out")
|
191
|
+
return
|
192
|
+
|
194
193
|
conclusion = "success"
|
195
194
|
if task_failed:
|
196
195
|
conclusion = "failure"
|
@@ -270,9 +269,17 @@ class Runner:
|
|
270
269
|
stderr=asyncio.subprocess.PIPE,
|
271
270
|
)
|
272
271
|
|
273
|
-
|
274
|
-
|
275
|
-
|
272
|
+
try:
|
273
|
+
await self.primitive.jobs.ajob_run_update(
|
274
|
+
self.job_run["id"],
|
275
|
+
parent_pid=process.pid,
|
276
|
+
)
|
277
|
+
except ValueError:
|
278
|
+
logger.error(
|
279
|
+
f"Failed to update job run {self.job_run['id']} with process PID {process.pid}"
|
280
|
+
)
|
281
|
+
kill_process_and_children(pid=process.pid)
|
282
|
+
return False
|
276
283
|
|
277
284
|
stdout_failed, stderr_failed = await asyncio.gather(
|
278
285
|
self.log_cmd(
|
@@ -289,10 +296,6 @@ class Runner:
|
|
289
296
|
f"Finished executing command {i + 1}/{len(commands)}: {cmd} with return code {returncode}"
|
290
297
|
)
|
291
298
|
|
292
|
-
JobRun.objects.filter_by(job_run_id=self.job_run["id"]).update(
|
293
|
-
{"pid": None}
|
294
|
-
)
|
295
|
-
|
296
299
|
if returncode > 0:
|
297
300
|
logger.error(
|
298
301
|
f"Task {task['label']} failed on '{cmd}' with return code {returncode}"
|
@@ -1,8 +1,9 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
1
3
|
from gql import Client
|
2
4
|
from loguru import logger
|
3
5
|
from rich.logging import RichHandler
|
4
6
|
from rich.traceback import install
|
5
|
-
from typing import Optional
|
6
7
|
|
7
8
|
from .agent.actions import Agent
|
8
9
|
from .auth.actions import Auth
|
@@ -12,11 +13,11 @@ from .files.actions import Files
|
|
12
13
|
from .git.actions import Git
|
13
14
|
from .hardware.actions import Hardware
|
14
15
|
from .jobs.actions import Jobs
|
16
|
+
from .monitor.actions import Monitor
|
15
17
|
from .organizations.actions import Organizations
|
16
18
|
from .projects.actions import Projects
|
17
19
|
from .provisioning.actions import Provisioning
|
18
20
|
from .reservations.actions import Reservations
|
19
|
-
from .monitor.actions import Monitor
|
20
21
|
from .utils.config import read_config_file
|
21
22
|
|
22
23
|
|
@@ -115,6 +115,7 @@ class Jobs(BaseAction):
|
|
115
115
|
conclusion: str = None,
|
116
116
|
file_ids: Optional[List[str]] = [],
|
117
117
|
number_of_files_produced: Optional[int] = None,
|
118
|
+
parent_pid: Optional[int] = None,
|
118
119
|
):
|
119
120
|
mutation = gql(job_run_update_mutation)
|
120
121
|
input = {"id": id}
|
@@ -126,12 +127,42 @@ class Jobs(BaseAction):
|
|
126
127
|
input["files"] = file_ids
|
127
128
|
if number_of_files_produced is not None:
|
128
129
|
input["numberOfFilesProduced"] = number_of_files_produced
|
130
|
+
if parent_pid is not None:
|
131
|
+
input["parentPid"] = parent_pid
|
129
132
|
variables = {"input": input}
|
130
133
|
result = self.primitive.session.execute(
|
131
134
|
mutation, variable_values=variables, get_execution_result=True
|
132
135
|
)
|
133
136
|
return result
|
134
137
|
|
138
|
+
@guard
|
139
|
+
async def ajob_run_update(
|
140
|
+
self,
|
141
|
+
id: str,
|
142
|
+
status: str = None,
|
143
|
+
conclusion: str = None,
|
144
|
+
file_ids: Optional[List[str]] = [],
|
145
|
+
number_of_files_produced: Optional[int] = None,
|
146
|
+
parent_pid: Optional[int] = None,
|
147
|
+
):
|
148
|
+
mutation = gql(job_run_update_mutation)
|
149
|
+
input = {"id": id}
|
150
|
+
if status:
|
151
|
+
input["status"] = status
|
152
|
+
if conclusion:
|
153
|
+
input["conclusion"] = conclusion
|
154
|
+
if file_ids and len(file_ids) > 0:
|
155
|
+
input["files"] = file_ids
|
156
|
+
if number_of_files_produced is not None:
|
157
|
+
input["numberOfFilesProduced"] = number_of_files_produced
|
158
|
+
if parent_pid is not None:
|
159
|
+
input["parentPid"] = parent_pid
|
160
|
+
variables = {"input": input}
|
161
|
+
result = await self.primitive.session.execute_async(
|
162
|
+
mutation, variable_values=variables, get_execution_result=True
|
163
|
+
)
|
164
|
+
return result
|
165
|
+
|
135
166
|
@guard
|
136
167
|
def github_access_token_for_job_run(self, job_run_id: str):
|
137
168
|
query = gql(github_app_token_for_job_run_query)
|
@@ -0,0 +1,196 @@
|
|
1
|
+
import sys
|
2
|
+
from time import sleep
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from loguru import logger
|
6
|
+
|
7
|
+
from primitive.__about__ import __version__
|
8
|
+
from primitive.utils.actions import BaseAction
|
9
|
+
from primitive.utils.exceptions import P_CLI_100, P_CLI_101
|
10
|
+
from primitive.utils.psutil import kill_process_and_children
|
11
|
+
|
12
|
+
MAX_GET_STATUS_TIMEOUT = 30
|
13
|
+
|
14
|
+
|
15
|
+
class Monitor(BaseAction):
|
16
|
+
def start(self, job_run_id: Optional[str] = None):
|
17
|
+
logger.remove()
|
18
|
+
logger.add(
|
19
|
+
sink=sys.stderr,
|
20
|
+
format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <level>{message}</level>",
|
21
|
+
backtrace=True,
|
22
|
+
diagnose=True,
|
23
|
+
level="DEBUG" if self.primitive.DEBUG else "INFO",
|
24
|
+
)
|
25
|
+
logger.info("primitive monitor")
|
26
|
+
logger.info(f"Version: {__version__}")
|
27
|
+
|
28
|
+
# TODO: tighten logic for determining if we're running in a container
|
29
|
+
RUNNING_IN_CONTAINER = False
|
30
|
+
if job_run_id is not None:
|
31
|
+
logger.info("Running in container...")
|
32
|
+
RUNNING_IN_CONTAINER = True
|
33
|
+
|
34
|
+
# can't check if if it is a container
|
35
|
+
if not RUNNING_IN_CONTAINER:
|
36
|
+
try:
|
37
|
+
# hey stupid:
|
38
|
+
# do not set is_available to True here, it will mess up the reservation logic
|
39
|
+
# only set is_available after we've checked that no active reservation is present
|
40
|
+
# setting is_available of the parent also effects the children,
|
41
|
+
# which may have active reservations as well
|
42
|
+
self.primitive.hardware.check_in_http(is_online=True)
|
43
|
+
except Exception as exception:
|
44
|
+
logger.exception(f"Error checking in hardware: {exception}")
|
45
|
+
sys.exit(1)
|
46
|
+
|
47
|
+
# From Dylan on June 30th:
|
48
|
+
# If passed an explicit job_run_id we know it is running in a container.
|
49
|
+
# If no job_run_id is passed, we need to check that this device has an active reservation.
|
50
|
+
# Fetch the active reservations. If it exists AND has a JobRun associated with it:
|
51
|
+
# - check if the JobRun exists in the API
|
52
|
+
# - if it does exist, check if it is already running
|
53
|
+
# - if it is in status [pending, request_in_progress, in_progress] where we should wait for the PID
|
54
|
+
# - if it is in status [request_completed, completed] and there is a PID, kill it
|
55
|
+
# Finally, if running in a container, kill the process.
|
56
|
+
# Else, wait for a new active reservation to be created.
|
57
|
+
|
58
|
+
try:
|
59
|
+
active_reservation_data = None
|
60
|
+
previous_reservation_id = None
|
61
|
+
active_reservation_id = None
|
62
|
+
|
63
|
+
while True:
|
64
|
+
# this block determines if there is a reservation at all
|
65
|
+
# handles cleanup of old reservations
|
66
|
+
# obtains an active JobRun's ID
|
67
|
+
if not RUNNING_IN_CONTAINER:
|
68
|
+
hardware = self.primitive.hardware.get_own_hardware_details()
|
69
|
+
# fetch the latest hardware and activeReservation details
|
70
|
+
if active_reservation_data := hardware["activeReservation"]:
|
71
|
+
active_reservation_id = active_reservation_data.get("id", None)
|
72
|
+
if previous_reservation_id is None:
|
73
|
+
previous_reservation_id = active_reservation_id
|
74
|
+
else:
|
75
|
+
active_reservation_data = None
|
76
|
+
active_reservation_id = None
|
77
|
+
|
78
|
+
# if there is no activeReservation or previous reservation, sync + sleep
|
79
|
+
if (
|
80
|
+
active_reservation_data is None
|
81
|
+
and active_reservation_id is None
|
82
|
+
and previous_reservation_id is None
|
83
|
+
):
|
84
|
+
self.primitive.hardware.check_in_http(
|
85
|
+
is_available=True, is_online=True
|
86
|
+
)
|
87
|
+
self.primitive.hardware._sync_children(hardware=hardware)
|
88
|
+
|
89
|
+
sleep_amount = 5
|
90
|
+
logger.info(
|
91
|
+
f"No active reservation found... [sleeping {sleep_amount} seconds]"
|
92
|
+
)
|
93
|
+
sleep(sleep_amount)
|
94
|
+
continue
|
95
|
+
|
96
|
+
# if there is a previous_reservation_id but no activeReservation, cleanup
|
97
|
+
elif active_reservation_data is None and previous_reservation_id:
|
98
|
+
logger.info(
|
99
|
+
f"Cleaning up previous reservation {previous_reservation_id}..."
|
100
|
+
)
|
101
|
+
self.primitive.provisioning.remove_reservation_authorized_keys(
|
102
|
+
reservation_id=previous_reservation_id
|
103
|
+
)
|
104
|
+
job_run_data = (
|
105
|
+
self.primitive.reservations.get_job_run_for_reservation_id(
|
106
|
+
reservation_id=previous_reservation_id
|
107
|
+
)
|
108
|
+
)
|
109
|
+
job_run_id = job_run_data.get("id")
|
110
|
+
previous_reservation_id = None
|
111
|
+
|
112
|
+
# if we are on the new reservation
|
113
|
+
elif (
|
114
|
+
(previous_reservation_id is not None)
|
115
|
+
and (active_reservation_id is not None)
|
116
|
+
and (previous_reservation_id == active_reservation_id)
|
117
|
+
):
|
118
|
+
self.primitive.provisioning.add_reservation_authorized_keys(
|
119
|
+
reservation_id=active_reservation_id
|
120
|
+
)
|
121
|
+
|
122
|
+
# we have an active reservation, check if we have JobRuns attached to it
|
123
|
+
if active_reservation_id is not None:
|
124
|
+
logger.info(f"Active Reservation ID: {active_reservation_id}")
|
125
|
+
job_run_data = (
|
126
|
+
self.primitive.reservations.get_job_run_for_reservation_id(
|
127
|
+
reservation_id=active_reservation_id
|
128
|
+
)
|
129
|
+
)
|
130
|
+
job_run_id = job_run_data.get("id")
|
131
|
+
|
132
|
+
# Golden state for normal reservation
|
133
|
+
if not job_run_id and active_reservation_id:
|
134
|
+
self.primitive.hardware.check_in_http(
|
135
|
+
is_available=False, is_online=True
|
136
|
+
)
|
137
|
+
sleep_amount = 5
|
138
|
+
logger.info(
|
139
|
+
f"Waiting for Job Runs... [sleeping {sleep_amount} seconds]"
|
140
|
+
)
|
141
|
+
sleep(sleep_amount)
|
142
|
+
continue
|
143
|
+
|
144
|
+
# job_run_data can come from 3 places:
|
145
|
+
# 1. an explicitly passed job_run_id
|
146
|
+
# 2. the previous reservation has an job_run_id (kill old PIDs)
|
147
|
+
# 3. the active reservation has an job_run_id (check status)
|
148
|
+
while job_run_id:
|
149
|
+
status_result = self.primitive.jobs.get_job_status(id=job_run_id)
|
150
|
+
get_status_timeout = 0
|
151
|
+
sleep_amount = 5
|
152
|
+
|
153
|
+
while get_status_timeout < MAX_GET_STATUS_TIMEOUT:
|
154
|
+
if not status_result or not status_result.data:
|
155
|
+
logger.error(
|
156
|
+
f"Error fetching job status for Job Run {job_run_id}. Retrying... [sleeping {sleep_amount} seconds]"
|
157
|
+
)
|
158
|
+
get_status_timeout += sleep_amount
|
159
|
+
sleep(sleep_amount)
|
160
|
+
continue
|
161
|
+
else:
|
162
|
+
break
|
163
|
+
|
164
|
+
if not status_result or not status_result.data:
|
165
|
+
raise P_CLI_101()
|
166
|
+
|
167
|
+
status_value = status_result.data["jobRun"]["status"]
|
168
|
+
parent_pid = status_result.data["jobRun"]["parentPid"]
|
169
|
+
|
170
|
+
if status_value == "completed":
|
171
|
+
logger.info(
|
172
|
+
f"Job run {job_run_id} is completed. Killing children if they exist."
|
173
|
+
)
|
174
|
+
if parent_pid is not None:
|
175
|
+
kill_process_and_children(pid=parent_pid)
|
176
|
+
status_value = None
|
177
|
+
job_run_id = None
|
178
|
+
else:
|
179
|
+
logger.info(
|
180
|
+
f"Job Run {job_run_id} with Status {status_value} with PID {parent_pid}. [sleeping {sleep_amount} seconds]"
|
181
|
+
)
|
182
|
+
sleep(sleep_amount)
|
183
|
+
continue
|
184
|
+
|
185
|
+
except KeyboardInterrupt:
|
186
|
+
logger.info("Stopping primitive monitor...")
|
187
|
+
try:
|
188
|
+
if not RUNNING_IN_CONTAINER:
|
189
|
+
self.primitive.hardware.check_in_http(
|
190
|
+
is_available=False, is_online=False, stopping_agent=True
|
191
|
+
)
|
192
|
+
|
193
|
+
except P_CLI_100 as exception:
|
194
|
+
logger.error("Error stopping primitive monitor.")
|
195
|
+
logger.error(str(exception))
|
196
|
+
sys.exit()
|
@@ -176,3 +176,27 @@ class Reservations(BaseAction):
|
|
176
176
|
)
|
177
177
|
|
178
178
|
return reservation_result
|
179
|
+
|
180
|
+
@guard
|
181
|
+
def get_job_run_for_reservation_id(self, reservation_id: str) -> dict:
|
182
|
+
if not reservation_id:
|
183
|
+
logger.error("No reservation ID provided.")
|
184
|
+
return {}
|
185
|
+
|
186
|
+
job_runs_for_reservation = self.primitive.jobs.get_job_runs(
|
187
|
+
first=1,
|
188
|
+
reservation_id=reservation_id,
|
189
|
+
)
|
190
|
+
|
191
|
+
while job_runs_for_reservation is None or job_runs_for_reservation.data is None:
|
192
|
+
logger.error("Error fetching job runs.")
|
193
|
+
sleep_amount = 5
|
194
|
+
logger.info(f"Error fetching job runs... [sleeping {sleep_amount} seconds]")
|
195
|
+
sleep(sleep_amount)
|
196
|
+
continue
|
197
|
+
|
198
|
+
if not job_runs_for_reservation.data["jobRuns"]["edges"]:
|
199
|
+
logger.error("No job runs found for the given reservation ID.")
|
200
|
+
return {}
|
201
|
+
|
202
|
+
return job_runs_for_reservation.data["jobRuns"]["edges"][0]["node"]
|
@@ -10,3 +10,14 @@ class P_CLI_100(Exception):
|
|
10
10
|
|
11
11
|
def __str__(self):
|
12
12
|
return f"{self.codename}: {self.message}"
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass
|
16
|
+
class P_CLI_101(Exception):
|
17
|
+
"""Could Not Get Status for JobRun"""
|
18
|
+
|
19
|
+
codename: str = "P_CLI_101"
|
20
|
+
message: str = "Could Not Get Status for JobRun"
|
21
|
+
|
22
|
+
def __str__(self):
|
23
|
+
return f"{self.codename}: {self.message}"
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import psutil
|
2
|
+
from loguru import logger
|
3
|
+
|
4
|
+
|
5
|
+
def kill_process_and_children(pid: int) -> bool:
|
6
|
+
"""Kill a process and all its children."""
|
7
|
+
try:
|
8
|
+
try:
|
9
|
+
parent = psutil.Process(pid)
|
10
|
+
logger.info(f"Process PID {parent.pid} found.")
|
11
|
+
except psutil.NoSuchProcess:
|
12
|
+
logger.info("Process not found")
|
13
|
+
return False
|
14
|
+
|
15
|
+
children = parent.children(recursive=True)
|
16
|
+
|
17
|
+
for child in children:
|
18
|
+
logger.info(f"Killing child process {child.pid}...")
|
19
|
+
child.kill()
|
20
|
+
|
21
|
+
logger.info(f"Killing parent process {parent.pid}...")
|
22
|
+
parent.kill()
|
23
|
+
return True
|
24
|
+
except psutil.NoSuchProcess:
|
25
|
+
logger.warning(f"Process with PID {pid} not found.")
|
26
|
+
return False
|