primitive 0.2.39__tar.gz → 0.2.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. {primitive-0.2.39 → primitive-0.2.40}/PKG-INFO +1 -1
  2. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/__about__.py +1 -1
  3. primitive-0.2.40/src/primitive/agent/actions.py +150 -0
  4. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/agent/commands.py +1 -1
  5. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/agent/runner.py +39 -36
  6. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/client.py +3 -2
  7. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/actions.py +31 -0
  8. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/graphql/fragments.py +1 -0
  9. primitive-0.2.40/src/primitive/monitor/actions.py +196 -0
  10. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/actions.py +24 -0
  11. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/exceptions.py +11 -0
  12. primitive-0.2.40/src/primitive/utils/psutil.py +26 -0
  13. primitive-0.2.39/src/primitive/agent/actions.py +0 -114
  14. primitive-0.2.39/src/primitive/db/base.py +0 -5
  15. primitive-0.2.39/src/primitive/db/models.py +0 -88
  16. primitive-0.2.39/src/primitive/db/sqlite.py +0 -70
  17. primitive-0.2.39/src/primitive/monitor/actions.py +0 -247
  18. {primitive-0.2.39 → primitive-0.2.40}/.git-hooks/pre-commit +0 -0
  19. {primitive-0.2.39 → primitive-0.2.40}/.gitattributes +0 -0
  20. {primitive-0.2.39 → primitive-0.2.40}/.github/workflows/lint.yml +0 -0
  21. {primitive-0.2.39 → primitive-0.2.40}/.github/workflows/publish.yml +0 -0
  22. {primitive-0.2.39 → primitive-0.2.40}/.github/workflows/pyright.yml +0 -0
  23. {primitive-0.2.39 → primitive-0.2.40}/.gitignore +0 -0
  24. {primitive-0.2.39 → primitive-0.2.40}/.vscode/extensions.json +0 -0
  25. {primitive-0.2.39 → primitive-0.2.40}/.vscode/settings.json +0 -0
  26. {primitive-0.2.39 → primitive-0.2.40}/LICENSE.txt +0 -0
  27. {primitive-0.2.39 → primitive-0.2.40}/Makefile +0 -0
  28. {primitive-0.2.39 → primitive-0.2.40}/README.md +0 -0
  29. {primitive-0.2.39 → primitive-0.2.40}/linux setup.md +0 -0
  30. {primitive-0.2.39 → primitive-0.2.40}/pyproject.toml +0 -0
  31. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/__init__.py +0 -0
  32. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/agent/__init__.py +0 -0
  33. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/agent/uploader.py +0 -0
  34. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/__init__.py +0 -0
  35. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/actions.py +0 -0
  36. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/commands.py +0 -0
  37. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/graphql/__init__.py +0 -0
  38. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/auth/graphql/queries.py +0 -0
  39. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/cli.py +0 -0
  40. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/__init__.py +0 -0
  41. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/actions.py +0 -0
  42. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/commands.py +0 -0
  43. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/launch_agents.py +0 -0
  44. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/launch_service.py +0 -0
  45. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/daemons/ui.py +0 -0
  46. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/exec/__init__.py +0 -0
  47. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/exec/actions.py +0 -0
  48. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/exec/commands.py +0 -0
  49. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/exec/interactive.py +0 -0
  50. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/__init__.py +0 -0
  51. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/actions.py +0 -0
  52. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/commands.py +0 -0
  53. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/graphql/__init__.py +0 -0
  54. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/graphql/fragments.py +0 -0
  55. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/graphql/mutations.py +0 -0
  56. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/files/graphql/queries.py +0 -0
  57. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/__init__.py +0 -0
  58. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/actions.py +0 -0
  59. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/commands.py +0 -0
  60. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/graphql/__init__.py +0 -0
  61. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/git/graphql/queries.py +0 -0
  62. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/graphql/__init__.py +0 -0
  63. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/graphql/relay.py +0 -0
  64. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/graphql/sdk.py +0 -0
  65. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/graphql/utility_fragments.py +0 -0
  66. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/__init__.py +0 -0
  67. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/actions.py +0 -0
  68. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/android.py +0 -0
  69. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/commands.py +0 -0
  70. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/graphql/__init__.py +0 -0
  71. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/graphql/fragments.py +0 -0
  72. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/graphql/mutations.py +0 -0
  73. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/graphql/queries.py +0 -0
  74. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/hardware/ui.py +0 -0
  75. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/__init__.py +0 -0
  76. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/commands.py +0 -0
  77. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/graphql/__init__.py +0 -0
  78. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/graphql/mutations.py +0 -0
  79. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/jobs/graphql/queries.py +0 -0
  80. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/monitor/commands.py +0 -0
  81. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/__init__.py +0 -0
  82. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/actions.py +0 -0
  83. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/commands.py +0 -0
  84. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/graphql/__init__.py +0 -0
  85. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/graphql/fragments.py +0 -0
  86. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/graphql/mutations.py +0 -0
  87. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/organizations/graphql/queries.py +0 -0
  88. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/__init__.py +0 -0
  89. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/actions.py +0 -0
  90. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/commands.py +0 -0
  91. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/graphql/__init__.py +0 -0
  92. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/graphql/fragments.py +0 -0
  93. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/graphql/mutations.py +0 -0
  94. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/projects/graphql/queries.py +0 -0
  95. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/provisioning/__init__.py +0 -0
  96. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/provisioning/actions.py +0 -0
  97. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/provisioning/graphql/__init__.py +0 -0
  98. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/provisioning/graphql/queries.py +0 -0
  99. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/__init__.py +0 -0
  100. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/commands.py +0 -0
  101. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/graphql/__init__.py +0 -0
  102. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/graphql/fragments.py +0 -0
  103. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/graphql/mutations.py +0 -0
  104. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/reservations/graphql/queries.py +0 -0
  105. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/__init__.py +0 -0
  106. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/actions.py +0 -0
  107. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/auth.py +0 -0
  108. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/cache.py +0 -0
  109. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/chunk_size.py +0 -0
  110. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/config.py +0 -0
  111. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/daemons.py +0 -0
  112. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/logging.py +0 -0
  113. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/memory_size.py +0 -0
  114. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/printer.py +0 -0
  115. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/shell.py +0 -0
  116. {primitive-0.2.39 → primitive-0.2.40}/src/primitive/utils/text.py +0 -0
  117. {primitive-0.2.39 → primitive-0.2.40}/tests/__init__.py +0 -0
  118. {primitive-0.2.39 → primitive-0.2.40}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: primitive
3
- Version: 0.2.39
3
+ Version: 0.2.40
4
4
  Project-URL: Documentation, https://github.com//primitivecorp/primitive-cli#readme
5
5
  Project-URL: Issues, https://github.com//primitivecorp/primitive-cli/issues
6
6
  Project-URL: Source, https://github.com//primitivecorp/primitive-cli
@@ -1,4 +1,4 @@
1
1
  # SPDX-FileCopyrightText: 2024-present Dylan Stein <dylan@primitive.tech>
2
2
  #
3
3
  # SPDX-License-Identifier: MIT
4
- __version__ = "0.2.39"
4
+ __version__ = "0.2.40"
@@ -0,0 +1,150 @@
1
+ import sys
2
+ from time import sleep
3
+ from typing import Optional
4
+
5
+ from loguru import logger
6
+
7
+ from primitive.__about__ import __version__
8
+ from primitive.agent.runner import Runner
9
+ from primitive.agent.uploader import Uploader
10
+ from primitive.utils.actions import BaseAction
11
+
12
+
13
+ class Agent(BaseAction):
14
+ def start(self, job_run_id: Optional[str] = None):
15
+ logger.remove()
16
+ logger.add(
17
+ sink=sys.stderr,
18
+ format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <level>{message}</level>",
19
+ backtrace=True,
20
+ diagnose=True,
21
+ level="DEBUG" if self.primitive.DEBUG else "INFO",
22
+ )
23
+ logger.info("primitive agent")
24
+ logger.info(f"Version: {__version__}")
25
+
26
+ # TODO: tighten logic for determining if we're running in a container
27
+ RUNNING_IN_CONTAINER = False
28
+ if job_run_id is not None:
29
+ logger.info("Running in container...")
30
+ RUNNING_IN_CONTAINER = True
31
+
32
+ # Create uploader
33
+ uploader = Uploader(primitive=self.primitive)
34
+
35
+ try:
36
+ while True:
37
+ logger.debug("Scanning for files to upload...")
38
+ uploader.scan()
39
+
40
+ logger.debug("Checking for pending job runs for this device...")
41
+
42
+ # From Dylan June 30th:
43
+ # If passed an explicit job_run_id:
44
+ # - check if the JobRun exists in the API
45
+ # - if it does, set it to request_in_progress
46
+ # - if it does not, log an error and stop execution
47
+ # If no job_run_id is passed:
48
+ # - verify that this is a Node with an active Reservation
49
+ # - if the Reservation is active AND it has a JobRun associated with it,
50
+ # then query for that JobRun
51
+ # - if no JobRuns are found in the API, wait for another active reservation
52
+ # - if a JobRun is found, set it to request_in_progress
53
+ # - then wait for the JobRun to be in_progress from the API
54
+
55
+ active_reservation_id = None
56
+ job_run_data: dict = {}
57
+
58
+ if RUNNING_IN_CONTAINER and job_run_id:
59
+ job_run_result = self.primitive.jobs.get_job_run(id=job_run_id)
60
+ if job_run_result.data:
61
+ job_run_data = job_run_result.data.get("jobRun", {})
62
+ else:
63
+ hardware = self.primitive.hardware.get_own_hardware_details()
64
+ # fetch the latest hardware and activeReservation details
65
+ if active_reservation_data := hardware["activeReservation"]:
66
+ active_reservation_id = active_reservation_data.get("id", None)
67
+
68
+ if active_reservation_id is not None:
69
+ job_run_data = (
70
+ self.primitive.reservations.get_job_run_for_reservation_id(
71
+ reservation_id=active_reservation_id
72
+ )
73
+ )
74
+ job_run_id = job_run_data.get("id", None)
75
+
76
+ if (
77
+ len(job_run_data.keys()) == 0
78
+ or not job_run_data.get("id")
79
+ or job_run_id is None
80
+ ):
81
+ if RUNNING_IN_CONTAINER:
82
+ logger.info("Running in container, exiting due to no JobRun.")
83
+ break
84
+ logger.debug("No pending Job Run found, sleeping...")
85
+ sleep(5)
86
+ continue
87
+
88
+ logger.debug("Found pending Job Run")
89
+ logger.debug(f"Job Run ID: {job_run_data.get('id')}")
90
+ logger.debug(f"Job Name: {job_run_data.get('job').get('name')}")
91
+
92
+ logger.info(
93
+ f"Setting JobRun {job_run_data.get('job').get('name')} to request_in_progress"
94
+ )
95
+ # we are setting to request_in_progress here which puts a started_at time on the JobRun in the API's database
96
+ # any time spent pulling Git repositories, setting up, etc, counts as compute time
97
+ job_run_result = self.primitive.jobs.job_run_update(
98
+ id=job_run_id, status="request_in_progress"
99
+ )
100
+
101
+ while job_run_data["status"] != "in_progress":
102
+ logger.info(
103
+ f"Waiting for JobRun {job_run_data.get('name')} to be in_progress"
104
+ )
105
+ sleep(1)
106
+ job_run_result = self.primitive.jobs.get_job_run(id=job_run_id)
107
+ if job_run_result.data is not None:
108
+ job_run_data = job_run_result.data.get("jobRun", {})
109
+
110
+ runner = Runner(
111
+ primitive=self.primitive,
112
+ job_run=job_run_data,
113
+ )
114
+
115
+ try:
116
+ runner.setup()
117
+ except Exception as exception:
118
+ logger.exception(
119
+ f"Exception while initializing runner: {exception}"
120
+ )
121
+ self.primitive.jobs.job_run_update(
122
+ id=job_run_id,
123
+ status="request_completed",
124
+ conclusion="failure",
125
+ )
126
+ continue
127
+
128
+ try:
129
+ runner.execute_job_run()
130
+ except Exception as exception:
131
+ logger.exception(f"Exception while executing job: {exception}")
132
+ self.primitive.jobs.job_run_update(
133
+ id=job_run_id,
134
+ status="request_completed",
135
+ conclusion="failure",
136
+ )
137
+ finally:
138
+ runner.cleanup()
139
+
140
+ # NOTE: also run scan here to force upload of artifacts
141
+ # This should probably eventually be another daemon?
142
+ uploader.scan()
143
+
144
+ if RUNNING_IN_CONTAINER:
145
+ logger.info("Running in container, exiting after job run")
146
+ break
147
+
148
+ sleep(5)
149
+ except KeyboardInterrupt:
150
+ logger.info("Stopping primitive agent...")
@@ -12,4 +12,4 @@ if typing.TYPE_CHECKING:
12
12
  def cli(context, job_run_id: typing.Optional[str] = None):
13
13
  """agent"""
14
14
  primitive: Primitive = context.obj.get("PRIMITIVE")
15
- primitive.agent.execute(job_run_id=job_run_id)
15
+ primitive.agent.start(job_run_id=job_run_id)
@@ -10,10 +10,10 @@ from typing import Dict, List, TypedDict
10
10
  import yaml
11
11
  from loguru import logger
12
12
 
13
- from ..db.models import JobRun
14
- from ..utils.cache import get_artifacts_cache, get_logs_cache, get_sources_cache
15
- from ..utils.logging import fmt, log_context
16
- from ..utils.shell import env_to_dict
13
+ from primitive.utils.cache import get_artifacts_cache, get_logs_cache, get_sources_cache
14
+ from primitive.utils.logging import fmt, log_context
15
+ from primitive.utils.psutil import kill_process_and_children
16
+ from primitive.utils.shell import env_to_dict
17
17
 
18
18
  try:
19
19
  from yaml import CLoader as Loader
@@ -109,27 +109,21 @@ class Runner:
109
109
  self.job_settings["rootDirectory"]
110
110
  )
111
111
 
112
- db_config = self.job_settings.get("config", None)
113
- if db_config:
114
- logger.info(f"Using job config from database for {self.job['slug']}")
115
- self.config = db_config
116
- else:
117
- # Attempt to parse the job yaml file
118
- job_filename = self.job_settings["repositoryFilename"]
119
- logger.info(f"Scanning directory for job file {job_filename}")
112
+ job_filename = self.job_settings["repositoryFilename"]
113
+ logger.info(f"Scanning directory for job file {job_filename}")
120
114
 
121
- job_config_file = Path(self.source_dir / ".primitive" / job_filename)
115
+ job_config_file = Path(self.source_dir / ".primitive" / job_filename)
122
116
 
123
- if job_config_file.exists():
124
- logger.info(
125
- f"Found job description for {self.job['slug']} at {job_config_file}"
126
- )
127
- self.config = yaml.load(open(job_config_file, "r"), Loader=Loader)
128
- else:
129
- logger.error(
130
- f"No job description with matching filename '{job_filename}' found"
131
- )
132
- raise FileNotFoundError
117
+ if job_config_file.exists():
118
+ logger.info(
119
+ f"Found job description for {self.job['slug']} at {job_config_file}"
120
+ )
121
+ self.config = yaml.load(open(job_config_file, "r"), Loader=Loader)
122
+ else:
123
+ logger.error(
124
+ f"No job description with matching filename '{job_filename}' found"
125
+ )
126
+ raise FileNotFoundError
133
127
 
134
128
  # Setup initial process environment
135
129
  self.initial_env = os.environ
@@ -147,15 +141,11 @@ class Runner:
147
141
  )
148
142
 
149
143
  @log_context(label="execute")
150
- def execute(self) -> None:
151
- logger.info(f"Executing {self.job['slug']} job")
152
- self.primitive.jobs.job_run_update(
153
- self.job_run["id"], status="request_in_progress"
154
- )
155
-
144
+ def execute_job_run(self) -> None:
156
145
  self.modified_env = {**self.initial_env}
157
146
  task_failed = False
158
147
  cancelled = False
148
+ timed_out = False
159
149
 
160
150
  for task in self.config["executes"]:
161
151
  # Everything inside this loop should be contextualized with the task label
@@ -171,6 +161,9 @@ class Runner:
171
161
  if status_value == "completed" and conclusion_value == "cancelled":
172
162
  cancelled = True
173
163
  break
164
+ if status_value == "completed" and conclusion_value == "timed_out":
165
+ timed_out = True
166
+ break
174
167
 
175
168
  # Everything within this block should be contextualized as user logs
176
169
  with logger.contextualize(type="user"):
@@ -186,11 +179,17 @@ class Runner:
186
179
  conclusion_value = status.data["jobRun"]["conclusion"]
187
180
  if status_value == "completed" and conclusion_value == "cancelled":
188
181
  cancelled = True
182
+ if status_value == "completed" and conclusion_value == "timed_out":
183
+ timed_out = True
189
184
 
190
185
  if cancelled:
191
186
  logger.warning("Job cancelled by user")
192
187
  return
193
188
 
189
+ if timed_out:
190
+ logger.error("Job timed out")
191
+ return
192
+
194
193
  conclusion = "success"
195
194
  if task_failed:
196
195
  conclusion = "failure"
@@ -270,9 +269,17 @@ class Runner:
270
269
  stderr=asyncio.subprocess.PIPE,
271
270
  )
272
271
 
273
- JobRun.objects.filter_by(job_run_id=self.job_run["id"]).update(
274
- {"pid": process.pid}
275
- )
272
+ try:
273
+ await self.primitive.jobs.ajob_run_update(
274
+ self.job_run["id"],
275
+ parent_pid=process.pid,
276
+ )
277
+ except ValueError:
278
+ logger.error(
279
+ f"Failed to update job run {self.job_run['id']} with process PID {process.pid}"
280
+ )
281
+ kill_process_and_children(pid=process.pid)
282
+ return False
276
283
 
277
284
  stdout_failed, stderr_failed = await asyncio.gather(
278
285
  self.log_cmd(
@@ -289,10 +296,6 @@ class Runner:
289
296
  f"Finished executing command {i + 1}/{len(commands)}: {cmd} with return code {returncode}"
290
297
  )
291
298
 
292
- JobRun.objects.filter_by(job_run_id=self.job_run["id"]).update(
293
- {"pid": None}
294
- )
295
-
296
299
  if returncode > 0:
297
300
  logger.error(
298
301
  f"Task {task['label']} failed on '{cmd}' with return code {returncode}"
@@ -1,8 +1,9 @@
1
+ from typing import Optional
2
+
1
3
  from gql import Client
2
4
  from loguru import logger
3
5
  from rich.logging import RichHandler
4
6
  from rich.traceback import install
5
- from typing import Optional
6
7
 
7
8
  from .agent.actions import Agent
8
9
  from .auth.actions import Auth
@@ -12,11 +13,11 @@ from .files.actions import Files
12
13
  from .git.actions import Git
13
14
  from .hardware.actions import Hardware
14
15
  from .jobs.actions import Jobs
16
+ from .monitor.actions import Monitor
15
17
  from .organizations.actions import Organizations
16
18
  from .projects.actions import Projects
17
19
  from .provisioning.actions import Provisioning
18
20
  from .reservations.actions import Reservations
19
- from .monitor.actions import Monitor
20
21
  from .utils.config import read_config_file
21
22
 
22
23
 
@@ -115,6 +115,7 @@ class Jobs(BaseAction):
115
115
  conclusion: str = None,
116
116
  file_ids: Optional[List[str]] = [],
117
117
  number_of_files_produced: Optional[int] = None,
118
+ parent_pid: Optional[int] = None,
118
119
  ):
119
120
  mutation = gql(job_run_update_mutation)
120
121
  input = {"id": id}
@@ -126,12 +127,42 @@ class Jobs(BaseAction):
126
127
  input["files"] = file_ids
127
128
  if number_of_files_produced is not None:
128
129
  input["numberOfFilesProduced"] = number_of_files_produced
130
+ if parent_pid is not None:
131
+ input["parentPid"] = parent_pid
129
132
  variables = {"input": input}
130
133
  result = self.primitive.session.execute(
131
134
  mutation, variable_values=variables, get_execution_result=True
132
135
  )
133
136
  return result
134
137
 
138
+ @guard
139
+ async def ajob_run_update(
140
+ self,
141
+ id: str,
142
+ status: str = None,
143
+ conclusion: str = None,
144
+ file_ids: Optional[List[str]] = [],
145
+ number_of_files_produced: Optional[int] = None,
146
+ parent_pid: Optional[int] = None,
147
+ ):
148
+ mutation = gql(job_run_update_mutation)
149
+ input = {"id": id}
150
+ if status:
151
+ input["status"] = status
152
+ if conclusion:
153
+ input["conclusion"] = conclusion
154
+ if file_ids and len(file_ids) > 0:
155
+ input["files"] = file_ids
156
+ if number_of_files_produced is not None:
157
+ input["numberOfFilesProduced"] = number_of_files_produced
158
+ if parent_pid is not None:
159
+ input["parentPid"] = parent_pid
160
+ variables = {"input": input}
161
+ result = await self.primitive.session.execute_async(
162
+ mutation, variable_values=variables, get_execution_result=True
163
+ )
164
+ return result
165
+
135
166
  @guard
136
167
  def github_access_token_for_job_run(self, job_run_id: str):
137
168
  query = gql(github_app_token_for_job_run_query)
@@ -50,5 +50,6 @@ fragment JobRunStatusFragment on JobRun {
50
50
  id
51
51
  status
52
52
  conclusion
53
+ parentPid
53
54
  }
54
55
  """
@@ -0,0 +1,196 @@
1
+ import sys
2
+ from time import sleep
3
+ from typing import Optional
4
+
5
+ from loguru import logger
6
+
7
+ from primitive.__about__ import __version__
8
+ from primitive.utils.actions import BaseAction
9
+ from primitive.utils.exceptions import P_CLI_100, P_CLI_101
10
+ from primitive.utils.psutil import kill_process_and_children
11
+
12
+ MAX_GET_STATUS_TIMEOUT = 30
13
+
14
+
15
+ class Monitor(BaseAction):
16
+ def start(self, job_run_id: Optional[str] = None):
17
+ logger.remove()
18
+ logger.add(
19
+ sink=sys.stderr,
20
+ format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | <level>{level: <8}</level> | <level>{message}</level>",
21
+ backtrace=True,
22
+ diagnose=True,
23
+ level="DEBUG" if self.primitive.DEBUG else "INFO",
24
+ )
25
+ logger.info("primitive monitor")
26
+ logger.info(f"Version: {__version__}")
27
+
28
+ # TODO: tighten logic for determining if we're running in a container
29
+ RUNNING_IN_CONTAINER = False
30
+ if job_run_id is not None:
31
+ logger.info("Running in container...")
32
+ RUNNING_IN_CONTAINER = True
33
+
34
+ # can't check if if it is a container
35
+ if not RUNNING_IN_CONTAINER:
36
+ try:
37
+ # hey stupid:
38
+ # do not set is_available to True here, it will mess up the reservation logic
39
+ # only set is_available after we've checked that no active reservation is present
40
+ # setting is_available of the parent also effects the children,
41
+ # which may have active reservations as well
42
+ self.primitive.hardware.check_in_http(is_online=True)
43
+ except Exception as exception:
44
+ logger.exception(f"Error checking in hardware: {exception}")
45
+ sys.exit(1)
46
+
47
+ # From Dylan on June 30th:
48
+ # If passed an explicit job_run_id we know it is running in a container.
49
+ # If no job_run_id is passed, we need to check that this device has an active reservation.
50
+ # Fetch the active reservations. If it exists AND has a JobRun associated with it:
51
+ # - check if the JobRun exists in the API
52
+ # - if it does exist, check if it is already running
53
+ # - if it is in status [pending, request_in_progress, in_progress] where we should wait for the PID
54
+ # - if it is in status [request_completed, completed] and there is a PID, kill it
55
+ # Finally, if running in a container, kill the process.
56
+ # Else, wait for a new active reservation to be created.
57
+
58
+ try:
59
+ active_reservation_data = None
60
+ previous_reservation_id = None
61
+ active_reservation_id = None
62
+
63
+ while True:
64
+ # this block determines if there is a reservation at all
65
+ # handles cleanup of old reservations
66
+ # obtains an active JobRun's ID
67
+ if not RUNNING_IN_CONTAINER:
68
+ hardware = self.primitive.hardware.get_own_hardware_details()
69
+ # fetch the latest hardware and activeReservation details
70
+ if active_reservation_data := hardware["activeReservation"]:
71
+ active_reservation_id = active_reservation_data.get("id", None)
72
+ if previous_reservation_id is None:
73
+ previous_reservation_id = active_reservation_id
74
+ else:
75
+ active_reservation_data = None
76
+ active_reservation_id = None
77
+
78
+ # if there is no activeReservation or previous reservation, sync + sleep
79
+ if (
80
+ active_reservation_data is None
81
+ and active_reservation_id is None
82
+ and previous_reservation_id is None
83
+ ):
84
+ self.primitive.hardware.check_in_http(
85
+ is_available=True, is_online=True
86
+ )
87
+ self.primitive.hardware._sync_children(hardware=hardware)
88
+
89
+ sleep_amount = 5
90
+ logger.info(
91
+ f"No active reservation found... [sleeping {sleep_amount} seconds]"
92
+ )
93
+ sleep(sleep_amount)
94
+ continue
95
+
96
+ # if there is a previous_reservation_id but no activeReservation, cleanup
97
+ elif active_reservation_data is None and previous_reservation_id:
98
+ logger.info(
99
+ f"Cleaning up previous reservation {previous_reservation_id}..."
100
+ )
101
+ self.primitive.provisioning.remove_reservation_authorized_keys(
102
+ reservation_id=previous_reservation_id
103
+ )
104
+ job_run_data = (
105
+ self.primitive.reservations.get_job_run_for_reservation_id(
106
+ reservation_id=previous_reservation_id
107
+ )
108
+ )
109
+ job_run_id = job_run_data.get("id")
110
+ previous_reservation_id = None
111
+
112
+ # if we are on the new reservation
113
+ elif (
114
+ (previous_reservation_id is not None)
115
+ and (active_reservation_id is not None)
116
+ and (previous_reservation_id == active_reservation_id)
117
+ ):
118
+ self.primitive.provisioning.add_reservation_authorized_keys(
119
+ reservation_id=active_reservation_id
120
+ )
121
+
122
+ # we have an active reservation, check if we have JobRuns attached to it
123
+ if active_reservation_id is not None:
124
+ logger.info(f"Active Reservation ID: {active_reservation_id}")
125
+ job_run_data = (
126
+ self.primitive.reservations.get_job_run_for_reservation_id(
127
+ reservation_id=active_reservation_id
128
+ )
129
+ )
130
+ job_run_id = job_run_data.get("id")
131
+
132
+ # Golden state for normal reservation
133
+ if not job_run_id and active_reservation_id:
134
+ self.primitive.hardware.check_in_http(
135
+ is_available=False, is_online=True
136
+ )
137
+ sleep_amount = 5
138
+ logger.info(
139
+ f"Waiting for Job Runs... [sleeping {sleep_amount} seconds]"
140
+ )
141
+ sleep(sleep_amount)
142
+ continue
143
+
144
+ # job_run_data can come from 3 places:
145
+ # 1. an explicitly passed job_run_id
146
+ # 2. the previous reservation has an job_run_id (kill old PIDs)
147
+ # 3. the active reservation has an job_run_id (check status)
148
+ while job_run_id:
149
+ status_result = self.primitive.jobs.get_job_status(id=job_run_id)
150
+ get_status_timeout = 0
151
+ sleep_amount = 5
152
+
153
+ while get_status_timeout < MAX_GET_STATUS_TIMEOUT:
154
+ if not status_result or not status_result.data:
155
+ logger.error(
156
+ f"Error fetching job status for Job Run {job_run_id}. Retrying... [sleeping {sleep_amount} seconds]"
157
+ )
158
+ get_status_timeout += sleep_amount
159
+ sleep(sleep_amount)
160
+ continue
161
+ else:
162
+ break
163
+
164
+ if not status_result or not status_result.data:
165
+ raise P_CLI_101()
166
+
167
+ status_value = status_result.data["jobRun"]["status"]
168
+ parent_pid = status_result.data["jobRun"]["parentPid"]
169
+
170
+ if status_value == "completed":
171
+ logger.info(
172
+ f"Job run {job_run_id} is completed. Killing children if they exist."
173
+ )
174
+ if parent_pid is not None:
175
+ kill_process_and_children(pid=parent_pid)
176
+ status_value = None
177
+ job_run_id = None
178
+ else:
179
+ logger.info(
180
+ f"Job Run {job_run_id} with Status {status_value} with PID {parent_pid}. [sleeping {sleep_amount} seconds]"
181
+ )
182
+ sleep(sleep_amount)
183
+ continue
184
+
185
+ except KeyboardInterrupt:
186
+ logger.info("Stopping primitive monitor...")
187
+ try:
188
+ if not RUNNING_IN_CONTAINER:
189
+ self.primitive.hardware.check_in_http(
190
+ is_available=False, is_online=False, stopping_agent=True
191
+ )
192
+
193
+ except P_CLI_100 as exception:
194
+ logger.error("Error stopping primitive monitor.")
195
+ logger.error(str(exception))
196
+ sys.exit()
@@ -176,3 +176,27 @@ class Reservations(BaseAction):
176
176
  )
177
177
 
178
178
  return reservation_result
179
+
180
+ @guard
181
+ def get_job_run_for_reservation_id(self, reservation_id: str) -> dict:
182
+ if not reservation_id:
183
+ logger.error("No reservation ID provided.")
184
+ return {}
185
+
186
+ job_runs_for_reservation = self.primitive.jobs.get_job_runs(
187
+ first=1,
188
+ reservation_id=reservation_id,
189
+ )
190
+
191
+ while job_runs_for_reservation is None or job_runs_for_reservation.data is None:
192
+ logger.error("Error fetching job runs.")
193
+ sleep_amount = 5
194
+ logger.info(f"Error fetching job runs... [sleeping {sleep_amount} seconds]")
195
+ sleep(sleep_amount)
196
+ continue
197
+
198
+ if not job_runs_for_reservation.data["jobRuns"]["edges"]:
199
+ logger.error("No job runs found for the given reservation ID.")
200
+ return {}
201
+
202
+ return job_runs_for_reservation.data["jobRuns"]["edges"][0]["node"]
@@ -10,3 +10,14 @@ class P_CLI_100(Exception):
10
10
 
11
11
  def __str__(self):
12
12
  return f"{self.codename}: {self.message}"
13
+
14
+
15
+ @dataclass
16
+ class P_CLI_101(Exception):
17
+ """Could Not Get Status for JobRun"""
18
+
19
+ codename: str = "P_CLI_101"
20
+ message: str = "Could Not Get Status for JobRun"
21
+
22
+ def __str__(self):
23
+ return f"{self.codename}: {self.message}"
@@ -0,0 +1,26 @@
1
+ import psutil
2
+ from loguru import logger
3
+
4
+
5
+ def kill_process_and_children(pid: int) -> bool:
6
+ """Kill a process and all its children."""
7
+ try:
8
+ try:
9
+ parent = psutil.Process(pid)
10
+ logger.info(f"Process PID {parent.pid} found.")
11
+ except psutil.NoSuchProcess:
12
+ logger.info("Process not found")
13
+ return False
14
+
15
+ children = parent.children(recursive=True)
16
+
17
+ for child in children:
18
+ logger.info(f"Killing child process {child.pid}...")
19
+ child.kill()
20
+
21
+ logger.info(f"Killing parent process {parent.pid}...")
22
+ parent.kill()
23
+ return True
24
+ except psutil.NoSuchProcess:
25
+ logger.warning(f"Process with PID {pid} not found.")
26
+ return False