aind-data-transfer-service 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aind-data-transfer-service might be problematic. Click here for more details.

@@ -0,0 +1,492 @@
1
+ """Module to contain models for hpc rest api responses."""
2
+
3
+ import json
4
+ from pathlib import PurePosixPath
5
+ from typing import Any, List, Literal, Optional
6
+
7
+ from pydantic import Extra, Field, SecretStr
8
+ from pydantic_settings import BaseSettings
9
+
10
+
11
+ class HpcJobSubmitSettings(BaseSettings):
12
+ """Configs to send in a post request. v0.0.36 of slurm rest api."""
13
+
14
+ account: Optional[str] = Field(
15
+ None,
16
+ description="Charge resources used by this job to specified account.",
17
+ )
18
+ # This is the way it is written in the slurm documentation
19
+ account_gather_freqency: Optional[str] = Field(
20
+ None,
21
+ description=(
22
+ "Define the job accounting and profiling sampling intervals."
23
+ ),
24
+ )
25
+ argv: Optional[List[str]] = Field(
26
+ None, description="Arguments to the script."
27
+ )
28
+ array: Optional[str] = Field(
29
+ None,
30
+ description=(
31
+ "Submit a job array, multiple jobs to be executed with identical "
32
+ "parameters. The indexes specification identifies what array "
33
+ "index values should be used."
34
+ ),
35
+ )
36
+ batch_features: Optional[str] = Field(
37
+ None, description="features required for batch script's node"
38
+ )
39
+ begin_time: Optional[str] = Field(
40
+ None,
41
+ description=(
42
+ "Submit the batch script to the Slurm controller immediately, "
43
+ "like normal, but tell the controller to defer the allocation of "
44
+ "the job until the specified time."
45
+ ),
46
+ )
47
+ burst_buffer: Optional[str] = Field(
48
+ None, description="Burst buffer specification."
49
+ )
50
+ cluster_constraints: Optional[str] = Field(
51
+ None,
52
+ description=(
53
+ "Specifies features that a federated cluster must have to have a "
54
+ "sibling job submitted to it."
55
+ ),
56
+ )
57
+ comment: Optional[str] = Field(None, description="An arbitrary comment.")
58
+ constraints: Optional[str] = Field(
59
+ None, description="node features required by job."
60
+ )
61
+ core_specification: Optional[int] = Field(
62
+ None,
63
+ description=(
64
+ "Count of specialized threads per node reserved by the job for "
65
+ "system operations and not used by the application."
66
+ ),
67
+ )
68
+ cores_per_socket: Optional[int] = Field(
69
+ None,
70
+ description=(
71
+ "Restrict node selection to nodes with at least the specified "
72
+ "number of cores per socket."
73
+ ),
74
+ )
75
+ cpu_binding: Optional[str] = Field(None, description="Cpu binding")
76
+ cpu_binding_hint: Optional[str] = Field(
77
+ None, description="Cpu binding hint"
78
+ )
79
+ cpu_frequency: Optional[str] = Field(
80
+ None,
81
+ description=(
82
+ "Request that job steps initiated by srun commands inside this "
83
+ "sbatch script be run at some requested frequency if possible, on "
84
+ "the CPUs selected for the step on the compute node(s)."
85
+ ),
86
+ )
87
+ cpus_per_gpu: Optional[str] = Field(
88
+ None, description="Number of CPUs requested per allocated GPU."
89
+ )
90
+ cpus_per_task: Optional[int] = Field(
91
+ None,
92
+ description=(
93
+ "Advise the Slurm controller that ensuing job steps will require "
94
+ "ncpus number of processors per task."
95
+ ),
96
+ )
97
+ current_working_directory: Optional[str] = Field(
98
+ None,
99
+ description=(
100
+ "Instruct Slurm to connect the batch script's standard output "
101
+ "directly to the file name."
102
+ ),
103
+ )
104
+ deadline: Optional[str] = Field(
105
+ None,
106
+ description=(
107
+ "Remove the job if no ending is possible before this deadline "
108
+ "(start > (deadline - time[-min]))."
109
+ ),
110
+ )
111
+ delay_boot: Optional[int] = Field(
112
+ None,
113
+ description=(
114
+ "Do not reboot nodes in order to satisfied this job's feature "
115
+ "specification if the job has been eligible to run for less than "
116
+ "this time period."
117
+ ),
118
+ )
119
+ dependency: Optional[str] = Field(
120
+ None,
121
+ description=(
122
+ "Defer the start of this job until the specified dependencies "
123
+ "have been satisfied completed."
124
+ ),
125
+ )
126
+ distribution: Optional[str] = Field(
127
+ None,
128
+ description=(
129
+ "Specify alternate distribution methods for remote processes."
130
+ ),
131
+ )
132
+ environment: Optional[dict] = Field(
133
+ None, description="Dictionary of environment entries."
134
+ )
135
+ exclusive: Optional[Literal["user", "mcs", "true", "false"]] = Field(
136
+ None,
137
+ description=(
138
+ "The job allocation can share nodes just other users with the "
139
+ "'user' option or with the 'mcs' option)."
140
+ ),
141
+ )
142
+ get_user_environment: Optional[bool] = Field(
143
+ None, description="Load new login environment for user on job node."
144
+ )
145
+ gres: Optional[str] = Field(
146
+ None,
147
+ description=(
148
+ "Specifies a comma delimited list of generic consumable resources."
149
+ ),
150
+ )
151
+ gres_flags: Optional[
152
+ Literal["disable-binding", "enforce-binding"]
153
+ ] = Field(
154
+ None, description="Specify generic resource task binding options."
155
+ )
156
+ gpu_binding: Optional[str] = Field(
157
+ None, description="Requested binding of tasks to GPU."
158
+ )
159
+ gpu_frequency: Optional[str] = Field(
160
+ None, description="Requested GPU frequency."
161
+ )
162
+ gpus: Optional[str] = Field(None, description="GPUs per job.")
163
+ gpus_per_node: Optional[str] = Field(None, description="GPUs per node.")
164
+ gpus_per_socket: Optional[str] = Field(
165
+ None, description="GPUs per socket."
166
+ )
167
+ gpus_per_task: Optional[str] = Field(None, description="GPUs per task.")
168
+ hold: Optional[bool] = Field(
169
+ None,
170
+ description=(
171
+ "Specify the job is to be submitted in a held state "
172
+ "(priority of zero)."
173
+ ),
174
+ )
175
+ kill_on_invalid_dependency: Optional[bool] = Field(
176
+ None,
177
+ description=(
178
+ "If a job has an invalid dependency, then Slurm is to "
179
+ "terminate it."
180
+ ),
181
+ )
182
+ licenses: Optional[str] = Field(
183
+ None,
184
+ description=(
185
+ "Specification of licenses (or other resources available on all "
186
+ "nodes of the cluster) which must be allocated to this job."
187
+ ),
188
+ )
189
+ mail_type: Optional[str] = Field(
190
+ None,
191
+ description="Notify user by email when certain event types occur.",
192
+ )
193
+ mail_user: Optional[str] = Field(
194
+ None,
195
+ description=(
196
+ "User to receive email notification of state changes as defined "
197
+ "by mail_type."
198
+ ),
199
+ )
200
+ mcs_label: Optional[str] = Field(
201
+ None,
202
+ description="This parameter is a group among the groups of the user.",
203
+ )
204
+ memory_binding: Optional[str] = Field(
205
+ None, description="Bind tasks to memory."
206
+ )
207
+ memory_per_cpu: Optional[int] = Field(
208
+ None, description="Minimum real memory per cpu (MB)."
209
+ )
210
+ memory_per_gpu: Optional[int] = Field(
211
+ None, description="Minimum memory required per allocated GPU."
212
+ )
213
+ memory_per_node: Optional[int] = Field(
214
+ None, description="Minimum real memory per node (MB)."
215
+ )
216
+ minimum_cpus_per_node: Optional[int] = Field(
217
+ None, description="Minimum number of CPUs per node."
218
+ )
219
+ minimum_nodes: Optional[bool] = Field(
220
+ None,
221
+ description=(
222
+ "If a range of node counts is given, prefer the smaller count."
223
+ ),
224
+ )
225
+ name: Optional[str] = Field(
226
+ None, description="Specify a name for the job allocation."
227
+ )
228
+ nice: Optional[str] = Field(
229
+ None,
230
+ description=(
231
+ "Run the job with an adjusted scheduling priority within Slurm."
232
+ ),
233
+ )
234
+ no_kill: Optional[bool] = Field(
235
+ None,
236
+ description=(
237
+ "Do not automatically terminate a job if one of the nodes it has "
238
+ "been allocated fails."
239
+ ),
240
+ )
241
+ nodes: Optional[List[int]] = Field(
242
+ None,
243
+ description=(
244
+ "Request that a minimum of minnodes nodes and a maximum node "
245
+ "count."
246
+ ),
247
+ )
248
+ open_mode: Optional[Literal["append", "truncate"]] = Field(
249
+ None,
250
+ description=(
251
+ "Open the output and error files using append or truncate mode "
252
+ "as specified."
253
+ ),
254
+ )
255
+ partition: Optional[str] = Field(
256
+ None,
257
+ description=(
258
+ "Request a specific partition for the resource allocation."
259
+ ),
260
+ )
261
+ priority: Optional[str] = Field(
262
+ None, description="Request a specific job priority."
263
+ )
264
+ # Set this to "production" for production environment
265
+ qos: Optional[str] = Field(
266
+ None, description="Request a quality of service for the job."
267
+ )
268
+ requeue: Optional[bool] = Field(
269
+ None,
270
+ description=(
271
+ "Specifies that the batch job should eligible to being requeue."
272
+ ),
273
+ )
274
+ reservation: Optional[str] = Field(
275
+ None,
276
+ description=(
277
+ "Allocate resources for the job from the named reservation."
278
+ ),
279
+ )
280
+ signal: Optional[str] = Field(
281
+ None,
282
+ description=(
283
+ "When a job is within sig_time seconds of its end time, send it "
284
+ "the signal sig_num."
285
+ ),
286
+ )
287
+ sockets_per_node: Optional[int] = Field(
288
+ None,
289
+ description=(
290
+ "Restrict node selection to nodes with at least the specified "
291
+ "number of sockets."
292
+ ),
293
+ )
294
+ spread_job: Optional[bool] = Field(
295
+ None,
296
+ description=(
297
+ "Spread the job allocation over as many nodes as possible and "
298
+ "attempt to evenly distribute tasks across the allocated nodes."
299
+ ),
300
+ )
301
+ standard_error: Optional[str] = Field(
302
+ None,
303
+ description=(
304
+ "Instruct Slurm to connect the batch script's standard error "
305
+ "directly to the file name."
306
+ ),
307
+ )
308
+ standard_in: Optional[str] = Field(
309
+ None,
310
+ description=(
311
+ "Instruct Slurm to connect the batch script's standard input "
312
+ "directly to the file name specified."
313
+ ),
314
+ )
315
+ standard_out: Optional[str] = Field(
316
+ None,
317
+ description=(
318
+ "Instruct Slurm to connect the batch script's standard output "
319
+ "directly to the file name."
320
+ ),
321
+ )
322
+ tasks: Optional[int] = Field(
323
+ None,
324
+ description=(
325
+ "Advises the Slurm controller that job steps run within the "
326
+ "allocation will launch a maximum of number tasks and to provide "
327
+ "for sufficient resources."
328
+ ),
329
+ )
330
+ tasks_per_core: Optional[int] = Field(
331
+ None, description="Request the maximum ntasks be invoked on each core."
332
+ )
333
+ tasks_per_node: Optional[int] = Field(
334
+ None, description="Request the maximum ntasks be invoked on each node."
335
+ )
336
+ tasks_per_socket: Optional[int] = Field(
337
+ None,
338
+ description="Request the maximum ntasks be invoked on each socket.",
339
+ )
340
+ thread_specification: Optional[int] = Field(
341
+ None,
342
+ description=(
343
+ "Count of specialized threads per node reserved by the job for "
344
+ "system operations and not used by the application."
345
+ ),
346
+ )
347
+ threads_per_core: Optional[int] = Field(
348
+ None,
349
+ description=(
350
+ "Restrict node selection to nodes with at least the specified "
351
+ "number of threads per core."
352
+ ),
353
+ )
354
+ time_limit: Optional[int] = Field(None, description="Step time limit.")
355
+ time_minimum: Optional[int] = Field(
356
+ None, description="Minimum run time in minutes."
357
+ )
358
+ wait_all_nodes: Optional[bool] = Field(
359
+ None,
360
+ description=(
361
+ "Do not begin execution until all nodes are ready for " "use."
362
+ ),
363
+ )
364
+ wckey: Optional[str] = Field(
365
+ None, description="Specify wckey to be used with job."
366
+ )
367
+
368
+ class Config:
369
+ """Config to set env var prefix to HPC"""
370
+
371
+ extra = Extra.forbid
372
+ env_prefix = "HPC_"
373
+
374
+ @staticmethod
375
+ def script_command_str(sif_loc_str) -> str:
376
+ """This is the command that will be sent to the hpc"""
377
+ command_str = [
378
+ "#!/bin/bash",
379
+ "\nsingularity",
380
+ "exec",
381
+ "--cleanenv",
382
+ sif_loc_str,
383
+ "python",
384
+ "-m",
385
+ "aind_data_transfer.jobs.basic_job",
386
+ ]
387
+ return " ".join(command_str)
388
+
389
+ @staticmethod
390
+ def _set_default_val(values: dict, key: str, default_value: Any) -> None:
391
+ """Util method to set a default if value not in dict[key]"""
392
+ if values.get(key) is None:
393
+ values[key] = default_value
394
+ return None
395
+
396
+ @classmethod
397
+ def from_upload_job_configs(
398
+ cls,
399
+ logging_directory: PurePosixPath,
400
+ aws_secret_access_key: SecretStr,
401
+ aws_access_key_id: str,
402
+ aws_default_region: str,
403
+ aws_session_token: Optional[SecretStr] = None,
404
+ **kwargs,
405
+ ):
406
+ """
407
+ Class constructor to use when submitting a basic upload job request
408
+ Parameters
409
+ ----------
410
+ logging_directory : PurePosixPath
411
+ aws_secret_access_key : SecretStr
412
+ aws_access_key_id : str
413
+ aws_default_region : str
414
+ aws_session_token : Optional[SecretStr]
415
+ kwargs : dict
416
+ Hpc settings
417
+ """
418
+ hpc_env = {
419
+ "PATH": "/bin:/usr/bin/:/usr/local/bin/",
420
+ "LD_LIBRARY_PATH": "/lib/:/lib64/:/usr/local/lib",
421
+ "SINGULARITYENV_AWS_SECRET_ACCESS_KEY": (
422
+ aws_secret_access_key.get_secret_value()
423
+ ),
424
+ "SINGULARITYENV_AWS_ACCESS_KEY_ID": aws_access_key_id,
425
+ "SINGULARITYENV_AWS_DEFAULT_REGION": aws_default_region,
426
+ }
427
+ if aws_session_token is not None:
428
+ hpc_env[
429
+ "SINGULARITYENV_AWS_SESSION_TOKEN"
430
+ ] = aws_session_token.get_secret_value()
431
+ cls._set_default_val(kwargs, "environment", hpc_env)
432
+ cls._set_default_val(
433
+ kwargs,
434
+ "standard_out",
435
+ str(logging_directory / (kwargs["name"] + ".out")),
436
+ )
437
+ cls._set_default_val(
438
+ kwargs,
439
+ "standard_error",
440
+ str(logging_directory / (kwargs["name"] + "_error.out")),
441
+ )
442
+ return cls(**kwargs)
443
+
444
+ @classmethod
445
+ def attach_configs_to_script(
446
+ cls,
447
+ script: str,
448
+ base_configs: dict,
449
+ upload_configs_aws_param_store_name: Optional[str],
450
+ staging_directory: Optional[str],
451
+ ) -> str:
452
+ """
453
+ Helper method to attach configs to a base run command string.
454
+ Parameters
455
+ ----------
456
+ script : str
457
+ Can be like
458
+ '#!/bin/bash \nsingularity exec --cleanenv
459
+ feat_289.sif python -m aind_data_transfer.jobs.basic_job'
460
+ base_configs : dict
461
+ job_configs to attach as --json-args
462
+ upload_configs_aws_param_store_name : Optional[str]
463
+ Will supply this config if not in base_configs and not None
464
+ staging_directory : Optional[str]
465
+ Will supply this config if not in base_configs and not None
466
+
467
+ Returns
468
+ -------
469
+ str
470
+ The run command script to send to submit to the slurm cluster
471
+
472
+ """
473
+ if staging_directory is not None:
474
+ cls._set_default_val(
475
+ base_configs, "temp_directory", staging_directory
476
+ )
477
+ if upload_configs_aws_param_store_name is not None:
478
+ cls._set_default_val(
479
+ base_configs,
480
+ "aws_param_store_name",
481
+ upload_configs_aws_param_store_name,
482
+ )
483
+
484
+ return " ".join(
485
+ [
486
+ script,
487
+ "--json-args",
488
+ "'",
489
+ json.dumps(base_configs),
490
+ "'",
491
+ ]
492
+ )
@@ -0,0 +1,58 @@
1
+ """Module to handle setting up logger"""
2
+
3
+ import logging
4
+ from typing import Literal, Optional
5
+
6
+ from logging_loki import LokiHandler
7
+ from pydantic import Field
8
+ from pydantic_settings import BaseSettings
9
+
10
+
11
+ class LoggingConfigs(BaseSettings):
12
+ """Configs for logger"""
13
+
14
+ env_name: Optional[str] = Field(
15
+ default=None, description="Can be used to help tag logging source."
16
+ )
17
+ loki_uri: Optional[str] = Field(
18
+ default=None, description="URI of Loki logging server."
19
+ )
20
+ log_level: Literal[
21
+ "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"
22
+ ] = Field(default="INFO", description="Log level")
23
+
24
+ @property
25
+ def app_name(self):
26
+ """Build app name from configs"""
27
+ package_name = __package__
28
+ base_name = package_name.split(".")[0].replace("_", "-")
29
+ app_name = (
30
+ base_name
31
+ if self.env_name is None
32
+ else f"{base_name}-{self.env_name}"
33
+ )
34
+ return app_name
35
+
36
+ @property
37
+ def loki_path(self):
38
+ """Full path to log loki messages to"""
39
+ return (
40
+ None
41
+ if self.loki_uri is None
42
+ else f"{self.loki_uri}/loki/api/v1/push"
43
+ )
44
+
45
+
46
+ def get_logger(log_configs: LoggingConfigs) -> logging.Logger:
47
+ """Return a logger that can be used to log messages."""
48
+ level = logging.getLevelName(log_configs.log_level)
49
+ logger = logging.getLogger(__name__)
50
+ logger.setLevel(level)
51
+ if log_configs.loki_uri is not None:
52
+ handler = LokiHandler(
53
+ url=log_configs.loki_path,
54
+ version="1",
55
+ tags={"application": log_configs.app_name},
56
+ )
57
+ logger.addHandler(handler)
58
+ return logger
@@ -0,0 +1 @@
1
+ """Package for models"""