pybiolib 0.2.951__py3-none-any.whl → 1.2.1890__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biolib/__init__.py +357 -11
- biolib/_data_record/data_record.py +380 -0
- biolib/_index/__init__.py +0 -0
- biolib/_index/index.py +55 -0
- biolib/_index/query_result.py +103 -0
- biolib/_internal/__init__.py +0 -0
- biolib/_internal/add_copilot_prompts.py +58 -0
- biolib/_internal/add_gui_files.py +81 -0
- biolib/_internal/data_record/__init__.py +1 -0
- biolib/_internal/data_record/data_record.py +85 -0
- biolib/_internal/data_record/push_data.py +116 -0
- biolib/_internal/data_record/remote_storage_endpoint.py +43 -0
- biolib/_internal/errors.py +5 -0
- biolib/_internal/file_utils.py +125 -0
- biolib/_internal/fuse_mount/__init__.py +1 -0
- biolib/_internal/fuse_mount/experiment_fuse_mount.py +209 -0
- biolib/_internal/http_client.py +159 -0
- biolib/_internal/lfs/__init__.py +1 -0
- biolib/_internal/lfs/cache.py +51 -0
- biolib/_internal/libs/__init__.py +1 -0
- biolib/_internal/libs/fusepy/__init__.py +1257 -0
- biolib/_internal/push_application.py +488 -0
- biolib/_internal/runtime.py +22 -0
- biolib/_internal/string_utils.py +13 -0
- biolib/_internal/templates/__init__.py +1 -0
- biolib/_internal/templates/copilot_template/.github/instructions/general-app-knowledge.instructions.md +10 -0
- biolib/_internal/templates/copilot_template/.github/instructions/style-general.instructions.md +20 -0
- biolib/_internal/templates/copilot_template/.github/instructions/style-python.instructions.md +16 -0
- biolib/_internal/templates/copilot_template/.github/instructions/style-react-ts.instructions.md +47 -0
- biolib/_internal/templates/copilot_template/.github/prompts/biolib_app_inputs.prompt.md +11 -0
- biolib/_internal/templates/copilot_template/.github/prompts/biolib_onboard_repo.prompt.md +19 -0
- biolib/_internal/templates/copilot_template/.github/prompts/biolib_run_apps.prompt.md +12 -0
- biolib/_internal/templates/dashboard_template/.biolib/config.yml +5 -0
- biolib/_internal/templates/github_workflow_template/.github/workflows/biolib.yml +21 -0
- biolib/_internal/templates/gitignore_template/.gitignore +10 -0
- biolib/_internal/templates/gui_template/.yarnrc.yml +1 -0
- biolib/_internal/templates/gui_template/App.tsx +53 -0
- biolib/_internal/templates/gui_template/Dockerfile +27 -0
- biolib/_internal/templates/gui_template/biolib-sdk.ts +82 -0
- biolib/_internal/templates/gui_template/dev-data/output.json +7 -0
- biolib/_internal/templates/gui_template/index.css +5 -0
- biolib/_internal/templates/gui_template/index.html +13 -0
- biolib/_internal/templates/gui_template/index.tsx +10 -0
- biolib/_internal/templates/gui_template/package.json +27 -0
- biolib/_internal/templates/gui_template/tsconfig.json +24 -0
- biolib/_internal/templates/gui_template/vite-plugin-dev-data.ts +50 -0
- biolib/_internal/templates/gui_template/vite.config.mts +10 -0
- biolib/_internal/templates/init_template/.biolib/config.yml +19 -0
- biolib/_internal/templates/init_template/Dockerfile +14 -0
- biolib/_internal/templates/init_template/requirements.txt +1 -0
- biolib/_internal/templates/init_template/run.py +12 -0
- biolib/_internal/templates/init_template/run.sh +4 -0
- biolib/_internal/templates/templates.py +25 -0
- biolib/_internal/tree_utils.py +106 -0
- biolib/_internal/utils/__init__.py +65 -0
- biolib/_internal/utils/auth.py +46 -0
- biolib/_internal/utils/job_url.py +33 -0
- biolib/_internal/utils/multinode.py +263 -0
- biolib/_runtime/runtime.py +157 -0
- biolib/_session/session.py +44 -0
- biolib/_shared/__init__.py +0 -0
- biolib/_shared/types/__init__.py +74 -0
- biolib/_shared/types/account.py +12 -0
- biolib/_shared/types/account_member.py +8 -0
- biolib/_shared/types/app.py +9 -0
- biolib/_shared/types/data_record.py +40 -0
- biolib/_shared/types/experiment.py +32 -0
- biolib/_shared/types/file_node.py +17 -0
- biolib/_shared/types/push.py +6 -0
- biolib/_shared/types/resource.py +37 -0
- biolib/_shared/types/resource_deploy_key.py +11 -0
- biolib/_shared/types/resource_permission.py +14 -0
- biolib/_shared/types/resource_version.py +19 -0
- biolib/_shared/types/result.py +14 -0
- biolib/_shared/types/typing.py +10 -0
- biolib/_shared/types/user.py +19 -0
- biolib/_shared/utils/__init__.py +7 -0
- biolib/_shared/utils/resource_uri.py +75 -0
- biolib/api/__init__.py +6 -0
- biolib/api/client.py +168 -0
- biolib/app/app.py +252 -49
- biolib/app/search_apps.py +45 -0
- biolib/biolib_api_client/api_client.py +126 -31
- biolib/biolib_api_client/app_types.py +24 -4
- biolib/biolib_api_client/auth.py +31 -8
- biolib/biolib_api_client/biolib_app_api.py +147 -52
- biolib/biolib_api_client/biolib_job_api.py +161 -141
- biolib/biolib_api_client/job_types.py +21 -5
- biolib/biolib_api_client/lfs_types.py +7 -23
- biolib/biolib_api_client/user_state.py +56 -0
- biolib/biolib_binary_format/__init__.py +1 -4
- biolib/biolib_binary_format/file_in_container.py +105 -0
- biolib/biolib_binary_format/module_input.py +24 -7
- biolib/biolib_binary_format/module_output_v2.py +149 -0
- biolib/biolib_binary_format/remote_endpoints.py +34 -0
- biolib/biolib_binary_format/remote_stream_seeker.py +59 -0
- biolib/biolib_binary_format/saved_job.py +3 -2
- biolib/biolib_binary_format/{attestation_document.py → stdout_and_stderr.py} +8 -8
- biolib/biolib_binary_format/system_status_update.py +3 -2
- biolib/biolib_binary_format/utils.py +175 -0
- biolib/biolib_docker_client/__init__.py +11 -2
- biolib/biolib_errors.py +36 -0
- biolib/biolib_logging.py +27 -10
- biolib/cli/__init__.py +38 -0
- biolib/cli/auth.py +46 -0
- biolib/cli/data_record.py +164 -0
- biolib/cli/index.py +32 -0
- biolib/cli/init.py +421 -0
- biolib/cli/lfs.py +101 -0
- biolib/cli/push.py +50 -0
- biolib/cli/run.py +63 -0
- biolib/cli/runtime.py +14 -0
- biolib/cli/sdk.py +16 -0
- biolib/cli/start.py +56 -0
- biolib/compute_node/cloud_utils/cloud_utils.py +110 -161
- biolib/compute_node/job_worker/cache_state.py +66 -88
- biolib/compute_node/job_worker/cache_types.py +1 -6
- biolib/compute_node/job_worker/docker_image_cache.py +112 -37
- biolib/compute_node/job_worker/executors/__init__.py +0 -3
- biolib/compute_node/job_worker/executors/docker_executor.py +532 -199
- biolib/compute_node/job_worker/executors/docker_types.py +9 -1
- biolib/compute_node/job_worker/executors/types.py +19 -9
- biolib/compute_node/job_worker/job_legacy_input_wait_timeout_thread.py +30 -0
- biolib/compute_node/job_worker/job_max_runtime_timer_thread.py +3 -5
- biolib/compute_node/job_worker/job_storage.py +108 -0
- biolib/compute_node/job_worker/job_worker.py +397 -212
- biolib/compute_node/job_worker/large_file_system.py +87 -38
- biolib/compute_node/job_worker/network_alloc.py +99 -0
- biolib/compute_node/job_worker/network_buffer.py +240 -0
- biolib/compute_node/job_worker/utilization_reporter_thread.py +197 -0
- biolib/compute_node/job_worker/utils.py +9 -24
- biolib/compute_node/remote_host_proxy.py +400 -98
- biolib/compute_node/utils.py +31 -9
- biolib/compute_node/webserver/compute_node_results_proxy.py +189 -0
- biolib/compute_node/webserver/proxy_utils.py +28 -0
- biolib/compute_node/webserver/webserver.py +130 -44
- biolib/compute_node/webserver/webserver_types.py +2 -6
- biolib/compute_node/webserver/webserver_utils.py +77 -12
- biolib/compute_node/webserver/worker_thread.py +183 -42
- biolib/experiments/__init__.py +0 -0
- biolib/experiments/experiment.py +356 -0
- biolib/jobs/__init__.py +1 -0
- biolib/jobs/job.py +741 -0
- biolib/jobs/job_result.py +185 -0
- biolib/jobs/types.py +50 -0
- biolib/py.typed +0 -0
- biolib/runtime/__init__.py +14 -0
- biolib/sdk/__init__.py +91 -0
- biolib/tables.py +34 -0
- biolib/typing_utils.py +2 -7
- biolib/user/__init__.py +1 -0
- biolib/user/sign_in.py +54 -0
- biolib/utils/__init__.py +162 -0
- biolib/utils/cache_state.py +94 -0
- biolib/utils/multipart_uploader.py +194 -0
- biolib/utils/seq_util.py +150 -0
- biolib/utils/zip/remote_zip.py +640 -0
- pybiolib-1.2.1890.dist-info/METADATA +41 -0
- pybiolib-1.2.1890.dist-info/RECORD +177 -0
- {pybiolib-0.2.951.dist-info → pybiolib-1.2.1890.dist-info}/WHEEL +1 -1
- pybiolib-1.2.1890.dist-info/entry_points.txt +2 -0
- README.md +0 -17
- biolib/app/app_result.py +0 -68
- biolib/app/utils.py +0 -62
- biolib/biolib-js/0-biolib.worker.js +0 -1
- biolib/biolib-js/1-biolib.worker.js +0 -1
- biolib/biolib-js/2-biolib.worker.js +0 -1
- biolib/biolib-js/3-biolib.worker.js +0 -1
- biolib/biolib-js/4-biolib.worker.js +0 -1
- biolib/biolib-js/5-biolib.worker.js +0 -1
- biolib/biolib-js/6-biolib.worker.js +0 -1
- biolib/biolib-js/index.html +0 -10
- biolib/biolib-js/main-biolib.js +0 -1
- biolib/biolib_api_client/biolib_account_api.py +0 -21
- biolib/biolib_api_client/biolib_large_file_system_api.py +0 -108
- biolib/biolib_binary_format/aes_encrypted_package.py +0 -42
- biolib/biolib_binary_format/module_output.py +0 -58
- biolib/biolib_binary_format/rsa_encrypted_aes_package.py +0 -57
- biolib/biolib_push.py +0 -114
- biolib/cli.py +0 -203
- biolib/cli_utils.py +0 -273
- biolib/compute_node/cloud_utils/enclave_parent_types.py +0 -7
- biolib/compute_node/enclave/__init__.py +0 -2
- biolib/compute_node/enclave/enclave_remote_hosts.py +0 -53
- biolib/compute_node/enclave/nitro_secure_module_utils.py +0 -64
- biolib/compute_node/job_worker/executors/base_executor.py +0 -18
- biolib/compute_node/job_worker/executors/pyppeteer_executor.py +0 -173
- biolib/compute_node/job_worker/executors/remote/__init__.py +0 -1
- biolib/compute_node/job_worker/executors/remote/nitro_enclave_utils.py +0 -81
- biolib/compute_node/job_worker/executors/remote/remote_executor.py +0 -51
- biolib/lfs.py +0 -196
- biolib/pyppeteer/.circleci/config.yml +0 -100
- biolib/pyppeteer/.coveragerc +0 -3
- biolib/pyppeteer/.gitignore +0 -89
- biolib/pyppeteer/.pre-commit-config.yaml +0 -28
- biolib/pyppeteer/CHANGES.md +0 -253
- biolib/pyppeteer/CONTRIBUTING.md +0 -26
- biolib/pyppeteer/LICENSE +0 -12
- biolib/pyppeteer/README.md +0 -137
- biolib/pyppeteer/docs/Makefile +0 -177
- biolib/pyppeteer/docs/_static/custom.css +0 -28
- biolib/pyppeteer/docs/_templates/layout.html +0 -10
- biolib/pyppeteer/docs/changes.md +0 -1
- biolib/pyppeteer/docs/conf.py +0 -299
- biolib/pyppeteer/docs/index.md +0 -21
- biolib/pyppeteer/docs/make.bat +0 -242
- biolib/pyppeteer/docs/reference.md +0 -211
- biolib/pyppeteer/docs/server.py +0 -60
- biolib/pyppeteer/poetry.lock +0 -1699
- biolib/pyppeteer/pyppeteer/__init__.py +0 -135
- biolib/pyppeteer/pyppeteer/accessibility.py +0 -286
- biolib/pyppeteer/pyppeteer/browser.py +0 -401
- biolib/pyppeteer/pyppeteer/browser_fetcher.py +0 -194
- biolib/pyppeteer/pyppeteer/command.py +0 -22
- biolib/pyppeteer/pyppeteer/connection/__init__.py +0 -242
- biolib/pyppeteer/pyppeteer/connection/cdpsession.py +0 -101
- biolib/pyppeteer/pyppeteer/coverage.py +0 -346
- biolib/pyppeteer/pyppeteer/device_descriptors.py +0 -787
- biolib/pyppeteer/pyppeteer/dialog.py +0 -79
- biolib/pyppeteer/pyppeteer/domworld.py +0 -597
- biolib/pyppeteer/pyppeteer/emulation_manager.py +0 -53
- biolib/pyppeteer/pyppeteer/errors.py +0 -48
- biolib/pyppeteer/pyppeteer/events.py +0 -63
- biolib/pyppeteer/pyppeteer/execution_context.py +0 -156
- biolib/pyppeteer/pyppeteer/frame/__init__.py +0 -299
- biolib/pyppeteer/pyppeteer/frame/frame_manager.py +0 -306
- biolib/pyppeteer/pyppeteer/helpers.py +0 -245
- biolib/pyppeteer/pyppeteer/input.py +0 -371
- biolib/pyppeteer/pyppeteer/jshandle.py +0 -598
- biolib/pyppeteer/pyppeteer/launcher.py +0 -683
- biolib/pyppeteer/pyppeteer/lifecycle_watcher.py +0 -169
- biolib/pyppeteer/pyppeteer/models/__init__.py +0 -103
- biolib/pyppeteer/pyppeteer/models/_protocol.py +0 -12460
- biolib/pyppeteer/pyppeteer/multimap.py +0 -82
- biolib/pyppeteer/pyppeteer/network_manager.py +0 -678
- biolib/pyppeteer/pyppeteer/options.py +0 -8
- biolib/pyppeteer/pyppeteer/page.py +0 -1728
- biolib/pyppeteer/pyppeteer/pipe_transport.py +0 -59
- biolib/pyppeteer/pyppeteer/target.py +0 -147
- biolib/pyppeteer/pyppeteer/task_queue.py +0 -24
- biolib/pyppeteer/pyppeteer/timeout_settings.py +0 -36
- biolib/pyppeteer/pyppeteer/tracing.py +0 -93
- biolib/pyppeteer/pyppeteer/us_keyboard_layout.py +0 -305
- biolib/pyppeteer/pyppeteer/util.py +0 -18
- biolib/pyppeteer/pyppeteer/websocket_transport.py +0 -47
- biolib/pyppeteer/pyppeteer/worker.py +0 -101
- biolib/pyppeteer/pyproject.toml +0 -97
- biolib/pyppeteer/spell.txt +0 -137
- biolib/pyppeteer/tox.ini +0 -72
- biolib/pyppeteer/utils/generate_protocol_types.py +0 -603
- biolib/start_cli.py +0 -7
- biolib/utils.py +0 -47
- biolib/validators/validate_app_version.py +0 -183
- biolib/validators/validate_argument.py +0 -134
- biolib/validators/validate_module.py +0 -323
- biolib/validators/validate_zip_file.py +0 -40
- biolib/validators/validator_utils.py +0 -103
- pybiolib-0.2.951.dist-info/LICENSE +0 -21
- pybiolib-0.2.951.dist-info/METADATA +0 -61
- pybiolib-0.2.951.dist-info/RECORD +0 -153
- pybiolib-0.2.951.dist-info/entry_points.txt +0 -3
- /LICENSE → /pybiolib-1.2.1890.dist-info/licenses/LICENSE +0 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import time
|
|
3
|
+
import subprocess
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
|
|
6
|
+
from docker.models.containers import Container # type: ignore
|
|
7
|
+
|
|
8
|
+
import biolib.api.client
|
|
9
|
+
from biolib.biolib_logging import logger_no_user_data
|
|
10
|
+
from biolib.typing_utils import List, TypedDict, Optional, Dict, cast
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class UtilizationMetricSample(TypedDict):
|
|
14
|
+
cpu_usage_in_percent: float
|
|
15
|
+
gpu_usage_in_percent: Optional[float]
|
|
16
|
+
memory_usage_in_percent: float
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AggregatedUtilizationMetrics(TypedDict):
|
|
20
|
+
cpu_average_usage_in_percent: float
|
|
21
|
+
cpu_max_usage_in_percent: float
|
|
22
|
+
gpu_average_usage_in_percent: Optional[float]
|
|
23
|
+
gpu_max_usage_in_percent: Optional[float]
|
|
24
|
+
memory_average_usage_in_percent: float
|
|
25
|
+
memory_max_usage_in_percent: float
|
|
26
|
+
recorded_at: str
|
|
27
|
+
sampling_period_in_milliseconds: int
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class CpuUsage(TypedDict):
|
|
31
|
+
total_usage: float
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class _CpuStats(TypedDict, total=False):
|
|
35
|
+
cpu_usage: CpuUsage
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class CpuStats(_CpuStats, total=False):
|
|
39
|
+
system_cpu_usage: float
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class MemoryStats(TypedDict, total=False):
|
|
43
|
+
usage: float
|
|
44
|
+
limit: float
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ContainerStats(TypedDict):
|
|
48
|
+
cpu_stats: CpuStats
|
|
49
|
+
memory_stats: MemoryStats
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class UtilizationReporterThread(threading.Thread):
|
|
53
|
+
def __init__(self, container: Container, job_uuid: str, compute_node_auth_token: str, include_gpu_stats: bool):
|
|
54
|
+
super().__init__(daemon=False) # Do not run as daemon thread to ensure final reporting request goes through
|
|
55
|
+
self._container_object: Container = container
|
|
56
|
+
self._job_uuid: str = job_uuid
|
|
57
|
+
self._compute_node_auth_token: str = compute_node_auth_token
|
|
58
|
+
|
|
59
|
+
self._sampling_period_in_milliseconds = 1_000
|
|
60
|
+
self._samples_between_writes = 60
|
|
61
|
+
self._attempt_to_get_gpu_stats = include_gpu_stats
|
|
62
|
+
|
|
63
|
+
def run(self) -> None:
|
|
64
|
+
try:
|
|
65
|
+
self._run_helper()
|
|
66
|
+
except BaseException as error:
|
|
67
|
+
logger_no_user_data.exception(f'UtilizationReporterThread hit error: {error}')
|
|
68
|
+
|
|
69
|
+
def _run_helper(self) -> None:
|
|
70
|
+
logger_no_user_data.debug(f'Job "{self._job_uuid}" utilization metrics reporter thread started')
|
|
71
|
+
prev_cpu_usage: float = 0.0
|
|
72
|
+
prev_cpu_system_usage: float = 0.0
|
|
73
|
+
metric_samples: List[UtilizationMetricSample] = []
|
|
74
|
+
while True:
|
|
75
|
+
stats = self._get_container_stats()
|
|
76
|
+
if not stats:
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
cpu_total_usage = stats['cpu_stats']['cpu_usage']['total_usage']
|
|
80
|
+
cpu_system_usage = stats['cpu_stats'].get('system_cpu_usage', 0.0)
|
|
81
|
+
|
|
82
|
+
# Calculate CPU usage
|
|
83
|
+
cpu_usage_delta_ns = cpu_total_usage - prev_cpu_usage
|
|
84
|
+
cpu_system_usage_delta_ns = cpu_system_usage - prev_cpu_system_usage
|
|
85
|
+
|
|
86
|
+
cpu_usage_in_percent = 0.0
|
|
87
|
+
if cpu_system_usage_delta_ns:
|
|
88
|
+
cpu_usage_in_percent = (cpu_usage_delta_ns / cpu_system_usage_delta_ns) * 100
|
|
89
|
+
|
|
90
|
+
# Set previous usage
|
|
91
|
+
prev_cpu_usage = cpu_total_usage
|
|
92
|
+
prev_cpu_system_usage = cpu_system_usage
|
|
93
|
+
|
|
94
|
+
memory_usage_in_percent = 0.0
|
|
95
|
+
if 'usage' in stats['memory_stats'] and 'limit' in stats['memory_stats']:
|
|
96
|
+
memory_usage_in_percent = stats['memory_stats']['usage'] / stats['memory_stats']['limit'] * 100
|
|
97
|
+
|
|
98
|
+
gpu_usage_in_percent = self._get_gpu_utilization_in_percent()
|
|
99
|
+
|
|
100
|
+
metric_sample = UtilizationMetricSample(
|
|
101
|
+
cpu_usage_in_percent=cpu_usage_in_percent,
|
|
102
|
+
memory_usage_in_percent=memory_usage_in_percent,
|
|
103
|
+
gpu_usage_in_percent=gpu_usage_in_percent,
|
|
104
|
+
)
|
|
105
|
+
metric_samples.append(metric_sample)
|
|
106
|
+
|
|
107
|
+
if len(metric_samples) >= self._samples_between_writes:
|
|
108
|
+
self._report_aggregated_utilization_metric(metric_samples)
|
|
109
|
+
metric_samples = []
|
|
110
|
+
|
|
111
|
+
time.sleep(self._sampling_period_in_milliseconds / 1_000)
|
|
112
|
+
|
|
113
|
+
logger_no_user_data.debug(f'Job "{self._job_uuid}" reporting remaining samples after container has exited')
|
|
114
|
+
self._report_aggregated_utilization_metric(metric_samples)
|
|
115
|
+
logger_no_user_data.debug(f'Job "{self._job_uuid}" utilization metrics reporter thread exiting')
|
|
116
|
+
|
|
117
|
+
def _get_gpu_utilization_in_percent(self) -> Optional[float]:
|
|
118
|
+
if not self._attempt_to_get_gpu_stats:
|
|
119
|
+
return None
|
|
120
|
+
try:
|
|
121
|
+
cmd = 'nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader'
|
|
122
|
+
utilization = subprocess.check_output(cmd, shell=True, stderr=subprocess.DEVNULL).decode('utf-8')
|
|
123
|
+
utilization_for_each_gpu = [float(x.replace(' %', '')) for x in utilization.strip().split('\n')]
|
|
124
|
+
utilization_for_first_gpu = utilization_for_each_gpu[0]
|
|
125
|
+
return utilization_for_first_gpu
|
|
126
|
+
except BaseException as error:
|
|
127
|
+
logger_no_user_data.exception(f'Failed to get GPU utilization got error: {error}')
|
|
128
|
+
self._attempt_to_get_gpu_stats = False
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
def _get_container_stats(self) -> Optional[ContainerStats]:
|
|
132
|
+
try:
|
|
133
|
+
return cast(ContainerStats, self._container_object.stats(stream=False))
|
|
134
|
+
except BaseException:
|
|
135
|
+
# Assume the container no longer exists and return None
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
def _get_aggregated_utilization_metric_from_metric_samples(
|
|
139
|
+
self,
|
|
140
|
+
metric_samples: List[UtilizationMetricSample],
|
|
141
|
+
) -> AggregatedUtilizationMetrics:
|
|
142
|
+
cpu_max_usage_in_percent: float = 0.0
|
|
143
|
+
cpu_usage_in_percent_sum: float = 0.0
|
|
144
|
+
gpu_max_usage_in_percent: Optional[float] = None
|
|
145
|
+
gpu_usage_in_percent_sum: Optional[float] = None
|
|
146
|
+
memory_max_usage_in_percent: float = 0.0
|
|
147
|
+
memory_usage_in_percent_sum: float = 0.0
|
|
148
|
+
|
|
149
|
+
for metric_sample in metric_samples:
|
|
150
|
+
cpu_max_usage_in_percent = max(cpu_max_usage_in_percent, metric_sample['cpu_usage_in_percent'])
|
|
151
|
+
cpu_usage_in_percent_sum += metric_sample['cpu_usage_in_percent']
|
|
152
|
+
memory_max_usage_in_percent = max(memory_max_usage_in_percent, metric_sample['memory_usage_in_percent'])
|
|
153
|
+
memory_usage_in_percent_sum += metric_sample['memory_usage_in_percent']
|
|
154
|
+
|
|
155
|
+
if metric_sample['gpu_usage_in_percent'] is not None:
|
|
156
|
+
if gpu_max_usage_in_percent is None:
|
|
157
|
+
gpu_max_usage_in_percent = 0.0
|
|
158
|
+
if gpu_usage_in_percent_sum is None:
|
|
159
|
+
gpu_usage_in_percent_sum = 0.0
|
|
160
|
+
|
|
161
|
+
gpu_max_usage_in_percent = max(gpu_max_usage_in_percent, metric_sample['gpu_usage_in_percent'])
|
|
162
|
+
gpu_usage_in_percent_sum += metric_sample['gpu_usage_in_percent']
|
|
163
|
+
|
|
164
|
+
cpu_average_usage_in_percent = cpu_usage_in_percent_sum / len(metric_samples)
|
|
165
|
+
memory_average_usage_in_percent = memory_usage_in_percent_sum / len(metric_samples)
|
|
166
|
+
gpu_average_usage_in_percent = gpu_usage_in_percent_sum / len(metric_samples) \
|
|
167
|
+
if gpu_usage_in_percent_sum is not None else None
|
|
168
|
+
|
|
169
|
+
return AggregatedUtilizationMetrics(
|
|
170
|
+
cpu_average_usage_in_percent=cpu_average_usage_in_percent,
|
|
171
|
+
cpu_max_usage_in_percent=cpu_max_usage_in_percent,
|
|
172
|
+
gpu_average_usage_in_percent=gpu_average_usage_in_percent,
|
|
173
|
+
gpu_max_usage_in_percent=gpu_max_usage_in_percent,
|
|
174
|
+
memory_average_usage_in_percent=memory_average_usage_in_percent,
|
|
175
|
+
memory_max_usage_in_percent=memory_max_usage_in_percent,
|
|
176
|
+
recorded_at=datetime.now(timezone.utc).isoformat(),
|
|
177
|
+
sampling_period_in_milliseconds=self._sampling_period_in_milliseconds * self._samples_between_writes,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
def _report_aggregated_utilization_metric(self, metric_samples: List[UtilizationMetricSample]) -> None:
|
|
181
|
+
if len(metric_samples) == 0:
|
|
182
|
+
logger_no_user_data.debug(f'Job "{self._job_uuid}" no metric samples to aggregate. Skipping reporting.')
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
aggregated_metrics = self._get_aggregated_utilization_metric_from_metric_samples(metric_samples)
|
|
186
|
+
logger_no_user_data.debug(f'Job "{self._job_uuid}" reporting aggregated metrics {aggregated_metrics}')
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
biolib.api.client.post(
|
|
190
|
+
path=f'/internal/compute-nodes/jobs/{self._job_uuid}/utilization-metrics/',
|
|
191
|
+
headers={'Compute-Node-Auth-Token': self._compute_node_auth_token},
|
|
192
|
+
data=cast(Dict, aggregated_metrics),
|
|
193
|
+
)
|
|
194
|
+
except BaseException as error:
|
|
195
|
+
logger_no_user_data.error(
|
|
196
|
+
f'Job "{self._job_uuid}" failed to report metrics: {aggregated_metrics} got error: {error}'
|
|
197
|
+
)
|
|
@@ -1,10 +1,7 @@
|
|
|
1
|
-
import logging
|
|
2
1
|
import subprocess
|
|
3
2
|
|
|
4
|
-
from biolib.biolib_logging import logger
|
|
3
|
+
from biolib.biolib_logging import logger, logger_no_user_data
|
|
5
4
|
from biolib import utils
|
|
6
|
-
from biolib.compute_node.cloud_utils.cloud_utils import CloudUtils
|
|
7
|
-
from biolib.compute_node.utils import SystemExceptionCodes
|
|
8
5
|
from biolib.typing_utils import Callable
|
|
9
6
|
|
|
10
7
|
|
|
@@ -12,18 +9,17 @@ class ComputeProcessException(Exception):
|
|
|
12
9
|
def __init__(self, original_error: Exception, biolib_error_code,
|
|
13
10
|
# Not using SendSystemExceptionType since importing it leads to many circular import problems
|
|
14
11
|
# TODO: Fix circular import problems when importing SendSystemExceptionType
|
|
15
|
-
send_system_exception: Callable[[
|
|
12
|
+
send_system_exception: Callable[[int], None],
|
|
16
13
|
may_contain_user_data: bool = True):
|
|
17
14
|
super().__init__()
|
|
18
15
|
|
|
19
|
-
if
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
)
|
|
16
|
+
if not may_contain_user_data:
|
|
17
|
+
logger_no_user_data.error(str(original_error))
|
|
18
|
+
else:
|
|
19
|
+
logger_no_user_data.debug('Hit a ComputeProcessException that may contain user data')
|
|
20
|
+
logger.error(str(original_error))
|
|
24
21
|
|
|
25
22
|
send_system_exception(biolib_error_code)
|
|
26
|
-
logger.error(original_error)
|
|
27
23
|
|
|
28
24
|
|
|
29
25
|
def log_disk_and_memory_usage_info() -> None:
|
|
@@ -31,16 +27,5 @@ def log_disk_and_memory_usage_info() -> None:
|
|
|
31
27
|
disk_usage_info = subprocess.run(['df', '-h'], check=False, capture_output=True)
|
|
32
28
|
memory_usage_info = subprocess.run(['free', '-h', '--si'], check=False, capture_output=True)
|
|
33
29
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
log_message=disk_usage_info.stdout.decode(),
|
|
37
|
-
level=logging.DEBUG
|
|
38
|
-
)
|
|
39
|
-
CloudUtils.log(
|
|
40
|
-
log_message=memory_usage_info.stdout.decode(),
|
|
41
|
-
level=logging.DEBUG
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
else:
|
|
45
|
-
logger.debug(disk_usage_info)
|
|
46
|
-
logger.debug(memory_usage_info)
|
|
30
|
+
logger_no_user_data.debug(disk_usage_info)
|
|
31
|
+
logger_no_user_data.debug(memory_usage_info)
|