matrice-compute 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- matrice_compute/__init__.py +12 -1
- matrice_compute/action_instance.py +32 -1
- matrice_compute/instance_manager.py +9 -8
- matrice_compute/scaling.py +300 -218
- {matrice_compute-0.1.11.dist-info → matrice_compute-0.1.13.dist-info}/METADATA +1 -1
- {matrice_compute-0.1.11.dist-info → matrice_compute-0.1.13.dist-info}/RECORD +9 -9
- {matrice_compute-0.1.11.dist-info → matrice_compute-0.1.13.dist-info}/WHEEL +0 -0
- {matrice_compute-0.1.11.dist-info → matrice_compute-0.1.13.dist-info}/licenses/LICENSE.txt +0 -0
- {matrice_compute-0.1.11.dist-info → matrice_compute-0.1.13.dist-info}/top_level.txt +0 -0
matrice_compute/__init__.py
CHANGED
|
@@ -1,9 +1,20 @@
|
|
|
1
1
|
"""Module providing __init__ functionality."""
|
|
2
2
|
|
|
3
|
+
import subprocess
|
|
3
4
|
|
|
4
5
|
from matrice_common.utils import dependencies_check
|
|
5
6
|
|
|
6
|
-
dependencies_check(
|
|
7
|
+
dependencies_check(
|
|
8
|
+
["docker", "psutil", "cryptography", "notebook", "aiohttp", "kafka-python"]
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
subprocess.run( # Re-upgrade docker to avoid missing DOCKER_HOST connection error
|
|
12
|
+
["pip", "install", "--upgrade", "docker"],
|
|
13
|
+
check=True,
|
|
14
|
+
stdout=subprocess.DEVNULL, # suppress normal output
|
|
15
|
+
stderr=subprocess.DEVNULL # suppress warnings/progress
|
|
16
|
+
)
|
|
17
|
+
|
|
7
18
|
from matrice_compute.instance_manager import InstanceManager # noqa: E402
|
|
8
19
|
|
|
9
20
|
__all__ = ["InstanceManager"]
|
|
@@ -74,7 +74,8 @@ class ActionInstance:
|
|
|
74
74
|
"streaming_gateway": streaming_gateway_execute,
|
|
75
75
|
"facial_recognition_setup": facial_recognition_setup_execute,
|
|
76
76
|
"fe_fs_streaming": fe_fs_streaming_execute,
|
|
77
|
-
"inference_ws_server": inference_ws_server_execute
|
|
77
|
+
"inference_ws_server": inference_ws_server_execute,
|
|
78
|
+
"lpr_setup": lpr_setup_execute
|
|
78
79
|
}
|
|
79
80
|
if self.action_type not in self.actions_map:
|
|
80
81
|
raise ValueError(f"Unknown action type: {self.action_type}")
|
|
@@ -1100,6 +1101,36 @@ def facial_recognition_setup_execute(self: ActionInstance):
|
|
|
1100
1101
|
# Docker Command run
|
|
1101
1102
|
self.start(worker_cmd, "facial_recognition_setup")
|
|
1102
1103
|
|
|
1104
|
+
@log_errors(raise_exception=False)
|
|
1105
|
+
def lpr_setup_execute(self: ActionInstance):
|
|
1106
|
+
"""
|
|
1107
|
+
Creates and setup the database for license plate server.
|
|
1108
|
+
"""
|
|
1109
|
+
action_details = self.get_action_details()
|
|
1110
|
+
|
|
1111
|
+
if not action_details:
|
|
1112
|
+
return
|
|
1113
|
+
image = self.docker_container
|
|
1114
|
+
external_port = self.scaling.get_open_port()
|
|
1115
|
+
|
|
1116
|
+
self.setup_action_requirements(action_details)
|
|
1117
|
+
|
|
1118
|
+
# Add worker container run command
|
|
1119
|
+
worker_cmd = (
|
|
1120
|
+
f"docker run -d --pull=always "
|
|
1121
|
+
f"--name lpr-worker "
|
|
1122
|
+
f"-p {external_port}:8082 "
|
|
1123
|
+
f'-e ENV="{os.environ.get("ENV", "prod")}" '
|
|
1124
|
+
f'-e MATRICE_SECRET_ACCESS_KEY="{self.matrice_secret_access_key}" '
|
|
1125
|
+
f'-e MATRICE_ACCESS_KEY_ID="{self.matrice_access_key_id}" '
|
|
1126
|
+
f'-e ACTION_ID="{self.action_record_id}" '
|
|
1127
|
+
f"{image}"
|
|
1128
|
+
)
|
|
1129
|
+
print("Worker docker run command:", worker_cmd)
|
|
1130
|
+
|
|
1131
|
+
# Docker Command run
|
|
1132
|
+
self.start(worker_cmd, "lpr_setup")
|
|
1133
|
+
|
|
1103
1134
|
@log_errors(raise_exception=False)
|
|
1104
1135
|
def inference_ws_server_execute(self: ActionInstance):
|
|
1105
1136
|
"""
|
|
@@ -153,7 +153,8 @@ class InstanceManager:
|
|
|
153
153
|
key,
|
|
154
154
|
value,
|
|
155
155
|
) in manual_instance_info.items():
|
|
156
|
-
|
|
156
|
+
if value is not None:
|
|
157
|
+
os.environ[key] = str(value)
|
|
157
158
|
if not (os.environ.get("SERVICE_PROVIDER") and os.environ.get("INSTANCE_ID")):
|
|
158
159
|
raise Exception(
|
|
159
160
|
"SERVICE_PROVIDER and INSTANCE_ID must be set as environment variables or passed as arguments"
|
|
@@ -220,13 +221,13 @@ class InstanceManager:
|
|
|
220
221
|
"Error in shutdown_manager handle_shutdown: %s",
|
|
221
222
|
str(exc),
|
|
222
223
|
)
|
|
223
|
-
try:
|
|
224
|
-
|
|
225
|
-
except Exception as exc:
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
224
|
+
# try:
|
|
225
|
+
# self.scale_down_manager.auto_scaledown_actions()
|
|
226
|
+
# except Exception as exc:
|
|
227
|
+
# logging.error(
|
|
228
|
+
# "Error in scale_down_manager auto_scaledown_actions: %s",
|
|
229
|
+
# str(exc),
|
|
230
|
+
# )
|
|
230
231
|
try:
|
|
231
232
|
self.machine_resources_tracker.update_available_resources()
|
|
232
233
|
except Exception as exc:
|
matrice_compute/scaling.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import os
|
|
4
4
|
import logging
|
|
5
5
|
from matrice_common.utils import log_errors
|
|
6
|
-
from kafka import KafkaProducer, KafkaConsumer
|
|
6
|
+
# from kafka import KafkaProducer, KafkaConsumer
|
|
7
7
|
import uuid
|
|
8
8
|
import json
|
|
9
9
|
import time
|
|
@@ -37,32 +37,34 @@ class Scaling:
|
|
|
37
37
|
"Initialized Scaling with instance_id: %s",
|
|
38
38
|
instance_id,
|
|
39
39
|
)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
40
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
41
|
+
# self.kafka_config = {
|
|
42
|
+
# "bootstrap_servers": self.get_kafka_bootstrap_servers(),
|
|
43
|
+
# "api_request_topic": "action_requests",
|
|
44
|
+
# "api_response_topic": "action_responses",
|
|
45
|
+
# "scaling_request_topic": "compute_requests",
|
|
46
|
+
# "scaling_response_topic": "compute_responses"
|
|
47
|
+
# }
|
|
48
|
+
# self.kafka_producer = KafkaProducer(
|
|
49
|
+
# bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
50
|
+
# value_serializer=lambda v: json.dumps(v).encode("utf-8"),)
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
54
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
55
|
+
# @log_errors(default_return=(None, "Error creating Kafka producer", "Kafka producer creation failed"), log_error=True)
|
|
56
|
+
# def get_kafka_bootstrap_servers(self):
|
|
57
|
+
# """Get Kafka bootstrap servers from API and decode base64 fields."""
|
|
58
|
+
# path = "/v1/actions/get_kafka_info"
|
|
59
|
+
# response = self.rpc.get(path=path)
|
|
60
|
+
# if not response or not response.get("success"):
|
|
61
|
+
# raise ValueError(f"Failed to fetch Kafka config: {response.get('message', 'No response')}")
|
|
62
|
+
# encoded_ip = response["data"]["ip"]
|
|
63
|
+
# encoded_port = response["data"]["port"]
|
|
64
|
+
# ip = base64.b64decode(encoded_ip).decode("utf-8")
|
|
65
|
+
# port = base64.b64decode(encoded_port).decode("utf-8")
|
|
66
|
+
# bootstrap_servers = f"{ip}:{port}"
|
|
67
|
+
# return bootstrap_servers
|
|
66
68
|
|
|
67
69
|
@log_errors(default_return=(None, "Error processing response", "Response processing failed"), log_error=True)
|
|
68
70
|
def handle_response(self, resp, success_message, error_message):
|
|
@@ -285,34 +287,44 @@ class Scaling:
|
|
|
285
287
|
|
|
286
288
|
@log_errors(log_error=True)
|
|
287
289
|
def get_action_details(self, action_status_id):
|
|
288
|
-
"""Get details for a specific action using
|
|
290
|
+
"""Get details for a specific action using REST API.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
action_status_id: ID of the action status to fetch
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
Tuple of (data, error, message) from API response
|
|
297
|
+
"""
|
|
289
298
|
logging.info("Getting action details for action %s", action_status_id)
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
#
|
|
293
|
-
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
)
|
|
300
|
-
if
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
#
|
|
299
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
300
|
+
# api = "get_action_details"
|
|
301
|
+
# payload = {"actionRecordId": action_status_id}
|
|
302
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
303
|
+
# api=api,
|
|
304
|
+
# payload=payload,
|
|
305
|
+
# request_topic=self.kafka_config["api_request_topic"],
|
|
306
|
+
# response_topic=self.kafka_config["api_response_topic"],
|
|
307
|
+
# timeout=60
|
|
308
|
+
# )
|
|
309
|
+
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
310
|
+
# if kafka_response_received:
|
|
311
|
+
# if error:
|
|
312
|
+
# logging.warning("Kafka returned error for get_action_details: %s. Falling back to REST API.", error)
|
|
313
|
+
# else:
|
|
314
|
+
# return data, error, message
|
|
315
|
+
|
|
316
|
+
# Using REST API directly
|
|
304
317
|
try:
|
|
305
318
|
path = f"/v1/actions/action/{action_status_id}/details"
|
|
306
319
|
resp = self.rpc.get(path=path)
|
|
307
320
|
return self.handle_response(
|
|
308
321
|
resp,
|
|
309
|
-
"Task details fetched successfully
|
|
310
|
-
"Could not fetch the task details
|
|
322
|
+
"Task details fetched successfully",
|
|
323
|
+
"Could not fetch the task details",
|
|
311
324
|
)
|
|
312
325
|
except Exception as e:
|
|
313
|
-
logging.error("REST
|
|
314
|
-
|
|
315
|
-
return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
|
|
326
|
+
logging.error("REST API failed (get_action_details): %s", e)
|
|
327
|
+
return None, f"Failed via REST: {e}", "REST API failed"
|
|
316
328
|
|
|
317
329
|
|
|
318
330
|
@log_errors(log_error=True)
|
|
@@ -327,11 +339,26 @@ class Scaling:
|
|
|
327
339
|
service="",
|
|
328
340
|
job_params=None,
|
|
329
341
|
):
|
|
330
|
-
"""Update an action using
|
|
342
|
+
"""Update an action using REST API.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
id: Action ID
|
|
346
|
+
step_code: Step code
|
|
347
|
+
action_type: Type of action
|
|
348
|
+
status: Status of the action
|
|
349
|
+
sub_action: Sub-action details
|
|
350
|
+
status_description: Description of the status
|
|
351
|
+
service: Service name
|
|
352
|
+
job_params: Job parameters dictionary
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
Tuple of (data, error, message) from API response
|
|
356
|
+
"""
|
|
331
357
|
if job_params is None:
|
|
332
358
|
job_params = {}
|
|
333
359
|
logging.info("Updating action %s", id)
|
|
334
|
-
|
|
360
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
361
|
+
# api = "update_action"
|
|
335
362
|
payload = {
|
|
336
363
|
"_id": id,
|
|
337
364
|
"stepCode": step_code,
|
|
@@ -342,63 +369,85 @@ class Scaling:
|
|
|
342
369
|
"serviceName": service,
|
|
343
370
|
"jobParams": job_params,
|
|
344
371
|
}
|
|
345
|
-
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
)
|
|
352
|
-
if
|
|
353
|
-
|
|
372
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
373
|
+
# api=api,
|
|
374
|
+
# payload=payload,
|
|
375
|
+
# request_topic=self.kafka_config["api_request_topic"],
|
|
376
|
+
# response_topic=self.kafka_config["api_response_topic"],
|
|
377
|
+
# timeout=60
|
|
378
|
+
# )
|
|
379
|
+
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
380
|
+
# if kafka_response_received:
|
|
381
|
+
# if error:
|
|
382
|
+
# logging.warning("Kafka returned error for update_action: %s. Falling back to REST API.", error)
|
|
383
|
+
# else:
|
|
384
|
+
# return data, error, message
|
|
385
|
+
|
|
386
|
+
# Using REST API directly
|
|
354
387
|
try:
|
|
355
388
|
path = "/v1/actions"
|
|
356
389
|
resp = self.rpc.put(path=path, payload=payload)
|
|
357
390
|
return self.handle_response(
|
|
358
391
|
resp,
|
|
359
|
-
"Error logged successfully
|
|
360
|
-
"Could not log the errors
|
|
392
|
+
"Error logged successfully",
|
|
393
|
+
"Could not log the errors",
|
|
361
394
|
)
|
|
362
395
|
except Exception as e:
|
|
363
|
-
logging.error("REST
|
|
364
|
-
|
|
365
|
-
return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
|
|
396
|
+
logging.error("REST API failed (update_action): %s", e)
|
|
397
|
+
return None, f"Failed via REST: {e}", "REST API failed"
|
|
366
398
|
|
|
367
399
|
|
|
368
400
|
@log_errors(log_error=True)
|
|
369
401
|
def assign_jobs(self, is_gpu):
|
|
370
|
-
"""Assign jobs to the instance using
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
402
|
+
"""Assign jobs to the instance using REST API.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
is_gpu: Boolean or any value indicating if this is a GPU instance.
|
|
406
|
+
Will be converted to proper boolean.
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
Tuple of (data, error, message) from API response
|
|
410
|
+
"""
|
|
411
|
+
# Convert is_gpu to proper boolean
|
|
412
|
+
is_gpu_bool = bool(is_gpu)
|
|
413
|
+
logging.info("Assigning jobs for instance %s (GPU: %s)", self.instance_id, is_gpu_bool)
|
|
414
|
+
|
|
415
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
416
|
+
# api = "assign_jobs"
|
|
417
|
+
# payload = {
|
|
418
|
+
# "instanceID": self.instance_id,
|
|
419
|
+
# "isGPUInstance": is_gpu_bool,
|
|
420
|
+
# }
|
|
421
|
+
|
|
422
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
423
|
+
# api=api,
|
|
424
|
+
# payload=payload,
|
|
425
|
+
# request_topic=self.kafka_config["api_request_topic"],
|
|
426
|
+
# response_topic=self.kafka_config["api_response_topic"],
|
|
427
|
+
# timeout=60
|
|
428
|
+
# )
|
|
429
|
+
|
|
430
|
+
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
431
|
+
# if kafka_response_received:
|
|
432
|
+
# if error:
|
|
433
|
+
# logging.warning("Kafka returned error for assign_jobs: %s. Falling back to REST API.", error)
|
|
434
|
+
# else:
|
|
435
|
+
# return data, error, message
|
|
436
|
+
|
|
437
|
+
# Using REST API directly
|
|
390
438
|
try:
|
|
391
|
-
|
|
439
|
+
# Convert boolean to lowercase string for API endpoint
|
|
440
|
+
is_gpu_str = str(is_gpu_bool).lower()
|
|
441
|
+
path = f"/v1/actions/assign_jobs/{is_gpu_str}/{self.instance_id}"
|
|
392
442
|
resp = self.rpc.get(path=path)
|
|
393
443
|
return self.handle_response(
|
|
394
444
|
resp,
|
|
395
|
-
"Pinged successfully
|
|
396
|
-
"Could not ping the scaling jobs
|
|
445
|
+
"Pinged successfully",
|
|
446
|
+
"Could not ping the scaling jobs",
|
|
397
447
|
)
|
|
398
448
|
except Exception as e:
|
|
399
|
-
logging.error("REST
|
|
400
|
-
|
|
401
|
-
return None, f"Failed via Kafka and REST: {e}", "Cached for retry"
|
|
449
|
+
logging.error("REST API failed (assign_jobs): %s", e)
|
|
450
|
+
return None, f"Failed via REST: {e}", "REST API failed"
|
|
402
451
|
|
|
403
452
|
|
|
404
453
|
@log_errors(log_error=True)
|
|
@@ -409,7 +458,17 @@ class Scaling:
|
|
|
409
458
|
availableMemory=0,
|
|
410
459
|
availableGPUMemory=0,
|
|
411
460
|
):
|
|
412
|
-
"""Update available resources for the instance using
|
|
461
|
+
"""Update available resources for the instance using REST API.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
availableCPU: Available CPU resources
|
|
465
|
+
availableGPU: Available GPU resources
|
|
466
|
+
availableMemory: Available memory
|
|
467
|
+
availableGPUMemory: Available GPU memory
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
Tuple of (data, error, message) from API response
|
|
471
|
+
"""
|
|
413
472
|
logging.info("Updating available resources for instance %s", self.instance_id)
|
|
414
473
|
payload = {
|
|
415
474
|
"instance_id": self.instance_id,
|
|
@@ -418,63 +477,84 @@ class Scaling:
|
|
|
418
477
|
"availableGPUMemory": availableGPUMemory,
|
|
419
478
|
"availableGPU": availableGPU,
|
|
420
479
|
}
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
)
|
|
432
|
-
|
|
433
|
-
if
|
|
434
|
-
|
|
480
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
481
|
+
# api = "update_available_resources"
|
|
482
|
+
# correlation_id = str(uuid.uuid4())
|
|
483
|
+
|
|
484
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
485
|
+
# api=api,
|
|
486
|
+
# payload=payload,
|
|
487
|
+
# request_topic=self.kafka_config["scaling_request_topic"],
|
|
488
|
+
# response_topic=self.kafka_config["scaling_response_topic"],
|
|
489
|
+
# timeout=60
|
|
490
|
+
# )
|
|
491
|
+
|
|
492
|
+
# # Check if Kafka response was received
|
|
493
|
+
# # Response format: {'correlationId': 'id', 'status': 'success'/'error', 'data': ..., 'error': 'error message'}
|
|
494
|
+
# if kafka_response_received:
|
|
495
|
+
# if error:
|
|
496
|
+
# logging.warning("Kafka returned error for update_available_resources: %s. Falling back to REST API.", error)
|
|
497
|
+
# else:
|
|
498
|
+
# return data, error, message
|
|
499
|
+
|
|
500
|
+
# Using REST API directly
|
|
435
501
|
try:
|
|
436
502
|
path = f"/v1/scaling/update_available_resources/{self.instance_id}"
|
|
437
503
|
resp = self.rpc.put(path=path, payload=payload)
|
|
438
504
|
return self.handle_response(
|
|
439
505
|
resp,
|
|
440
|
-
"Resources updated successfully
|
|
441
|
-
"Could not update the resources
|
|
506
|
+
"Resources updated successfully",
|
|
507
|
+
"Could not update the resources",
|
|
442
508
|
)
|
|
443
509
|
except Exception as e:
|
|
444
|
-
logging.error("REST
|
|
445
|
-
|
|
446
|
-
return None, f"Failed to update available resources via Kafka and REST: {e}", "Cached for retry"
|
|
510
|
+
logging.error("REST API failed (update_available_resources): %s", e)
|
|
511
|
+
return None, f"Failed to update available resources via REST: {e}", "REST API failed"
|
|
447
512
|
|
|
448
513
|
@log_errors(log_error=True)
|
|
449
514
|
def update_action_docker_logs(self, action_record_id, log_content):
|
|
450
|
-
"""Update docker logs for an action using
|
|
515
|
+
"""Update docker logs for an action using REST API.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
action_record_id: ID of the action record
|
|
519
|
+
log_content: Content of the logs to update
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
Tuple of (data, error, message) from API response
|
|
523
|
+
"""
|
|
451
524
|
logging.info("Updating docker logs for action %s", action_record_id)
|
|
452
|
-
|
|
525
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
526
|
+
# api = "update_action_docker_logs"
|
|
453
527
|
payload = {
|
|
454
528
|
"actionRecordId": action_record_id,
|
|
455
529
|
"logContent": log_content,
|
|
456
|
-
|
|
457
530
|
}
|
|
458
|
-
data, error, message, kafka_response_received = self._send_kafka_request(
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
if
|
|
467
|
-
|
|
531
|
+
# data, error, message, kafka_response_received = self._send_kafka_request(
|
|
532
|
+
# api=api,
|
|
533
|
+
# payload=payload,
|
|
534
|
+
# request_topic=self.kafka_config["api_request_topic"],
|
|
535
|
+
# response_topic=self.kafka_config["api_response_topic"],
|
|
536
|
+
# timeout=60
|
|
537
|
+
# )
|
|
538
|
+
|
|
539
|
+
# # Check if Kafka response was received and if it's an error, log and fallback to REST API
|
|
540
|
+
# if kafka_response_received:
|
|
541
|
+
# if error:
|
|
542
|
+
# logging.warning("Kafka returned error for update_action_docker_logs: %s. Falling back to REST API.", error)
|
|
543
|
+
# else:
|
|
544
|
+
# return data, error, message
|
|
545
|
+
|
|
546
|
+
# Using REST API directly
|
|
468
547
|
try:
|
|
469
548
|
path = "/v1/actions/update_action_docker_logs"
|
|
470
549
|
resp = self.rpc.put(path=path, payload=payload)
|
|
471
550
|
return self.handle_response(
|
|
472
551
|
resp,
|
|
473
|
-
"Docker logs updated successfully
|
|
474
|
-
"Could not update the docker logs
|
|
552
|
+
"Docker logs updated successfully",
|
|
553
|
+
"Could not update the docker logs",
|
|
475
554
|
)
|
|
476
555
|
except Exception as e:
|
|
477
|
-
logging.error("REST
|
|
556
|
+
logging.error("REST API failed (update_action_docker_logs): %s", e)
|
|
557
|
+
return None, f"Failed via REST: {e}", "REST API failed"
|
|
478
558
|
|
|
479
559
|
|
|
480
560
|
@log_errors(log_error=True)
|
|
@@ -533,7 +613,8 @@ class Scaling:
|
|
|
533
613
|
if port in self.used_ports:
|
|
534
614
|
continue
|
|
535
615
|
self.used_ports.add(port)
|
|
536
|
-
|
|
616
|
+
ports_value = ",".join(str(p) for p in self.used_ports)
|
|
617
|
+
os.environ["USED_PORTS"] = str(ports_value)
|
|
537
618
|
logging.info("Found available port: %s", port)
|
|
538
619
|
return port
|
|
539
620
|
logging.error(
|
|
@@ -790,98 +871,99 @@ class Scaling:
|
|
|
790
871
|
"Could not fetch internal keys",
|
|
791
872
|
)
|
|
792
873
|
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
874
|
+
# KAFKA TEMPORARILY DISABLED - Using REST API directly
|
|
875
|
+
# @log_errors(log_error=True)
|
|
876
|
+
# def handle_kafka_response(self, msg, success_message, error_message):
|
|
877
|
+
# """
|
|
878
|
+
# Helper to process Kafka response messages in a consistent way.
|
|
879
|
+
# """
|
|
880
|
+
# if msg.get("status") == "success":
|
|
881
|
+
# data = msg.get("data")
|
|
882
|
+
# error = None
|
|
883
|
+
# message = success_message
|
|
884
|
+
# logging.info(message)
|
|
885
|
+
# else:
|
|
886
|
+
# data = msg.get("data")
|
|
887
|
+
# error = msg.get("error", "Unknown error")
|
|
888
|
+
# message = error_message
|
|
889
|
+
# logging.error("%s: %s", message, error)
|
|
890
|
+
# return data, error, message
|
|
891
|
+
|
|
892
|
+
# def _send_kafka_request(self, api, payload, request_topic, response_topic, timeout=60):
|
|
893
|
+
# """
|
|
894
|
+
# Helper to send a request to Kafka and wait for a response.
|
|
895
|
+
# Returns (data, error, message, kafka_response_received) where kafka_response_received is True if a response was received (even if error), False if transport error/timeout.
|
|
896
|
+
# """
|
|
897
|
+
# correlation_id = str(uuid.uuid4())
|
|
898
|
+
# request_message = {
|
|
899
|
+
# "correlationId": correlation_id,
|
|
900
|
+
# "api": api,
|
|
901
|
+
# "payload": payload,
|
|
902
|
+
# }
|
|
903
|
+
|
|
904
|
+
# consumer = KafkaConsumer(
|
|
905
|
+
# response_topic,
|
|
906
|
+
# bootstrap_servers=self.kafka_config["bootstrap_servers"],
|
|
907
|
+
# group_id=None,
|
|
908
|
+
# value_deserializer=lambda m: json.loads(m.decode("utf-8")),
|
|
909
|
+
# auto_offset_reset='latest',
|
|
910
|
+
# enable_auto_commit=True,
|
|
911
|
+
# )
|
|
912
|
+
|
|
913
|
+
# try:
|
|
914
|
+
# if hasattr(self.session.rpc, 'AUTH_TOKEN'):
|
|
915
|
+
# self.session.rpc.AUTH_TOKEN.set_bearer_token()
|
|
916
|
+
# auth_token = self.session.rpc.AUTH_TOKEN.bearer_token
|
|
917
|
+
# auth_token = auth_token.replace("Bearer ", "")
|
|
918
|
+
# headers = [("Authorization", bytes(f"{auth_token}", "utf-8"))]
|
|
919
|
+
# else:
|
|
920
|
+
# headers = None
|
|
921
|
+
# self.kafka_producer.send(request_topic, request_message, headers=headers)
|
|
922
|
+
# # self.kafka_producer.flush()
|
|
923
|
+
# logging.info("Sent %s request to Kafka topic %s", api, request_topic)
|
|
924
|
+
# except Exception as e:
|
|
925
|
+
# logging.error("Kafka producer error: %s", e)
|
|
926
|
+
# return None, f"Kafka producer error: {e}", "Kafka send failed", False
|
|
927
|
+
# try:
|
|
928
|
+
# start = time.time()
|
|
929
|
+
# while time.time() - start < timeout:
|
|
930
|
+
# # Poll for messages with a short timeout to avoid blocking forever
|
|
931
|
+
# message_batch = consumer.poll(timeout_ms=1000)
|
|
932
|
+
# if message_batch:
|
|
933
|
+
# for topic_partition, messages in message_batch.items():
|
|
934
|
+
# for message in messages:
|
|
935
|
+
# print("trying to fetch message")
|
|
936
|
+
# msg = message.value
|
|
937
|
+
# if msg.get("correlationId") == correlation_id:
|
|
938
|
+
# consumer.close()
|
|
939
|
+
# # Always treat a received response as final, even if error
|
|
940
|
+
# return self.handle_kafka_response(
|
|
941
|
+
# msg,
|
|
942
|
+
# f"Fetched via Kafka for {api}",
|
|
943
|
+
# f"Kafka error response for {api}"
|
|
944
|
+
# ) + (True,)
|
|
945
|
+
# else:
|
|
946
|
+
# print(f"No messages received, waiting... ({time.time() - start:.1f}s/{timeout}s)")
|
|
947
|
+
#
|
|
948
|
+
# consumer.close()
|
|
949
|
+
# logging.warning("Kafka response timeout for %s after %d seconds", api, timeout)
|
|
950
|
+
# return None, "Kafka response timeout", "Kafka response timeout", False
|
|
951
|
+
# except Exception as e:
|
|
952
|
+
# logging.error("Kafka consumer error: %s", e)
|
|
953
|
+
# return None, f"Kafka consumer error: {e}", "Kafka consumer error", False
|
|
954
|
+
|
|
955
|
+
# def _cache_failed_request(self, api, payload):
|
|
956
|
+
# """Cache the failed request for retry. Here, we use a simple file cache as a placeholder."""
|
|
957
|
+
# try:
|
|
958
|
+
# cache_file = os.path.join(os.path.dirname(__file__), 'request_cache.json')
|
|
959
|
+
# if os.path.exists(cache_file):
|
|
960
|
+
# with open(cache_file, 'r') as f:
|
|
961
|
+
# cache = json.load(f)
|
|
962
|
+
# else:
|
|
963
|
+
# cache = []
|
|
964
|
+
# cache.append({"api": api, "payload": payload, "ts": time.time()})
|
|
965
|
+
# with open(cache_file, 'w') as f:
|
|
966
|
+
# json.dump(cache, f)
|
|
967
|
+
# logging.info("Cached failed request for api %s", api)
|
|
968
|
+
# except Exception as e:
|
|
969
|
+
# logging.error("Failed to cache request: %s", e)
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
matrice_compute/__init__.py,sha256=
|
|
2
|
-
matrice_compute/action_instance.py,sha256=
|
|
1
|
+
matrice_compute/__init__.py,sha256=ZzQcFsT005VCgq9VZUh565f4upOooEb_FwZ6RgweNZs,597
|
|
2
|
+
matrice_compute/action_instance.py,sha256=6IVMNODznEagFlwifjP1neO6OK0H46vuvMYDw02gYF0,58985
|
|
3
3
|
matrice_compute/actions_manager.py,sha256=5U-xM6tl_Z6x96bi-c7AJM9ru80LqTN8f5Oce8dAu_A,7780
|
|
4
4
|
matrice_compute/actions_scaledown_manager.py,sha256=pJ0nduNwHWZ10GnqJNx0Ok7cVWabQ_M8E2Vb9pH3A_k,2002
|
|
5
|
-
matrice_compute/instance_manager.py,sha256=
|
|
5
|
+
matrice_compute/instance_manager.py,sha256=8USyX09ZxLvnVNIrjRogbyUeMCfgWnasuRqYkkVF4tQ,10146
|
|
6
6
|
matrice_compute/instance_utils.py,sha256=tIFVUi8HJPy4GY-jtfVx2zIgmXNta7s3jCIRzBga1hI,21977
|
|
7
7
|
matrice_compute/prechecks.py,sha256=W9YmNF3RcLhOf4U8WBlExvFqDw1aGWSNTlJtA73lbDQ,17196
|
|
8
8
|
matrice_compute/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
matrice_compute/resources_tracker.py,sha256=My26LPglDHcQcTkxxiXwpfdqkpEAt3clrqJ-k1fAl1M,17878
|
|
10
|
-
matrice_compute/scaling.py,sha256=
|
|
10
|
+
matrice_compute/scaling.py,sha256=hlPpEW8uggMKHW9kwu71obOnbNXhoqRlkmux4Fc3OP0,35202
|
|
11
11
|
matrice_compute/shutdown_manager.py,sha256=0MYV_AqygqR9NEntYf7atUC-PbWXyNkm1f-8c2aizgA,13234
|
|
12
12
|
matrice_compute/task_utils.py,sha256=ML9uTrYQiWgEMJitYxoGlVOa9KUXNKV_WqnousOTK6k,2762
|
|
13
|
-
matrice_compute-0.1.
|
|
14
|
-
matrice_compute-0.1.
|
|
15
|
-
matrice_compute-0.1.
|
|
16
|
-
matrice_compute-0.1.
|
|
17
|
-
matrice_compute-0.1.
|
|
13
|
+
matrice_compute-0.1.13.dist-info/licenses/LICENSE.txt,sha256=_uQUZpgO0mRYL5-fPoEvLSbNnLPv6OmbeEDCHXhK6Qc,1066
|
|
14
|
+
matrice_compute-0.1.13.dist-info/METADATA,sha256=aX4hxZ2ll6w9miiYJ9Ed-FZtEVUEvwNb6vUplVYNm0w,1038
|
|
15
|
+
matrice_compute-0.1.13.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
16
|
+
matrice_compute-0.1.13.dist-info/top_level.txt,sha256=63Plr3L1GzBUWZO5JZaFkiv8IcB10xUPU-9w3i6ptvE,16
|
|
17
|
+
matrice_compute-0.1.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|