matrice-compute 0.1.1__tar.gz → 0.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/PKG-INFO +1 -1
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/matrice_compute.egg-info/PKG-INFO +1 -1
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/__init__.py +1 -1
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/action_instance.py +13 -2
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/actions_manager.py +1 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/instance_manager.py +8 -8
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/scaling.py +28 -21
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/LICENSE.txt +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/README.md +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/matrice_compute.egg-info/SOURCES.txt +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/matrice_compute.egg-info/dependency_links.txt +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/matrice_compute.egg-info/not-zip-safe +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/matrice_compute.egg-info/top_level.txt +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/pyproject.toml +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/setup.cfg +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/setup.py +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/actions_scaledown_manager.py +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/instance_utils.py +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/prechecks.py +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/py.typed +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/resources_tracker.py +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/shutdown_manager.py +0 -0
- {matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/task_utils.py +0 -0
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
|
|
4
4
|
from matrice_common.utils import dependencies_check
|
|
5
5
|
|
|
6
|
-
dependencies_check(["docker", "psutil", "cryptography", "notebook", "aiohttp"])
|
|
6
|
+
dependencies_check(["docker", "psutil", "cryptography", "notebook", "aiohttp", "kafka-python"])
|
|
7
7
|
from matrice_compute.instance_manager import InstanceManager # noqa: E402
|
|
8
8
|
|
|
9
9
|
__all__ = ["InstanceManager"]
|
|
@@ -348,9 +348,14 @@ class ActionInstance:
|
|
|
348
348
|
"-v /var/run/docker.sock:/var/run/docker.sock" if mount_docker_sock else "",
|
|
349
349
|
]
|
|
350
350
|
pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
|
|
351
|
+
|
|
351
352
|
pkgs = ["matrice_common", "matrice"]
|
|
352
353
|
pkgs.extend(extra_pkgs)
|
|
353
|
-
|
|
354
|
+
if env == 'dev':
|
|
355
|
+
pkgs = [pkg + ">=1.0.0" for pkg in pkgs]
|
|
356
|
+
pip_install_matrice = f"pip install --pre --upgrade --force-reinstall --index-url {pypi_index} {' '.join(pkgs)}"
|
|
357
|
+
else:
|
|
358
|
+
pip_install_matrice = f"pip install --upgrade --force-reinstall --index-url {pypi_index} {' '.join(pkgs)}"
|
|
354
359
|
pip_install_requirements = (
|
|
355
360
|
"if [ -f requirements.txt ]; then pip install -r requirements.txt; fi "
|
|
356
361
|
)
|
|
@@ -1490,6 +1495,12 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1490
1495
|
# Build the docker command directly to match user's pattern
|
|
1491
1496
|
pypi_index = f"https://{'test.' if env != 'prod' else ''}pypi.org/simple/"
|
|
1492
1497
|
|
|
1498
|
+
if env == 'dev':
|
|
1499
|
+
pypi_index = f"https://test.pypi.org/simple/ --pre"
|
|
1500
|
+
pkgs = f"matrice_common>=1.0.0 matrice>=1.0.0"
|
|
1501
|
+
else:
|
|
1502
|
+
pkgs = f"matrice_common matrice"
|
|
1503
|
+
|
|
1493
1504
|
cmd = (
|
|
1494
1505
|
f"docker run -p {host_port}:{container_port} "
|
|
1495
1506
|
f"{env_args} "
|
|
@@ -1499,7 +1510,7 @@ def kafka_setup_execute(self: ActionInstance):
|
|
|
1499
1510
|
f"source venv/bin/activate && "
|
|
1500
1511
|
f"/opt/kafka/bin/startup.sh & "
|
|
1501
1512
|
f"if [ -f requirements.txt ]; then venv/bin/python3 -m pip install -r requirements.txt; fi && "
|
|
1502
|
-
f"venv/bin/python3 -m pip install --upgrade --force-reinstall --index-url {pypi_index}
|
|
1513
|
+
f"venv/bin/python3 -m pip install --upgrade --force-reinstall --index-url {pypi_index} {pkgs} && "
|
|
1503
1514
|
f"sleep 20 && "
|
|
1504
1515
|
f'venv/bin/python3 main.py {self.action_record_id} {host_port}"'
|
|
1505
1516
|
)
|
|
@@ -196,6 +196,7 @@ class ActionsManager:
|
|
|
196
196
|
def start_actions_manager(self) -> None:
|
|
197
197
|
"""Start the actions manager main loop."""
|
|
198
198
|
while True:
|
|
199
|
+
waiting_time = self.poll_interval # Default wait time
|
|
199
200
|
try:
|
|
200
201
|
mem_usage = get_mem_usage()
|
|
201
202
|
logging.info("Memory usage: %d", mem_usage)
|
|
@@ -91,7 +91,7 @@ class InstanceManager:
|
|
|
91
91
|
self.actions_resources_tracker = ActionsResourcesTracker(self.scaling)
|
|
92
92
|
logging.info("InstanceManager initialized with actions resources tracker")
|
|
93
93
|
self.poll_interval = 10
|
|
94
|
-
|
|
94
|
+
# Note: encryption_key is set in _setup_env_credentials
|
|
95
95
|
logging.info("InstanceManager initialized.")
|
|
96
96
|
|
|
97
97
|
@log_errors(default_return=None, raise_exception=True, log_error=True)
|
|
@@ -220,13 +220,13 @@ class InstanceManager:
|
|
|
220
220
|
"Error in shutdown_manager handle_shutdown: %s",
|
|
221
221
|
str(exc),
|
|
222
222
|
)
|
|
223
|
-
try:
|
|
224
|
-
|
|
225
|
-
except Exception as exc:
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
223
|
+
# try:
|
|
224
|
+
# self.scale_down_manager.auto_scaledown_actions()
|
|
225
|
+
# except Exception as exc:
|
|
226
|
+
# logging.error(
|
|
227
|
+
# "Error in scale_down_manager auto_scaledown_actions: %s",
|
|
228
|
+
# str(exc),
|
|
229
|
+
# )
|
|
230
230
|
try:
|
|
231
231
|
self.machine_resources_tracker.update_available_resources()
|
|
232
232
|
except Exception as exc:
|
|
@@ -99,7 +99,7 @@ class Scaling:
|
|
|
99
99
|
"Getting downscaled ids for instance %s",
|
|
100
100
|
self.instance_id,
|
|
101
101
|
)
|
|
102
|
-
path = f"/v1/
|
|
102
|
+
path = f"/v1/compute/down_scaled_ids/{self.instance_id}"
|
|
103
103
|
resp = self.rpc.get(path=path)
|
|
104
104
|
return self.handle_response(
|
|
105
105
|
resp,
|
|
@@ -295,7 +295,7 @@ class Scaling:
|
|
|
295
295
|
payload=payload,
|
|
296
296
|
request_topic=self.kafka_config["api_request_topic"],
|
|
297
297
|
response_topic=self.kafka_config["api_response_topic"],
|
|
298
|
-
timeout=
|
|
298
|
+
timeout=60
|
|
299
299
|
)
|
|
300
300
|
if kafka_response_received:
|
|
301
301
|
return data, error, message
|
|
@@ -347,7 +347,7 @@ class Scaling:
|
|
|
347
347
|
payload=payload,
|
|
348
348
|
request_topic=self.kafka_config["api_request_topic"],
|
|
349
349
|
response_topic=self.kafka_config["api_response_topic"],
|
|
350
|
-
timeout=
|
|
350
|
+
timeout=60
|
|
351
351
|
)
|
|
352
352
|
if kafka_response_received:
|
|
353
353
|
return data, error, message
|
|
@@ -380,7 +380,7 @@ class Scaling:
|
|
|
380
380
|
payload=payload,
|
|
381
381
|
request_topic=self.kafka_config["api_request_topic"],
|
|
382
382
|
response_topic=self.kafka_config["api_response_topic"],
|
|
383
|
-
timeout=
|
|
383
|
+
timeout=60
|
|
384
384
|
)
|
|
385
385
|
|
|
386
386
|
if kafka_response_received:
|
|
@@ -427,7 +427,7 @@ class Scaling:
|
|
|
427
427
|
payload=payload,
|
|
428
428
|
request_topic=self.kafka_config["scaling_request_topic"],
|
|
429
429
|
response_topic=self.kafka_config["scaling_response_topic"],
|
|
430
|
-
timeout=
|
|
430
|
+
timeout=60
|
|
431
431
|
)
|
|
432
432
|
|
|
433
433
|
if kafka_response_received:
|
|
@@ -460,7 +460,7 @@ class Scaling:
|
|
|
460
460
|
payload=payload,
|
|
461
461
|
request_topic=self.kafka_config["api_request_topic"],
|
|
462
462
|
response_topic=self.kafka_config["api_response_topic"],
|
|
463
|
-
timeout=
|
|
463
|
+
timeout=60
|
|
464
464
|
)
|
|
465
465
|
|
|
466
466
|
if kafka_response_received:
|
|
@@ -782,7 +782,7 @@ class Scaling:
|
|
|
782
782
|
Returns:
|
|
783
783
|
Tuple of (data, error, message) from API response
|
|
784
784
|
"""
|
|
785
|
-
path = f"/v1/
|
|
785
|
+
path = f"/v1/actions/get_internal_api_key/{action_id}/{self.instance_id}"
|
|
786
786
|
resp = self.rpc.get(path=path)
|
|
787
787
|
return self.handle_response(
|
|
788
788
|
resp,
|
|
@@ -807,7 +807,7 @@ class Scaling:
|
|
|
807
807
|
logging.error("%s: %s", message, error)
|
|
808
808
|
return data, error, message
|
|
809
809
|
|
|
810
|
-
def _send_kafka_request(self, api, payload, request_topic, response_topic, timeout=
|
|
810
|
+
def _send_kafka_request(self, api, payload, request_topic, response_topic, timeout=60):
|
|
811
811
|
"""
|
|
812
812
|
Helper to send a request to Kafka and wait for a response.
|
|
813
813
|
Returns (data, error, message, kafka_response_received) where kafka_response_received is True if a response was received (even if error), False if transport error/timeout.
|
|
@@ -844,20 +844,27 @@ class Scaling:
|
|
|
844
844
|
return None, f"Kafka producer error: {e}", "Kafka send failed", False
|
|
845
845
|
try:
|
|
846
846
|
start = time.time()
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
847
|
+
while time.time() - start < timeout:
|
|
848
|
+
# Poll for messages with a short timeout to avoid blocking forever
|
|
849
|
+
message_batch = consumer.poll(timeout_ms=1000)
|
|
850
|
+
if message_batch:
|
|
851
|
+
for topic_partition, messages in message_batch.items():
|
|
852
|
+
for message in messages:
|
|
853
|
+
print("trying to fetch message")
|
|
854
|
+
msg = message.value
|
|
855
|
+
if msg.get("correlationId") == correlation_id:
|
|
856
|
+
consumer.close()
|
|
857
|
+
# Always treat a received response as final, even if error
|
|
858
|
+
return self.handle_kafka_response(
|
|
859
|
+
msg,
|
|
860
|
+
f"Fetched via Kafka for {api}",
|
|
861
|
+
f"Kafka error response for {api}"
|
|
862
|
+
) + (True,)
|
|
863
|
+
else:
|
|
864
|
+
print(f"No messages received, waiting... ({time.time() - start:.1f}s/{timeout}s)")
|
|
865
|
+
|
|
859
866
|
consumer.close()
|
|
860
|
-
logging.warning("Kafka response timeout for %s", api)
|
|
867
|
+
logging.warning("Kafka response timeout for %s after %d seconds", api, timeout)
|
|
861
868
|
return None, "Kafka response timeout", "Kafka response timeout", False
|
|
862
869
|
except Exception as e:
|
|
863
870
|
logging.error("Kafka consumer error: %s", e)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.1 → matrice_compute-0.1.12}/matrice_compute.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{matrice_compute-0.1.1 → matrice_compute-0.1.12}/src/matrice_compute/actions_scaledown_manager.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|