kubetorch 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. kubetorch/__init__.py +59 -0
  2. kubetorch/cli.py +1939 -0
  3. kubetorch/cli_utils.py +967 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +269 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +159 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +140 -0
  30. kubetorch/resources/callables/module.py +1315 -0
  31. kubetorch/resources/callables/utils.py +203 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +253 -0
  34. kubetorch/resources/compute/compute.py +2414 -0
  35. kubetorch/resources/compute/decorators.py +137 -0
  36. kubetorch/resources/compute/utils.py +1026 -0
  37. kubetorch/resources/compute/websocket.py +135 -0
  38. kubetorch/resources/images/__init__.py +1 -0
  39. kubetorch/resources/images/image.py +412 -0
  40. kubetorch/resources/images/images.py +64 -0
  41. kubetorch/resources/secrets/__init__.py +2 -0
  42. kubetorch/resources/secrets/kubernetes_secrets_client.py +377 -0
  43. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  44. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  45. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  46. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  47. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  48. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  49. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  50. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  51. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  52. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  53. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  54. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  55. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  56. kubetorch/resources/secrets/provider_secrets/providers.py +92 -0
  57. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  58. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  59. kubetorch/resources/secrets/secret.py +224 -0
  60. kubetorch/resources/secrets/secret_factory.py +64 -0
  61. kubetorch/resources/secrets/utils.py +222 -0
  62. kubetorch/resources/volumes/__init__.py +0 -0
  63. kubetorch/resources/volumes/volume.py +340 -0
  64. kubetorch/servers/__init__.py +0 -0
  65. kubetorch/servers/http/__init__.py +0 -0
  66. kubetorch/servers/http/distributed_utils.py +2968 -0
  67. kubetorch/servers/http/http_client.py +802 -0
  68. kubetorch/servers/http/http_server.py +1622 -0
  69. kubetorch/servers/http/server_metrics.py +255 -0
  70. kubetorch/servers/http/utils.py +722 -0
  71. kubetorch/serving/__init__.py +0 -0
  72. kubetorch/serving/autoscaling.py +153 -0
  73. kubetorch/serving/base_service_manager.py +344 -0
  74. kubetorch/serving/constants.py +77 -0
  75. kubetorch/serving/deployment_service_manager.py +431 -0
  76. kubetorch/serving/knative_service_manager.py +487 -0
  77. kubetorch/serving/raycluster_service_manager.py +526 -0
  78. kubetorch/serving/service_manager.py +18 -0
  79. kubetorch/serving/templates/deployment_template.yaml +17 -0
  80. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  81. kubetorch/serving/templates/kt_setup_template.sh.j2 +91 -0
  82. kubetorch/serving/templates/pod_template.yaml +198 -0
  83. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  84. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  85. kubetorch/serving/templates/service_template.yaml +21 -0
  86. kubetorch/serving/templates/workerset_template.yaml +36 -0
  87. kubetorch/serving/utils.py +344 -0
  88. kubetorch/utils.py +263 -0
  89. kubetorch-0.2.5.dist-info/METADATA +75 -0
  90. kubetorch-0.2.5.dist-info/RECORD +92 -0
  91. kubetorch-0.2.5.dist-info/WHEEL +4 -0
  92. kubetorch-0.2.5.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,203 @@
1
+ import importlib.metadata as metadata
2
+ import inspect
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Callable, Optional, Type, Union
7
+
8
+ from kubetorch.logger import get_logger
9
+
10
+ logger = get_logger(__name__)
11
+
12
+ SHELL_COMMANDS = {"ssh", "run_bash", "rsync"}
13
+
14
+
15
+ class NotebookError(Exception):
16
+ """Raised when a function defined in a notebook environment cannot be properly handled."""
17
+
18
+ pass
19
+
20
+
21
+ def prepare_notebook_fn(fn_pointers, name):
22
+ """Handle a function defined in a notebook by writing it out to a dedicated .py file to be imported
23
+ on the cluster."""
24
+ module_path = Path.cwd() / (f"{name}_fn.py" if name else "sent_fn.py")
25
+ logger.info(
26
+ f"Function is defined in a notebook, writing it out to {str(module_path)} "
27
+ f"to make it importable. Please make sure the function does not rely on any local variables, "
28
+ f"including imports (which should be moved inside the function body). "
29
+ f"This restriction does not apply to functions defined in normal Python files."
30
+ )
31
+ try:
32
+ # Try to pull the frame variable for the function by name
33
+ user_fn_name = fn_pointers[2]
34
+ frame = inspect.stack()[2].frame
35
+ user_fn = frame.f_globals.get(user_fn_name) or frame.f_locals.get(user_fn_name)
36
+ source = inspect.getsource(user_fn).strip() if user_fn else None
37
+ if source is None:
38
+ raise NotebookError(
39
+ f"Failed to load source code for function {user_fn_name}. "
40
+ f"Please ensure the function is defined in the notebook and not relying on local variables."
41
+ )
42
+ except Exception as e:
43
+ raise NotebookError(str(e))
44
+
45
+ with module_path.open("w") as f:
46
+ f.write(source)
47
+
48
+ return fn_pointers[0], module_path.stem, fn_pointers[2]
49
+
50
+
51
+ def extract_pointers(raw_cls_or_fn: Union[Type, Callable]):
52
+ """Get the path to the module, module name, and function name to be able to import it on the server"""
53
+ if not (isinstance(raw_cls_or_fn, Type) or isinstance(raw_cls_or_fn, Callable)):
54
+ raise TypeError(f"Expected Type or Callable but received {type(raw_cls_or_fn)}")
55
+
56
+ # (root_path, module_name, cls_or_fn_name)
57
+ return _get_module_import_info(raw_cls_or_fn)
58
+
59
+
60
+ def _get_module_import_info(raw_cls_or_fn: Union[Type, Callable]):
61
+ """
62
+ Given a class or function in Python, get all the information needed to import it in another Python process.
63
+ """
64
+
65
+ # Background on all these dunders: https://docs.python.org/3/reference/import.html
66
+ py_module = inspect.getmodule(raw_cls_or_fn)
67
+
68
+ # Need to resolve in case just filename is given
69
+ module_path = _extract_module_path(raw_cls_or_fn)
70
+
71
+ # TODO better way of detecting if in a notebook or interactive Python env
72
+ if not module_path or module_path.endswith("ipynb"):
73
+ # The only time __file__ wouldn't be present is if the function is defined in an interactive
74
+ # interpreter or a notebook. We can't import on the server in that case, so we need to cloudpickle
75
+ # the fn to send it over. The __call__ function will serialize the function if we return it this way.
76
+ # This is a short-term hack.
77
+ # return None, "notebook", raw_fn.__name__
78
+ root_path = os.getcwd()
79
+ module_name = "notebook"
80
+ cls_or_fn_name = raw_cls_or_fn.__name__
81
+ else:
82
+ root_path = os.path.dirname(module_path)
83
+ module_name = inspect.getmodulename(module_path)
84
+ # TODO __qualname__ doesn't work when fn is aliased funnily, like torch.sum
85
+ cls_or_fn_name = getattr(raw_cls_or_fn, "__qualname__", raw_cls_or_fn.__name__)
86
+
87
+ # Adapted from https://github.com/modal-labs/modal-client/blob/main/modal/_function_utils.py#L94
88
+ if getattr(py_module, "__package__", None):
89
+ module_path = os.path.abspath(py_module.__file__)
90
+ package_paths = [os.path.abspath(p) for p in __import__(py_module.__package__).__path__]
91
+ base_dirs = [
92
+ base_dir for base_dir in package_paths if os.path.commonpath((base_dir, module_path)) == base_dir
93
+ ]
94
+
95
+ if len(base_dirs) != 1:
96
+ raise Exception("Wasn't able to find the package directory!")
97
+ root_path = os.path.dirname(base_dirs[0])
98
+ module_name = py_module.__spec__.name
99
+
100
+ return root_path, module_name, cls_or_fn_name
101
+
102
+
103
+ def _extract_module_path(raw_cls_or_fn: Union[Type, Callable]):
104
+ py_module = inspect.getmodule(raw_cls_or_fn)
105
+
106
+ # Need to resolve in case just filename is given
107
+ module_path = str(Path(inspect.getfile(py_module)).resolve()) if hasattr(py_module, "__file__") else None
108
+
109
+ return module_path
110
+
111
+
112
+ def locate_working_dir(start_dir=None):
113
+ """
114
+ Locate the working directory of the project.
115
+
116
+ Args:
117
+ start_dir (str, optional): The directory to start searching from. Defaults to the current working directory.
118
+
119
+ Returns:
120
+ tuple: A tuple containing the working directory and a boolean indicating if a project directory was found.
121
+ """
122
+ if start_dir is None:
123
+ start_dir = os.getcwd()
124
+
125
+ # Search first for anything that represents a Python package
126
+ target_files = [
127
+ ".git",
128
+ "setup.py",
129
+ "setup.cfg",
130
+ "pyproject.toml",
131
+ "requirements.txt",
132
+ ]
133
+
134
+ dir_with_target = _find_directory_containing_any_file(start_dir, target_files, searched_dirs=set())
135
+ found_project_dir = dir_with_target is not None
136
+ return (dir_with_target if found_project_dir else start_dir), found_project_dir
137
+
138
+
139
+ def _find_directory_containing_any_file(dir_path, files, searched_dirs=None):
140
+ if Path(dir_path) == Path.home() or dir_path == Path("/"):
141
+ return None
142
+
143
+ if any(Path(dir_path, file).exists() for file in files):
144
+ return str(dir_path)
145
+
146
+ searched_dirs.add(dir_path)
147
+ parent_path = Path(dir_path).parent
148
+ if parent_path in searched_dirs:
149
+ return None
150
+ return _find_directory_containing_any_file(parent_path, files, searched_dirs=searched_dirs)
151
+
152
+
153
+ def get_local_install_path(package_name: str) -> Optional[str]:
154
+ from importlib.metadata import distributions
155
+
156
+ for dist in distributions():
157
+ direct_url_json = dist.read_text("direct_url.json")
158
+ if direct_url_json and dist.metadata["Name"].lower() == package_name.lower():
159
+ try:
160
+ url = json.loads(direct_url_json).get("url", None)
161
+ if url:
162
+ if url.startswith("file://"):
163
+ return url[len("file://") :]
164
+ except json.JSONDecodeError:
165
+ pass
166
+ return None
167
+
168
+
169
+ def find_locally_installed_version(package_name: str) -> Optional[str]:
170
+ try:
171
+ return metadata.version(package_name)
172
+ except metadata.PackageNotFoundError:
173
+ return None
174
+
175
+
176
+ def get_names_for_reload_fallbacks(name: str, prefixes: list[str] = []):
177
+ from kubetorch.globals import config
178
+ from kubetorch.servers.http.utils import clean_and_validate_k8s_name
179
+ from kubetorch.utils import current_git_branch, validate_username
180
+
181
+ current_prefix = config.username
182
+
183
+ if prefixes:
184
+ fallback_prefixes = prefixes
185
+ else:
186
+ # try reloading based on current username or current git branch (in that order)
187
+ branch = current_git_branch()
188
+ if branch:
189
+ # Ensure that we use the truncated branch name that was used to create the service initially
190
+ valid_branch = validate_username(branch)
191
+ # Note: username/prefix takes precedence over branch (in the event they differ)
192
+ fallback_prefixes = [v for v in (current_prefix, valid_branch) if v is not None]
193
+ else:
194
+ fallback_prefixes = [current_prefix] if current_prefix else []
195
+
196
+ potential_names = [
197
+ clean_and_validate_k8s_name(f"{prefix}-{name}", allow_full_length=True) for prefix in fallback_prefixes
198
+ ]
199
+ if not prefixes and name not in potential_names:
200
+ # try loading the bare name (i.e. prod mode) last, but only if we're not looking for specific prefixes
201
+ potential_names.append(name)
202
+
203
+ return potential_names
File without changes
@@ -0,0 +1,253 @@
1
+ import os
2
+ import re
3
+ import signal
4
+ import sys
5
+ import threading
6
+ import time
7
+ from datetime import datetime, timezone
8
+ from typing import Dict
9
+
10
+ from kubetorch.globals import config
11
+ from kubetorch.logger import get_logger
12
+
13
+ from kubetorch.resources.callables.module import Module
14
+ from kubetorch.resources.compute.compute import Compute
15
+ from kubetorch.resources.compute.utils import ServiceTimeoutError
16
+ from kubetorch.servers.http.utils import is_running_in_kubernetes
17
+ from kubetorch.utils import get_kt_install_url
18
+
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class App(Module):
23
+ MODULE_TYPE = "app"
24
+
25
+ def __init__(
26
+ self,
27
+ compute: Compute,
28
+ cli_command: str,
29
+ pointers: tuple,
30
+ name: str = None,
31
+ run_async: bool = False,
32
+ ):
33
+ """
34
+ Initialize an App object for remote execution.
35
+
36
+ .. note::
37
+
38
+ To create an App, please use the factory method :func:`app` in conjunction with the `kt run` CLI command.
39
+
40
+ Args:
41
+ compute (Compute): Compute
42
+ cli_command (str): CLI command to run on the compute.
43
+ pointers (tuple): A tuple containing references needed to locate the app file, of the format
44
+ (current working directory, path of file relative to cwd, None)
45
+ name (str, optional): Name to assign the app. If not provided, will be based on the name of the file in
46
+ which the app was defined.
47
+ run_async (bool, optional): Whether to run the app async. (Default: ``False``)
48
+ """
49
+ super().__init__(name=name, pointers=pointers)
50
+ self.cli_command = cli_command
51
+ self.pointers = pointers
52
+ self.name = name or self.module_name
53
+ self._compute = compute
54
+ self._run_async = run_async
55
+ self._remote_pointers = None
56
+
57
+ self._http_client = None
58
+
59
+ @property
60
+ def module_name(self):
61
+ return os.path.splitext(self.pointers[1])[0]
62
+
63
+ def from_name(self):
64
+ raise ValueError("Reloading app is not supported.")
65
+
66
+ def setup_signal_handlers(self):
67
+ signal.signal(signal.SIGINT, self.handle_termination_signal)
68
+ signal.signal(signal.SIGTERM, self.handle_termination_signal)
69
+
70
+ def handle_termination_signal(self, signum, frame):
71
+ red = "\u001b[31m"
72
+ reset = "\u001b[0m"
73
+
74
+ logger.info(f"{red}Received {signal.Signals(signum).name}. Exiting parent process.{reset}")
75
+ self._print_kt_cmds()
76
+ sys.exit(0)
77
+
78
+ def deploy(self):
79
+ """
80
+ Deploy the app to the compute specified by the app arguments.
81
+ """
82
+ self.compute.service_name = self.service_name
83
+
84
+ install_url, use_editable = get_kt_install_url(self.compute.freeze)
85
+ if not self.compute.freeze:
86
+ deployment_timestamp = datetime.now(timezone.utc).isoformat()
87
+ self._rsync_repo_and_image_patches(install_url, use_editable, init_args={})
88
+ else:
89
+ deployment_timestamp = None
90
+
91
+ self.setup_signal_handlers()
92
+
93
+ stream_logs = not self._run_async
94
+ self._launch_service(install_url, use_editable, deployment_timestamp, stream_logs)
95
+
96
+ def _get_service_dockerfile(self, metadata_env_vars):
97
+ image_instructions = super()._get_service_dockerfile(metadata_env_vars)
98
+
99
+ remote_script = os.path.join(self.remote_pointers[0], self.remote_pointers[1])
100
+ local_script = r"\b" + re.escape(self.remote_pointers[1]) + r"\b"
101
+ remote_cmd = re.sub(local_script, remote_script, self.cli_command)
102
+
103
+ image_instructions += f"CMD {remote_cmd}\n"
104
+ return image_instructions
105
+
106
+ def _launch_service(
107
+ self,
108
+ install_url,
109
+ use_editable,
110
+ deployment_timestamp,
111
+ stream_logs,
112
+ ):
113
+ trigger_reload = self.compute.is_up()
114
+ if self._run_async:
115
+ thread = threading.Thread(
116
+ target=super()._launch_service,
117
+ args=(
118
+ install_url,
119
+ use_editable,
120
+ {},
121
+ deployment_timestamp,
122
+ stream_logs,
123
+ config.log_verbosity,
124
+ False,
125
+ ),
126
+ )
127
+ thread.start()
128
+
129
+ if trigger_reload:
130
+ self._update_service(stream_logs, deployment_timestamp)
131
+ time.sleep(1)
132
+ else:
133
+ # wait for pods to be ready before exiting out
134
+ start_time = time.time()
135
+ while not self.compute.is_up() and time.time() - start_time < 60:
136
+ time.sleep(5)
137
+
138
+ if not self.compute.is_up():
139
+ raise ServiceTimeoutError(f"Service {self.service_name} is not up after 60 seconds.")
140
+ else:
141
+ super()._launch_service(
142
+ install_url,
143
+ use_editable,
144
+ init_args={},
145
+ deployment_timestamp=deployment_timestamp,
146
+ stream_logs=stream_logs,
147
+ verbosity=config.log_verbosity,
148
+ dryrun=False,
149
+ )
150
+
151
+ if trigger_reload:
152
+ self._update_service(stream_logs, deployment_timestamp)
153
+
154
+ def _update_service(self, stream_logs, deployment_timestamp):
155
+ client = self._client()
156
+
157
+ if self._run_async:
158
+ thread = threading.Thread(
159
+ target=client.call_method,
160
+ args=(
161
+ self.endpoint(),
162
+ stream_logs,
163
+ ),
164
+ kwargs={"headers": {"X-Deployed-As-Of": deployment_timestamp}},
165
+ )
166
+ thread.start()
167
+ time.sleep(1)
168
+ sys.exit()
169
+ else:
170
+ client.call_method(
171
+ self.endpoint(),
172
+ stream_logs=stream_logs,
173
+ headers={"X-Deployed-As-Of": deployment_timestamp},
174
+ )
175
+
176
+ def _print_kt_cmds(self):
177
+ logger.info(f"To see logs, run: kt logs {self.service_name}.")
178
+ logger.info(f"To teardown service, run: kt teardown {self.service_name}")
179
+
180
+ def endpoint(self):
181
+ return f"{self.base_endpoint}/_reload_image"
182
+
183
+
184
+ def app(
185
+ name: str = None,
186
+ port: int = None,
187
+ health_check: str = None,
188
+ **kwargs: Dict,
189
+ ):
190
+ """
191
+ Builds and deploys an instance of :class:`App`.
192
+
193
+ Args:
194
+ name (str, optional): Name to give the remote app. If not provided, will be based off the name of the file in
195
+ which the app was defined.
196
+ port (int, optional): Server port to expose, if the app starts an HTTP server.
197
+ health_check (str, optional): Health check endpoint, if running a server, to check when server is up and ready.
198
+ **kwargs: Compute kwargs, to define the compute on which to run the app on.
199
+
200
+ Examples:
201
+
202
+ Define the ``kt.app`` object and compute in your Python file:
203
+
204
+ .. code-block:: python
205
+
206
+ import kubetorch as kt
207
+
208
+ # Define the app at the top of the Python file to deploy
209
+ # train.py
210
+ kt.app(name="my-app", image=kt.Image("docker-latest"), cpus="0.01")
211
+
212
+ if __name__ == "__main__":
213
+ ...
214
+
215
+ Deploy and run the app remotely using the ``kt run`` CLI command:
216
+
217
+ .. code-block:: bash
218
+
219
+ kt run python train.py --epochs 5
220
+ kt run fastapi run my_app.py --name fastapi-app
221
+ """
222
+ if not os.getenv("KT_RUN") == "1" or is_running_in_kubernetes():
223
+ return None
224
+
225
+ if name and os.getenv("KT_RUN_NAME") and not (name == os.getenv("KT_RUN_NAME")):
226
+ raise ValueError(
227
+ f"Name mismatch between kt.App definition ({name}) and kt run command ({os.getenv('KT_RUN_NAME')})."
228
+ )
229
+ name = name or os.getenv("KT_RUN_NAME")
230
+ cli_command = os.getenv("KT_RUN_CMD") # set in kt run
231
+ run_async = os.getenv("KT_RUN_ASYNC") == 1
232
+
233
+ env_vars = kwargs.get("env_vars", {})
234
+ if port:
235
+ env_vars["KT_APP_PORT"] = port
236
+ if health_check:
237
+ env_vars["KT_APP_HEALTHCHECK"] = health_check
238
+ kwargs["env_vars"] = env_vars
239
+ compute = Compute(**kwargs)
240
+
241
+ main_file = os.getenv("KT_RUN_FILE") or os.path.abspath(sys.modules["__main__"].__file__)
242
+ relative_path = os.path.relpath(main_file, os.getcwd())
243
+ pointers = [os.getcwd(), relative_path, None]
244
+ relative_cli_command = re.sub(main_file, relative_path, cli_command)
245
+
246
+ kt_app = App(
247
+ compute=compute,
248
+ cli_command=relative_cli_command,
249
+ pointers=pointers,
250
+ name=name,
251
+ run_async=run_async,
252
+ )
253
+ return kt_app