tuft 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tuft/__main__.py +7 -0
- tuft/cli.py +41 -8
- tuft/config.py +6 -4
- tuft/sampling_controller.py +3 -1
- tuft/training_controller.py +6 -1
- {tuft-0.1.1.dist-info → tuft-0.1.2.dist-info}/METADATA +2 -2
- {tuft-0.1.1.dist-info → tuft-0.1.2.dist-info}/RECORD +10 -9
- {tuft-0.1.1.dist-info → tuft-0.1.2.dist-info}/WHEEL +0 -0
- {tuft-0.1.1.dist-info → tuft-0.1.2.dist-info}/entry_points.txt +0 -0
- {tuft-0.1.1.dist-info → tuft-0.1.2.dist-info}/licenses/LICENSE +0 -0
tuft/__main__.py
ADDED
tuft/cli.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import logging
|
|
6
|
+
import os
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import typer
|
|
@@ -14,34 +15,66 @@ from .telemetry import init_telemetry
|
|
|
14
15
|
from .telemetry.metrics import ResourceMetricsCollector
|
|
15
16
|
|
|
16
17
|
|
|
17
|
-
app = typer.Typer(help="TuFT - Tenant-unified Fine-Tuning Server.")
|
|
18
|
+
app = typer.Typer(help="TuFT - Tenant-unified Fine-Tuning Server.", no_args_is_help=True)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Required for Typer to recognize subcommands when using no_args_is_help=True
|
|
22
|
+
@app.callback()
|
|
23
|
+
def callback() -> None:
|
|
24
|
+
"""TuFT - Tenant-unified Fine-Tuning Server."""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Default paths based on TUFT_HOME
|
|
28
|
+
_TUFT_HOME = Path(os.environ.get("TUFT_HOME", Path.home() / ".tuft"))
|
|
29
|
+
_DEFAULT_CONFIG_PATH = _TUFT_HOME / "configs" / "tuft_config.yaml"
|
|
30
|
+
_DEFAULT_CHECKPOINT_DIR = _TUFT_HOME / "checkpoints"
|
|
18
31
|
|
|
19
32
|
_HOST_OPTION = typer.Option("127.0.0.1", "--host", help="Interface to bind", envvar="TUFT_HOST")
|
|
20
33
|
_PORT_OPTION = typer.Option(10610, "--port", "-p", help="Port to bind", envvar="TUFT_PORT")
|
|
21
|
-
_LOG_LEVEL_OPTION = typer.Option(
|
|
34
|
+
_LOG_LEVEL_OPTION = typer.Option(
|
|
35
|
+
"info", "--log-level", help="Uvicorn log level", envvar="TUFT_LOG_LEVEL"
|
|
36
|
+
)
|
|
22
37
|
_RELOAD_OPTION = typer.Option(False, "--reload", help="Enable auto-reload (development only)")
|
|
23
38
|
_CONFIG_OPTION = typer.Option(
|
|
24
39
|
None,
|
|
25
40
|
"--config",
|
|
26
41
|
"-c",
|
|
27
|
-
help="Path to a TuFT configuration file (YAML)",
|
|
42
|
+
help=f"Path to a TuFT configuration file (YAML). Defaults to {_DEFAULT_CONFIG_PATH}",
|
|
43
|
+
envvar="TUFT_CONFIG",
|
|
28
44
|
)
|
|
29
45
|
_CHECKPOINT_DIR_OPTION = typer.Option(
|
|
30
46
|
None,
|
|
31
47
|
"--checkpoint-dir",
|
|
32
|
-
help="Override checkpoint_dir from config file. Defaults to
|
|
48
|
+
help=f"Override checkpoint_dir from config file. Defaults to {_DEFAULT_CHECKPOINT_DIR}",
|
|
49
|
+
envvar="TUFT_CHECKPOINT_DIR",
|
|
33
50
|
)
|
|
34
51
|
|
|
35
52
|
|
|
53
|
+
def _resolve_config_path(config_path: Path | None) -> Path:
|
|
54
|
+
"""Resolve the config path, falling back to default if not provided."""
|
|
55
|
+
if config_path is not None:
|
|
56
|
+
return config_path
|
|
57
|
+
if _DEFAULT_CONFIG_PATH.exists():
|
|
58
|
+
return _DEFAULT_CONFIG_PATH
|
|
59
|
+
raise typer.BadParameter(
|
|
60
|
+
f"Configuration file must be provided via --config or TUFT_CONFIG, "
|
|
61
|
+
f"or create a default config at {_DEFAULT_CONFIG_PATH}"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
36
65
|
def _build_config(
|
|
37
66
|
config_path: Path | None,
|
|
38
67
|
checkpoint_dir: Path | None,
|
|
39
68
|
) -> AppConfig:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
69
|
+
resolved_config_path = _resolve_config_path(config_path)
|
|
70
|
+
config = load_yaml_config(resolved_config_path)
|
|
71
|
+
# Apply checkpoint_dir override, or use default if not in config
|
|
43
72
|
if checkpoint_dir is not None:
|
|
44
73
|
config.checkpoint_dir = checkpoint_dir.expanduser()
|
|
74
|
+
elif config.checkpoint_dir is None:
|
|
75
|
+
config.checkpoint_dir = _DEFAULT_CHECKPOINT_DIR
|
|
76
|
+
# Guarantee checkpoint_dir is set after resolution
|
|
77
|
+
assert config.checkpoint_dir is not None, "checkpoint_dir must be set after config resolution"
|
|
45
78
|
config.ensure_directories()
|
|
46
79
|
return config
|
|
47
80
|
|
|
@@ -84,7 +117,7 @@ def launch(
|
|
|
84
117
|
|
|
85
118
|
|
|
86
119
|
def main() -> None:
|
|
87
|
-
app()
|
|
120
|
+
app(prog_name="tuft")
|
|
88
121
|
|
|
89
122
|
|
|
90
123
|
if __name__ == "__main__":
|
tuft/config.py
CHANGED
|
@@ -9,8 +9,9 @@ from typing import Dict, Iterable, List
|
|
|
9
9
|
from .persistence import PersistenceConfig
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
def _default_checkpoint_dir() -> Path:
|
|
13
|
-
|
|
12
|
+
def _default_checkpoint_dir() -> Path | None:
|
|
13
|
+
"""Return None to let CLI set the default based on TUFT_HOME."""
|
|
14
|
+
return None
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def _default_persistence_config() -> PersistenceConfig:
|
|
@@ -42,7 +43,7 @@ def _default_telemetry_config() -> TelemetryConfig:
|
|
|
42
43
|
class AppConfig:
|
|
43
44
|
"""Runtime configuration for the TuFT server."""
|
|
44
45
|
|
|
45
|
-
checkpoint_dir: Path = field(default_factory=_default_checkpoint_dir)
|
|
46
|
+
checkpoint_dir: Path | None = field(default_factory=_default_checkpoint_dir)
|
|
46
47
|
supported_models: List[ModelConfig] = field(default_factory=list)
|
|
47
48
|
model_owner: str = "local-user"
|
|
48
49
|
toy_backend_seed: int = 0
|
|
@@ -53,7 +54,8 @@ class AppConfig:
|
|
|
53
54
|
telemetry: TelemetryConfig = field(default_factory=_default_telemetry_config)
|
|
54
55
|
|
|
55
56
|
def ensure_directories(self) -> None:
|
|
56
|
-
self.checkpoint_dir
|
|
57
|
+
if self.checkpoint_dir is not None:
|
|
58
|
+
self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
|
|
57
59
|
|
|
58
60
|
def check_validity(self) -> None:
|
|
59
61
|
if not self.supported_models:
|
tuft/sampling_controller.py
CHANGED
|
@@ -181,8 +181,10 @@ class SamplingController:
|
|
|
181
181
|
if model_path:
|
|
182
182
|
# model_path should have higher priority than base_model
|
|
183
183
|
try:
|
|
184
|
+
assert self.config.checkpoint_dir is not None
|
|
184
185
|
parsed_checkpoint = CheckpointRecord.from_tinker_path(
|
|
185
|
-
model_path,
|
|
186
|
+
model_path,
|
|
187
|
+
self.config.checkpoint_dir,
|
|
186
188
|
)
|
|
187
189
|
except FileNotFoundError as exc:
|
|
188
190
|
raise CheckpointNotFoundException(checkpoint_id=model_path) from exc
|
tuft/training_controller.py
CHANGED
|
@@ -508,6 +508,7 @@ class TrainingController:
|
|
|
508
508
|
logger.info("Checkpoint save begin: %s", checkpoint_id)
|
|
509
509
|
|
|
510
510
|
setattr(training_run, counter_attr, counter + 1)
|
|
511
|
+
assert self.config.checkpoint_dir is not None
|
|
511
512
|
checkpoint = CheckpointRecord.from_training_run(
|
|
512
513
|
training_run_id=training_run.training_run_id,
|
|
513
514
|
checkpoint_name=checkpoint_name,
|
|
@@ -567,7 +568,11 @@ class TrainingController:
|
|
|
567
568
|
) -> None:
|
|
568
569
|
"""Load a checkpoint."""
|
|
569
570
|
try:
|
|
570
|
-
|
|
571
|
+
assert self.config.checkpoint_dir is not None
|
|
572
|
+
parsed_checkpoint = CheckpointRecord.from_tinker_path(
|
|
573
|
+
path,
|
|
574
|
+
self.config.checkpoint_dir,
|
|
575
|
+
)
|
|
571
576
|
except FileNotFoundError as exc:
|
|
572
577
|
raise CheckpointNotFoundException(checkpoint_id=model_id) from exc
|
|
573
578
|
source_model_id = parsed_checkpoint.training_run_id or model_id
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: tuft
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: A multi-tenant fine-tuning platform for LLMs with Tinker-compatible API
|
|
5
5
|
Author-email: TuFT Developers <tuft@list.alibaba-inc.com>
|
|
6
6
|
License: MIT License
|
|
@@ -29,6 +29,7 @@ Requires-Python: >=3.11
|
|
|
29
29
|
Requires-Dist: fastapi>=0.125.0
|
|
30
30
|
Requires-Dist: httpx>=0.28.1
|
|
31
31
|
Requires-Dist: numpy<2.0.0
|
|
32
|
+
Requires-Dist: nvidia-ml-py>=13.0.0
|
|
32
33
|
Requires-Dist: omegaconf>=2.3.0
|
|
33
34
|
Requires-Dist: opentelemetry-api>=1.20.0
|
|
34
35
|
Requires-Dist: opentelemetry-exporter-otlp>=1.20.0
|
|
@@ -36,7 +37,6 @@ Requires-Dist: opentelemetry-instrumentation-fastapi>=0.41b0
|
|
|
36
37
|
Requires-Dist: opentelemetry-instrumentation-logging>=0.41b0
|
|
37
38
|
Requires-Dist: opentelemetry-sdk>=1.20.0
|
|
38
39
|
Requires-Dist: psutil>=5.9.0
|
|
39
|
-
Requires-Dist: pynvml>=11.5.0
|
|
40
40
|
Requires-Dist: ray>=2.50.0
|
|
41
41
|
Requires-Dist: tinker>=0.7.0
|
|
42
42
|
Requires-Dist: transformers<5.0.0,>=4.57.3
|
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
tuft/__init__.py,sha256=BJu6iJ_QGwcJXRXDgR1LjV25KgM6sVd7_WqIXVTEuVM,97
|
|
2
|
+
tuft/__main__.py,sha256=MPhC9msQXf9py5xkLPQ4JoqrvCpL_qXVwksasNUj7ig,131
|
|
2
3
|
tuft/auth.py,sha256=2Wk9ATXlAiGm1Irpj66CfIyORuHzciSNAOzVwM8PeO0,1071
|
|
3
4
|
tuft/backend.py,sha256=ftiaaNds2MXroszZW8l6DEq515qiw1KmrODI3x6AHE4,10254
|
|
4
5
|
tuft/checkpoints.py,sha256=bObo2NzDrfzp5BiS6I_FIA3frLFic_sT4o4c-PEzfpk,6917
|
|
5
|
-
tuft/cli.py,sha256
|
|
6
|
-
tuft/config.py,sha256=
|
|
6
|
+
tuft/cli.py,sha256=-WhmHGIHmWtL46LvXRlhTPVPhBUjZHVLJi0nYR_pqoE,4024
|
|
7
|
+
tuft/config.py,sha256=bX6NuSora0Wqhk5Q5lsnc0lojeevxnLHfiijJHMdtVg,4380
|
|
7
8
|
tuft/exceptions.py,sha256=_xdsL8bx3Y6jvC5VYHVCa73uAEWXxcl2YwVc09lJXFk,4088
|
|
8
9
|
tuft/futures.py,sha256=0gRLgDJJQRGGmULYsKdUs3VDsrLN8QfuFfXV00kxHO4,16375
|
|
9
|
-
tuft/sampling_controller.py,sha256=
|
|
10
|
+
tuft/sampling_controller.py,sha256=c02VQ6Qww9IQC9VJYzQO9Z9v45kK2QeaOKlknYWjSI4,15250
|
|
10
11
|
tuft/server.py,sha256=NUapRGdQbQH6PbuCfMZeMVi_7vM6nM7xmxepCPkgyko,24996
|
|
11
12
|
tuft/state.py,sha256=J9R5Wd9JlMtpYcaY_6t5RvgJbY3EX5ZJTZfoQhwZ9hU,12853
|
|
12
|
-
tuft/training_controller.py,sha256=
|
|
13
|
+
tuft/training_controller.py,sha256=V4JMgyEnf4wYGrk72AR5rHH1iYl488vt7d0c-ubTrO0,30008
|
|
13
14
|
tuft/backends/__init__.py,sha256=7A6Pu-vEMbcMWapAh-zkI1O5WtBHO0OxwED8qAy9kAQ,262
|
|
14
15
|
tuft/backends/base_backend.py,sha256=bdlx3hRyEj00GKFlh2fAczn7h4zANz7bdKgXb_F18y4,3462
|
|
15
16
|
tuft/backends/hf_training_model.py,sha256=XQa598SpY7DnYYU0rTaHjlh-5dRCPueFtcdxrcjXWIc,16993
|
|
@@ -28,8 +29,8 @@ tuft/telemetry/__init__.py,sha256=dlSGiJ_pMElhwEe31olGg88ZrjoBeGUBn2P17qFNymM,33
|
|
|
28
29
|
tuft/telemetry/metrics.py,sha256=Yz6s2AQ5CptFXvEm-PbO-Ib17-aF0rnoG8vZxH-Pawo,11538
|
|
29
30
|
tuft/telemetry/provider.py,sha256=jGKqTMsP-WekKGCMN9QHwt-g_1Lk1xUOy1BO-__xG5I,6700
|
|
30
31
|
tuft/telemetry/tracing.py,sha256=GL-wEEQtzM1ycgfI4sMsHUeIC7qj5MyOH-sBwHihbsE,957
|
|
31
|
-
tuft-0.1.
|
|
32
|
-
tuft-0.1.
|
|
33
|
-
tuft-0.1.
|
|
34
|
-
tuft-0.1.
|
|
35
|
-
tuft-0.1.
|
|
32
|
+
tuft-0.1.2.dist-info/METADATA,sha256=UlTE_gR3cPFLzV69GyIHD6TOm-dHSmSM5NcEHt8L0Pg,20381
|
|
33
|
+
tuft-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
34
|
+
tuft-0.1.2.dist-info/entry_points.txt,sha256=T48zU7Vdi2ZsARDeOZ9jK6XGuYNaCbSaUTd5POouLms,39
|
|
35
|
+
tuft-0.1.2.dist-info/licenses/LICENSE,sha256=fJHdoqbikZ-GATzLNmixfKDot1w_cJuHKY3mH4qSmYs,1069
|
|
36
|
+
tuft-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|