fkat 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. fkat/__init__.py +147 -0
  2. fkat/data/__init__.py +15 -0
  3. fkat/data/data_module.py +198 -0
  4. fkat/data/datasets/__init__.py +19 -0
  5. fkat/data/datasets/dict.py +78 -0
  6. fkat/data/datasets/json.py +176 -0
  7. fkat/data/datasets/map.py +90 -0
  8. fkat/data/datasets/parquet.py +242 -0
  9. fkat/data/datasets/sized.py +31 -0
  10. fkat/data/dict.py +42 -0
  11. fkat/data/samplers/__init__.py +9 -0
  12. fkat/data/samplers/dict.py +38 -0
  13. fkat/data/samplers/sized.py +16 -0
  14. fkat/data/samplers/strategies.py +68 -0
  15. fkat/data/sharded.py +718 -0
  16. fkat/data/shm.py +364 -0
  17. fkat/predict.py +32 -0
  18. fkat/py.typed +0 -0
  19. fkat/pytorch/__init__.py +3 -0
  20. fkat/pytorch/actions/__init__.py +11 -0
  21. fkat/pytorch/actions/aws/__init__.py +3 -0
  22. fkat/pytorch/actions/aws/batch.py +29 -0
  23. fkat/pytorch/actions/aws/ec2.py +61 -0
  24. fkat/pytorch/callbacks/__init__.py +2 -0
  25. fkat/pytorch/callbacks/cuda/__init__.py +16 -0
  26. fkat/pytorch/callbacks/cuda/cache.py +115 -0
  27. fkat/pytorch/callbacks/cuda/memory.py +200 -0
  28. fkat/pytorch/callbacks/cuda/nsys.py +199 -0
  29. fkat/pytorch/callbacks/cuda/nvtx.py +288 -0
  30. fkat/pytorch/callbacks/cuda/xid.py +173 -0
  31. fkat/pytorch/callbacks/debugging/__init__.py +9 -0
  32. fkat/pytorch/callbacks/debugging/introspection.py +569 -0
  33. fkat/pytorch/callbacks/debugging/optimizer.py +45 -0
  34. fkat/pytorch/callbacks/gc.py +146 -0
  35. fkat/pytorch/callbacks/loggers.py +211 -0
  36. fkat/pytorch/callbacks/logging/__init__.py +12 -0
  37. fkat/pytorch/callbacks/logging/heartbeat.py +76 -0
  38. fkat/pytorch/callbacks/logging/throughput.py +253 -0
  39. fkat/pytorch/callbacks/logging/validation_metrics.py +94 -0
  40. fkat/pytorch/callbacks/monitoring/__init__.py +14 -0
  41. fkat/pytorch/callbacks/monitoring/crash.py +162 -0
  42. fkat/pytorch/callbacks/monitoring/dp.py +130 -0
  43. fkat/pytorch/callbacks/monitoring/hardware_stats.py +135 -0
  44. fkat/pytorch/callbacks/monitoring/shutdown.py +170 -0
  45. fkat/pytorch/callbacks/profiling/__init__.py +13 -0
  46. fkat/pytorch/callbacks/profiling/flops.py +574 -0
  47. fkat/pytorch/callbacks/profiling/memray.py +212 -0
  48. fkat/pytorch/callbacks/profiling/torch.py +197 -0
  49. fkat/pytorch/callbacks/profiling/viztracer.py +197 -0
  50. fkat/pytorch/loggers.py +284 -0
  51. fkat/pytorch/schedule/__init__.py +27 -0
  52. fkat/pytorch/schedule/base.py +308 -0
  53. fkat/pytorch/schedule/mlflow.py +143 -0
  54. fkat/pytorch/utilities.py +49 -0
  55. fkat/test.py +31 -0
  56. fkat/train.py +32 -0
  57. fkat/utils/__init__.py +28 -0
  58. fkat/utils/aws/__init__.py +3 -0
  59. fkat/utils/aws/imds.py +137 -0
  60. fkat/utils/boto3.py +24 -0
  61. fkat/utils/config.py +194 -0
  62. fkat/utils/cuda/__init__.py +3 -0
  63. fkat/utils/cuda/preflight/__init__.py +3 -0
  64. fkat/utils/cuda/preflight/health_check/aws_instance_config.py +82 -0
  65. fkat/utils/cuda/preflight/health_check/constants.py +23 -0
  66. fkat/utils/cuda/preflight/health_check/ddb_client.py +82 -0
  67. fkat/utils/cuda/preflight/health_check/gpu_connection_test.py +104 -0
  68. fkat/utils/cuda/preflight/health_check/gpu_stress_test.py +122 -0
  69. fkat/utils/cuda/preflight/health_check/helpers.py +297 -0
  70. fkat/utils/cuda/preflight/health_check/logger.py +205 -0
  71. fkat/utils/cuda/preflight/health_check/timer.py +31 -0
  72. fkat/utils/cuda/preflight/run.py +560 -0
  73. fkat/utils/cuda/xid.py +48 -0
  74. fkat/utils/logging.py +28 -0
  75. fkat/utils/mlflow.py +33 -0
  76. fkat/utils/pandas.py +25 -0
  77. fkat/utils/pdb.py +84 -0
  78. fkat/utils/pool.py +81 -0
  79. fkat/utils/profiler.py +18 -0
  80. fkat/utils/pyarrow.py +21 -0
  81. fkat/utils/rng.py +27 -0
  82. fkat/utils/shm.py +184 -0
  83. fkat/validate.py +31 -0
  84. fkat-0.1.2.dist-info/METADATA +134 -0
  85. fkat-0.1.2.dist-info/RECORD +88 -0
  86. fkat-0.1.2.dist-info/WHEEL +4 -0
  87. fkat-0.1.2.dist-info/licenses/LICENSE +175 -0
  88. fkat-0.1.2.dist-info/licenses/NOTICE +1 -0
fkat/utils/pdb.py ADDED
@@ -0,0 +1,84 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ import os
4
+ import pdb
5
+ import sys
6
+ from typing import Any
7
+ from types import FrameType, TracebackType
8
+ from typing_extensions import override
9
+
10
+
11
+ class ForkedPdb(pdb.Pdb):
12
+ def __init__(self) -> None:
13
+ super().__init__()
14
+ self.rank = os.environ.get(
15
+ "RANK", # PyTorch DDP
16
+ os.environ.get(
17
+ "PMI_RANK", # MPI
18
+ os.environ.get(
19
+ "OMPI_COMM_WORLD_RANK", # OpenMPI
20
+ "unknown",
21
+ ),
22
+ ),
23
+ )
24
+
25
+ @override
26
+ def interaction(self, frame: FrameType | None, traceback: TracebackType | None, *args: Any, **kwargs: Any) -> None:
27
+ _stdin = sys.stdin
28
+ try:
29
+ sys.stdin = open("/dev/stdin")
30
+ self.print_rank_info()
31
+ pdb.Pdb.interaction(self, frame, traceback, *args, **kwargs)
32
+ finally:
33
+ sys.stdin = _stdin
34
+
35
+ def print_rank_info(self) -> None:
36
+ print(f"\n[RANK={self.rank}, PID={os.getpid()}]:")
37
+
38
+ @override
39
+ def default(self, line: str) -> None:
40
+ self.print_rank_info()
41
+ super().default(line)
42
+
43
+ @override
44
+ def do_continue(self, arg: str) -> bool | None:
45
+ self.print_rank_info()
46
+ return super().do_continue(arg)
47
+
48
+ @override
49
+ def do_next(self, arg: str) -> bool | None:
50
+ self.print_rank_info()
51
+ return super().do_next(arg)
52
+
53
+ @override
54
+ def do_step(self, arg: str) -> bool | None:
55
+ self.print_rank_info()
56
+ return super().do_step(arg)
57
+
58
+ @override
59
+ def do_return(self, arg: str) -> bool | None:
60
+ self.print_rank_info()
61
+ return super().do_return(arg)
62
+
63
+ @override
64
+ def do_quit(self, arg: str) -> bool | None:
65
+ self.print_rank_info()
66
+ return super().do_quit(arg)
67
+
68
+ @override
69
+ def do_jump(self, arg: str) -> bool | None:
70
+ self.print_rank_info()
71
+ return super().do_jump(arg)
72
+
73
+ @override
74
+ def precmd(self, line: str) -> str:
75
+ self.print_rank_info()
76
+ return line
77
+
78
+ def post_mortem(self, tb: TracebackType | None) -> None:
79
+ self.reset()
80
+ self.interaction(None, tb)
81
+
82
+
83
+ def post_mortem() -> None:
84
+ sys.excepthook = lambda t, v, tb: ForkedPdb().post_mortem(tb)
fkat/utils/pool.py ADDED
@@ -0,0 +1,81 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from concurrent.futures import ThreadPoolExecutor, Future
4
+ from typing import Any, TypeVar
5
+ import multiprocessing as mp
6
+ from multiprocessing.pool import AsyncResult
7
+ from collections.abc import Callable, Iterable, Mapping
8
+
9
+
10
+ T = TypeVar("T", covariant=False)
11
+ T_co = TypeVar("T_co", covariant=True)
12
+
13
+
14
+ class FutureResult(AsyncResult[T]):
15
+ """An AsyncResult implementation for concurrent.future Future object"""
16
+
17
+ def __init__(self, fut: Future[T]) -> None:
18
+ self.fut = fut
19
+
20
+ def ready(self) -> bool:
21
+ return self.fut.done()
22
+
23
+ def get(self, timeout: float | None = None) -> T:
24
+ return self.fut.result(timeout)
25
+
26
+ def wait(self, timeout: float | None = None) -> None:
27
+ self.fut.exception(timeout)
28
+
29
+ def successful(self) -> bool:
30
+ return self.fut.exception() is None
31
+
32
+
33
+ class ThreadPool:
34
+ """A multiprocessing Pool-like implementation that uses ThreadPoolExecutor"""
35
+
36
+ def __init__(self, **kwargs: Any) -> None:
37
+ self.pool = ThreadPoolExecutor(**kwargs)
38
+
39
+ def apply_async(
40
+ self,
41
+ func: Callable[..., T_co],
42
+ args: Iterable[Any] | None = None,
43
+ kwds: Mapping[str, Any] | None = None,
44
+ ) -> FutureResult[T_co]:
45
+ fut = self.pool.submit(func, *(args or ()), **(kwds or {}))
46
+ return FutureResult(fut)
47
+
48
+ def close(self) -> None:
49
+ self.pool.shutdown()
50
+
51
+ def join(self) -> None:
52
+ if self.pool._shutdown:
53
+ self.close()
54
+ else:
55
+ self.pool.submit(lambda: None).result()
56
+
57
+
58
+ class NoDaemonProcess(mp.Process):
59
+ """A Process implementation that never runs in daemon mode"""
60
+
61
+ @property
62
+ def daemon(self) -> bool:
63
+ return False
64
+
65
+ @daemon.setter
66
+ def daemon(self, value: bool) -> None:
67
+ pass
68
+
69
+
70
+ class NoDaemonContext(type(mp.get_context())): # type: ignore[misc]
71
+ """A multiprocessing Context that uses NoDaemonProcess"""
72
+
73
+ Process = NoDaemonProcess
74
+
75
+
76
+ class NoDaemonPool(mp.pool.Pool): # type: ignore[unresolved-attribute]
77
+ """A multiprocessing Pool that uses NoDaemonContext"""
78
+
79
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
80
+ kwargs["context"] = NoDaemonContext()
81
+ super().__init__(*args, **kwargs)
fkat/utils/profiler.py ADDED
@@ -0,0 +1,18 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ import atexit
4
+
5
+ from lightning.pytorch.profilers import Profiler
6
+
7
+
8
+ def profile_until_exit(profiler: Profiler, action: str, filename_suffix: str | None = None) -> None:
9
+ def stop_profiler() -> None:
10
+ profiler.stop(action)
11
+ profiler.summary()
12
+ profiler.describe()
13
+
14
+ atexit.register(stop_profiler)
15
+
16
+ if profiler.filename and filename_suffix:
17
+ profiler.filename += filename_suffix
18
+ profiler.start(action)
fkat/utils/pyarrow.py ADDED
@@ -0,0 +1,21 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ import pyarrow as pa
4
+ from typing import Any
5
+ from collections.abc import Iterator
6
+
7
+
8
+ def iter_rows(table: pa.Table, chunk_size: int) -> Iterator[dict[str, Any]]:
9
+ """
10
+ Generator function to iterate over rows of a PyArrow table in chunks.
11
+
12
+ Args:
13
+ table (pa.Table): PyArrow table.
14
+ chunk_size (int): The number of rows per chunk for processing.
15
+ Yields:
16
+ Dict[str, Any]: Dictionary representing each row.
17
+ """
18
+ for chunk in table.to_batches(chunk_size):
19
+ columns = chunk.to_pydict()
20
+ for i in range(chunk.num_rows):
21
+ yield {col: columns[col][i] for col in columns}
fkat/utils/rng.py ADDED
@@ -0,0 +1,27 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from random import getstate as python_get_rng_state
4
+ from random import setstate as python_set_rng_state
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+ import torch
9
+
10
+
11
+ def get_rng_states() -> dict[str, Any]:
12
+ r"""Collect the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python."""
13
+ states = {
14
+ "torch": torch.get_rng_state(),
15
+ "numpy": np.random.get_state(),
16
+ "python": python_get_rng_state(),
17
+ }
18
+ return states
19
+
20
+
21
+ def set_rng_states(rng_state_dict: dict[str, Any]) -> None:
22
+ r"""Set the global random state of :mod:`torch`, :mod:`torch.cuda`, :mod:`numpy` and Python in the current
23
+ process."""
24
+ torch.set_rng_state(rng_state_dict["torch"])
25
+ np.random.set_state(rng_state_dict["numpy"])
26
+ version, state, gauss = rng_state_dict["python"]
27
+ python_set_rng_state((version, tuple(state), gauss))
fkat/utils/shm.py ADDED
@@ -0,0 +1,184 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ """Distributed Shared Memory Utility for Dataloader"""
4
+
5
+ import logging
6
+ import mmap
7
+ import os
8
+ import pickle
9
+ import shutil
10
+ import time
11
+ import uuid
12
+ from pathlib import Path
13
+ from typing import Any, SupportsIndex
14
+ from collections.abc import Callable, Iterable, Iterator
15
+
16
+ import torch.distributed as dist
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ COMPLETE = ".complete"
21
+
22
+
23
+ __all__ = ["save", "load"]
24
+
25
+
26
+ def save(obj: Any, path: Path | None = None) -> Path:
27
+ """Serialize obj with out-of-band data to path for zero-copy shared memory usage.
28
+
29
+ If the object to be serialized itself, or the objects it uses for data
30
+ storage (such as numpy arrays) implement the the pickle protocol version 5
31
+ pickle.PickleBuffer type in __reduce_ex__, then this function can store
32
+ these buffers out-of-band as files in `path` so that they subsequently be
33
+ re-used for zero-copy sharing accross processes.
34
+
35
+ Args:
36
+ obj (object):
37
+ Object to serialize. For example a PyArrow Table, a Pandas Dataframe or
38
+ any type that relies on NumPy to store the binary data.
39
+ path (pathlib.Path, optional):
40
+ Empty folder used to save serialized data. Usually a folder in /dev/shm
41
+ Returns:
42
+ pathlib.Path where the data was serialized
43
+ """
44
+ idx = 0
45
+ root: Path = path or generate_path()
46
+ root.mkdir(parents=True, exist_ok=True)
47
+
48
+ def buffer_callback(buf: pickle.PickleBuffer) -> None:
49
+ nonlocal idx
50
+ with open(root / f"{idx}.bin", "wb") as f:
51
+ f.write(buf)
52
+ idx += 1
53
+
54
+ with open(root / "meta.pkl", "wb") as f:
55
+ pickle.dump(obj, f, protocol=5, buffer_callback=buffer_callback)
56
+
57
+ # mark as saved
58
+ (root / COMPLETE).touch()
59
+ return root
60
+
61
+
62
+ def generate_path() -> Path:
63
+ global_rank = dist.get_rank() if dist.is_initialized() else 0 # type: ignore[possibly-unbound-attribute]
64
+ path_str = f"/dev/shm/{global_rank}-{uuid.uuid4()}"
65
+ path = Path(path_str)
66
+ return path
67
+
68
+
69
+ def save_iter(
70
+ it: Iterable[Any],
71
+ path: Path | None = None,
72
+ max_items: int = 0,
73
+ should_stop: Callable[[], bool] = lambda: False,
74
+ truncation_threshold: int | None = None,
75
+ ) -> Path:
76
+ logger.debug("save iter %r ... started", path)
77
+ path = path or generate_path()
78
+ next_idx = 0
79
+ for i, e in enumerate(it):
80
+ logger.debug("save iter %r ...", path)
81
+ if max_items > 0:
82
+ while (cnt := sum(x.is_dir() for x in path.iterdir()) if path.exists() else 0) >= max_items:
83
+ logger.debug("save iter ... %r dirs of %r stop? %r", cnt, max_items, should_stop())
84
+ if should_stop():
85
+ break
86
+ time.sleep(0.001) # busy wait
87
+ if should_stop():
88
+ break
89
+ if truncation_threshold is not None and i == truncation_threshold:
90
+ logger.info(f"reached {truncation_threshold=}, stop saving microbatches")
91
+ break
92
+ save(e, path / str(i))
93
+ next_idx = i + 1
94
+ save(POISON_PILL, path / str(next_idx))
95
+ logger.debug("save iter %r ... finished after %r microbatches", path, next_idx)
96
+ return path
97
+
98
+
99
+ class Sentinel:
100
+ """
101
+ Create a unique sentinel object that is pickled as a constant.
102
+ """
103
+
104
+ def __init__(self, name: str) -> None:
105
+ self.name = name
106
+
107
+ def __repr__(self) -> str:
108
+ return self.name # pragma: no cover
109
+
110
+ def __copy__(self) -> "Sentinel":
111
+ return self # pragma: no cover
112
+
113
+ def __deepcopy__(self, memo: Any) -> "Sentinel":
114
+ return self # pragma: no cover
115
+
116
+ def __reduce__(self) -> str | tuple[Any, ...]:
117
+ return self.name
118
+
119
+ def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]:
120
+ return self.name
121
+
122
+
123
+ POISON_PILL = Sentinel("POISON_PILL")
124
+
125
+
126
+ def load(path: Path) -> Any:
127
+ """Load serialized object with out-of-band data from path based on zero-copy shared memory.
128
+
129
+ Args:
130
+ path (pathlib.Path):
131
+ Folder used to save serialized data with serialize(). Usually a folder /dev/shm
132
+ Returns:
133
+ Raw deserialized data
134
+ """
135
+ if not saved(path):
136
+ raise RuntimeError(f"The object at {path} is corrupted or not saved")
137
+ buffers: list[pickle.PickleBuffer | mmap.mmap] = []
138
+ num_buffers = len(list(path.iterdir())) - 2 # exclude meta.pkl and .complete
139
+ for idx in range(num_buffers):
140
+ fpath = path / f"{idx}.bin"
141
+ if os.stat(fpath).st_size == 0:
142
+ buffers.append(pickle.PickleBuffer(b""))
143
+ else:
144
+ with open(fpath, "rb") as f:
145
+ buffers.append(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ))
146
+ with open(path / "meta.pkl", "rb") as f:
147
+ obj = pickle.load(f, buffers=buffers)
148
+ shutil.rmtree(path)
149
+ logger.debug("removed %r", path)
150
+ for b in buffers:
151
+ if isinstance(b, pickle.PickleBuffer):
152
+ b.release()
153
+ else:
154
+ b.close()
155
+ return obj
156
+
157
+
158
+ def saved(path: Path) -> bool:
159
+ return (path / COMPLETE).exists()
160
+
161
+
162
+ def load_iter(
163
+ path: Path, next_timeout: int = 10 * 60, wait_callback: Callable[[], None] = lambda: None
164
+ ) -> Iterator[Any]:
165
+ idx = 0
166
+ while True:
167
+ start_time = time.time()
168
+ wait_time_threshold = start_time + next_timeout
169
+ chunk_path = path / str(idx)
170
+ while not saved(chunk_path):
171
+ wait_callback()
172
+ logger.debug("waiting for data in %r", chunk_path)
173
+ if time.time() > wait_time_threshold:
174
+ logger.error("timed out waiting for %r", chunk_path)
175
+ raise TimeoutError
176
+ time.sleep(0.001) # busy wait
177
+ chunk = load(chunk_path)
178
+ if chunk is POISON_PILL:
179
+ logger.debug("poison pill!")
180
+ break
181
+ logger.debug("fetching microbatch took %r s", time.time() - start_time)
182
+ yield chunk
183
+ idx += 1
184
+ return
fkat/validate.py ADDED
@@ -0,0 +1,31 @@
1
+ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #!/usr/bin/env python
4
+
5
+ """
6
+ The ``fkat.validate`` entrypoint processes the provided config,
7
+ instatiates the ``trainer``, ``model`` and ``data`` sections and calls ``trainer.validate()``.
8
+ """
9
+
10
+ import hydra
11
+ import lightning as L
12
+ from omegaconf import DictConfig
13
+
14
+ from fkat import initialize, run_main
15
+
16
+
17
+ @hydra.main(version_base="1.3")
18
+ def main(cfg: DictConfig) -> None:
19
+ s = initialize(cfg)
20
+ kwargs = {
21
+ "ckpt_path": s.ckpt_path,
22
+ }
23
+ if isinstance(s.data, L.LightningDataModule):
24
+ kwargs["datamodule"] = s.data
25
+ else:
26
+ kwargs["val_dataloaders"] = s.data.val_dataloader() if s.data else None
27
+ s.trainer.validate(s.model, **kwargs)
28
+
29
+
30
+ if __name__ == "__main__":
31
+ run_main(main)
@@ -0,0 +1,134 @@
1
+ Metadata-Version: 2.4
2
+ Name: fkat
3
+ Version: 0.1.2
4
+ Summary: Foundational Kit for AI Training
5
+ Project-URL: homepage, https://github.com/amzn/fkat
6
+ Author: FKAT Contributors
7
+ License: Apache-2.0
8
+ License-File: LICENSE
9
+ License-File: NOTICE
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: awswrangler>=3.5.1
12
+ Requires-Dist: boto3>=1.35.89
13
+ Requires-Dist: datasets>=3.0.0
14
+ Requires-Dist: evaluate
15
+ Requires-Dist: fsspec[s3]
16
+ Requires-Dist: hydra-core
17
+ Requires-Dist: importlib-metadata
18
+ Requires-Dist: lightning!=2022.*
19
+ Requires-Dist: lightning-utilities
20
+ Requires-Dist: mlflow-skinny<=3.8.1
21
+ Requires-Dist: nvidia-ml-py
22
+ Requires-Dist: pandas
23
+ Requires-Dist: pyarrow<21.0.0,>=15.0.0
24
+ Requires-Dist: tensorboard
25
+ Requires-Dist: torch!=2.3.0,!=2.6.0,>=2.0.1
26
+ Requires-Dist: torchaudio
27
+ Requires-Dist: torchmetrics>=0.11.4
28
+ Requires-Dist: torchvision
29
+ Requires-Dist: transformers
30
+ Requires-Dist: wandb
31
+ Provides-Extra: docs
32
+ Requires-Dist: docutils<0.21,>=0.16; extra == 'docs'
33
+ Requires-Dist: myst-parser<3.0.0,>=0.18.1; extra == 'docs'
34
+ Requires-Dist: pandoc<=2.3,>=1.0; extra == 'docs'
35
+ Requires-Dist: sphinx-autobuild; extra == 'docs'
36
+ Requires-Dist: sphinx-autodoc-typehints==1.23.4; extra == 'docs'
37
+ Requires-Dist: sphinx-book-theme; extra == 'docs'
38
+ Requires-Dist: sphinx-copybutton<=0.5.2,>=0.3; extra == 'docs'
39
+ Requires-Dist: sphinx-multiproject; extra == 'docs'
40
+ Requires-Dist: sphinx-paramlinks<=0.6.0,>=0.5.1; extra == 'docs'
41
+ Requires-Dist: sphinx-prompt==1.5.0; extra == 'docs'
42
+ Requires-Dist: sphinx-rtd-dark-mode==1.3.0; extra == 'docs'
43
+ Requires-Dist: sphinx-togglebutton<=0.3.2,>=0.2; extra == 'docs'
44
+ Requires-Dist: sphinx-toolbox==3.5.0; extra == 'docs'
45
+ Requires-Dist: sphinx<6.0,>5.0; extra == 'docs'
46
+ Requires-Dist: sphinxcontrib-mockautodoc; extra == 'docs'
47
+ Requires-Dist: sphinxcontrib-video==0.2.0; extra == 'docs'
48
+ Provides-Extra: test
49
+ Requires-Dist: captum>=0.4.0; extra == 'test'
50
+ Requires-Dist: contourpy<1.3.1; extra == 'test'
51
+ Requires-Dist: deepspeed; extra == 'test'
52
+ Requires-Dist: ipython==8.18.1; extra == 'test'
53
+ Requires-Dist: moto==5.0.18; extra == 'test'
54
+ Requires-Dist: nvidia-ml-py; extra == 'test'
55
+ Requires-Dist: nvtx; extra == 'test'
56
+ Requires-Dist: peft; extra == 'test'
57
+ Requires-Dist: pre-commit; extra == 'test'
58
+ Requires-Dist: pre-commit-hooks; extra == 'test'
59
+ Requires-Dist: pytest; extra == 'test'
60
+ Requires-Dist: pytest-cov; extra == 'test'
61
+ Requires-Dist: pytest-timeout; extra == 'test'
62
+ Requires-Dist: pytest-xdist; extra == 'test'
63
+ Requires-Dist: ruff; extra == 'test'
64
+ Requires-Dist: tox-venv; extra == 'test'
65
+ Requires-Dist: tox-wheel==1.0.0; extra == 'test'
66
+ Requires-Dist: tox==3.26.0; extra == 'test'
67
+ Requires-Dist: trl==0.7.1; extra == 'test'
68
+ Requires-Dist: twine; extra == 'test'
69
+ Requires-Dist: ty; extra == 'test'
70
+ Requires-Dist: types-boto3[batch,dynamodb,ec2,s3]; extra == 'test'
71
+ Requires-Dist: viztracer; extra == 'test'
72
+ Description-Content-Type: text/markdown
73
+
74
+ # FKAT
75
+
76
+ [![Documentation](https://img.shields.io/badge/docs-gh--pages-blue)](https://amzn.github.io/fkat/)
77
+ [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE)
78
+ [![Python](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
79
+ [![PyTorch](https://img.shields.io/badge/PyTorch-2.0+-ee4c2c.svg)](https://pytorch.org/)
80
+ [![Lightning](https://img.shields.io/badge/Lightning-2.0+-792ee5.svg)](https://lightning.ai/)
81
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
82
+
83
+ Foundational Kit for AI Training
84
+
85
+ ## Documentation
86
+
87
+ 📚 **[Read the full documentation](https://amzn.github.io/fkat/)**
88
+
89
+ ## Dependencies
90
+
91
+ This project depends on third-party open source packages that are installed via PyPI.
92
+
93
+ Key dependencies include:
94
+ - PyTorch (BSD-3-Clause)
95
+ - Lightning (Apache-2.0)
96
+ - Transformers (Apache-2.0)
97
+ - Hydra (MIT)
98
+ - MLflow (Apache-2.0)
99
+ - AWS SDK for Python / Boto3 (Apache-2.0)
100
+ - PyArrow (Apache-2.0)
101
+
102
+ For a complete list of dependencies and their licenses, see `pyproject.toml` and run `pip-licenses` after installation.
103
+
104
+ ## Setup
105
+
106
+ ```bash
107
+ pip install hatch
108
+ hatch env create
109
+ ```
110
+
111
+ ## Development
112
+
113
+ ```bash
114
+ hatch run test:test
115
+ hatch run lint:check
116
+ ```
117
+
118
+ ## Documentation
119
+
120
+ Docs are automatically built and deployed to GitHub Pages on push to main/mainline.
121
+
122
+ Build locally:
123
+ ```bash
124
+ hatch run docs:build
125
+ hatch run docs:serve
126
+ ```
127
+
128
+ ## Contributing
129
+
130
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
131
+
132
+ ## Code of Conduct
133
+
134
+ See [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md).
@@ -0,0 +1,88 @@
1
+ fkat/__init__.py,sha256=kuYbZ-7cqDIEA5dNggMVe49LibdC-RwE1pL3yPhH-2U,5149
2
+ fkat/predict.py,sha256=e2FSuw5HIz_eocoxTbncklNch18e_5mffIgObJXBrFk,878
3
+ fkat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ fkat/test.py,sha256=CKqP0MLmXqVdXlBTcjtN5CmPT_YnCi2COyUg7a7rMBM,811
5
+ fkat/train.py,sha256=Gx_td6GyhJ6VAIQEAwMOa2me9FT2t3_OL1PTt6wwOVI,892
6
+ fkat/validate.py,sha256=cHg9zW9EpHm_B8x30jb7xBSzu_x8N5haJY5d00sNaXE,821
7
+ fkat/data/__init__.py,sha256=261egC2KYrpUnaDXnCLnOFoudKcywYkFZ25QlrdJWkk,419
8
+ fkat/data/data_module.py,sha256=RACUbTNdFtYXpKc-pvj7iN4kp1dPDqouQfbrcNz9QVA,7793
9
+ fkat/data/dict.py,sha256=-ivMLelLGDOhVcw9gEygp4LLdOgiFBOVmta-mZCYHiw,1442
10
+ fkat/data/sharded.py,sha256=6BSi975P4RYJfewwZJklT3Oyd9CcD4Y1AJbCnEv6ayc,29219
11
+ fkat/data/shm.py,sha256=_1ZeRhZRih_TMZabSQPq3Gm9PInZtwed0Lgy4My4yZ8,14337
12
+ fkat/data/datasets/__init__.py,sha256=hUU88kfAIWNHhkz-v5w3fHqNcHihYIKj8VUf-ps65D4,540
13
+ fkat/data/datasets/dict.py,sha256=9d1PWVaLP5G_E6w0V9AO0mFZsx6-HqqKjve9RYWnpyg,2700
14
+ fkat/data/datasets/json.py,sha256=oNn4czQThutKBpixnLdG1UBAlIa2pkWcNb1OC1C4rc8,6318
15
+ fkat/data/datasets/map.py,sha256=gE_xnUVtl6h-ouTBZbiAvu7C0-dPZ7OTR-DxHG0mvhM,2568
16
+ fkat/data/datasets/parquet.py,sha256=bGSjrrFJw3DE5WEFGIuxkA_XH9WjC9l_3TFVYH1ZMt8,8819
17
+ fkat/data/datasets/sized.py,sha256=tmQX1cd9_0b4jmnJ3560oPX2r_o9OvXljeweQb5f9xk,724
18
+ fkat/data/samplers/__init__.py,sha256=HDPLdpb7UwttzARf16Z6-vABaHfzEXXpqHv-HPAxdo8,233
19
+ fkat/data/samplers/dict.py,sha256=nppgM6z0WfG16LkOiT7CIzqyC7IkePu5VjI_VeFQ1gI,1559
20
+ fkat/data/samplers/sized.py,sha256=uo1Y-XgH10qaH6HYiKv39YCiRWQuzKMJ4Ha6Oqu3Gj0,410
21
+ fkat/data/samplers/strategies.py,sha256=jsG5C8jCP6QNgwlNbMRx0UF2Kp0hfUTDEB6sRkqDxeE,1902
22
+ fkat/pytorch/__init__.py,sha256=cdA5mS4EFfP6aggb1H9JDyT2ZTGwRDJwFEAtOEOLBSg,108
23
+ fkat/pytorch/loggers.py,sha256=sSFTlv8e5s5Nyz4B50HPbQWjhJmgB-BK4wDAeWAOs5I,9683
24
+ fkat/pytorch/utilities.py,sha256=VIMjJ7OjSh-dA-7OjPi10WjLY9ZMFtopggtEOJHXyXU,1446
25
+ fkat/pytorch/actions/__init__.py,sha256=YukooKLQ5FhRrJHIVIKusClD8BXFJYpcWF0_vKCblBs,412
26
+ fkat/pytorch/actions/aws/__init__.py,sha256=cdA5mS4EFfP6aggb1H9JDyT2ZTGwRDJwFEAtOEOLBSg,108
27
+ fkat/pytorch/actions/aws/batch.py,sha256=nMGvAuSOtHuY4DnxXojI_cbhFCq8HpIal9hV5cBpxQo,976
28
+ fkat/pytorch/actions/aws/ec2.py,sha256=xwsy9QFdFCkhkYWka5kl_f6tdRDJZCsfKU1kCrsAbMg,2631
29
+ fkat/pytorch/callbacks/__init__.py,sha256=LTiYHnMO8kpe1MOUjnqYLW55bRL2vI3iW6616zwBCCI,107
30
+ fkat/pytorch/callbacks/gc.py,sha256=FHwjci0Kj6znp5hfnr5fgNeTblSSICkKJCyRqMuWBmI,5087
31
+ fkat/pytorch/callbacks/loggers.py,sha256=hQNpajn6YwUbEE_hfrtJ0WL1inAi4WUNWaTrPk8320E,7297
32
+ fkat/pytorch/callbacks/cuda/__init__.py,sha256=gILx7c3Gu4jXEIrDVrdD388X-xq4qkRbD_AEaDjPHRU,330
33
+ fkat/pytorch/callbacks/cuda/cache.py,sha256=nqUJVxQlRw76NqtM0-vxwAxp3NgVRWIySD3T8sPNbLQ,3788
34
+ fkat/pytorch/callbacks/cuda/memory.py,sha256=N_JucoxovoAeawWgTwXu89o23T8gk0fRgmsF34YaeUs,8575
35
+ fkat/pytorch/callbacks/cuda/nsys.py,sha256=6MrWSZHvGCNeCcSfF6BX5xyXV_VqGDyu3pwZeznKcY4,7039
36
+ fkat/pytorch/callbacks/cuda/nvtx.py,sha256=zWWdl-kcRSVeo2R7VaoeQ5qT0m_9rl07qddfcCKXEbM,11179
37
+ fkat/pytorch/callbacks/cuda/xid.py,sha256=RZaZDjmRutimg5LTBQ1waRygmUkfzUTmItjzMHfZ6Oo,6736
38
+ fkat/pytorch/callbacks/debugging/__init__.py,sha256=npRowOXLh-swYsIUBjTKWkX9T00ISA9gIB4qdhzjnUY,250
39
+ fkat/pytorch/callbacks/debugging/introspection.py,sha256=ENaqSgNoh-0fAoS4LsJRKqqCtUuypaMjYHYCsQNv54w,21565
40
+ fkat/pytorch/callbacks/debugging/optimizer.py,sha256=suzSAJu0stuJo9W9j-1f63Y5LzZrAimibRavUWJjQjM,1696
41
+ fkat/pytorch/callbacks/logging/__init__.py,sha256=Br3Hwt5HexjdFZcuvO5kWxa3yzO7pVtWBvXIKVk52jk,301
42
+ fkat/pytorch/callbacks/logging/heartbeat.py,sha256=Re29g8zy-5Hucvkr7CjWkssB-Sywmy84aivmzqnajz8,2801
43
+ fkat/pytorch/callbacks/logging/throughput.py,sha256=E42193nY3FqWo04YWHQ-hDArOu_O1jFoTTX2Mxmaj_c,9427
44
+ fkat/pytorch/callbacks/logging/validation_metrics.py,sha256=nyImmN0EeUG0gbpmoXtXPaRabVgHu5Az202K851iFO8,3680
45
+ fkat/pytorch/callbacks/monitoring/__init__.py,sha256=-0w2E1H0698aLNeyz4QozY67ioQviQhkI0y3Z7m1jyQ,354
46
+ fkat/pytorch/callbacks/monitoring/crash.py,sha256=AIGMUZl73YK8MMrRXIZXBNl2J7Qr77SNRfRFkI9aRgU,6077
47
+ fkat/pytorch/callbacks/monitoring/dp.py,sha256=NEcMg6cpi5--xnA77BHQfbqxhHyojwK1z75loO5b5j4,4698
48
+ fkat/pytorch/callbacks/monitoring/hardware_stats.py,sha256=_yrpdi8RJdCm74YflUj_Xx3e_odhuufiqVctSMGSE9A,5394
49
+ fkat/pytorch/callbacks/monitoring/shutdown.py,sha256=2luwvueNeK1_am4Y8gttEunzJOgdwqedpVbINw9iT7Y,6469
50
+ fkat/pytorch/callbacks/profiling/__init__.py,sha256=4TSwRLkqEGRcYCx-hgqgGrLRV6D4nUjwVd-TFtCU72A,293
51
+ fkat/pytorch/callbacks/profiling/flops.py,sha256=UWNim5nUYRQJu4Sg_8plKiLktRC-OySdb2xLzHnFF4k,23281
52
+ fkat/pytorch/callbacks/profiling/memray.py,sha256=mG18EuNqRG2hCAxwa13RVLPAsKDRf0-cQMOIMym-fgA,7694
53
+ fkat/pytorch/callbacks/profiling/torch.py,sha256=SZ5Nd_ypbtP56lYHQmfdfgXN8cqCy-8JfLMHv7xRB3U,7558
54
+ fkat/pytorch/callbacks/profiling/viztracer.py,sha256=TKc-4A1tO2jxjhIZeKpEUeVtVNbXWg8q2pPb8G7OnOc,7421
55
+ fkat/pytorch/schedule/__init__.py,sha256=ritz02ozwavSurhRaK4tpU7Oj5edBlXO1uwuAs2X4RA,458
56
+ fkat/pytorch/schedule/base.py,sha256=1HuDBBPK-Czc5xX2ppWZaBdzJpjMGNuMtSwsJ5w2_eA,9462
57
+ fkat/pytorch/schedule/mlflow.py,sha256=3hDH85yrejnht4eupiBmiIJu9bebvFF2skBBCIR-sJw,5980
58
+ fkat/utils/__init__.py,sha256=gmLZEQAEHy2pth2jU-oM-afQKXqVl7NJxtgBpY2OW3w,808
59
+ fkat/utils/boto3.py,sha256=vFmgaM4AbpvUkhnNybK_7AIbvPT6PKMEGDcbdiUo9co,634
60
+ fkat/utils/config.py,sha256=ypvvU-T7ibQwlGMOWre_tuJtVjdAjbyOJsUTH_EZ7O0,6238
61
+ fkat/utils/logging.py,sha256=7k-7-rAxtut9fMJyGkGTv8vy1udanzPHKE7w-UKjvQs,838
62
+ fkat/utils/mlflow.py,sha256=gUyE6JrD_8glhsHkg77eHA6fXI_6gu3pgvuRHPgs4HA,1075
63
+ fkat/utils/pandas.py,sha256=kUcNHhuWFK45oQ9c5iEyBrTfBmzubL-2uF8MfByYJ34,925
64
+ fkat/utils/pdb.py,sha256=q-yzt0_iJKfN7wFHvm8SIwPiVWZO7R0WEl_qlP6aT_Q,2329
65
+ fkat/utils/pool.py,sha256=2TxOGBZTxR_ZyM923VYJeiU-kLSLCIwBxDtfrfrUFmk,2309
66
+ fkat/utils/profiler.py,sha256=lp-VMMTSnoe7JmomfZ4AoTBo3X9SotiXE7VOdcFMr2E,547
67
+ fkat/utils/pyarrow.py,sha256=QooFPFRgKd27405JygZyxhVD_9dQHUSBsEwNApKnzvE,733
68
+ fkat/utils/rng.py,sha256=2dUlU4IgZgAnrHZZ7MnrKdv9aRO8buki9nv1SGPWUDo,971
69
+ fkat/utils/shm.py,sha256=hTYgycIzLI-MskK-JHkTEIFb9c29XJ4z8QtI--6Kbjg,5941
70
+ fkat/utils/aws/__init__.py,sha256=cdA5mS4EFfP6aggb1H9JDyT2ZTGwRDJwFEAtOEOLBSg,108
71
+ fkat/utils/aws/imds.py,sha256=xv60coWcD2gp4HiC5rWtEujsGEHMrMnDMNtPrx6OKdc,4474
72
+ fkat/utils/cuda/__init__.py,sha256=cdA5mS4EFfP6aggb1H9JDyT2ZTGwRDJwFEAtOEOLBSg,108
73
+ fkat/utils/cuda/xid.py,sha256=xdfqQ6gBnNjSjktPAdkrAwZmktJL5uyxuwdDJNi2B30,1611
74
+ fkat/utils/cuda/preflight/__init__.py,sha256=cdA5mS4EFfP6aggb1H9JDyT2ZTGwRDJwFEAtOEOLBSg,108
75
+ fkat/utils/cuda/preflight/run.py,sha256=p_MlLcyrQbqbyvuqeO5Q42IKc88UDS-yk9IMaBHcgiw,21561
76
+ fkat/utils/cuda/preflight/health_check/aws_instance_config.py,sha256=db54ylrFRpHvfihaZOmmHIA-4C9vpG4maQcnQwtHDsg,1989
77
+ fkat/utils/cuda/preflight/health_check/constants.py,sha256=Xblr9IyLBVYG54rOBusKkVeSSpSzlko-WdyRn4vmf9Y,738
78
+ fkat/utils/cuda/preflight/health_check/ddb_client.py,sha256=x0FLnoao8iEM9yDJqhfDNZDZ8VtVbXuirQh6ODpXT04,3250
79
+ fkat/utils/cuda/preflight/health_check/gpu_connection_test.py,sha256=Dmmq4lDoXYPABY-V51oV-cl3c64oFqa4um9288cC1Dw,4185
80
+ fkat/utils/cuda/preflight/health_check/gpu_stress_test.py,sha256=YAZNWtRgkPCqthhIxmr-U6vuWW-e8_6ZZdsbT-JTGCA,4693
81
+ fkat/utils/cuda/preflight/health_check/helpers.py,sha256=rXw6fhKN36y4Xui4yDVQpym0RmY1pnhMLTScVZuQneQ,11291
82
+ fkat/utils/cuda/preflight/health_check/logger.py,sha256=XAIJPvf3Pp0gPTCWE_W9HBMrqh7T2y794YvsG2pXLGM,8894
83
+ fkat/utils/cuda/preflight/health_check/timer.py,sha256=lvMEEH8EBLLIMj0hVlaL4OuCaq7GYlBNsB6JXmh0YRA,912
84
+ fkat-0.1.2.dist-info/METADATA,sha256=7Cn7bE7zumd2mDduxMdj8QMcX5_CBSEv9k6x9Lm67PQ,4480
85
+ fkat-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
86
+ fkat-0.1.2.dist-info/licenses/LICENSE,sha256=CeipvOyAZxBGUsFoaFqwkx54aPnIKEtm9a5u2uXxEws,10142
87
+ fkat-0.1.2.dist-info/licenses/NOTICE,sha256=1CkO1kwu3Q_OHYTj-d-yiBJA_lNN73a4zSntavaD4oc,67
88
+ fkat-0.1.2.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any