fractfs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fractfs/__init__.py ADDED
@@ -0,0 +1,172 @@
1
+ """fractfs — drop-in tiered file storage for apps on ephemeral nodes.
2
+
3
+ The whole drop-in surface is three calls::
4
+
5
+ import fractfs
6
+ fractfs.init() # load config, provision symlinks, restore, start syncing
7
+ fractfs.sync_now() # optional: force a checkpoint (e.g. before shutdown)
8
+ fractfs.status() # optional: inspect tiers and last sync time
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import atexit
14
+ import logging
15
+ import os
16
+ from dataclasses import dataclass
17
+ from typing import Any, Dict, List, Optional
18
+
19
+ from .backend import make_backend
20
+ from .config import Config, load_config
21
+ from .provisioner import ClobberError, provision, warnings_for
22
+ from .resolver import Tier, resolve
23
+ from .sync import SyncDaemon, SyncEngine
24
+
25
+ __all__ = [
26
+ "init",
27
+ "sync_now",
28
+ "status",
29
+ "shutdown",
30
+ "resolve",
31
+ "Tier",
32
+ "Config",
33
+ "load_config",
34
+ "ClobberError",
35
+ ]
36
+
37
+ __version__ = "0.1.0"
38
+
39
+ log = logging.getLogger("fractfs")
40
+
41
+
42
+ @dataclass
43
+ class _Runtime:
44
+ cfg: Config
45
+ engine: SyncEngine
46
+ daemon: Optional[SyncDaemon] = None
47
+
48
+
49
+ _RUNTIME: Optional[_Runtime] = None
50
+
51
+
52
+ def init(
53
+ root: Optional[os.PathLike] = None,
54
+ *,
55
+ force: bool = False,
56
+ start_daemon: bool = True,
57
+ restore: bool = True,
58
+ ) -> Config:
59
+ """Initialise fractfs: load config, provision symlinks, restore, start syncing.
60
+
61
+ Blocks on restore before returning so the app never reads cold state. Safe to
62
+ call once at startup. ``force=True`` lets provisioning migrate a non-empty real
63
+ local dir into the Volume (see :class:`ClobberError`).
64
+ """
65
+ global _RUNTIME
66
+
67
+ cfg = load_config(root)
68
+ engine = SyncEngine(cfg, backend=make_backend(cfg)) if cfg.is_provisionable() else None
69
+
70
+ for warning in warnings_for(cfg):
71
+ log.warning("fractfs config: %s", warning)
72
+
73
+ if cfg.is_provisionable():
74
+ actions = provision(cfg, force=force)
75
+ for a in actions:
76
+ log.debug("provision: %s", a)
77
+ # Identify the deployed bundle before restore so it's excluded from the
78
+ # checkpoint (re-supplied from the image on every cold start anyway).
79
+ if cfg.auto_ignore_bundle and engine is not None:
80
+ bundle = engine.detect_bundle()
81
+ if bundle:
82
+ log.info("fractfs auto-ignoring %d deploy-bundle file(s)", len(bundle))
83
+ # Cold-start ordering: restore must finish before the app reads anything.
84
+ if restore and engine is not None:
85
+ restored = engine.restore()
86
+ if restored:
87
+ log.info("fractfs restored %d checkpointed file(s)", len(restored))
88
+ else:
89
+ log.warning(
90
+ "fractfs: no fractfs_VOLUME_ROOT set; running in passthrough mode "
91
+ "(no redirect, no checkpoint)."
92
+ )
93
+
94
+ daemon = None
95
+ if engine is not None and start_daemon and cfg.sync_interval > 0:
96
+ daemon = SyncDaemon(engine, cfg.sync_interval)
97
+ daemon.start()
98
+ atexit.register(_atexit_stop)
99
+
100
+ _RUNTIME = _Runtime(cfg=cfg, engine=engine, daemon=daemon) if engine is not None else _Runtime(
101
+ cfg=cfg, engine=_NullEngine(cfg) # type: ignore[arg-type]
102
+ )
103
+ return cfg
104
+
105
+
106
+ def sync_now() -> List[str]:
107
+ """Force a checkpoint immediately. Returns the rel paths that were copied."""
108
+ rt = _require_runtime()
109
+ return rt.engine.checkpoint()
110
+
111
+
112
+ def status() -> Dict[str, Any]:
113
+ """Report current configuration, the tier of each tracked path, and last sync."""
114
+ rt = _require_runtime()
115
+ cfg = rt.cfg
116
+ tracked: Dict[str, str] = {}
117
+ for d in cfg.dir_paths:
118
+ tracked[d] = resolve(d, cfg).value
119
+ return {
120
+ "backend": cfg.backend,
121
+ "volume_root": str(cfg.volume_root) if cfg.volume_root else None,
122
+ "scratch": str(cfg.scratch),
123
+ "sync_interval": cfg.sync_interval,
124
+ "provisionable": cfg.is_provisionable(),
125
+ "daemon_running": rt.daemon is not None,
126
+ "last_sync_time": getattr(rt.engine, "last_sync_time", None),
127
+ "auto_ignore_bundle": cfg.auto_ignore_bundle,
128
+ "bundle_file_count": len(getattr(rt.engine, "bundle_paths", set())),
129
+ "dirs": tracked,
130
+ "ignore_patterns": list(cfg.ignore_patterns),
131
+ "local_patterns": list(cfg.local_patterns),
132
+ "warnings": warnings_for(cfg),
133
+ }
134
+
135
+
136
+ def shutdown(*, final_sync: bool = True) -> None:
137
+ """Stop the sync daemon, optionally running one last checkpoint."""
138
+ global _RUNTIME
139
+ if _RUNTIME is not None and _RUNTIME.daemon is not None:
140
+ _RUNTIME.daemon.stop(final_sync=final_sync)
141
+ _RUNTIME.daemon = None
142
+
143
+
144
+ def _require_runtime() -> _Runtime:
145
+ if _RUNTIME is None:
146
+ raise RuntimeError("fractfs.init() has not been called")
147
+ return _RUNTIME
148
+
149
+
150
+ def _atexit_stop() -> None:
151
+ try:
152
+ shutdown(final_sync=True)
153
+ except Exception: # best-effort on interpreter shutdown
154
+ pass
155
+
156
+
157
+ class _NullEngine:
158
+ """Stand-in when there is no Volume: checkpoint/restore are no-ops."""
159
+
160
+ def __init__(self, cfg: Config):
161
+ self.cfg = cfg
162
+ self.last_sync_time = None
163
+ self.bundle_paths: set = set()
164
+
165
+ def detect_bundle(self) -> set:
166
+ return set()
167
+
168
+ def checkpoint(self) -> List[str]:
169
+ return []
170
+
171
+ def restore(self, **_: Any) -> List[str]:
172
+ return []
fractfs/backend.py ADDED
@@ -0,0 +1,112 @@
1
+ """Backend abstraction for the remote/durable store.
2
+
3
+ The primary target (Databricks Volumes) is a POSIX FUSE mount, and ``local``
4
+ backing is just a directory, so both are served by :class:`PosixBackend`. The
5
+ ``fsspec``-backed path (``s3`` and friends) is wired through :class:`FsspecBackend`
6
+ which is imported lazily so the base install stays dependency-light.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import os
12
+ import shutil
13
+ from pathlib import Path
14
+ from typing import Iterable, Protocol, runtime_checkable
15
+
16
+
17
+ @runtime_checkable
18
+ class Backend(Protocol):
19
+ """Minimal surface the provisioner and sync daemon need from a store."""
20
+
21
+ def exists(self, path: str) -> bool: ...
22
+
23
+ def makedirs(self, path: str) -> None: ...
24
+
25
+ def put_file(self, local_path: os.PathLike, remote_path: str) -> None:
26
+ """Copy a local file to ``remote_path``, atomically where possible."""
27
+
28
+ def get_file(self, remote_path: str, local_path: os.PathLike) -> None:
29
+ """Copy ``remote_path`` down to a local file, atomically where possible."""
30
+
31
+ def list_files(self, path: str) -> Iterable[str]:
32
+ """Yield remote-relative paths of every file under ``path`` (recursive)."""
33
+
34
+ def remove(self, path: str) -> None: ...
35
+
36
+
37
+ class PosixBackend:
38
+ """Backend over a POSIX-visible root (a Volume mount or a local directory).
39
+
40
+ ``root`` is the absolute remote root; ``path`` arguments to every method are
41
+ interpreted relative to it (or accepted as already-absolute paths under it).
42
+ """
43
+
44
+ def __init__(self, root: os.PathLike, *, atomic_rename: bool = True):
45
+ self.root = Path(root)
46
+ # FUSE mounts don't always honour atomic rename; callers can disable it
47
+ # to fall back to a plain copy (see plan: FUSE atomicity open question).
48
+ self.atomic_rename = atomic_rename
49
+
50
+ def _abs(self, path: str) -> Path:
51
+ p = Path(path)
52
+ return p if p.is_absolute() else self.root / p
53
+
54
+ def exists(self, path: str) -> bool:
55
+ return self._abs(path).exists()
56
+
57
+ def makedirs(self, path: str) -> None:
58
+ self._abs(path).mkdir(parents=True, exist_ok=True)
59
+
60
+ def put_file(self, local_path: os.PathLike, remote_path: str) -> None:
61
+ dst = self._abs(remote_path)
62
+ dst.parent.mkdir(parents=True, exist_ok=True)
63
+ self._write_atomic(Path(local_path), dst)
64
+
65
+ def get_file(self, remote_path: str, local_path: os.PathLike) -> None:
66
+ dst = Path(local_path)
67
+ dst.parent.mkdir(parents=True, exist_ok=True)
68
+ self._write_atomic(self._abs(remote_path), dst)
69
+
70
+ def list_files(self, path: str) -> Iterable[str]:
71
+ base = self._abs(path)
72
+ if not base.exists():
73
+ return
74
+ for dirpath, _dirnames, filenames in os.walk(base):
75
+ for name in filenames:
76
+ full = Path(dirpath) / name
77
+ yield str(full.relative_to(self.root))
78
+
79
+ def remove(self, path: str) -> None:
80
+ p = self._abs(path)
81
+ if p.is_dir() and not p.is_symlink():
82
+ shutil.rmtree(p)
83
+ elif p.exists() or p.is_symlink():
84
+ p.unlink()
85
+
86
+ def _write_atomic(self, src: Path, dst: Path) -> None:
87
+ """Copy ``src`` to ``dst`` via a temp file + rename when atomic_rename."""
88
+ if not self.atomic_rename:
89
+ shutil.copy2(src, dst)
90
+ return
91
+ tmp = dst.with_name(f".{dst.name}.fractfs.tmp.{os.getpid()}")
92
+ try:
93
+ shutil.copy2(src, tmp)
94
+ os.replace(tmp, dst)
95
+ finally:
96
+ if tmp.exists():
97
+ tmp.unlink()
98
+
99
+
100
+ def make_backend(cfg) -> Backend:
101
+ """Construct the backend for a :class:`~fractfs.config.Config`."""
102
+ if cfg.backend in ("volumes", "local"):
103
+ if cfg.volume_root is None:
104
+ raise ValueError(
105
+ f"backend {cfg.backend!r} requires fractfs_VOLUME_ROOT to be set"
106
+ )
107
+ return PosixBackend(cfg.volume_root)
108
+ if cfg.backend == "s3":
109
+ from .fsspec_backend import FsspecBackend
110
+
111
+ return FsspecBackend(cfg)
112
+ raise ValueError(f"no backend implementation for {cfg.backend!r}")
fractfs/config.py ADDED
@@ -0,0 +1,160 @@
1
+ """Config loader: parse ``.fractfs.toml`` + environment into a ``Config``.
2
+
3
+ Env vars override the TOML file for the scalar fields so deployments can tune
4
+ behaviour (backend, volume root, cadence) without editing the repo.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import sys
11
+ from dataclasses import dataclass, field
12
+ from pathlib import Path
13
+ from typing import List, Optional
14
+
15
+ import pathspec
16
+
17
+ if sys.version_info >= (3, 11):
18
+ import tomllib
19
+ else: # pragma: no cover - exercised only on <3.11
20
+ import tomli as tomllib
21
+
22
+ CONFIG_FILENAME = ".fractfs.toml"
23
+
24
+ # Backends we know how to provision against.
25
+ BACKENDS = ("volumes", "s3", "local")
26
+
27
+ # pathspec renamed the gitignore factory; prefer the current name, fall back for
28
+ # older pathspec releases that only ship "gitwildmatch".
29
+ try:
30
+ pathspec.PathSpec.from_lines("gitignore", [])
31
+ _PATHSPEC_FACTORY = "gitignore"
32
+ except (ValueError, KeyError, LookupError): # pragma: no cover - old pathspec
33
+ _PATHSPEC_FACTORY = "gitwildmatch"
34
+
35
+ _DEFAULT_SYNC_INTERVAL = 300
36
+ _DEFAULT_CHECKPOINT_SUBDIR = "_checkpoint"
37
+ _DEFAULT_SCRATCH = "/tmp/fractfs"
38
+
39
+
40
+ @dataclass
41
+ class Config:
42
+ """Resolved configuration.
43
+
44
+ ``root`` is the application root (the dir holding ``.fractfs.toml``); all
45
+ ``dir_paths`` and resolver inputs are relative to it.
46
+ """
47
+
48
+ root: Path
49
+ backend: str = "local"
50
+ volume_root: Optional[Path] = None
51
+ scratch: Path = Path(_DEFAULT_SCRATCH)
52
+ sync_interval: int = _DEFAULT_SYNC_INTERVAL
53
+ checkpoint_subdir: str = _DEFAULT_CHECKPOINT_SUBDIR
54
+ dir_paths: List[str] = field(default_factory=list)
55
+ ignore_patterns: List[str] = field(default_factory=list)
56
+ local_patterns: List[str] = field(default_factory=list)
57
+ use_content_hash: bool = False
58
+ auto_ignore_bundle: bool = True
59
+
60
+ ignore_spec: pathspec.PathSpec = field(init=False)
61
+ local_spec: pathspec.PathSpec = field(init=False)
62
+
63
+ def __post_init__(self) -> None:
64
+ self.root = Path(self.root).resolve()
65
+ if self.volume_root is not None:
66
+ self.volume_root = Path(self.volume_root)
67
+ self.scratch = Path(self.scratch)
68
+ # Normalize dir paths to forward-slash relative strings (drop "./", trailing "/").
69
+ self.dir_paths = [d.strip("/").replace("\\", "/") for d in self.dir_paths if d.strip("/")]
70
+ self.ignore_spec = pathspec.PathSpec.from_lines(_PATHSPEC_FACTORY, self.ignore_patterns)
71
+ self.local_spec = pathspec.PathSpec.from_lines(_PATHSPEC_FACTORY, self.local_patterns)
72
+ if self.backend not in BACKENDS:
73
+ raise ValueError(
74
+ f"unknown fractfs backend {self.backend!r}; expected one of {BACKENDS}"
75
+ )
76
+
77
+ # -- derived paths -----------------------------------------------------
78
+
79
+ @property
80
+ def checkpoint_root(self) -> Optional[Path]:
81
+ """Absolute path under the Volume where LOCAL_SYNCED checkpoints land."""
82
+ if self.volume_root is None:
83
+ return None
84
+ return self.volume_root / self.checkpoint_subdir
85
+
86
+ def is_provisionable(self) -> bool:
87
+ """Whether dir-redirect / back-symlink provisioning can run.
88
+
89
+ Requires a Volume root; without one (pure ``local`` backend, no mount)
90
+ only checkpoint/restore against a local volume_root is meaningful.
91
+ """
92
+ return self.volume_root is not None
93
+
94
+
95
+ def _env(name: str) -> Optional[str]:
96
+ return os.environ.get(f"fractfs_{name}") or os.environ.get(f"FRACTFS_{name}")
97
+
98
+
99
+ def load_config(root: Optional[os.PathLike] = None) -> Config:
100
+ """Load config from ``<root>/.fractfs.toml`` with env-var overrides.
101
+
102
+ ``root`` defaults to ``$fractfs_ROOT`` then the current working directory.
103
+ """
104
+ if root is None:
105
+ root = _env("ROOT") or os.getcwd()
106
+ root = Path(root).resolve()
107
+
108
+ data = _read_toml(root / CONFIG_FILENAME)
109
+
110
+ dirs = data.get("dirs", {}).get("paths", []) or []
111
+ ignore = data.get("ignore", {}).get("patterns", []) or []
112
+ local = data.get("local", {}).get("patterns", []) or []
113
+
114
+ backend = _env("BACKEND") or data.get("backend") or "local"
115
+
116
+ volume_root = _env("VOLUME_ROOT") or data.get("volume_root")
117
+ scratch = _env("SCRATCH") or data.get("scratch") or _DEFAULT_SCRATCH
118
+ checkpoint_subdir = (
119
+ _env("CHECKPOINT_SUBDIR") or data.get("checkpoint_subdir") or _DEFAULT_CHECKPOINT_SUBDIR
120
+ )
121
+
122
+ sync_interval_raw = _env("SYNC_INTERVAL") or data.get("sync_interval")
123
+ sync_interval = int(sync_interval_raw) if sync_interval_raw is not None else _DEFAULT_SYNC_INTERVAL
124
+
125
+ hash_raw = _env("CONTENT_HASH") or data.get("content_hash")
126
+ use_content_hash = _as_bool(hash_raw, default=False)
127
+
128
+ bundle_raw = _env("AUTO_IGNORE_BUNDLE")
129
+ if bundle_raw is None and "auto_ignore_bundle" in data:
130
+ bundle_raw = data.get("auto_ignore_bundle")
131
+ auto_ignore_bundle = _as_bool(bundle_raw, default=True)
132
+
133
+ return Config(
134
+ root=root,
135
+ backend=backend,
136
+ volume_root=volume_root,
137
+ scratch=scratch,
138
+ sync_interval=sync_interval,
139
+ checkpoint_subdir=checkpoint_subdir,
140
+ dir_paths=list(dirs),
141
+ ignore_patterns=list(ignore),
142
+ local_patterns=list(local),
143
+ use_content_hash=use_content_hash,
144
+ auto_ignore_bundle=auto_ignore_bundle,
145
+ )
146
+
147
+
148
+ def _read_toml(path: Path) -> dict:
149
+ if not path.exists():
150
+ return {}
151
+ with open(path, "rb") as fh:
152
+ return tomllib.load(fh)
153
+
154
+
155
+ def _as_bool(value, default: bool) -> bool:
156
+ if value is None:
157
+ return default
158
+ if isinstance(value, bool):
159
+ return value
160
+ return str(value).strip().lower() in ("1", "true", "yes", "on")
@@ -0,0 +1,61 @@
1
+ """fsspec-backed implementation (S3 / ADLS / GCS) — optional extra.
2
+
3
+ Kept in its own module so importing fractfs never pulls in ``fsspec``/``s3fs``
4
+ unless an fsspec backend is actually requested. Install with ``fractfs[s3]``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from typing import Iterable
11
+
12
+
13
+ class FsspecBackend:
14
+ """Backend over any fsspec filesystem rooted at ``cfg.volume_root`` URL.
15
+
16
+ ``volume_root`` here is an fsspec URL such as ``s3://bucket/prefix``.
17
+ """
18
+
19
+ def __init__(self, cfg):
20
+ try:
21
+ import fsspec
22
+ except ImportError as exc: # pragma: no cover - depends on optional extra
23
+ raise ImportError(
24
+ "the s3/fsspec backend requires the 'fsspec' extra: pip install 'fractfs[s3]'"
25
+ ) from exc
26
+ if cfg.volume_root is None:
27
+ raise ValueError("fsspec backend requires fractfs_VOLUME_ROOT (an fsspec URL)")
28
+ self.url_root = str(cfg.volume_root).rstrip("/")
29
+ self.fs, self.path_root = fsspec.core.url_to_fs(self.url_root)
30
+
31
+ def _abs(self, path: str) -> str:
32
+ path = str(path)
33
+ if path.startswith(self.path_root):
34
+ return path
35
+ return f"{self.path_root}/{path.lstrip('/')}"
36
+
37
+ def exists(self, path: str) -> bool:
38
+ return self.fs.exists(self._abs(path))
39
+
40
+ def makedirs(self, path: str) -> None:
41
+ self.fs.makedirs(self._abs(path), exist_ok=True)
42
+
43
+ def put_file(self, local_path: os.PathLike, remote_path: str) -> None:
44
+ self.fs.put_file(str(local_path), self._abs(remote_path))
45
+
46
+ def get_file(self, remote_path: str, local_path: os.PathLike) -> None:
47
+ os.makedirs(os.path.dirname(local_path) or ".", exist_ok=True)
48
+ self.fs.get_file(self._abs(remote_path), str(local_path))
49
+
50
+ def list_files(self, path: str) -> Iterable[str]:
51
+ base = self._abs(path)
52
+ if not self.fs.exists(base):
53
+ return
54
+ for full in self.fs.find(base):
55
+ rel = full[len(self.path_root):].lstrip("/")
56
+ yield rel
57
+
58
+ def remove(self, path: str) -> None:
59
+ target = self._abs(path)
60
+ if self.fs.exists(target):
61
+ self.fs.rm(target, recursive=True)
fractfs/patterns.py ADDED
@@ -0,0 +1,63 @@
1
+ """Static analysis of gitignore patterns, used to decide what can be pinned.
2
+
3
+ A ``local``/``ignore`` file inside a Volume-redirected ``[dirs]`` directory only
4
+ stays node-local if a back-symlink exists *before* the write. We can pre-create
5
+ that symlink for any pattern naming a concrete location; we cannot for a glob,
6
+ because the filename isn't known until the app creates it.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import List, Tuple
12
+
13
+ from .resolver import _under_any_dir
14
+
15
+ # gitignore wildcard metacharacters. A pattern containing any of these names a
16
+ # set of paths, not one path, so its back-symlink location can't be predicted.
17
+ _GLOB_META = "*?["
18
+
19
+
20
+ def is_glob(pattern: str) -> bool:
21
+ return any(c in pattern for c in _GLOB_META)
22
+
23
+
24
+ def is_negation(pattern: str) -> bool:
25
+ return pattern.lstrip().startswith("!")
26
+
27
+
28
+ def is_dir_pattern(pattern: str) -> bool:
29
+ """True if the pattern targets a directory (trailing slash)."""
30
+ return pattern.rstrip().endswith("/")
31
+
32
+
33
+ def is_anchored(pattern: str) -> bool:
34
+ """gitignore anchoring: a leading or internal slash pins the pattern to root."""
35
+ body = pattern.strip()
36
+ if body.startswith("/"):
37
+ return True
38
+ return "/" in body.rstrip("/")
39
+
40
+
41
+ def is_concrete(pattern: str) -> bool:
42
+ """A pattern we can pre-create a back-symlink for (an exact path/name)."""
43
+ return bool(pattern.strip()) and not is_negation(pattern) and not is_glob(pattern)
44
+
45
+
46
+ def pin_targets(pattern: str, dir_paths: List[str]) -> List[Tuple[str, bool]]:
47
+ """Back-symlink targets for ``pattern`` that fall inside ``dir_paths``.
48
+
49
+ Returns ``(rel_path, is_dir)`` tuples. Empty for globs/negations (can't be
50
+ predicted) and for concrete paths that don't land inside any Volume dir
51
+ (those already live in the plain local tree and need no symlink).
52
+ """
53
+ if not is_concrete(pattern):
54
+ return []
55
+ is_dir = is_dir_pattern(pattern)
56
+ clean = pattern.strip().strip("/")
57
+ if not clean:
58
+ return []
59
+ if is_anchored(pattern):
60
+ return [(clean, is_dir)] if _under_any_dir(clean, dir_paths) else []
61
+ # Unanchored bare name (e.g. "manifest.json", ".locks/"): the common case is
62
+ # one at the top of each Volume dir — reserve a slot there.
63
+ return [(f"{d}/{clean}", is_dir) for d in dir_paths]