furu 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,233 @@
1
+ import datetime
2
+ import enum
3
+ import hashlib
4
+ import importlib
5
+ import json
6
+ import pathlib
7
+ import textwrap
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import chz
12
+
13
+ from ..errors import _FuruMissing
14
+ from pydantic import BaseModel as PydanticBaseModel
15
+
16
+
17
+ # Type alias for JSON-serializable values. We use Any here because this serialization
18
+ # library handles arbitrary user-defined objects that we cannot know at compile time.
19
+ JsonValue = Any
20
+
21
+
22
+ class FuruSerializer:
23
+ """Handles serialization, deserialization, and hashing of Furu objects."""
24
+
25
+ CLASS_MARKER = "__class__"
26
+
27
+ @staticmethod
28
+ def get_classname(obj: object) -> str:
29
+ """Get fully qualified class name."""
30
+ classname = obj.__class__.__module__
31
+ if classname == "__main__":
32
+ raise ValueError("Cannot serialize objects from __main__ module")
33
+
34
+ if isinstance(obj, enum.Enum):
35
+ return f"{classname}.{obj.__class__.__qualname__}:{obj.name}"
36
+ return f"{classname}.{obj.__class__.__qualname__}"
37
+
38
+ @classmethod
39
+ def to_dict(cls, obj: object) -> JsonValue:
40
+ """Convert object to JSON-serializable dictionary."""
41
+ if isinstance(obj, _FuruMissing):
42
+ raise ValueError("Cannot serialize Furu.MISSING")
43
+
44
+ if chz.is_chz(obj):
45
+ result = {cls.CLASS_MARKER: cls.get_classname(obj)}
46
+ for field_name in chz.chz_fields(obj):
47
+ result[field_name] = cls.to_dict(getattr(obj, field_name))
48
+ return result
49
+
50
+ if isinstance(obj, pathlib.Path):
51
+ return str(obj)
52
+
53
+ if isinstance(obj, (list, tuple)):
54
+ return [cls.to_dict(v) for v in obj]
55
+
56
+ if isinstance(obj, dict):
57
+ return {k: cls.to_dict(v) for k, v in obj.items()}
58
+
59
+ return obj
60
+
61
+ @classmethod
62
+ def from_dict(cls, data: JsonValue) -> JsonValue:
63
+ """Reconstruct object from dictionary."""
64
+ if isinstance(data, dict) and cls.CLASS_MARKER in data:
65
+ module_path, _, class_name = data[cls.CLASS_MARKER].rpartition(".")
66
+ data_class = getattr(importlib.import_module(module_path), class_name)
67
+
68
+ kwargs = {
69
+ k: cls.from_dict(v) for k, v in data.items() if k != cls.CLASS_MARKER
70
+ }
71
+
72
+ path_types = (Path, pathlib.Path)
73
+
74
+ if chz.is_chz(data_class):
75
+ for name, field in chz.chz_fields(data_class).items():
76
+ if field.final_type in path_types and isinstance(
77
+ kwargs.get(name), str
78
+ ):
79
+ kwargs[name] = pathlib.Path(kwargs[name])
80
+ return data_class(**kwargs)
81
+
82
+ if isinstance(data, list):
83
+ return [cls.from_dict(v) for v in data]
84
+
85
+ if isinstance(data, dict):
86
+ return {k: cls.from_dict(v) for k, v in data.items()}
87
+
88
+ return data
89
+
90
+ @classmethod
91
+ def compute_hash(cls, obj: object, verbose: bool = False) -> str:
92
+ """Compute deterministic hash of object."""
93
+
94
+ def canonicalize(item: object) -> JsonValue:
95
+ if isinstance(item, _FuruMissing):
96
+ raise ValueError("Cannot hash Furu.MISSING")
97
+
98
+ if chz.is_chz(item):
99
+ fields = chz.chz_fields(item)
100
+ return {
101
+ "__class__": cls.get_classname(item),
102
+ **{
103
+ name: canonicalize(getattr(item, name))
104
+ for name in fields
105
+ if not name.startswith("_")
106
+ },
107
+ }
108
+
109
+ if isinstance(item, dict):
110
+ filtered = item
111
+ if cls.CLASS_MARKER in item:
112
+ filtered = {
113
+ k: v
114
+ for k, v in item.items()
115
+ if not (isinstance(k, str) and k.startswith("_"))
116
+ or k == cls.CLASS_MARKER
117
+ }
118
+ return {k: canonicalize(v) for k, v in sorted(filtered.items())}
119
+
120
+ if isinstance(item, (list, tuple)):
121
+ return [canonicalize(v) for v in item]
122
+
123
+ if isinstance(item, Path):
124
+ return str(item)
125
+
126
+ if isinstance(item, enum.Enum):
127
+ return {"__enum__": cls.get_classname(item)}
128
+
129
+ if isinstance(item, (set, frozenset)):
130
+ return sorted(canonicalize(v) for v in item)
131
+
132
+ if isinstance(item, (bytes, bytearray, memoryview)):
133
+ return {"__bytes__": hashlib.sha256(item).hexdigest()}
134
+
135
+ if isinstance(item, datetime.datetime):
136
+ return item.astimezone(datetime.timezone.utc).isoformat(
137
+ timespec="microseconds"
138
+ )
139
+
140
+ if isinstance(item, (str, int, float, bool)) or item is None:
141
+ return item
142
+
143
+ if isinstance(item, PydanticBaseModel):
144
+ return {
145
+ "__class__": cls.get_classname(item),
146
+ **{k: canonicalize(v) for k, v in item.model_dump().items()},
147
+ }
148
+
149
+ raise TypeError(f"Cannot hash type: {type(item)}")
150
+
151
+ canonical = canonicalize(obj)
152
+ json_str = json.dumps(canonical, sort_keys=True, separators=(",", ":"))
153
+
154
+ if verbose:
155
+ print(json_str)
156
+
157
+ return hashlib.blake2s(json_str.encode(), digest_size=10).hexdigest()
158
+
159
+ @classmethod
160
+ def to_python(cls, obj: object, multiline: bool = True) -> str:
161
+ """Convert object to Python code representation."""
162
+
163
+ def to_py_recursive(item: object, indent: int = 0) -> str:
164
+ if isinstance(item, _FuruMissing):
165
+ raise ValueError("Cannot convert Furu.MISSING to Python")
166
+
167
+ pad = "" if not multiline else " " * indent
168
+ next_indent = indent + (4 if multiline else 0)
169
+
170
+ if chz.is_chz(item):
171
+ cls_path = cls.get_classname(item)
172
+ fields = []
173
+ for name, field in chz.chz_fields(item).items():
174
+ fields.append(
175
+ f"{name}={to_py_recursive(getattr(item, name), next_indent)}"
176
+ )
177
+
178
+ if multiline:
179
+ inner = (",\n" + " " * next_indent).join(fields)
180
+ return f"{cls_path}(\n{pad} {inner}\n{pad})"
181
+ return f"{cls_path}({', '.join(fields)})"
182
+
183
+ if isinstance(item, enum.Enum):
184
+ return cls.get_classname(item)
185
+
186
+ if isinstance(item, pathlib.Path):
187
+ return f"pathlib.Path({str(item)!r})"
188
+
189
+ if isinstance(item, datetime.datetime):
190
+ iso = item.astimezone(datetime.timezone.utc).isoformat(
191
+ timespec="microseconds"
192
+ )
193
+ return f"datetime.datetime.fromisoformat({iso!r})"
194
+
195
+ if isinstance(item, (bytes, bytearray, memoryview)):
196
+ hex_str = hashlib.sha256(item).hexdigest()
197
+ return f"bytes.fromhex({hex_str!r})"
198
+
199
+ if isinstance(item, list):
200
+ items = ", ".join(to_py_recursive(v, next_indent) for v in item)
201
+ return f"[{items}]"
202
+
203
+ if isinstance(item, tuple):
204
+ items = ", ".join(to_py_recursive(v, next_indent) for v in item)
205
+ comma = "," if len(item) == 1 else ""
206
+ return f"({items}{comma})"
207
+
208
+ if isinstance(item, set):
209
+ items = ", ".join(to_py_recursive(v, next_indent) for v in item)
210
+ return f"{{{items}}}"
211
+
212
+ if isinstance(item, frozenset):
213
+ items = ", ".join(to_py_recursive(v, next_indent) for v in item)
214
+ return f"frozenset({{{items}}})"
215
+
216
+ if isinstance(item, dict):
217
+ kv_pairs = [
218
+ f"{to_py_recursive(k, next_indent)}: {to_py_recursive(v, next_indent)}"
219
+ for k, v in item.items()
220
+ ]
221
+
222
+ if multiline:
223
+ joined = (",\n" + " " * (indent + 4)).join(kv_pairs)
224
+ return f"{{\n{pad} {joined}\n{pad}}}"
225
+ else:
226
+ return "{" + ", ".join(kv_pairs) + "}"
227
+
228
+ return repr(item)
229
+
230
+ result = to_py_recursive(obj, indent=0)
231
+ if multiline:
232
+ result = textwrap.dedent(result).strip()
233
+ return result
@@ -0,0 +1,32 @@
1
+ from .metadata import (
2
+ EnvironmentInfo,
3
+ GitInfo,
4
+ FuruMetadata,
5
+ MetadataManager,
6
+ clear_metadata_cache,
7
+ )
8
+ from .migration import MigrationManager, MigrationRecord
9
+ from .state import (
10
+ ComputeLockContext,
11
+ FuruErrorState,
12
+ StateAttempt,
13
+ StateManager,
14
+ StateOwner,
15
+ compute_lock,
16
+ )
17
+
18
+ __all__ = [
19
+ "ComputeLockContext",
20
+ "EnvironmentInfo",
21
+ "GitInfo",
22
+ "FuruErrorState",
23
+ "FuruMetadata",
24
+ "MetadataManager",
25
+ "MigrationManager",
26
+ "MigrationRecord",
27
+ "StateAttempt",
28
+ "StateManager",
29
+ "StateOwner",
30
+ "clear_metadata_cache",
31
+ "compute_lock",
32
+ ]
@@ -0,0 +1,282 @@
1
+ import datetime
2
+ import getpass
3
+ import json
4
+ import os
5
+ import platform
6
+ import socket
7
+ import subprocess
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING
11
+
12
+ from pydantic import BaseModel, ConfigDict
13
+
14
+ from ..config import FURU_CONFIG
15
+ from ..serialization import BaseModel as PydanticBaseModel
16
+ from ..serialization import FuruSerializer
17
+ from ..serialization.serializer import JsonValue
18
+
19
+ if TYPE_CHECKING:
20
+ from ..core.furu import Furu
21
+
22
+ # Module-level cache for metadata (controlled via FURU_CACHE_METADATA)
23
+ _cached_git_info: "GitInfo | None" = None
24
+ _cached_git_info_time: float = 0.0
25
+
26
+
27
+ def clear_metadata_cache() -> None:
28
+ """Clear the cached metadata. Useful for testing or long-running processes."""
29
+ global _cached_git_info, _cached_git_info_time
30
+ _cached_git_info = None
31
+ _cached_git_info_time = 0.0
32
+
33
+
34
+ class GitInfo(BaseModel):
35
+ """Git repository information."""
36
+
37
+ model_config = ConfigDict(extra="forbid", strict=True)
38
+
39
+ git_commit: str
40
+ git_branch: str
41
+ git_remote: str | None
42
+ git_patch: str
43
+ git_submodules: dict[str, str]
44
+
45
+
46
+ class EnvironmentInfo(BaseModel):
47
+ """Runtime environment information."""
48
+
49
+ model_config = ConfigDict(extra="forbid", strict=True)
50
+
51
+ timestamp: str
52
+ command: str
53
+ python_version: str
54
+ executable: str
55
+ platform: str
56
+ hostname: str
57
+ user: str
58
+ pid: int
59
+
60
+
61
+ class FuruMetadata(BaseModel):
62
+ """Complete metadata for a Furu experiment."""
63
+
64
+ model_config = ConfigDict(extra="forbid", strict=True)
65
+
66
+ # Furu-specific fields
67
+ furu_python_def: str
68
+ furu_obj: JsonValue # Serialized Furu object from FuruSerializer.to_dict()
69
+ furu_hash: str
70
+ furu_path: str
71
+
72
+ # Git info
73
+ git_commit: str
74
+ git_branch: str
75
+ git_remote: str | None
76
+ git_patch: str
77
+ git_submodules: dict[str, str]
78
+
79
+ # Environment info
80
+ timestamp: str
81
+ command: str
82
+ python_version: str
83
+ executable: str
84
+ platform: str
85
+ hostname: str
86
+ user: str
87
+ pid: int
88
+
89
+
90
+ class MetadataManager:
91
+ """Handles metadata collection and storage."""
92
+
93
+ INTERNAL_DIR = ".furu"
94
+ METADATA_FILE = "metadata.json"
95
+
96
+ @classmethod
97
+ def get_metadata_path(cls, directory: Path) -> Path:
98
+ return directory / cls.INTERNAL_DIR / cls.METADATA_FILE
99
+
100
+ @staticmethod
101
+ def run_git_command(args: list[str]) -> str:
102
+ """Run git command, return output."""
103
+ proc = subprocess.run(
104
+ ["git", *args], text=True, capture_output=True, timeout=10
105
+ )
106
+ if proc.returncode not in (0, 1):
107
+ proc.check_returncode()
108
+ return proc.stdout.strip()
109
+
110
+ @classmethod
111
+ def collect_git_info(cls, ignore_diff: bool = False) -> GitInfo:
112
+ """Collect git repository information."""
113
+ global _cached_git_info, _cached_git_info_time
114
+ import time
115
+
116
+ ttl = FURU_CONFIG.cache_metadata_ttl_sec
117
+ # Return cached result if caching is enabled and not expired
118
+ if ttl is not None and _cached_git_info is not None:
119
+ age = time.time() - _cached_git_info_time
120
+ if age < ttl:
121
+ return _cached_git_info
122
+
123
+ if not FURU_CONFIG.require_git:
124
+ try:
125
+ head = cls.run_git_command(["rev-parse", "HEAD"])
126
+ branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
127
+ except subprocess.CalledProcessError:
128
+ return GitInfo(
129
+ git_commit="<no-git>",
130
+ git_branch="<no-git>",
131
+ git_remote=None,
132
+ git_patch="<no-git>",
133
+ git_submodules={},
134
+ )
135
+ else:
136
+ head = cls.run_git_command(["rev-parse", "HEAD"])
137
+ branch = cls.run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
138
+
139
+ if FURU_CONFIG.require_git_remote:
140
+ remote = cls.run_git_command(["remote", "get-url", "origin"])
141
+ else:
142
+ try:
143
+ remote = cls.run_git_command(["remote", "get-url", "origin"])
144
+ except subprocess.CalledProcessError:
145
+ remote = None
146
+
147
+ if ignore_diff:
148
+ patch = "<ignored-diff>"
149
+ else:
150
+ unstaged = cls.run_git_command(["diff"])
151
+ staged = cls.run_git_command(["diff", "--cached"])
152
+ untracked = cls.run_git_command(
153
+ ["ls-files", "--others", "--exclude-standard"]
154
+ ).splitlines()
155
+
156
+ untracked_patches = "\n".join(
157
+ cls.run_git_command(["diff", "--no-index", "/dev/null", f])
158
+ for f in untracked
159
+ )
160
+
161
+ patch = "\n".join(
162
+ filter(
163
+ None,
164
+ [
165
+ "# === unstaged ==================================================",
166
+ unstaged,
167
+ "# === staged ====================================================",
168
+ staged,
169
+ "# === untracked ================================================",
170
+ untracked_patches,
171
+ ],
172
+ )
173
+ )
174
+
175
+ if len(patch) > 50_000:
176
+ raise ValueError(
177
+ f"Git diff too large ({len(patch):,} bytes). "
178
+ "Use ignore_diff=True or FURU_IGNORE_DIFF=1"
179
+ )
180
+
181
+ submodules: dict[str, str] = {}
182
+ for line in cls.run_git_command(["submodule", "status"]).splitlines():
183
+ parts = line.split()
184
+ if len(parts) >= 2:
185
+ submodules[parts[1]] = parts[0]
186
+
187
+ result = GitInfo(
188
+ git_commit=head,
189
+ git_branch=branch,
190
+ git_remote=remote,
191
+ git_patch=patch,
192
+ git_submodules=submodules,
193
+ )
194
+
195
+ # Cache result if caching is enabled
196
+ if ttl is not None:
197
+ _cached_git_info = result
198
+ _cached_git_info_time = time.time()
199
+
200
+ return result
201
+
202
+ @staticmethod
203
+ def collect_environment_info() -> EnvironmentInfo:
204
+ """Collect environment information."""
205
+ return EnvironmentInfo(
206
+ timestamp=datetime.datetime.now(datetime.timezone.utc).isoformat(
207
+ timespec="microseconds"
208
+ ),
209
+ command=" ".join(sys.argv) if sys.argv else "<unknown>",
210
+ python_version=sys.version,
211
+ executable=sys.executable,
212
+ platform=platform.platform(),
213
+ hostname=socket.gethostname(),
214
+ user=getpass.getuser(),
215
+ pid=os.getpid(),
216
+ )
217
+
218
+ @classmethod
219
+ def create_metadata(
220
+ cls, furu_obj: "Furu", directory: Path, ignore_diff: bool = False
221
+ ) -> FuruMetadata:
222
+ """Create complete metadata for a Furu object."""
223
+ git_info = cls.collect_git_info(ignore_diff)
224
+ env_info = cls.collect_environment_info()
225
+
226
+ serialized_obj = FuruSerializer.to_dict(furu_obj)
227
+ if not isinstance(serialized_obj, dict):
228
+ raise TypeError(
229
+ f"Expected FuruSerializer.to_dict to return dict, got {type(serialized_obj)}"
230
+ )
231
+
232
+ return FuruMetadata(
233
+ furu_python_def=FuruSerializer.to_python(furu_obj, multiline=False),
234
+ furu_obj=serialized_obj,
235
+ furu_hash=FuruSerializer.compute_hash(furu_obj),
236
+ furu_path=str(directory.resolve()),
237
+ git_commit=git_info.git_commit,
238
+ git_branch=git_info.git_branch,
239
+ git_remote=git_info.git_remote,
240
+ git_patch=git_info.git_patch,
241
+ git_submodules=git_info.git_submodules,
242
+ timestamp=env_info.timestamp,
243
+ command=env_info.command,
244
+ python_version=env_info.python_version,
245
+ executable=env_info.executable,
246
+ platform=env_info.platform,
247
+ hostname=env_info.hostname,
248
+ user=env_info.user,
249
+ pid=env_info.pid,
250
+ )
251
+
252
+ @classmethod
253
+ def write_metadata(cls, metadata: FuruMetadata, directory: Path) -> None:
254
+ """Write metadata to file."""
255
+ metadata_path = cls.get_metadata_path(directory)
256
+ metadata_path.parent.mkdir(parents=True, exist_ok=True)
257
+ metadata_path.write_text(
258
+ json.dumps(
259
+ metadata.model_dump(mode="json"),
260
+ indent=2,
261
+ default=lambda o: o.model_dump()
262
+ if PydanticBaseModel is not None and isinstance(o, PydanticBaseModel)
263
+ else str(o),
264
+ )
265
+ )
266
+
267
+ @classmethod
268
+ def read_metadata(cls, directory: Path) -> FuruMetadata:
269
+ """Read metadata from file."""
270
+ metadata_path = cls.get_metadata_path(directory)
271
+ if not metadata_path.is_file():
272
+ raise FileNotFoundError(f"Metadata not found: {metadata_path}")
273
+ data = json.loads(metadata_path.read_text())
274
+ return FuruMetadata.model_validate(data)
275
+
276
+ @classmethod
277
+ def read_metadata_raw(cls, directory: Path) -> dict[str, JsonValue] | None:
278
+ """Read raw metadata JSON from file, returning None if not found."""
279
+ metadata_path = cls.get_metadata_path(directory)
280
+ if not metadata_path.is_file():
281
+ return None
282
+ return json.loads(metadata_path.read_text())
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict
8
+
9
+ from ..config import FURU_CONFIG
10
+ from ..serialization.serializer import JsonValue
11
+
12
+
13
+ RootKind = Literal["data", "git"]
14
+ MigrationPolicy = Literal["alias", "move", "copy"]
15
+ MigrationKind = Literal["alias", "moved", "copied", "migrated"]
16
+
17
+
18
+ class MigrationRecord(BaseModel):
19
+ model_config = ConfigDict(extra="ignore", strict=True)
20
+
21
+ kind: MigrationKind
22
+ policy: MigrationPolicy
23
+ from_namespace: str
24
+ from_hash: str
25
+ from_root: RootKind
26
+ to_namespace: str
27
+ to_hash: str
28
+ to_root: RootKind
29
+ migrated_at: str
30
+ overwritten_at: str | None = None
31
+ default_values: dict[str, JsonValue] | None = None
32
+ origin: str | None = None
33
+ note: str | None = None
34
+
35
+
36
+ class MigrationManager:
37
+ INTERNAL_DIR = ".furu"
38
+ MIGRATION_FILE = "migration.json"
39
+
40
+ @classmethod
41
+ def get_migration_path(cls, directory: Path) -> Path:
42
+ return directory / cls.INTERNAL_DIR / cls.MIGRATION_FILE
43
+
44
+ @classmethod
45
+ def read_migration(cls, directory: Path) -> MigrationRecord | None:
46
+ path = cls.get_migration_path(directory)
47
+ if not path.is_file():
48
+ return None
49
+ data = json.loads(path.read_text())
50
+ return MigrationRecord.model_validate(data)
51
+
52
+ @classmethod
53
+ def write_migration(cls, record: MigrationRecord, directory: Path) -> None:
54
+ path = cls.get_migration_path(directory)
55
+ path.parent.mkdir(parents=True, exist_ok=True)
56
+ tmp = path.with_suffix(".tmp")
57
+ tmp.write_text(json.dumps(record.model_dump(mode="json"), indent=2))
58
+ tmp.replace(path)
59
+
60
+ @classmethod
61
+ def resolve_dir(
62
+ cls, record: MigrationRecord, *, target: Literal["from", "to"]
63
+ ) -> Path:
64
+ if target == "from":
65
+ namespace = record.from_namespace
66
+ furu_hash = record.from_hash
67
+ root_kind = record.from_root
68
+ else:
69
+ namespace = record.to_namespace
70
+ furu_hash = record.to_hash
71
+ root_kind = record.to_root
72
+ root = FURU_CONFIG.get_root(version_controlled=root_kind == "git")
73
+ return root / Path(*namespace.split(".")) / furu_hash
74
+
75
+ @classmethod
76
+ def root_kind_for_dir(cls, directory: Path) -> RootKind:
77
+ for version_controlled in (False, True):
78
+ root = FURU_CONFIG.get_root(version_controlled=version_controlled)
79
+ if directory.is_relative_to(root):
80
+ return "git" if version_controlled else "data"
81
+ raise ValueError(f"Directory {directory} is not under a Furu root")