landmarkdiff 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- landmarkdiff/__init__.py +40 -0
- landmarkdiff/__main__.py +207 -0
- landmarkdiff/api_client.py +316 -0
- landmarkdiff/arcface_torch.py +583 -0
- landmarkdiff/audit.py +338 -0
- landmarkdiff/augmentation.py +293 -0
- landmarkdiff/benchmark.py +213 -0
- landmarkdiff/checkpoint_manager.py +361 -0
- landmarkdiff/cli.py +252 -0
- landmarkdiff/clinical.py +223 -0
- landmarkdiff/conditioning.py +278 -0
- landmarkdiff/config.py +358 -0
- landmarkdiff/curriculum.py +191 -0
- landmarkdiff/data.py +405 -0
- landmarkdiff/data_version.py +301 -0
- landmarkdiff/displacement_model.py +745 -0
- landmarkdiff/ensemble.py +330 -0
- landmarkdiff/evaluation.py +415 -0
- landmarkdiff/experiment_tracker.py +231 -0
- landmarkdiff/face_verifier.py +947 -0
- landmarkdiff/fid.py +244 -0
- landmarkdiff/hyperparam.py +347 -0
- landmarkdiff/inference.py +754 -0
- landmarkdiff/landmarks.py +432 -0
- landmarkdiff/log.py +90 -0
- landmarkdiff/losses.py +348 -0
- landmarkdiff/manipulation.py +651 -0
- landmarkdiff/masking.py +316 -0
- landmarkdiff/metrics_agg.py +313 -0
- landmarkdiff/metrics_viz.py +464 -0
- landmarkdiff/model_registry.py +362 -0
- landmarkdiff/morphometry.py +342 -0
- landmarkdiff/postprocess.py +600 -0
- landmarkdiff/py.typed +0 -0
- landmarkdiff/safety.py +395 -0
- landmarkdiff/synthetic/__init__.py +23 -0
- landmarkdiff/synthetic/augmentation.py +188 -0
- landmarkdiff/synthetic/pair_generator.py +208 -0
- landmarkdiff/synthetic/tps_warp.py +273 -0
- landmarkdiff/validation.py +324 -0
- landmarkdiff-0.2.3.dist-info/METADATA +1173 -0
- landmarkdiff-0.2.3.dist-info/RECORD +46 -0
- landmarkdiff-0.2.3.dist-info/WHEEL +5 -0
- landmarkdiff-0.2.3.dist-info/entry_points.txt +2 -0
- landmarkdiff-0.2.3.dist-info/licenses/LICENSE +21 -0
- landmarkdiff-0.2.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Dataset versioning and provenance tracking.
|
|
2
|
+
|
|
3
|
+
Tracks dataset composition, checksums, and lineage for reproducible training.
|
|
4
|
+
Creates manifest files that record exactly which data was used for each
|
|
5
|
+
training run.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from landmarkdiff.data_version import DataManifest
|
|
9
|
+
|
|
10
|
+
manifest = DataManifest.from_directory("data/training")
|
|
11
|
+
manifest.save("data/training/manifest.json")
|
|
12
|
+
|
|
13
|
+
# Later, verify data hasn't changed
|
|
14
|
+
manifest2 = DataManifest.from_directory("data/training")
|
|
15
|
+
assert manifest.checksum == manifest2.checksum
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import hashlib
|
|
21
|
+
import json
|
|
22
|
+
from dataclasses import dataclass, field
|
|
23
|
+
from datetime import datetime, timezone
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class FileEntry:
|
|
30
|
+
"""Metadata for a single dataset file."""
|
|
31
|
+
|
|
32
|
+
path: str
|
|
33
|
+
size_bytes: int
|
|
34
|
+
checksum: str # md5 of first 64KB (fast approximate)
|
|
35
|
+
procedure: str = ""
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def from_path(filepath: Path, base_dir: Path | None = None) -> FileEntry:
|
|
39
|
+
"""Create entry from a file path."""
|
|
40
|
+
rel = str(filepath.relative_to(base_dir)) if base_dir else str(filepath)
|
|
41
|
+
size = filepath.stat().st_size
|
|
42
|
+
|
|
43
|
+
# Fast checksum: first 64KB
|
|
44
|
+
h = hashlib.md5()
|
|
45
|
+
with open(filepath, "rb") as f:
|
|
46
|
+
h.update(f.read(65536))
|
|
47
|
+
|
|
48
|
+
# Infer procedure from filename
|
|
49
|
+
proc = ""
|
|
50
|
+
for p in [
|
|
51
|
+
"rhinoplasty",
|
|
52
|
+
"blepharoplasty",
|
|
53
|
+
"rhytidectomy",
|
|
54
|
+
"orthognathic",
|
|
55
|
+
"brow_lift",
|
|
56
|
+
"mentoplasty",
|
|
57
|
+
]:
|
|
58
|
+
if p in filepath.name or p in str(filepath.parent):
|
|
59
|
+
proc = p
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
return FileEntry(path=rel, size_bytes=size, checksum=h.hexdigest(), procedure=proc)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class DataManifest:
|
|
67
|
+
"""Dataset manifest for versioning and reproducibility.
|
|
68
|
+
|
|
69
|
+
Attributes:
|
|
70
|
+
version: Manifest format version.
|
|
71
|
+
created_at: Creation timestamp.
|
|
72
|
+
root_dir: Root directory of the dataset.
|
|
73
|
+
files: List of file entries.
|
|
74
|
+
metadata: Additional dataset metadata.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
version: str = "1.0"
|
|
78
|
+
created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
79
|
+
root_dir: str = ""
|
|
80
|
+
files: list[FileEntry] = field(default_factory=list)
|
|
81
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def total_files(self) -> int:
|
|
85
|
+
return len(self.files)
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def total_size_bytes(self) -> int:
|
|
89
|
+
return sum(f.size_bytes for f in self.files)
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def total_size_mb(self) -> float:
|
|
93
|
+
return self.total_size_bytes / (1024 * 1024)
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def checksum(self) -> str:
|
|
97
|
+
"""Compute aggregate checksum from all file checksums."""
|
|
98
|
+
h = hashlib.md5()
|
|
99
|
+
for f in sorted(self.files, key=lambda x: x.path):
|
|
100
|
+
h.update(f"{f.path}:{f.checksum}:{f.size_bytes}".encode())
|
|
101
|
+
return h.hexdigest()
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def by_procedure(self) -> dict[str, int]:
|
|
105
|
+
"""Count files by procedure."""
|
|
106
|
+
counts: dict[str, int] = {}
|
|
107
|
+
for f in self.files:
|
|
108
|
+
key = f.procedure or "unknown"
|
|
109
|
+
counts[key] = counts.get(key, 0) + 1
|
|
110
|
+
return counts
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def from_directory(
|
|
114
|
+
directory: str | Path,
|
|
115
|
+
extensions: set[str] | None = None,
|
|
116
|
+
include_patterns: list[str] | None = None,
|
|
117
|
+
) -> DataManifest:
|
|
118
|
+
"""Create manifest from a directory of dataset files.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
directory: Path to dataset directory.
|
|
122
|
+
extensions: File extensions to include (default: image types).
|
|
123
|
+
include_patterns: Glob patterns to include.
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
DataManifest with entries for all matching files.
|
|
127
|
+
"""
|
|
128
|
+
directory = Path(directory)
|
|
129
|
+
if not directory.exists():
|
|
130
|
+
raise FileNotFoundError(f"Directory not found: {directory}")
|
|
131
|
+
|
|
132
|
+
if extensions is None:
|
|
133
|
+
extensions = {".png", ".jpg", ".jpeg", ".webp", ".npy", ".npz"}
|
|
134
|
+
|
|
135
|
+
files: list[FileEntry] = []
|
|
136
|
+
if include_patterns:
|
|
137
|
+
for pattern in include_patterns:
|
|
138
|
+
for fp in sorted(directory.glob(pattern)):
|
|
139
|
+
if fp.is_file():
|
|
140
|
+
files.append(FileEntry.from_path(fp, base_dir=directory))
|
|
141
|
+
else:
|
|
142
|
+
for fp in sorted(directory.rglob("*")):
|
|
143
|
+
if fp.is_file() and fp.suffix.lower() in extensions:
|
|
144
|
+
files.append(FileEntry.from_path(fp, base_dir=directory))
|
|
145
|
+
|
|
146
|
+
manifest = DataManifest(
|
|
147
|
+
root_dir=str(directory),
|
|
148
|
+
files=files,
|
|
149
|
+
metadata={
|
|
150
|
+
"extensions": sorted(extensions),
|
|
151
|
+
"host": _get_hostname(),
|
|
152
|
+
},
|
|
153
|
+
)
|
|
154
|
+
return manifest
|
|
155
|
+
|
|
156
|
+
def save(self, path: str | Path) -> Path:
|
|
157
|
+
"""Save manifest to JSON.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
path: Output file path.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Path to saved manifest.
|
|
164
|
+
"""
|
|
165
|
+
path = Path(path)
|
|
166
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
167
|
+
|
|
168
|
+
data = {
|
|
169
|
+
"version": self.version,
|
|
170
|
+
"created_at": self.created_at,
|
|
171
|
+
"root_dir": self.root_dir,
|
|
172
|
+
"checksum": self.checksum,
|
|
173
|
+
"total_files": self.total_files,
|
|
174
|
+
"total_size_mb": round(self.total_size_mb, 2),
|
|
175
|
+
"by_procedure": self.by_procedure,
|
|
176
|
+
"metadata": self.metadata,
|
|
177
|
+
"files": [
|
|
178
|
+
{
|
|
179
|
+
"path": f.path,
|
|
180
|
+
"size_bytes": f.size_bytes,
|
|
181
|
+
"checksum": f.checksum,
|
|
182
|
+
"procedure": f.procedure,
|
|
183
|
+
}
|
|
184
|
+
for f in self.files
|
|
185
|
+
],
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
with open(path, "w") as f:
|
|
189
|
+
json.dump(data, f, indent=2)
|
|
190
|
+
|
|
191
|
+
return path
|
|
192
|
+
|
|
193
|
+
@staticmethod
|
|
194
|
+
def load(path: str | Path) -> DataManifest:
|
|
195
|
+
"""Load manifest from JSON.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
path: Path to manifest file.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
DataManifest instance.
|
|
202
|
+
"""
|
|
203
|
+
with open(path) as f:
|
|
204
|
+
data = json.load(f)
|
|
205
|
+
|
|
206
|
+
files = [
|
|
207
|
+
FileEntry(
|
|
208
|
+
path=fe["path"],
|
|
209
|
+
size_bytes=fe["size_bytes"],
|
|
210
|
+
checksum=fe["checksum"],
|
|
211
|
+
procedure=fe.get("procedure", ""),
|
|
212
|
+
)
|
|
213
|
+
for fe in data.get("files", [])
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
return DataManifest(
|
|
217
|
+
version=data.get("version", "1.0"),
|
|
218
|
+
created_at=data.get("created_at", ""),
|
|
219
|
+
root_dir=data.get("root_dir", ""),
|
|
220
|
+
files=files,
|
|
221
|
+
metadata=data.get("metadata", {}),
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
def verify(self, directory: str | Path | None = None) -> tuple[bool, list[str]]:
|
|
225
|
+
"""Verify dataset matches this manifest.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
directory: Directory to verify (default: original root_dir).
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
(all_match, list_of_issues)
|
|
232
|
+
"""
|
|
233
|
+
directory = Path(directory or self.root_dir)
|
|
234
|
+
issues: list[str] = []
|
|
235
|
+
|
|
236
|
+
for entry in self.files:
|
|
237
|
+
fp = directory / entry.path
|
|
238
|
+
if not fp.exists():
|
|
239
|
+
issues.append(f"Missing: {entry.path}")
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
actual_size = fp.stat().st_size
|
|
243
|
+
if actual_size != entry.size_bytes:
|
|
244
|
+
issues.append(
|
|
245
|
+
f"Size mismatch: {entry.path} (expected {entry.size_bytes}, got {actual_size})"
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Check checksum
|
|
249
|
+
h = hashlib.md5()
|
|
250
|
+
with open(fp, "rb") as f:
|
|
251
|
+
h.update(f.read(65536))
|
|
252
|
+
if h.hexdigest() != entry.checksum:
|
|
253
|
+
issues.append(f"Checksum mismatch: {entry.path}")
|
|
254
|
+
|
|
255
|
+
return len(issues) == 0, issues
|
|
256
|
+
|
|
257
|
+
def diff(self, other: DataManifest) -> dict[str, list[str]]:
|
|
258
|
+
"""Compare two manifests.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Dict with 'added', 'removed', 'modified' file lists.
|
|
262
|
+
"""
|
|
263
|
+
self_files = {f.path: f for f in self.files}
|
|
264
|
+
other_files = {f.path: f for f in other.files}
|
|
265
|
+
|
|
266
|
+
self_paths = set(self_files.keys())
|
|
267
|
+
other_paths = set(other_files.keys())
|
|
268
|
+
|
|
269
|
+
added = sorted(other_paths - self_paths)
|
|
270
|
+
removed = sorted(self_paths - other_paths)
|
|
271
|
+
modified = sorted(
|
|
272
|
+
p for p in self_paths & other_paths if self_files[p].checksum != other_files[p].checksum
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
return {"added": added, "removed": removed, "modified": modified}
|
|
276
|
+
|
|
277
|
+
def summary(self) -> str:
|
|
278
|
+
"""Human-readable summary."""
|
|
279
|
+
lines = [
|
|
280
|
+
f"Dataset Manifest v{self.version}",
|
|
281
|
+
f" Root: {self.root_dir}",
|
|
282
|
+
f" Files: {self.total_files}",
|
|
283
|
+
f" Size: {self.total_size_mb:.1f} MB",
|
|
284
|
+
f" Checksum: {self.checksum}",
|
|
285
|
+
]
|
|
286
|
+
procs = self.by_procedure
|
|
287
|
+
if procs:
|
|
288
|
+
lines.append(" By procedure:")
|
|
289
|
+
for proc, count in sorted(procs.items()):
|
|
290
|
+
lines.append(f" {proc}: {count}")
|
|
291
|
+
return "\n".join(lines)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _get_hostname() -> str:
|
|
295
|
+
"""Get hostname safely."""
|
|
296
|
+
try:
|
|
297
|
+
import socket
|
|
298
|
+
|
|
299
|
+
return socket.gethostname()
|
|
300
|
+
except Exception:
|
|
301
|
+
return "unknown"
|