landmarkdiff 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. landmarkdiff/__init__.py +40 -0
  2. landmarkdiff/__main__.py +207 -0
  3. landmarkdiff/api_client.py +316 -0
  4. landmarkdiff/arcface_torch.py +583 -0
  5. landmarkdiff/audit.py +338 -0
  6. landmarkdiff/augmentation.py +293 -0
  7. landmarkdiff/benchmark.py +213 -0
  8. landmarkdiff/checkpoint_manager.py +361 -0
  9. landmarkdiff/cli.py +252 -0
  10. landmarkdiff/clinical.py +223 -0
  11. landmarkdiff/conditioning.py +278 -0
  12. landmarkdiff/config.py +358 -0
  13. landmarkdiff/curriculum.py +191 -0
  14. landmarkdiff/data.py +405 -0
  15. landmarkdiff/data_version.py +301 -0
  16. landmarkdiff/displacement_model.py +745 -0
  17. landmarkdiff/ensemble.py +330 -0
  18. landmarkdiff/evaluation.py +415 -0
  19. landmarkdiff/experiment_tracker.py +231 -0
  20. landmarkdiff/face_verifier.py +947 -0
  21. landmarkdiff/fid.py +244 -0
  22. landmarkdiff/hyperparam.py +347 -0
  23. landmarkdiff/inference.py +754 -0
  24. landmarkdiff/landmarks.py +432 -0
  25. landmarkdiff/log.py +90 -0
  26. landmarkdiff/losses.py +348 -0
  27. landmarkdiff/manipulation.py +651 -0
  28. landmarkdiff/masking.py +316 -0
  29. landmarkdiff/metrics_agg.py +313 -0
  30. landmarkdiff/metrics_viz.py +464 -0
  31. landmarkdiff/model_registry.py +362 -0
  32. landmarkdiff/morphometry.py +342 -0
  33. landmarkdiff/postprocess.py +600 -0
  34. landmarkdiff/py.typed +0 -0
  35. landmarkdiff/safety.py +395 -0
  36. landmarkdiff/synthetic/__init__.py +23 -0
  37. landmarkdiff/synthetic/augmentation.py +188 -0
  38. landmarkdiff/synthetic/pair_generator.py +208 -0
  39. landmarkdiff/synthetic/tps_warp.py +273 -0
  40. landmarkdiff/validation.py +324 -0
  41. landmarkdiff-0.2.3.dist-info/METADATA +1173 -0
  42. landmarkdiff-0.2.3.dist-info/RECORD +46 -0
  43. landmarkdiff-0.2.3.dist-info/WHEEL +5 -0
  44. landmarkdiff-0.2.3.dist-info/entry_points.txt +2 -0
  45. landmarkdiff-0.2.3.dist-info/licenses/LICENSE +21 -0
  46. landmarkdiff-0.2.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,301 @@
1
+ """Dataset versioning and provenance tracking.
2
+
3
+ Tracks dataset composition, checksums, and lineage for reproducible training.
4
+ Creates manifest files that record exactly which data was used for each
5
+ training run.
6
+
7
+ Usage:
8
+ from landmarkdiff.data_version import DataManifest
9
+
10
+ manifest = DataManifest.from_directory("data/training")
11
+ manifest.save("data/training/manifest.json")
12
+
13
+ # Later, verify data hasn't changed
14
+ manifest2 = DataManifest.from_directory("data/training")
15
+ assert manifest.checksum == manifest2.checksum
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import hashlib
21
+ import json
22
+ from dataclasses import dataclass, field
23
+ from datetime import datetime, timezone
24
+ from pathlib import Path
25
+ from typing import Any
26
+
27
+
28
+ @dataclass
29
+ class FileEntry:
30
+ """Metadata for a single dataset file."""
31
+
32
+ path: str
33
+ size_bytes: int
34
+ checksum: str # md5 of first 64KB (fast approximate)
35
+ procedure: str = ""
36
+
37
+ @staticmethod
38
+ def from_path(filepath: Path, base_dir: Path | None = None) -> FileEntry:
39
+ """Create entry from a file path."""
40
+ rel = str(filepath.relative_to(base_dir)) if base_dir else str(filepath)
41
+ size = filepath.stat().st_size
42
+
43
+ # Fast checksum: first 64KB
44
+ h = hashlib.md5()
45
+ with open(filepath, "rb") as f:
46
+ h.update(f.read(65536))
47
+
48
+ # Infer procedure from filename
49
+ proc = ""
50
+ for p in [
51
+ "rhinoplasty",
52
+ "blepharoplasty",
53
+ "rhytidectomy",
54
+ "orthognathic",
55
+ "brow_lift",
56
+ "mentoplasty",
57
+ ]:
58
+ if p in filepath.name or p in str(filepath.parent):
59
+ proc = p
60
+ break
61
+
62
+ return FileEntry(path=rel, size_bytes=size, checksum=h.hexdigest(), procedure=proc)
63
+
64
+
65
+ @dataclass
66
+ class DataManifest:
67
+ """Dataset manifest for versioning and reproducibility.
68
+
69
+ Attributes:
70
+ version: Manifest format version.
71
+ created_at: Creation timestamp.
72
+ root_dir: Root directory of the dataset.
73
+ files: List of file entries.
74
+ metadata: Additional dataset metadata.
75
+ """
76
+
77
+ version: str = "1.0"
78
+ created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
79
+ root_dir: str = ""
80
+ files: list[FileEntry] = field(default_factory=list)
81
+ metadata: dict[str, Any] = field(default_factory=dict)
82
+
83
+ @property
84
+ def total_files(self) -> int:
85
+ return len(self.files)
86
+
87
+ @property
88
+ def total_size_bytes(self) -> int:
89
+ return sum(f.size_bytes for f in self.files)
90
+
91
+ @property
92
+ def total_size_mb(self) -> float:
93
+ return self.total_size_bytes / (1024 * 1024)
94
+
95
+ @property
96
+ def checksum(self) -> str:
97
+ """Compute aggregate checksum from all file checksums."""
98
+ h = hashlib.md5()
99
+ for f in sorted(self.files, key=lambda x: x.path):
100
+ h.update(f"{f.path}:{f.checksum}:{f.size_bytes}".encode())
101
+ return h.hexdigest()
102
+
103
+ @property
104
+ def by_procedure(self) -> dict[str, int]:
105
+ """Count files by procedure."""
106
+ counts: dict[str, int] = {}
107
+ for f in self.files:
108
+ key = f.procedure or "unknown"
109
+ counts[key] = counts.get(key, 0) + 1
110
+ return counts
111
+
112
+ @staticmethod
113
+ def from_directory(
114
+ directory: str | Path,
115
+ extensions: set[str] | None = None,
116
+ include_patterns: list[str] | None = None,
117
+ ) -> DataManifest:
118
+ """Create manifest from a directory of dataset files.
119
+
120
+ Args:
121
+ directory: Path to dataset directory.
122
+ extensions: File extensions to include (default: image types).
123
+ include_patterns: Glob patterns to include.
124
+
125
+ Returns:
126
+ DataManifest with entries for all matching files.
127
+ """
128
+ directory = Path(directory)
129
+ if not directory.exists():
130
+ raise FileNotFoundError(f"Directory not found: {directory}")
131
+
132
+ if extensions is None:
133
+ extensions = {".png", ".jpg", ".jpeg", ".webp", ".npy", ".npz"}
134
+
135
+ files: list[FileEntry] = []
136
+ if include_patterns:
137
+ for pattern in include_patterns:
138
+ for fp in sorted(directory.glob(pattern)):
139
+ if fp.is_file():
140
+ files.append(FileEntry.from_path(fp, base_dir=directory))
141
+ else:
142
+ for fp in sorted(directory.rglob("*")):
143
+ if fp.is_file() and fp.suffix.lower() in extensions:
144
+ files.append(FileEntry.from_path(fp, base_dir=directory))
145
+
146
+ manifest = DataManifest(
147
+ root_dir=str(directory),
148
+ files=files,
149
+ metadata={
150
+ "extensions": sorted(extensions),
151
+ "host": _get_hostname(),
152
+ },
153
+ )
154
+ return manifest
155
+
156
+ def save(self, path: str | Path) -> Path:
157
+ """Save manifest to JSON.
158
+
159
+ Args:
160
+ path: Output file path.
161
+
162
+ Returns:
163
+ Path to saved manifest.
164
+ """
165
+ path = Path(path)
166
+ path.parent.mkdir(parents=True, exist_ok=True)
167
+
168
+ data = {
169
+ "version": self.version,
170
+ "created_at": self.created_at,
171
+ "root_dir": self.root_dir,
172
+ "checksum": self.checksum,
173
+ "total_files": self.total_files,
174
+ "total_size_mb": round(self.total_size_mb, 2),
175
+ "by_procedure": self.by_procedure,
176
+ "metadata": self.metadata,
177
+ "files": [
178
+ {
179
+ "path": f.path,
180
+ "size_bytes": f.size_bytes,
181
+ "checksum": f.checksum,
182
+ "procedure": f.procedure,
183
+ }
184
+ for f in self.files
185
+ ],
186
+ }
187
+
188
+ with open(path, "w") as f:
189
+ json.dump(data, f, indent=2)
190
+
191
+ return path
192
+
193
+ @staticmethod
194
+ def load(path: str | Path) -> DataManifest:
195
+ """Load manifest from JSON.
196
+
197
+ Args:
198
+ path: Path to manifest file.
199
+
200
+ Returns:
201
+ DataManifest instance.
202
+ """
203
+ with open(path) as f:
204
+ data = json.load(f)
205
+
206
+ files = [
207
+ FileEntry(
208
+ path=fe["path"],
209
+ size_bytes=fe["size_bytes"],
210
+ checksum=fe["checksum"],
211
+ procedure=fe.get("procedure", ""),
212
+ )
213
+ for fe in data.get("files", [])
214
+ ]
215
+
216
+ return DataManifest(
217
+ version=data.get("version", "1.0"),
218
+ created_at=data.get("created_at", ""),
219
+ root_dir=data.get("root_dir", ""),
220
+ files=files,
221
+ metadata=data.get("metadata", {}),
222
+ )
223
+
224
+ def verify(self, directory: str | Path | None = None) -> tuple[bool, list[str]]:
225
+ """Verify dataset matches this manifest.
226
+
227
+ Args:
228
+ directory: Directory to verify (default: original root_dir).
229
+
230
+ Returns:
231
+ (all_match, list_of_issues)
232
+ """
233
+ directory = Path(directory or self.root_dir)
234
+ issues: list[str] = []
235
+
236
+ for entry in self.files:
237
+ fp = directory / entry.path
238
+ if not fp.exists():
239
+ issues.append(f"Missing: {entry.path}")
240
+ continue
241
+
242
+ actual_size = fp.stat().st_size
243
+ if actual_size != entry.size_bytes:
244
+ issues.append(
245
+ f"Size mismatch: {entry.path} (expected {entry.size_bytes}, got {actual_size})"
246
+ )
247
+
248
+ # Check checksum
249
+ h = hashlib.md5()
250
+ with open(fp, "rb") as f:
251
+ h.update(f.read(65536))
252
+ if h.hexdigest() != entry.checksum:
253
+ issues.append(f"Checksum mismatch: {entry.path}")
254
+
255
+ return len(issues) == 0, issues
256
+
257
+ def diff(self, other: DataManifest) -> dict[str, list[str]]:
258
+ """Compare two manifests.
259
+
260
+ Returns:
261
+ Dict with 'added', 'removed', 'modified' file lists.
262
+ """
263
+ self_files = {f.path: f for f in self.files}
264
+ other_files = {f.path: f for f in other.files}
265
+
266
+ self_paths = set(self_files.keys())
267
+ other_paths = set(other_files.keys())
268
+
269
+ added = sorted(other_paths - self_paths)
270
+ removed = sorted(self_paths - other_paths)
271
+ modified = sorted(
272
+ p for p in self_paths & other_paths if self_files[p].checksum != other_files[p].checksum
273
+ )
274
+
275
+ return {"added": added, "removed": removed, "modified": modified}
276
+
277
+ def summary(self) -> str:
278
+ """Human-readable summary."""
279
+ lines = [
280
+ f"Dataset Manifest v{self.version}",
281
+ f" Root: {self.root_dir}",
282
+ f" Files: {self.total_files}",
283
+ f" Size: {self.total_size_mb:.1f} MB",
284
+ f" Checksum: {self.checksum}",
285
+ ]
286
+ procs = self.by_procedure
287
+ if procs:
288
+ lines.append(" By procedure:")
289
+ for proc, count in sorted(procs.items()):
290
+ lines.append(f" {proc}: {count}")
291
+ return "\n".join(lines)
292
+
293
+
294
+ def _get_hostname() -> str:
295
+ """Get hostname safely."""
296
+ try:
297
+ import socket
298
+
299
+ return socket.gethostname()
300
+ except Exception:
301
+ return "unknown"