finwave-wavefront 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.4
2
+ Name: finwave-wavefront
3
+ Version: 0.1.0
4
+ Summary: Official Python client for fetching finwave datasets over the dataset-API handshake.
5
+ Project-URL: Homepage, https://operationalecology.io
6
+ Project-URL: Source, https://github.com/Operational-Ecology/Wavefront
7
+ Project-URL: finwave, https://finwave.io
8
+ Author-email: Alexander Barnhill <alex.c.barnhill@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: conservation,datasets,finwave,photo-identification,wildlife,yolo
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Topic :: Scientific/Engineering
17
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
18
+ Requires-Python: >=3.9
19
+ Requires-Dist: httpx>=0.24
20
+ Provides-Extra: test
21
+ Requires-Dist: pytest>=7; extra == 'test'
22
+ Requires-Dist: respx>=0.20; extra == 'test'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # wavefront
26
+
27
+ The official Python client for **[Finwave](https://finwave.io)** datasets.
28
+
29
+ Finwave serves frozen, versioned wildlife photo-identification and detector
30
+ datasets behind a small handshake API. `wavefront` turns that into one call.
31
+
32
+ ```bash
33
+ pip install finwave-wavefront
34
+ ```
35
+
36
+ ## Quick start
37
+
38
+ ```python
39
+ import wavefront
40
+
41
+ # the API key is read from $FW_API_TOKEN (or passed as api_key=...)
42
+ ds = wavefront.fetch("a7673931-9810-4c52-9654-1c9b1fafb63d", format="yolo")
43
+
44
+ print(ds.path) # extracted, ready to train on
45
+ print(ds.classes) # ['fluke']
46
+ print(ds.num_images) # 497
47
+ print(ds.fingerprint) # content hash — record it next to any model you train
48
+ ```
49
+
50
+ `ds` is path-like, so it drops straight into a trainer:
51
+
52
+ ```python
53
+ from ultralytics import YOLO
54
+ YOLO("yolo11n.pt").train(data=f"{ds.path}/data.yaml")
55
+ ```
56
+
57
+ ### Pre-flight without downloading
58
+
59
+ ```python
60
+ m = wavefront.manifest("a7673931-9810-4c52-9654-1c9b1fafb63d")
61
+ print(m.name, m.sample_count, m.available_formats) # Flukes v1 497 ['Yolo']
62
+ ```
63
+
64
+ ### A reusable client
65
+
66
+ ```python
67
+ from wavefront import Client
68
+ client = Client(api_key="...", base_url="https://finwave.io")
69
+ ds = client.fetch(dataset_id, format="yolo", dest="./data/flukes")
70
+ ```
71
+
72
+ ### Command line
73
+
74
+ ```bash
75
+ export FW_API_TOKEN=...
76
+ wavefront manifest a7673931-9810-4c52-9654-1c9b1fafb63d
77
+ wavefront fetch a7673931-9810-4c52-9654-1c9b1fafb63d --format yolo --dest ./data/flukes
78
+ ```
79
+
80
+ ## How it works
81
+
82
+ 1. `GET /manifest` — cheap metadata + which export formats are ready.
83
+ 2. `GET ?format=…` — a **handshake** that mints a short-lived signed download URL.
84
+ 3. Download that URL → a zip → extract → a `Dataset`.
85
+
86
+ Downloads are **cached by content fingerprint**, so re-fetching a frozen
87
+ version is a no-op. The key needs the dataset-download scope.
88
+
89
+ ## Authentication
90
+
91
+ Provide the key explicitly (`fetch(..., api_key=...)`) or set **`FW_API_TOKEN`**.
92
+ For compatibility, `WAVEFRONT_API_KEY`, `FINWAVE_DATASET_API_KEY` and
93
+ `DATASET_API_KEY` are also accepted (in that order).
94
+
95
+ ## Errors
96
+
97
+ All errors subclass `wavefront.WavefrontError`:
98
+
99
+ | Exception | When |
100
+ |---|---|
101
+ | `AuthError` | key missing / rejected (401/403) |
102
+ | `DatasetNotFoundError` | no such version, or not visible to the key (404) |
103
+ | `FormatNotAvailableError` | the version exists but that export hasn't been generated yet (`.available` lists what is) |
104
+ | `APIError` | any other non-success response |
105
+
106
+ ## License
107
+
108
+ MIT © Alexander Barnhill / [Operational Ecology](https://operationalecology.io).
109
+ A partnership artifact between finwave and Operational Ecology.
@@ -0,0 +1,11 @@
1
+ wavefront/__init__.py,sha256=TE1Xkoxce6mGQuRHuE44j8s7ZAfFsLEgxwapy2MRjnM,2353
2
+ wavefront/__main__.py,sha256=H4gQQ-7sbs7aNcuH9HkWJF48M14jBnz6FB9HLaYxpmo,3452
3
+ wavefront/_art.py,sha256=rS7Eg2VHcaAw1wzOoLF1OhSbi0XK7j9BywBzDIoaBOc,3768
4
+ wavefront/client.py,sha256=UQU6LP7HuhXJFVNL6lyLBftdUToCSI1CgnDtgLwU8vQ,11619
5
+ wavefront/exceptions.py,sha256=euSj98BqtPwLQjOn8g0vlYMwo650ilnA8KGPrcnv138,1612
6
+ wavefront/models.py,sha256=10XzDm33ND0N6R5LNBqOT9n-8bj9vbvTB46oWOKdPuc,3551
7
+ finwave_wavefront-0.1.0.dist-info/METADATA,sha256=YK2A9-pcAmUM3jsCsQTxX9so9HhmvGq_TNOdAttiaP4,3563
8
+ finwave_wavefront-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ finwave_wavefront-0.1.0.dist-info/entry_points.txt,sha256=ywBYOo2r3QbUAZJTAa9q-u1t3F_y9a60zqYykqahz5k,54
10
+ finwave_wavefront-0.1.0.dist-info/licenses/LICENSE,sha256=HOVL0nFm4EJpu4aftbrw6bElA9TYRozpE8QP5ZEsmjE,1097
11
+ finwave_wavefront-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ wavefront = wavefront.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Alexander Barnhill / Operational Ecology
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
wavefront/__init__.py ADDED
@@ -0,0 +1,69 @@
1
+ """wavefront — the official Python client for finwave datasets.
2
+
3
+ finwave (https://finwave.io) serves frozen, versioned wildlife photo-ID and
4
+ detector datasets behind a small handshake API. ``wavefront`` turns that into
5
+ one call:
6
+
7
+ >>> import wavefront
8
+ >>> ds = wavefront.fetch("a7673931-9810-4c52-9654-1c9b1fafb63d", format="yolo")
9
+ >>> ds.path, ds.classes, ds.num_images
10
+ (PosixPath('.../Yolo-81f97dec8667'), ['fluke'], 497)
11
+
12
+ The key is read from the ``FW_API_TOKEN`` environment variable (or passed
13
+ explicitly as ``api_key=``); ``WAVEFRONT_API_KEY``, ``FINWAVE_DATASET_API_KEY``
14
+ and ``DATASET_API_KEY`` are also accepted for compatibility. For repeated or
15
+ configured use, construct a :class:`Client`.
16
+
17
+ Every step logs on the ``wavefront`` logger. The library attaches a
18
+ ``NullHandler`` and never configures logging itself — enable output with
19
+ ``logging.basicConfig(level=logging.INFO)`` in your application.
20
+
21
+ Built by Operational Ecology (https://operationalecology.io).
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import logging
26
+ from typing import Optional
27
+
28
+ from .client import API_KEY_ENV, DEFAULT_BASE_URL, Client
29
+
30
+ logging.getLogger("wavefront").addHandler(logging.NullHandler())
31
+ from .exceptions import (
32
+ APIError,
33
+ AuthError,
34
+ DatasetNotFoundError,
35
+ FormatNotAvailableError,
36
+ IntegrityError,
37
+ WavefrontError,
38
+ )
39
+ from .models import Dataset, Manifest
40
+
41
+ __version__ = "0.1.0"
42
+ __all__ = [
43
+ "fetch",
44
+ "manifest",
45
+ "Client",
46
+ "Dataset",
47
+ "Manifest",
48
+ "WavefrontError",
49
+ "AuthError",
50
+ "DatasetNotFoundError",
51
+ "FormatNotAvailableError",
52
+ "IntegrityError",
53
+ "APIError",
54
+ "DEFAULT_BASE_URL",
55
+ "API_KEY_ENV",
56
+ "__version__",
57
+ ]
58
+
59
+
60
+ def fetch(dataset_version_id: str, *, format: str = "yolo",
61
+ api_key: Optional[str] = None, base_url: str = DEFAULT_BASE_URL, **kwargs) -> Dataset:
62
+ """Fetch + extract a dataset version with a one-off client. See :meth:`Client.fetch`."""
63
+ return Client(api_key, base_url=base_url).fetch(dataset_version_id, format=format, **kwargs)
64
+
65
+
66
+ def manifest(dataset_version_id: str, *,
67
+ api_key: Optional[str] = None, base_url: str = DEFAULT_BASE_URL) -> Manifest:
68
+ """Return a dataset version's manifest with a one-off client. See :meth:`Client.manifest`."""
69
+ return Client(api_key, base_url=base_url).manifest(dataset_version_id)
wavefront/__main__.py ADDED
@@ -0,0 +1,85 @@
1
+ """Command-line interface: ``wavefront fetch|manifest <id>``."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import logging
6
+ import sys
7
+
8
+ from . import __version__, _art
9
+ from .client import Client
10
+ from .exceptions import WavefrontError
11
+
12
+
13
+ def _fmt_bytes(n: int) -> str:
14
+ for unit in ("B", "KB", "MB", "GB"):
15
+ if n < 1024 or unit == "GB":
16
+ return f"{n:.0f}{unit}" if unit == "B" else f"{n/1:.0f}{unit}"
17
+ n /= 1024
18
+ return f"{n:.0f}B"
19
+
20
+
21
+ def main(argv=None) -> int:
22
+ p = argparse.ArgumentParser(prog="wavefront", description="Fetch finwave datasets.")
23
+ p.add_argument("--version", action="version", version=f"wavefront {__version__}")
24
+ p.add_argument("--api-key", default=None, help="overrides $FW_API_TOKEN")
25
+ p.add_argument("--base-url", default=None, help="finwave base URL")
26
+ p.add_argument("-v", "--verbose", action="store_true", help="debug-level logging")
27
+ p.add_argument("-q", "--quiet", action="store_true", help="warnings and errors only")
28
+ p.add_argument("--no-art", action="store_true", help="disable the wave animation")
29
+ sub = p.add_subparsers(dest="cmd", required=False)
30
+
31
+ m = sub.add_parser("manifest", help="print a version's metadata + formats")
32
+ m.add_argument("dataset_version_id")
33
+
34
+ f = sub.add_parser("fetch", help="download + extract a dataset version")
35
+ f.add_argument("dataset_version_id")
36
+ f.add_argument("--format", default="yolo")
37
+ f.add_argument("--dest", default=None, help="extract dir (default: cache)")
38
+ f.add_argument("--force", action="store_true", help="ignore cache")
39
+
40
+ args = p.parse_args(argv)
41
+ level = logging.WARNING if args.quiet else (logging.DEBUG if args.verbose else logging.INFO)
42
+ logging.basicConfig(level=level, format="%(message)s", stream=sys.stderr)
43
+ show_art = not (args.no_art or args.quiet)
44
+ if args.cmd is None: # bare `wavefront` → wave + wordmark
45
+ if show_art:
46
+ _art.banner()
47
+ return 0
48
+ kw = {}
49
+ if args.base_url:
50
+ kw["base_url"] = args.base_url
51
+ try:
52
+ client = Client(args.api_key, **kw)
53
+ if args.cmd == "manifest":
54
+ mf = client.manifest(args.dataset_version_id)
55
+ print(f"{mf.name} (v{mf.version_number})")
56
+ print(f" samples: {mf.sample_count} annotations: {mf.annotation_count}")
57
+ print(f" formats: {mf.available_formats or '(none generated yet)'}")
58
+ print(f" fingerprint: {mf.fingerprint}")
59
+ return 0
60
+ if args.cmd == "fetch":
61
+ if show_art:
62
+ _art.wave()
63
+ last = [0.0]
64
+
65
+ def prog(got, total):
66
+ pct = f" {100*got/total:.0f}%" if total else ""
67
+ if got - last[0] >= (1 << 23) or got == total: # ~8MB steps
68
+ print(f"\r downloading {_fmt_bytes(got)}{pct}", end="", file=sys.stderr)
69
+ last[0] = got
70
+
71
+ ds = client.fetch(args.dataset_version_id, format=args.format,
72
+ dest=args.dest, force=args.force, progress=prog)
73
+ print("", file=sys.stderr)
74
+ print(ds.path)
75
+ print(f" {ds.num_images} images, {ds.num_labels} labels, classes={ds.classes}",
76
+ file=sys.stderr)
77
+ return 0
78
+ except WavefrontError as e:
79
+ print(f"error: {e}", file=sys.stderr)
80
+ return 1
81
+ return 0
82
+
83
+
84
+ if __name__ == "__main__":
85
+ raise SystemExit(main())
wavefront/_art.py ADDED
@@ -0,0 +1,107 @@
1
+ """A small finwave-blue wave flourish for the terminal. Purely cosmetic.
2
+
3
+ Animates a travelling, foam-tipped wave in the finwave palette. No-ops on
4
+ non-TTY streams, under ``NO_COLOR``, or for dumb terminals, so it never
5
+ corrupts piped or logged output.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import math
10
+ import os
11
+ import shutil
12
+ import sys
13
+ import time
14
+ from typing import Optional
15
+
16
+ # finwave palette, deep water → crest (mirrors the logo's blue gradient)
17
+ _GRAD = [
18
+ (14, 63, 133), (21, 88, 184), (31, 111, 230),
19
+ (59, 143, 255), (93, 158, 255), (130, 185, 255),
20
+ ]
21
+ _FOAM = (224, 238, 255)
22
+ _BLOCKS = " ▁▂▃▄▅▆▇█"
23
+
24
+
25
+ def supported(stream) -> bool:
26
+ return (
27
+ hasattr(stream, "isatty")
28
+ and stream.isatty()
29
+ and not os.environ.get("NO_COLOR")
30
+ and not os.environ.get("WAVEFRONT_NO_ART")
31
+ and os.environ.get("TERM", "") not in ("", "dumb")
32
+ )
33
+
34
+
35
+ def _rgb(rgb) -> str:
36
+ return f"\x1b[38;2;{rgb[0]};{rgb[1]};{rgb[2]}m"
37
+
38
+
39
+ def _surface(width: int, rows: int, phase: float) -> list[float]:
40
+ """Water height in cells (0..rows) per column — two summed travelling waves."""
41
+ out = []
42
+ for x in range(width):
43
+ h = 0.52 + 0.30 * math.sin(x * 0.26 - phase) + 0.14 * math.sin(x * 0.11 + phase * 0.6)
44
+ out.append(max(0.0, min(1.0, h)) * rows)
45
+ return out
46
+
47
+
48
+ def _render_frame(width: int, rows: int, phase: float) -> str:
49
+ surf = _surface(width, rows, phase)
50
+ lines = []
51
+ for r in range(rows): # r = 0 is the top row
52
+ depth_from_surface = r # 0 near crest → lighter
53
+ line = []
54
+ for x in range(width):
55
+ band = surf[x] - (rows - 1 - r) # cells of water in this row (>1 = full)
56
+ if band <= 0:
57
+ line.append(" ")
58
+ continue
59
+ crest = band < 1.0 # the topmost filled cell
60
+ block = _BLOCKS[min(8, max(1, int(round(band * 8))))] if crest else "█"
61
+ if crest:
62
+ color = _FOAM
63
+ else:
64
+ gi = min(len(_GRAD) - 1, depth_from_surface)
65
+ color = _GRAD[len(_GRAD) - 1 - gi] if False else _GRAD[gi]
66
+ line.append(_rgb(color) + block)
67
+ lines.append("".join(line) + "\x1b[0m")
68
+ return "\n".join(lines)
69
+
70
+
71
+ def wave(stream=None, *, duration: float = 1.3, fps: int = 30,
72
+ width: Optional[int] = None, rows: int = 4) -> None:
73
+ """Play the wave flourish, then leave the terminal clean."""
74
+ stream = stream or sys.stderr
75
+ if not supported(stream):
76
+ return
77
+ cols = shutil.get_terminal_size((80, 24)).columns
78
+ w = min(width or cols - 2, 60)
79
+ frames = max(1, int(duration * fps))
80
+ stream.write("\x1b[?25l") # hide cursor
81
+ try:
82
+ for f in range(frames):
83
+ stream.write(_render_frame(w, rows, f / fps * 6.5))
84
+ if f < frames - 1:
85
+ stream.write(f"\x1b[{rows - 1}A\r") # back to top of the wave
86
+ stream.flush()
87
+ time.sleep(1.0 / fps)
88
+ stream.write("\n")
89
+ finally:
90
+ stream.write("\x1b[?25h\x1b[0m") # restore cursor + reset
91
+ stream.flush()
92
+
93
+
94
+ def banner(stream=None) -> None:
95
+ """A one-shot wave + wordmark, used by the bare ``wavefront`` command."""
96
+ from . import __version__
97
+ stream = stream or sys.stderr
98
+ wave(stream, duration=1.1)
99
+ if supported(stream):
100
+ stream.write(_rgb(_GRAD[3]) + " wavefront " + _rgb(_GRAD[5])
101
+ + f"v{__version__}\x1b[0m " + "\x1b[2mfinwave datasets, one call\x1b[0m\n")
102
+ else:
103
+ stream.write(f"wavefront v{__version__} — finwave datasets, one call\n")
104
+
105
+
106
+ if __name__ == "__main__": # quick visual check: `python -m wavefront._art`
107
+ wave(sys.stdout, duration=3.0)
wavefront/client.py ADDED
@@ -0,0 +1,277 @@
1
+ """The finwave dataset client.
2
+
3
+ The flow mirrors the finwave dataset API exactly:
4
+
5
+ 1. ``GET /api/datasets-api/{id}/manifest`` → cheap metadata + available formats
6
+ 2. ``GET /api/datasets-api/{id}?format=...`` → a *handshake* that mints a short-
7
+ lived signed download URL (no bytes yet)
8
+ 3. download the signed URL → a zip → extract → a :class:`~wavefront.models.Dataset`
9
+
10
+ Authentication is the ``X-API-KEY`` header; the key needs the dataset-download
11
+ scope. Downloads are cached by content fingerprint, so a repeated fetch of the
12
+ same frozen version is a no-op.
13
+
14
+ Every step emits an ``INFO`` log line on the ``wavefront`` logger so a caller
15
+ can see exactly what happened; the library installs a ``NullHandler`` and never
16
+ configures logging itself.
17
+ """
18
+ from __future__ import annotations
19
+
20
+ import logging
21
+ import os
22
+ import shutil
23
+ import tempfile
24
+ import time
25
+ import zipfile
26
+ from pathlib import Path
27
+ from typing import Callable, Optional
28
+
29
+ import httpx
30
+
31
+ from .exceptions import (
32
+ APIError,
33
+ AuthError,
34
+ DatasetNotFoundError,
35
+ FormatNotAvailableError,
36
+ )
37
+ from .models import Dataset, Manifest
38
+
39
+ log = logging.getLogger("wavefront")
40
+
41
+ DEFAULT_BASE_URL = "https://finwave.io"
42
+ #: Environment variables consulted for the API key, in order. ``FW_API_TOKEN``
43
+ #: is the canonical name; the rest are accepted for compatibility.
44
+ API_KEY_ENV = ("FW_API_TOKEN", "WAVEFRONT_API_KEY", "FINWAVE_DATASET_API_KEY", "DATASET_API_KEY")
45
+ _FORMAT_ALIASES = {"yolo": "Yolo", "coco": "Coco", "pascalvoc": "PascalVoc", "voc": "PascalVoc"}
46
+ _COMPLETE_MARKER = ".wavefront-complete"
47
+
48
+
49
+ def _mask(key: str) -> str:
50
+ """A safe-to-log fingerprint of a secret: never the secret itself."""
51
+ return f"{key[:3]}…{key[-2:]} ({len(key)} chars)" if len(key) >= 6 else "set"
52
+
53
+
54
+ def _resolve_key(api_key: Optional[str]) -> tuple[str, str]:
55
+ """Return (key, source) — source is 'argument' or the env var name."""
56
+ if api_key:
57
+ return api_key, "argument"
58
+ for name in API_KEY_ENV:
59
+ v = os.environ.get(name)
60
+ if v:
61
+ return v, name
62
+ raise AuthError(
63
+ "No API key provided. Pass api_key=... or set the FW_API_TOKEN "
64
+ "environment variable (also accepted: " + ", ".join(API_KEY_ENV[1:]) + ")."
65
+ )
66
+
67
+
68
+ def _canonical_format(fmt: str) -> str:
69
+ return _FORMAT_ALIASES.get(fmt.lower(), fmt)
70
+
71
+
72
+ def _default_cache_root() -> Path:
73
+ root = os.environ.get("WAVEFRONT_CACHE")
74
+ if root:
75
+ return Path(root)
76
+ base = os.environ.get("XDG_CACHE_HOME") or os.path.join(os.path.expanduser("~"), ".cache")
77
+ return Path(base) / "wavefront"
78
+
79
+
80
+ def _human_bytes(n: Optional[int]) -> str:
81
+ if not n:
82
+ return "?"
83
+ f = float(n)
84
+ for unit in ("B", "KB", "MB", "GB", "TB"):
85
+ if f < 1024 or unit == "TB":
86
+ return f"{f:.0f} {unit}" if unit == "B" else f"{f:.1f} {unit}"
87
+ f /= 1024
88
+ return f"{f:.1f} TB"
89
+
90
+
91
+ class Client:
92
+ """A reusable finwave dataset client.
93
+
94
+ Parameters
95
+ ----------
96
+ api_key:
97
+ Dataset-download-scoped key. If omitted, the ``FW_API_TOKEN`` environment
98
+ variable is used (also accepted: ``WAVEFRONT_API_KEY``,
99
+ ``FINWAVE_DATASET_API_KEY``, ``DATASET_API_KEY``).
100
+ base_url:
101
+ finwave base URL (default ``https://finwave.io``).
102
+ timeout:
103
+ Per-request timeout in seconds for the API calls (the large artifact
104
+ download uses a longer, separate timeout).
105
+ """
106
+
107
+ def __init__(
108
+ self,
109
+ api_key: Optional[str] = None,
110
+ *,
111
+ base_url: str = DEFAULT_BASE_URL,
112
+ timeout: float = 30.0,
113
+ ) -> None:
114
+ self.api_key, source = _resolve_key(api_key)
115
+ self.base_url = base_url.rstrip("/")
116
+ self.timeout = timeout
117
+ log.info("wavefront client ready: base_url=%s, key from %s [%s]",
118
+ self.base_url, source, _mask(self.api_key))
119
+
120
+ # ── low-level ────────────────────────────────────────────────────────────
121
+ def _get(self, path: str, **kwargs) -> httpx.Response:
122
+ url = f"{self.base_url}/api/datasets-api/{path}"
123
+ log.debug("GET %s %s", url, kwargs.get("params", ""))
124
+ try:
125
+ resp = httpx.get(url, headers={"X-API-KEY": self.api_key},
126
+ timeout=self.timeout, **kwargs)
127
+ except httpx.HTTPError as e: # network-level
128
+ log.error("request to %s failed: %s", url, e)
129
+ raise APIError(f"request to {url} failed: {e}") from e
130
+ log.debug("→ HTTP %d (%s)", resp.status_code, _human_bytes(len(resp.content)))
131
+ if resp.status_code in (401, 403):
132
+ raise AuthError(
133
+ "API key rejected (HTTP %d) — check the key and that it has the "
134
+ "dataset-download scope." % resp.status_code
135
+ )
136
+ return resp
137
+
138
+ @staticmethod
139
+ def _error_payload(resp: httpx.Response) -> dict:
140
+ try:
141
+ return resp.json()
142
+ except Exception:
143
+ return {}
144
+
145
+ # ── public API ───────────────────────────────────────────────────────────
146
+ def manifest(self, dataset_version_id: str) -> Manifest:
147
+ """Return version metadata + available export formats (no download)."""
148
+ log.info("manifest: requesting %s", dataset_version_id)
149
+ resp = self._get(f"{dataset_version_id}/manifest")
150
+ if resp.status_code == 404:
151
+ raise DatasetNotFoundError(
152
+ f"dataset version {dataset_version_id!r} not found (or not visible to this key)"
153
+ )
154
+ if resp.status_code != 200:
155
+ raise APIError("manifest request failed", status_code=resp.status_code,
156
+ payload=self._error_payload(resp))
157
+ m = Manifest.from_response(resp.json())
158
+ log.info("manifest: '%s' v%d — %d samples, %d annotations, formats=%s",
159
+ m.name, m.version_number, m.sample_count, m.annotation_count,
160
+ m.available_formats or "none yet")
161
+ return m
162
+
163
+ def fetch(
164
+ self,
165
+ dataset_version_id: str,
166
+ *,
167
+ format: str = "yolo",
168
+ dest: Optional[os.PathLike] = None,
169
+ cache: bool = True,
170
+ force: bool = False,
171
+ progress: Optional[Callable[[int, Optional[int]], None]] = None,
172
+ ) -> Dataset:
173
+ """Fetch + extract a dataset version, returning a :class:`Dataset`.
174
+
175
+ Parameters
176
+ ----------
177
+ format:
178
+ Export format, case-insensitive (``"yolo"`` by default).
179
+ dest:
180
+ Directory to extract into. Defaults to the fingerprint-keyed cache.
181
+ cache:
182
+ Reuse a previously-completed download of the same frozen fingerprint.
183
+ force:
184
+ Re-download even if a cached copy exists.
185
+ progress:
186
+ Optional callback ``(bytes_downloaded, total_or_None)`` for the
187
+ artifact download.
188
+ """
189
+ fmt = _canonical_format(format)
190
+ log.info("fetch: %s (format=%s)", dataset_version_id, fmt)
191
+ m = self.manifest(dataset_version_id)
192
+ if not m.has_format(fmt):
193
+ raise FormatNotAvailableError(
194
+ f"format {fmt!r} is not available for '{m.name}'. "
195
+ f"Available: {m.available_formats or 'none yet — an export must be generated'}.",
196
+ available=m.available_formats,
197
+ )
198
+
199
+ if dest is not None:
200
+ out = Path(dest)
201
+ else:
202
+ out = _default_cache_root() / f"{dataset_version_id}" / f"{fmt}-{m.fingerprint[:12]}"
203
+
204
+ marker = out / _COMPLETE_MARKER
205
+ if cache and not force and marker.exists() and marker.read_text().strip() == m.fingerprint:
206
+ ds = Dataset.from_extracted(root=out, manifest=m, fmt=fmt)
207
+ log.info("fetch: cache hit (fingerprint %s) → %s [%d images]",
208
+ m.fingerprint[:12], out, ds.num_images)
209
+ return ds
210
+
211
+ log.info("fetch: requesting download handshake…")
212
+ download_url = self._handshake(dataset_version_id, fmt)
213
+ out.mkdir(parents=True, exist_ok=True)
214
+ with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as tmp:
215
+ tmp_path = Path(tmp.name)
216
+ try:
217
+ self._download(download_url, tmp_path, progress=progress)
218
+ log.info("fetch: extracting to %s", out)
219
+ for child in out.iterdir():
220
+ if child.name == _COMPLETE_MARKER:
221
+ continue
222
+ shutil.rmtree(child) if child.is_dir() else child.unlink()
223
+ with zipfile.ZipFile(tmp_path) as zf:
224
+ zf.extractall(out)
225
+ finally:
226
+ tmp_path.unlink(missing_ok=True)
227
+ marker.write_text(m.fingerprint)
228
+ ds = Dataset.from_extracted(root=out, manifest=m, fmt=fmt)
229
+ log.info("fetch: ready → %s [%d images, %d labels, classes=%s]",
230
+ out, ds.num_images, ds.num_labels, ds.classes)
231
+ return ds
232
+
233
+ # ── internals ────────────────────────────────────────────────────────────
234
+ def _handshake(self, dataset_version_id: str, fmt: str) -> str:
235
+ resp = self._get(dataset_version_id, params={"format": fmt})
236
+ if resp.status_code == 404:
237
+ payload = self._error_payload(resp)
238
+ detail = (payload.get("detail") or "").lower()
239
+ if "format" in detail:
240
+ raise FormatNotAvailableError(
241
+ f"format {fmt!r} has not been produced for this version yet "
242
+ "(exports are generated separately from freezing)."
243
+ )
244
+ raise DatasetNotFoundError(f"dataset version {dataset_version_id!r} not found")
245
+ if resp.status_code != 200:
246
+ raise APIError("handshake failed", status_code=resp.status_code,
247
+ payload=self._error_payload(resp))
248
+ body = resp.json()
249
+ url = body.get("downloadUrl")
250
+ if not url:
251
+ raise APIError("handshake response had no downloadUrl",
252
+ status_code=resp.status_code, payload=body)
253
+ log.info("handshake: signed URL minted (expires %s)",
254
+ body.get("sasExpiresAt", "soon"))
255
+ return url
256
+
257
+ def _download(self, url: str, dest: Path, *,
258
+ progress: Optional[Callable[[int, Optional[int]], None]] = None) -> None:
259
+ # The download URL is a pre-signed object URL — no API key, long timeout.
260
+ t0 = time.monotonic()
261
+ with httpx.stream("GET", url, timeout=httpx.Timeout(None, connect=30.0),
262
+ follow_redirects=True) as resp:
263
+ if resp.status_code != 200:
264
+ raise APIError(f"artifact download failed (HTTP {resp.status_code})",
265
+ status_code=resp.status_code)
266
+ total = int(resp.headers.get("Content-Length", 0)) or None
267
+ log.info("download: %s …", _human_bytes(total))
268
+ got = 0
269
+ with open(dest, "wb") as f:
270
+ for chunk in resp.iter_bytes(1 << 20):
271
+ f.write(chunk)
272
+ got += len(chunk)
273
+ if progress is not None:
274
+ progress(got, total)
275
+ dt = time.monotonic() - t0
276
+ rate = got / dt / (1 << 20) if dt > 0 else 0
277
+ log.info("download: %s in %.1fs (%.0f MB/s)", _human_bytes(got), dt, rate)
@@ -0,0 +1,49 @@
1
+ """Exceptions raised by wavefront.
2
+
3
+ All inherit from :class:`WavefrontError`, so callers can catch the whole family
4
+ with a single ``except WavefrontError``.
5
+ """
6
+ from __future__ import annotations
7
+
8
+
9
+ class WavefrontError(Exception):
10
+ """Base class for all wavefront errors."""
11
+
12
+
13
+ class AuthError(WavefrontError):
14
+ """The API key was missing, malformed, or rejected (HTTP 401/403)."""
15
+
16
+
17
+ class DatasetNotFoundError(WavefrontError):
18
+ """No dataset version with the given id is visible to this key (HTTP 404)."""
19
+
20
+
21
+ class FormatNotAvailableError(WavefrontError):
22
+ """The requested export format has not been produced for this version yet.
23
+
24
+ The dataset exists but ``availableFormats`` does not include the requested
25
+ format — an admin must generate the export first (it is not produced on
26
+ freeze). :attr:`available` lists what *is* ready.
27
+ """
28
+
29
+ def __init__(self, message: str, *, available: list[str] | None = None) -> None:
30
+ super().__init__(message)
31
+ self.available = available or []
32
+
33
+
34
+ class IntegrityError(WavefrontError):
35
+ """A downloaded artifact did not match the fingerprint the server declared."""
36
+
37
+
38
+ class APIError(WavefrontError):
39
+ """An unexpected, non-success response from the finwave API.
40
+
41
+ :attr:`status_code` is the HTTP status; :attr:`payload` is the parsed error
42
+ body when the server returned one.
43
+ """
44
+
45
+ def __init__(self, message: str, *, status_code: int | None = None,
46
+ payload: dict | None = None) -> None:
47
+ super().__init__(message)
48
+ self.status_code = status_code
49
+ self.payload = payload or {}
wavefront/models.py ADDED
@@ -0,0 +1,111 @@
1
+ """Typed views over the finwave dataset-API responses and the local result.
2
+
3
+ These are thin, read-only dataclasses; the wire shapes they mirror are the
4
+ ``/manifest`` and handshake responses of ``/api/datasets-api/{id}``.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import os
9
+ from dataclasses import dataclass, field
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Optional
13
+
14
+
15
+ def _parse_dt(value: Optional[str]) -> Optional[datetime]:
16
+ if not value:
17
+ return None
18
+ try:
19
+ return datetime.fromisoformat(value.replace("Z", "+00:00"))
20
+ except ValueError:
21
+ return None
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class Manifest:
26
+ """Cheap pre-flight metadata for a dataset version (no download minted)."""
27
+
28
+ dataset_version_id: str
29
+ parent_dataset_id: str
30
+ name: str
31
+ version_number: int
32
+ fingerprint: str
33
+ sample_count: int
34
+ annotation_count: int
35
+ includes_negatives: bool
36
+ available_formats: list[str]
37
+ frozen_at: Optional[datetime] = None
38
+ raw: dict = field(default_factory=dict, repr=False)
39
+
40
+ @classmethod
41
+ def from_response(cls, d: dict) -> "Manifest":
42
+ return cls(
43
+ dataset_version_id=d.get("datasetVersionId", ""),
44
+ parent_dataset_id=d.get("parentDatasetId", ""),
45
+ name=d.get("name", ""),
46
+ version_number=int(d.get("versionNumber", 0) or 0),
47
+ fingerprint=d.get("fingerprint", ""),
48
+ sample_count=int(d.get("sampleCount", 0) or 0),
49
+ annotation_count=int(d.get("annotationCount", 0) or 0),
50
+ includes_negatives=bool(d.get("includesNegatives", False)),
51
+ available_formats=list(d.get("availableFormats", []) or []),
52
+ frozen_at=_parse_dt(d.get("frozenAt")),
53
+ raw=d,
54
+ )
55
+
56
+ def has_format(self, fmt: str) -> bool:
57
+ return fmt.lower() in {f.lower() for f in self.available_formats}
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class Dataset:
62
+ """A fetched, extracted dataset on local disk.
63
+
64
+ ``fingerprint`` is the server's content hash for this exact version — record
65
+ it alongside any model you train so the data is traceable.
66
+ """
67
+
68
+ id: str
69
+ name: str
70
+ version: int
71
+ fmt: str
72
+ fingerprint: str
73
+ path: Path
74
+ classes: list[str] = field(default_factory=list)
75
+ num_images: int = 0
76
+ num_labels: int = 0
77
+
78
+ def __fspath__(self) -> str: # usable directly as a path
79
+ return str(self.path)
80
+
81
+ def __str__(self) -> str:
82
+ return str(self.path)
83
+
84
+ @property
85
+ def images_dir(self) -> Path:
86
+ return self.path / "images"
87
+
88
+ @property
89
+ def labels_dir(self) -> Path:
90
+ return self.path / "labels"
91
+
92
+ @classmethod
93
+ def from_extracted(cls, *, root: Path, manifest: Manifest, fmt: str) -> "Dataset":
94
+ exts = (".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp")
95
+ imgs = [p for p in root.rglob("*") if p.suffix.lower() in exts]
96
+ labels = [p for p in root.rglob("*.txt") if "label" in str(p.parent).lower()]
97
+ classes: list[str] = []
98
+ cf = next((p for p in root.rglob("classes.txt")), None)
99
+ if cf is not None:
100
+ classes = [ln.strip() for ln in cf.read_text().splitlines() if ln.strip()]
101
+ return cls(
102
+ id=manifest.dataset_version_id,
103
+ name=manifest.name,
104
+ version=manifest.version_number,
105
+ fmt=fmt,
106
+ fingerprint=manifest.fingerprint,
107
+ path=root,
108
+ classes=classes,
109
+ num_images=len(imgs),
110
+ num_labels=len(labels),
111
+ )