cortexforge 0.1.0__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {cortexforge-0.1.0 → cortexforge-0.1.3}/PKG-INFO +1 -1
  2. {cortexforge-0.1.0 → cortexforge-0.1.3}/README.md +5 -10
  3. {cortexforge-0.1.0 → cortexforge-0.1.3}/pyproject.toml +5 -3
  4. cortexforge-0.1.3/src/cortexforge/cli/__init__.py +60 -0
  5. cortexforge-0.1.3/src/cortexforge/cli/datasets.py +56 -0
  6. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/cli/forge.py +8 -8
  7. cortexforge-0.1.3/src/cortexforge/cli/planner.py +88 -0
  8. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/datasets/__init__.py +4 -0
  9. cortexforge-0.1.3/src/cortexforge/datasets/api.py +366 -0
  10. cortexforge-0.1.3/src/cortexforge/datasets/main.py +77 -0
  11. cortexforge-0.1.3/src/cortexforge/datasets/registry.json +15 -0
  12. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/main.py +5 -2
  13. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/planner/generators/experiment_scenario.py +77 -36
  14. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/planner/main.py +15 -6
  15. cortexforge-0.1.3/src/cortexforge/planner/modulations.py +26 -0
  16. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge.egg-info/PKG-INFO +1 -1
  17. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge.egg-info/SOURCES.txt +4 -0
  18. cortexforge-0.1.3/src/cortexforge.egg-info/entry_points.txt +2 -0
  19. cortexforge-0.1.0/src/cortexforge/cli/planner.py +0 -45
  20. cortexforge-0.1.0/src/cortexforge/datasets/api.py +0 -88
  21. cortexforge-0.1.0/src/cortexforge/utils/__init__.py +0 -0
  22. cortexforge-0.1.0/src/cortexforge.egg-info/entry_points.txt +0 -3
  23. {cortexforge-0.1.0 → cortexforge-0.1.3}/setup.cfg +0 -0
  24. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/datasets/hash.py +0 -0
  25. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/datasets/local.py +0 -0
  26. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/datasets/manifest.py +0 -0
  27. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/datasets/types.py +0 -0
  28. {cortexforge-0.1.0/src/cortexforge/cli → cortexforge-0.1.3/src/cortexforge/forge}/__init__.py +0 -0
  29. {cortexforge-0.1.0/src/cortexforge/forge → cortexforge-0.1.3/src/cortexforge/forge/radio}/__init__.py +0 -0
  30. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/radio/rx.py +0 -0
  31. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/radio/rx_recorder.py +0 -0
  32. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/radio/tx.py +0 -0
  33. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/radio/tx_burst.py +0 -0
  34. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/radio/waveforms.py +0 -0
  35. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/radio/waveforms_analog.py +0 -0
  36. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/radio/waveforms_numerique.py +0 -0
  37. {cortexforge-0.1.0/src/cortexforge/forge/radio → cortexforge-0.1.3/src/cortexforge/forge/utils}/__init__.py +0 -0
  38. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/compute_baseline.py +0 -0
  39. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/load_timeline.py +0 -0
  40. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/node_identity.py +0 -0
  41. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/node_layout.py +0 -0
  42. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/sigmf/hash.py +0 -0
  43. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/sigmf/sigmf_annotations.py +0 -0
  44. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/sigmf/sigmf_captures.py +0 -0
  45. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/sigmf/sigmf_global.py +0 -0
  46. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/sigmf_writer.py +0 -0
  47. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/sync_barrier/rx_barrier_server.py +0 -0
  48. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/sync_barrier/sync_config.py +0 -0
  49. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/sync_barrier/tx_barrier_client.py +0 -0
  50. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/forge/utils/uhd_time.py +0 -0
  51. {cortexforge-0.1.0/src/cortexforge/forge/utils → cortexforge-0.1.3/src/cortexforge/planner}/__init__.py +0 -0
  52. {cortexforge-0.1.0/src/cortexforge/planner → cortexforge-0.1.3/src/cortexforge/planner/generators}/__init__.py +0 -0
  53. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/planner/generators/cortexlab_scenario.py +0 -0
  54. {cortexforge-0.1.0/src/cortexforge/planner/generators → cortexforge-0.1.3/src/cortexforge/utils}/__init__.py +0 -0
  55. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/utils/loader.py +0 -0
  56. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge/utils/logger.py +0 -0
  57. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge.egg-info/dependency_links.txt +0 -0
  58. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge.egg-info/requires.txt +0 -0
  59. {cortexforge-0.1.0 → cortexforge-0.1.3}/src/cortexforge.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: cortexforge
3
- Version: 0.1.0
3
+ Version: 0.1.3
4
4
  Summary: RF generator dataset based on SLICES/CorteXlab
5
5
  Author-email: Andrea Joly <andrea.joly@inria.fr>
6
6
  Requires-Python: >=3.10
@@ -1,13 +1,14 @@
1
1
  # CorteXForge
2
- This project is a framework designed to automate the generation and execution of radio dataset experiments on the [Slices/CorteXlab](https://www.cortexlab.fr/doku.php?id=start) testbed.
2
+ This project is a framework designed to automate the generation and execution of radio dataset experiments on the [SLICES-RI/CorteXlab](https://www.cortexlab.fr/doku.php?id=start) testbed.
3
3
  It relies on the [GNU Radio](https://www.gnuradio.org) environment to record labeled transmissions of various signals.
4
4
 
5
5
  ## Overview
6
- This project is organized into two main components:
6
+ This project is organized into three main components:
7
7
  - Scenario generation: this part produces configuration files describing the experiment setup. It creates:
8
8
  - a `scenario.yaml` file defining which nodes will be used on CorteXlab;
9
9
  - an `timeline.csv` file orchestrating the role and sequence of these nodes.
10
- - Experiment execution: this part deploys and executes the generates experiment definitions (`timeline.csv`) directly on the [Slices/CorteXlab](https://www.cortexlab.fr/doku.php?id=start) nodes.
10
+ - Experiment execution: this part deploys and executes the generates experiment definitions (`timeline.csv`) directly on the [SLICES-RI/CorteXlab](https://www.cortexlab.fr/doku.php?id=start) nodes.
11
+ - Dataset API
11
12
 
12
13
  ## Quick start (User Guide) :rocket:
13
14
 
@@ -21,7 +22,7 @@ The scenario generator can be executed locally before deployment in Slices/Cort
21
22
  ### Example usage
22
23
  - ```git clone https://github.com/Andreaj42/CorteXForge.git```
23
24
  - ```python3.13 -m venv .venv```
24
- - ```source .venv/bin/activate```
25
+ - ```. .venv/bin/activate```
25
26
  - ```pip install -e .[planner]```
26
27
  - ```cortexforge-planner --nodes-path confis/nodes.yaml --duration 600 --output-path my/path/on/cortexlab```
27
28
 
@@ -49,12 +50,6 @@ To monitor your experiment, use:
49
50
  - ```minus testbed status```
50
51
  - ```minus log -d```
51
52
 
52
- ## Developer Guide :hammer_and_wrench:
53
-
54
- Now clone the project:
55
- - ```git clone https://github.com/Andreaj42/CorteXForge.git```
56
- - ```cd forge```
57
-
58
53
 
59
54
  ### Docker Images :whale:
60
55
  To simplify deployment and ensure reproductibility, we generated a Docker image.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "cortexforge"
7
- version = "0.1.0"
7
+ version = "0.1.3"
8
8
  description = "RF generator dataset based on SLICES/CorteXlab"
9
9
  authors = [
10
10
  { name = "Andrea Joly", email = "andrea.joly@inria.fr" }
@@ -31,11 +31,13 @@ forge = [
31
31
 
32
32
 
33
33
  [project.scripts]
34
- cortexforge-planner = "cortexforge.planner.main:main"
35
- cortexforge-forge = "cortexforge.forge.main:main"
34
+ cortexforge = "cortexforge.cli:main"
36
35
 
37
36
  [tool.setuptools]
38
37
  package-dir = {"" = "src"}
39
38
 
40
39
  [tool.setuptools.packages.find]
41
40
  where = ["src"]
41
+
42
+ [tool.setuptools.package-data]
43
+ "cortexforge" = ["**/*.json"]
@@ -0,0 +1,60 @@
1
+ """Unified CLI entrypoint for CorteXForge."""
2
+
3
+ from argparse import ArgumentParser, Namespace
4
+
5
+ from cortexforge.cli.datasets import configure_parser as configure_datasets_parser
6
+ from cortexforge.cli.forge import configure_parser as configure_forge_parser
7
+ from cortexforge.cli.planner import configure_parser as configure_planner_parser
8
+
9
+
10
+ def build_parser() -> ArgumentParser:
11
+ """Build the root CLI parser."""
12
+ parser = ArgumentParser(prog="cortexforge", description="CorteXForge command line")
13
+ sub = parser.add_subparsers(
14
+ dest="command",
15
+ required=True,
16
+ help="CorteXForge subcommand",
17
+ )
18
+
19
+ configure_planner_parser(
20
+ sub.add_parser("planner", help="Generate scenarios and experiment files")
21
+ )
22
+ configure_forge_parser(
23
+ sub.add_parser(
24
+ "forge", help="Run transmitter or receiver commands on CorteXlab"
25
+ )
26
+ )
27
+ configure_datasets_parser(
28
+ sub.add_parser("datasets", help="Inspect available datasets")
29
+ )
30
+ return parser
31
+
32
+
33
+ def parse_args(argv: list[str] | None = None) -> Namespace:
34
+ """Parse root CLI arguments."""
35
+ return build_parser().parse_args(argv)
36
+
37
+
38
+ def main(argv: list[str] | None = None) -> None:
39
+ """Dispatch to the selected CorteXForge subcommand."""
40
+ args = parse_args(argv)
41
+
42
+ if args.command == "planner":
43
+ from cortexforge.planner.main import run as planner_run
44
+
45
+ planner_run(args)
46
+ return
47
+
48
+ if args.command == "forge":
49
+ from cortexforge.forge.main import run as forge_run
50
+
51
+ forge_run(args)
52
+ return
53
+
54
+ if args.command == "datasets":
55
+ from cortexforge.datasets.main import run as datasets_run
56
+
57
+ datasets_run(args)
58
+ return
59
+
60
+ raise ValueError(f"Unknown command: {args.command}")
@@ -0,0 +1,56 @@
1
+ """CLI argument parser for CorteXForge datasets."""
2
+
3
+ from argparse import ArgumentParser, Namespace
4
+ from pathlib import Path
5
+
6
+
7
+ def configure_parser(parser: ArgumentParser) -> ArgumentParser:
8
+ """Attach datasets-specific arguments to an existing parser."""
9
+ parser.add_argument(
10
+ "--root",
11
+ type=Path,
12
+ default=Path("datasets"),
13
+ help="Path to the datasets root directory",
14
+ )
15
+
16
+ sub = parser.add_subparsers(
17
+ dest="datasets_command",
18
+ required=True,
19
+ help="Dataset operation to execute",
20
+ )
21
+
22
+ sub.add_parser("list", help="List available datasets and their versions")
23
+
24
+ download = sub.add_parser(
25
+ "download",
26
+ help="Download and extract a dataset version from the embedded registry",
27
+ )
28
+ download.add_argument("name", type=str, help="Dataset name")
29
+ download.add_argument(
30
+ "--version",
31
+ type=str,
32
+ default=None,
33
+ help="Dataset version to download (defaults to latest)",
34
+ )
35
+ download.add_argument(
36
+ "--force",
37
+ action="store_true",
38
+ help="Re-download and overwrite an existing local dataset version",
39
+ )
40
+
41
+ return parser
42
+
43
+
44
+ def build_parser() -> ArgumentParser:
45
+ """Build and configure the standalone datasets parser."""
46
+ parser = ArgumentParser(
47
+ prog="cortexforge datasets",
48
+ description="Inspect available datasets",
49
+ )
50
+ return configure_parser(parser)
51
+
52
+
53
+ def parse_args(argv: list[str] | None = None) -> Namespace:
54
+ """Parse command line arguments."""
55
+ parser = build_parser()
56
+ return parser.parse_args(argv)
@@ -4,14 +4,8 @@ from argparse import ArgumentParser, Namespace
4
4
  from pathlib import Path
5
5
 
6
6
 
7
- def build_parser() -> ArgumentParser:
8
- """
9
- Build and configure the cli argument parser for CorteXForge forge.
10
-
11
- Returns:
12
- ArgumentParser: Configured argument parser instance with
13
- """
14
- parser = ArgumentParser(prog="CorteXForge forge", description="Dataset Generator")
7
+ def configure_parser(parser: ArgumentParser) -> ArgumentParser:
8
+ """Attach forge-specific arguments to an existing parser."""
15
9
  sub = parser.add_subparsers(
16
10
  dest="role",
17
11
  required=True,
@@ -52,6 +46,12 @@ def build_parser() -> ArgumentParser:
52
46
  return parser
53
47
 
54
48
 
49
+ def build_parser() -> ArgumentParser:
50
+ """Build and configure the standalone forge parser."""
51
+ parser = ArgumentParser(prog="cortexforge forge", description="Dataset Generator")
52
+ return configure_parser(parser)
53
+
54
+
55
55
  def parse_args(argv: list[str] | None = None) -> Namespace:
56
56
  """Parse command line arguments."""
57
57
  parser = build_parser()
@@ -0,0 +1,88 @@
1
+ """CLI argument parser for CorteXForge planner."""
2
+
3
+ from argparse import Action, ArgumentParser, Namespace
4
+ from pathlib import Path
5
+
6
+ from cortexforge.planner.modulations import DEFAULT_MODULATIONS
7
+
8
+
9
+ def _split_modulations(value: str) -> list[str]:
10
+ return [item.strip().upper() for item in value.split(",") if item.strip()]
11
+
12
+
13
+ class _ModulationAction(Action):
14
+ """Parse comma-separated and repeated modulation values into a flat list."""
15
+
16
+ def __call__(self, parser, namespace, values, option_string=None):
17
+ modulations = list(getattr(namespace, self.dest, None) or [])
18
+ for value in values:
19
+ modulations.extend(_split_modulations(value))
20
+ unknown = sorted(set(modulations) - set(DEFAULT_MODULATIONS))
21
+ if unknown:
22
+ parser.error(
23
+ f"unsupported modulation(s): {', '.join(unknown)}. "
24
+ f"Supported values are: {', '.join(DEFAULT_MODULATIONS)}"
25
+ )
26
+ setattr(namespace, self.dest, modulations)
27
+
28
+
29
+ def configure_parser(parser: ArgumentParser) -> ArgumentParser:
30
+ """Attach planner-specific arguments to an existing parser."""
31
+ parser.add_argument(
32
+ "--username", required=True, type=str, help="username on CorteXlab"
33
+ )
34
+ parser.add_argument(
35
+ "--duration", type=int, default=60, help="Experiment duration in seconds"
36
+ )
37
+ parser.add_argument(
38
+ "--rx-frequency", type=int, default=2450000000, help="Receiver frequency"
39
+ )
40
+ parser.add_argument("--rx-gain", type=int, default=10, help="Receiver gain")
41
+ parser.add_argument(
42
+ "--rx-sample-rate", type=int, default=250000, help="Receiver sample-rate"
43
+ )
44
+ parser.add_argument(
45
+ "--overlapping",
46
+ action="store_true",
47
+ help="Allow overlapping signals in timeline",
48
+ )
49
+ parser.add_argument(
50
+ "--n-signals",
51
+ type=int,
52
+ default=288,
53
+ help=(
54
+ "Number of signals to generate. Must be a multiple of the selected "
55
+ "modulation count."
56
+ ),
57
+ )
58
+ parser.add_argument(
59
+ "--nodes-path",
60
+ type=Path,
61
+ default="configs/nodes.yaml",
62
+ help="Path to nodes.yaml file",
63
+ )
64
+ parser.add_argument(
65
+ "--modulations",
66
+ action=_ModulationAction,
67
+ nargs="+",
68
+ metavar="MOD",
69
+ default=None,
70
+ help=(
71
+ "Modulations to include in the generated dataset. "
72
+ "Use a space-separated list or comma-separated values. "
73
+ "Defaults to all planner modulations."
74
+ ),
75
+ )
76
+ return parser
77
+
78
+
79
+ def build_parser() -> ArgumentParser:
80
+ """Build and configure the standalone planner parser."""
81
+ parser = ArgumentParser(prog="cortexforge planner", description="Dataset Generator")
82
+ return configure_parser(parser)
83
+
84
+
85
+ def parse_args(argv: list[str] | None = None) -> Namespace:
86
+ """Parse command line arguments."""
87
+ parser = build_parser()
88
+ return parser.parse_args(argv)
@@ -1,6 +1,8 @@
1
1
  from .api import (
2
2
  describe_dataset,
3
+ download_dataset,
3
4
  list_datasets,
5
+ list_registry_datasets,
4
6
  list_datasets_with_versions,
5
7
  list_versions,
6
8
  load_dataset,
@@ -9,7 +11,9 @@ from .api import (
9
11
 
10
12
  __all__ = [
11
13
  "load_dataset",
14
+ "download_dataset",
12
15
  "list_datasets",
16
+ "list_registry_datasets",
13
17
  "list_datasets_with_versions",
14
18
  "describe_dataset",
15
19
  "load_manifest",
@@ -0,0 +1,366 @@
1
+ import hashlib
2
+ import json
3
+ import logging
4
+ import shutil
5
+ import subprocess
6
+ import sys
7
+ import time
8
+ from pathlib import Path
9
+ from urllib.request import Request, urlopen
10
+
11
+ from .local import load_local_dataset
12
+ from .manifest import load_manifest
13
+
14
+ REGISTRY_PATH = Path(__file__).with_name("registry.json")
15
+ DOWNLOAD_HEADERS = {
16
+ "User-Agent": "CorteXForge/0.1 (+https://github.com/Andreaj42/CorteXForge)",
17
+ "Accept": "*/*",
18
+ }
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def _load_registry() -> dict:
23
+ with REGISTRY_PATH.open("r", encoding="utf-8") as f:
24
+ return json.load(f)
25
+
26
+
27
+ def _resolve_registry_entry(name: str, version: str | None) -> tuple[str, dict]:
28
+ registry = _load_registry()
29
+
30
+ if name not in registry:
31
+ raise ValueError(f"Unknown dataset: {name}")
32
+
33
+ dataset_entry = registry[name]
34
+ resolved_version = version or dataset_entry["latest"]
35
+ versions = dataset_entry.get("versions", {})
36
+
37
+ if resolved_version not in versions:
38
+ raise ValueError(f"Unknown version '{resolved_version}' for dataset '{name}'")
39
+
40
+ return resolved_version, versions[resolved_version]
41
+
42
+
43
+ def _sha256_hex(path: Path, chunk_size: int = 1024 * 1024) -> str:
44
+ h = hashlib.sha256()
45
+ with path.open("rb") as f:
46
+ for chunk in iter(lambda: f.read(chunk_size), b""):
47
+ h.update(chunk)
48
+ return h.hexdigest()
49
+
50
+
51
+ def _format_size(size_bytes: int | float) -> str:
52
+ value = float(size_bytes)
53
+ units = ["B", "KB", "MB", "GB", "TB"]
54
+
55
+ for unit in units:
56
+ if value < 1024 or unit == units[-1]:
57
+ if unit == "B":
58
+ return f"{int(value)} {unit}"
59
+ return f"{value:.1f} {unit}"
60
+ value /= 1024
61
+
62
+ return f"{int(size_bytes)} B"
63
+
64
+
65
+ def _render_progress_bar(
66
+ downloaded_bytes: int,
67
+ total_bytes: int | None,
68
+ width: int = 28,
69
+ ) -> str:
70
+ if not total_bytes or total_bytes <= 0:
71
+ return f"[{'?' * width}] {_format_size(downloaded_bytes)}"
72
+
73
+ ratio = min(downloaded_bytes / total_bytes, 1.0)
74
+ filled = min(int(ratio * width), width)
75
+ bar = "#" * filled + "-" * (width - filled)
76
+ percent = int(ratio * 100)
77
+ return (
78
+ f"[{bar}] {percent:3d}% "
79
+ f"({_format_size(downloaded_bytes)}/{_format_size(total_bytes)})"
80
+ )
81
+
82
+
83
+ def _update_progress_line(message: str) -> None:
84
+ sys.stderr.write(f"\r{message}")
85
+ sys.stderr.flush()
86
+
87
+
88
+ def _finish_progress_line(message: str) -> None:
89
+ sys.stderr.write(f"\r{message}\n")
90
+ sys.stderr.flush()
91
+
92
+
93
+ def _download_file(url: str, destination: Path, chunk_size: int = 1024 * 1024) -> None:
94
+ request = Request(url, headers=DOWNLOAD_HEADERS)
95
+ with urlopen(request) as response, destination.open("wb") as f:
96
+ total_bytes_header = response.headers.get("Content-Length")
97
+ total_bytes = int(total_bytes_header) if total_bytes_header else None
98
+ downloaded_bytes = 0
99
+ last_logged_percent = -1
100
+ started_at = time.monotonic()
101
+
102
+ if total_bytes:
103
+ _update_progress_line(
104
+ f"Download progress: {_render_progress_bar(0, total_bytes)}"
105
+ )
106
+ else:
107
+ _update_progress_line("Download progress: starting, total size unknown")
108
+
109
+ while True:
110
+ chunk = response.read(chunk_size)
111
+ if not chunk:
112
+ break
113
+
114
+ f.write(chunk)
115
+ downloaded_bytes += len(chunk)
116
+
117
+ if total_bytes:
118
+ percent = int((downloaded_bytes / total_bytes) * 100)
119
+ if percent >= last_logged_percent + 5 or downloaded_bytes == total_bytes:
120
+ elapsed = max(time.monotonic() - started_at, 1e-9)
121
+ speed = downloaded_bytes / elapsed
122
+ _update_progress_line(
123
+ "Download progress: "
124
+ f"{_render_progress_bar(downloaded_bytes, total_bytes)} "
125
+ f"at {_format_size(speed)}/s"
126
+ )
127
+ last_logged_percent = percent
128
+ else:
129
+ if downloaded_bytes == len(chunk) or downloaded_bytes % (50 * chunk_size) == 0:
130
+ elapsed = max(time.monotonic() - started_at, 1e-9)
131
+ speed = downloaded_bytes / elapsed
132
+ _update_progress_line(
133
+ "Download progress: "
134
+ f"{_format_size(downloaded_bytes)} downloaded "
135
+ f"at {_format_size(speed)}/s"
136
+ )
137
+
138
+ elapsed = max(time.monotonic() - started_at, 1e-9)
139
+ speed = downloaded_bytes / elapsed
140
+ _finish_progress_line(
141
+ "Download completed: "
142
+ f"{_format_size(downloaded_bytes)} in {elapsed:.1f}s "
143
+ f"at {_format_size(speed)}/s"
144
+ )
145
+
146
+
147
+ def _extract_archive_with_tar(archive_path: Path, root: Path) -> None:
148
+ subprocess.run(
149
+ ["tar", "-xf", str(archive_path.resolve()), "-C", str(root.resolve())],
150
+ check=True,
151
+ capture_output=True,
152
+ text=True,
153
+ )
154
+
155
+
156
+ def _extract_archive_with_external_zstd(archive_path: Path, root: Path) -> None:
157
+ decompressor = shutil.which("unzstd")
158
+ command = None
159
+
160
+ if decompressor:
161
+ command = [decompressor, "-c", str(archive_path.resolve())]
162
+ else:
163
+ zstd = shutil.which("zstd")
164
+ if zstd:
165
+ command = [zstd, "-d", "-c", str(archive_path.resolve())]
166
+
167
+ if command is None:
168
+ raise RuntimeError(
169
+ "Could not extract .tar.zst archive because no compatible zstd "
170
+ "decompressor was found in PATH. Install 'zstd' or 'unzstd'."
171
+ )
172
+
173
+ decompress_process = subprocess.Popen(
174
+ command,
175
+ stdout=subprocess.PIPE,
176
+ stderr=subprocess.PIPE,
177
+ text=False,
178
+ )
179
+ tar_process = subprocess.Popen(
180
+ ["tar", "-xf", "-", "-C", str(root.resolve())],
181
+ stdin=decompress_process.stdout,
182
+ stdout=subprocess.PIPE,
183
+ stderr=subprocess.PIPE,
184
+ text=False,
185
+ )
186
+
187
+ if decompress_process.stdout is not None:
188
+ decompress_process.stdout.close()
189
+
190
+ tar_stdout, tar_stderr = tar_process.communicate()
191
+ _, decompress_stderr = decompress_process.communicate()
192
+
193
+ if decompress_process.returncode != 0:
194
+ stderr_text = decompress_stderr.decode("utf-8", errors="replace").strip()
195
+ raise RuntimeError(
196
+ "Failed to decompress dataset archive with zstd. "
197
+ f"Command: {' '.join(command)}. Error: {stderr_text or 'unknown error'}"
198
+ )
199
+
200
+ if tar_process.returncode != 0:
201
+ stderr_text = tar_stderr.decode("utf-8", errors="replace").strip()
202
+ raise RuntimeError(
203
+ "Failed to extract decompressed tar archive. "
204
+ f"Error: {stderr_text or 'unknown error'}"
205
+ )
206
+
207
+
208
+ def _extract_archive(archive_path: Path, root: Path) -> None:
209
+ try:
210
+ _extract_archive_with_tar(archive_path, root)
211
+ return
212
+ except subprocess.CalledProcessError as exc:
213
+ tar_error = (exc.stderr or exc.stdout or "").strip()
214
+ logger.warning(
215
+ "Direct tar extraction failed for %s: %s",
216
+ archive_path.resolve(),
217
+ tar_error or f"exit code {exc.returncode}",
218
+ )
219
+
220
+ if archive_path.suffix == ".zst" and archive_path.name.endswith(".tar.zst"):
221
+ logger.info("Retrying archive extraction with external zstd decompressor")
222
+ _extract_archive_with_external_zstd(archive_path, root)
223
+ return
224
+
225
+ raise RuntimeError(f"Could not extract archive: {archive_path.resolve()}")
226
+
227
+
228
+ def load_dataset(
229
+ name: str,
230
+ version: str,
231
+ split: str,
232
+ root: str | Path,
233
+ verify: bool = True,
234
+ ):
235
+ dataset_dir = Path(root) / name / version
236
+ return load_local_dataset(dataset_dir=dataset_dir, split=split, verify=verify)
237
+
238
+
239
+ def describe_dataset(
240
+ name: str,
241
+ version: str,
242
+ root: str | Path,
243
+ ):
244
+ dataset_dir = Path(root) / name / version
245
+ return load_manifest(dataset_dir / "manifest.json")
246
+
247
+
248
+ def download_dataset(
249
+ name: str,
250
+ root: str | Path,
251
+ version: str | None = None,
252
+ force: bool = False,
253
+ ):
254
+ root = Path(root)
255
+ resolved_version, archive_entry = _resolve_registry_entry(name, version)
256
+ dataset_dir = root / archive_entry["root_dir"]
257
+ manifest_path = dataset_dir / "manifest.json"
258
+ resolved_root = root.resolve()
259
+ resolved_dataset_dir = dataset_dir.resolve()
260
+
261
+ if manifest_path.exists() and not force:
262
+ logger.info(
263
+ "Dataset %s:%s already available at %s",
264
+ name,
265
+ resolved_version,
266
+ resolved_dataset_dir,
267
+ )
268
+ return load_manifest(manifest_path)
269
+
270
+ archive_name = archive_entry["archive_name"]
271
+ archive_path = root / archive_name
272
+ url = archive_entry["url"]
273
+ expected_sha256 = archive_entry["sha256"]
274
+
275
+ root.mkdir(parents=True, exist_ok=True)
276
+
277
+ if dataset_dir.exists() and force:
278
+ logger.info(
279
+ "Removing existing dataset %s:%s from %s",
280
+ name,
281
+ resolved_version,
282
+ resolved_dataset_dir,
283
+ )
284
+ shutil.rmtree(dataset_dir)
285
+
286
+ logger.info("Download timeline for %s:%s", name, resolved_version)
287
+ logger.info("1/4 Resolve target directory: %s", resolved_root)
288
+ logger.info("2/4 Download archive from %s", url)
289
+
290
+ try:
291
+ _download_file(url, archive_path)
292
+
293
+ logger.info("3/4 Verify archive checksum: %s", archive_path.resolve())
294
+ actual_sha256 = _sha256_hex(archive_path)
295
+ if actual_sha256 != expected_sha256:
296
+ raise ValueError(
297
+ "Downloaded archive checksum mismatch for "
298
+ f"{name}:{resolved_version}. Expected {expected_sha256}, got {actual_sha256}"
299
+ )
300
+
301
+ logger.info("4/4 Extract archive into %s", resolved_root)
302
+ _extract_archive(archive_path, root)
303
+ finally:
304
+ archive_path.unlink(missing_ok=True)
305
+
306
+ if not manifest_path.exists():
307
+ raise FileNotFoundError(
308
+ f"Dataset archive for {name}:{resolved_version} did not contain {manifest_path}"
309
+ )
310
+
311
+ logger.info(
312
+ "Dataset %s:%s installed in %s",
313
+ name,
314
+ resolved_version,
315
+ resolved_dataset_dir,
316
+ )
317
+
318
+ return load_manifest(manifest_path)
319
+
320
+
321
+ def list_datasets(root: str | Path | None = None) -> list[str]:
322
+ del root
323
+ registry = _load_registry()
324
+ return sorted(registry.keys())
325
+
326
+
327
+ def list_versions(name: str, root: str | Path | None = None) -> list[str]:
328
+ del root
329
+ registry = _load_registry()
330
+
331
+ if name not in registry:
332
+ raise ValueError(f"Unknown dataset: {name}")
333
+
334
+ versions = registry[name].get("versions", {})
335
+ return sorted(versions.keys())
336
+
337
+
338
+ def list_datasets_with_versions(root: str | Path | None = None) -> dict[str, list[str]]:
339
+ del root
340
+ registry = _load_registry()
341
+ return {
342
+ dataset_name: sorted(dataset_entry.get("versions", {}).keys())
343
+ for dataset_name, dataset_entry in sorted(registry.items())
344
+ }
345
+
346
+
347
+ def list_registry_datasets(root: str | Path | None = None) -> list[dict]:
348
+ del root
349
+ registry = _load_registry()
350
+ result = []
351
+
352
+ for dataset_name, dataset_entry in sorted(registry.items()):
353
+ latest = dataset_entry["latest"]
354
+ latest_entry = dataset_entry.get("versions", {}).get(latest, {})
355
+ result.append(
356
+ {
357
+ "name": dataset_name,
358
+ "latest": latest,
359
+ "versions": sorted(dataset_entry.get("versions", {}).keys()),
360
+ "description": latest_entry.get("description", ""),
361
+ "size_bytes": latest_entry.get("size_bytes"),
362
+ "root_dir": latest_entry.get("root_dir"),
363
+ }
364
+ )
365
+
366
+ return result