atdata 0.2.3b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +30 -0
  3. atdata/_exceptions.py +168 -0
  4. atdata/_helpers.py +29 -15
  5. atdata/_hf_api.py +63 -11
  6. atdata/_logging.py +70 -0
  7. atdata/_protocols.py +19 -62
  8. atdata/_schema_codec.py +5 -4
  9. atdata/_type_utils.py +28 -2
  10. atdata/atmosphere/__init__.py +19 -9
  11. atdata/atmosphere/records.py +3 -2
  12. atdata/atmosphere/schema.py +2 -2
  13. atdata/cli/__init__.py +157 -171
  14. atdata/cli/inspect.py +69 -0
  15. atdata/cli/local.py +1 -1
  16. atdata/cli/preview.py +63 -0
  17. atdata/cli/schema.py +109 -0
  18. atdata/dataset.py +428 -326
  19. atdata/lens.py +9 -2
  20. atdata/local/__init__.py +71 -0
  21. atdata/local/_entry.py +157 -0
  22. atdata/local/_index.py +940 -0
  23. atdata/local/_repo_legacy.py +218 -0
  24. atdata/local/_s3.py +349 -0
  25. atdata/local/_schema.py +380 -0
  26. atdata/manifest/__init__.py +28 -0
  27. atdata/manifest/_aggregates.py +156 -0
  28. atdata/manifest/_builder.py +163 -0
  29. atdata/manifest/_fields.py +154 -0
  30. atdata/manifest/_manifest.py +146 -0
  31. atdata/manifest/_query.py +150 -0
  32. atdata/manifest/_writer.py +74 -0
  33. atdata/promote.py +4 -4
  34. atdata/providers/__init__.py +25 -0
  35. atdata/providers/_base.py +140 -0
  36. atdata/providers/_factory.py +69 -0
  37. atdata/providers/_postgres.py +214 -0
  38. atdata/providers/_redis.py +171 -0
  39. atdata/providers/_sqlite.py +191 -0
  40. atdata/repository.py +323 -0
  41. atdata/testing.py +337 -0
  42. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +4 -1
  43. atdata-0.3.0b1.dist-info/RECORD +54 -0
  44. atdata/local.py +0 -1720
  45. atdata-0.2.3b1.dist-info/RECORD +0 -28
  46. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
  47. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
  48. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/_schema_codec.py CHANGED
@@ -28,13 +28,14 @@ import hashlib
28
28
 
29
29
  from numpy.typing import NDArray
30
30
 
31
- # Import PackableSample for inheritance
31
+ # Import PackableSample for inheritance in dynamic class generation
32
32
  from .dataset import PackableSample
33
+ from ._protocols import Packable
33
34
 
34
35
 
35
36
  # Type cache to avoid regenerating identical types
36
37
  # Uses insertion order (Python 3.7+) for simple FIFO eviction
37
- _type_cache: dict[str, Type[PackableSample]] = {}
38
+ _type_cache: dict[str, Type[Packable]] = {}
38
39
  _TYPE_CACHE_MAX_SIZE = 256
39
40
 
40
41
 
@@ -130,7 +131,7 @@ def schema_to_type(
130
131
  schema: dict,
131
132
  *,
132
133
  use_cache: bool = True,
133
- ) -> Type[PackableSample]:
134
+ ) -> Type[Packable]:
134
135
  """Generate a PackableSample subclass from a schema record.
135
136
 
136
137
  This function dynamically creates a dataclass that inherits from PackableSample,
@@ -420,7 +421,7 @@ def clear_type_cache() -> None:
420
421
  _type_cache.clear()
421
422
 
422
423
 
423
- def get_cached_types() -> dict[str, Type[PackableSample]]:
424
+ def get_cached_types() -> dict[str, Type[Packable]]:
424
425
  """Get a copy of the current type cache.
425
426
 
426
427
  Returns:
atdata/_type_utils.py CHANGED
@@ -45,9 +45,13 @@ def numpy_dtype_to_string(dtype: Any) -> str:
45
45
  Schema dtype string (e.g., "float32", "int64"). Defaults to "float32".
46
46
  """
47
47
  dtype_str = str(dtype)
48
- for key, value in NUMPY_DTYPE_MAP.items():
48
+ # Exact match first (handles "float32", "int64", etc.)
49
+ if dtype_str in NUMPY_DTYPE_MAP:
50
+ return NUMPY_DTYPE_MAP[dtype_str]
51
+ # Substring match, longest keys first to avoid "int8" matching "uint8"
52
+ for key in sorted(NUMPY_DTYPE_MAP, key=len, reverse=True):
49
53
  if key in dtype_str:
50
- return value
54
+ return NUMPY_DTYPE_MAP[key]
51
55
  return "float32"
52
56
 
53
57
 
@@ -102,3 +106,25 @@ def extract_ndarray_dtype(python_type: Any) -> str:
102
106
  if dtype_arg is not None:
103
107
  return numpy_dtype_to_string(dtype_arg)
104
108
  return "float32"
109
+
110
+
111
+ def parse_semver(version: str) -> tuple[int, int, int]:
112
+ """Parse a semantic version string into a comparable tuple.
113
+
114
+ Args:
115
+ version: A ``"major.minor.patch"`` version string.
116
+
117
+ Returns:
118
+ Tuple of (major, minor, patch) integers.
119
+
120
+ Raises:
121
+ ValueError: If the version string is not valid semver.
122
+
123
+ Examples:
124
+ >>> parse_semver("1.2.3")
125
+ (1, 2, 3)
126
+ """
127
+ parts = version.split(".")
128
+ if len(parts) != 3:
129
+ raise ValueError(f"Invalid semver: {version}")
130
+ return int(parts[0]), int(parts[1]), int(parts[2])
@@ -99,23 +99,25 @@ class AtmosphereIndexEntry:
99
99
  class AtmosphereIndex:
100
100
  """ATProto index implementing AbstractIndex protocol.
101
101
 
102
+ .. deprecated::
103
+ Use ``atdata.Index(atmosphere=client)`` instead. ``AtmosphereIndex``
104
+ is retained for backwards compatibility and will be removed in a
105
+ future release.
106
+
102
107
  Wraps SchemaPublisher/Loader and DatasetPublisher/Loader to provide
103
- a unified interface compatible with LocalIndex.
108
+ a unified interface compatible with Index.
104
109
 
105
110
  Optionally accepts a ``PDSBlobStore`` for writing dataset shards as
106
111
  ATProto blobs, enabling fully decentralized dataset storage.
107
112
 
108
113
  Examples:
109
- >>> client = AtmosphereClient()
110
- >>> client.login("handle.bsky.social", "app-password")
114
+ >>> # Preferred: use unified Index
115
+ >>> from atdata.local import Index
116
+ >>> from atdata.atmosphere import AtmosphereClient
117
+ >>> index = Index(atmosphere=client)
111
118
  >>>
112
- >>> # Without blob storage (external URLs only)
119
+ >>> # Legacy (deprecated)
113
120
  >>> index = AtmosphereIndex(client)
114
- >>>
115
- >>> # With PDS blob storage
116
- >>> store = PDSBlobStore(client)
117
- >>> index = AtmosphereIndex(client, data_store=store)
118
- >>> entry = index.insert_dataset(dataset, name="my-data")
119
121
  """
120
122
 
121
123
  def __init__(
@@ -131,6 +133,14 @@ class AtmosphereIndex:
131
133
  data_store: Optional PDSBlobStore for writing shards as blobs.
132
134
  If provided, insert_dataset will upload shards to PDS.
133
135
  """
136
+ import warnings
137
+
138
+ warnings.warn(
139
+ "AtmosphereIndex is deprecated. Use atdata.Index(atmosphere=client) "
140
+ "instead for unified index access.",
141
+ DeprecationWarning,
142
+ stacklevel=2,
143
+ )
134
144
  self.client = client
135
145
  self._schema_publisher = SchemaPublisher(client)
136
146
  self._schema_loader = SchemaLoader(client)
@@ -21,9 +21,10 @@ from ._types import (
21
21
  from typing import TYPE_CHECKING
22
22
 
23
23
  if TYPE_CHECKING:
24
- from ..dataset import PackableSample, Dataset
24
+ from ..dataset import Dataset
25
+ from .._protocols import Packable
25
26
 
26
- ST = TypeVar("ST", bound="PackableSample")
27
+ ST = TypeVar("ST", bound="Packable")
27
28
 
28
29
 
29
30
  class DatasetPublisher:
@@ -26,9 +26,9 @@ from .._type_utils import (
26
26
  from typing import TYPE_CHECKING
27
27
 
28
28
  if TYPE_CHECKING:
29
- from ..dataset import PackableSample
29
+ from .._protocols import Packable
30
30
 
31
- ST = TypeVar("ST", bound="PackableSample")
31
+ ST = TypeVar("ST", bound="Packable")
32
32
 
33
33
 
34
34
  class SchemaPublisher:
atdata/cli/__init__.py CHANGED
@@ -1,221 +1,207 @@
1
1
  """Command-line interface for atdata.
2
2
 
3
- This module provides CLI commands for managing local development infrastructure
4
- and diagnosing configuration issues.
3
+ This module provides CLI commands for managing local development infrastructure,
4
+ inspecting datasets, and diagnosing configuration issues.
5
5
 
6
6
  Commands:
7
- atdata local up Start Redis and MinIO containers for local development
8
- atdata local down Stop local development containers
9
- atdata diagnose Check Redis configuration and connectivity
10
- atdata version Show version information
11
-
12
- Example:
13
- $ atdata local up
14
- Starting Redis on port 6379...
15
- Starting MinIO on port 9000...
16
- Local infrastructure ready.
17
-
18
- $ atdata diagnose
19
- Checking Redis configuration...
20
- ✓ Redis connected
21
- ✓ Persistence enabled (AOF)
22
- ✓ Memory policy: noeviction
7
+ atdata local up Start Redis and MinIO containers for local development
8
+ atdata local down Stop local development containers
9
+ atdata local status Show status of local infrastructure
10
+ atdata diagnose Check Redis configuration and connectivity
11
+ atdata inspect Show dataset summary information
12
+ atdata schema show Display dataset schema
13
+ atdata schema diff Compare two dataset schemas
14
+ atdata preview Preview first N samples of a dataset
15
+ atdata version Show version information
23
16
  """
24
17
 
25
- import argparse
26
18
  import sys
27
- from typing import Sequence
28
19
 
20
+ import typer
29
21
 
30
- def main(argv: Sequence[str] | None = None) -> int:
31
- """Main entry point for the atdata CLI.
22
+ # ---------------------------------------------------------------------------
23
+ # App hierarchy
24
+ # ---------------------------------------------------------------------------
32
25
 
33
- Args:
34
- argv: Command-line arguments. If None, uses sys.argv[1:].
26
+ app = typer.Typer(
27
+ name="atdata",
28
+ help="A loose federation of distributed, typed datasets.",
29
+ add_completion=False,
30
+ no_args_is_help=True,
31
+ )
35
32
 
36
- Returns:
37
- Exit code (0 for success, non-zero for errors).
38
- """
39
- parser = argparse.ArgumentParser(
40
- prog="atdata",
41
- description="A loose federation of distributed, typed datasets",
42
- formatter_class=argparse.RawDescriptionHelpFormatter,
43
- )
44
- parser.add_argument(
45
- "--version",
46
- "-v",
47
- action="store_true",
48
- help="Show version information",
49
- )
33
+ local_app = typer.Typer(
34
+ name="local",
35
+ help="Manage local development infrastructure.",
36
+ no_args_is_help=True,
37
+ )
38
+ app.add_typer(local_app, name="local")
50
39
 
51
- subparsers = parser.add_subparsers(dest="command", help="Available commands")
40
+ schema_app = typer.Typer(
41
+ name="schema",
42
+ help="Show or compare dataset schemas.",
43
+ no_args_is_help=True,
44
+ )
45
+ app.add_typer(schema_app, name="schema")
52
46
 
53
- # 'local' command group
54
- local_parser = subparsers.add_parser(
55
- "local",
56
- help="Manage local development infrastructure",
57
- )
58
- local_subparsers = local_parser.add_subparsers(
59
- dest="local_command",
60
- help="Local infrastructure commands",
61
- )
62
47
 
63
- # 'local up' command
64
- up_parser = local_subparsers.add_parser(
65
- "up",
66
- help="Start Redis and MinIO containers",
67
- )
68
- up_parser.add_argument(
69
- "--redis-port",
70
- type=int,
71
- default=6379,
72
- help="Redis port (default: 6379)",
73
- )
74
- up_parser.add_argument(
75
- "--minio-port",
76
- type=int,
77
- default=9000,
78
- help="MinIO API port (default: 9000)",
79
- )
80
- up_parser.add_argument(
81
- "--minio-console-port",
82
- type=int,
83
- default=9001,
84
- help="MinIO console port (default: 9001)",
85
- )
86
- up_parser.add_argument(
87
- "--detach",
88
- "-d",
89
- action="store_true",
90
- default=True,
91
- help="Run containers in detached mode (default: True)",
92
- )
48
+ # ---------------------------------------------------------------------------
49
+ # Top-level commands
50
+ # ---------------------------------------------------------------------------
93
51
 
94
- # 'local down' command
95
- down_parser = local_subparsers.add_parser(
96
- "down",
97
- help="Stop local development containers",
98
- )
99
- down_parser.add_argument(
100
- "--volumes",
101
- "-v",
102
- action="store_true",
103
- help="Also remove volumes (deletes all data)",
104
- )
105
52
 
106
- # 'local status' command
107
- local_subparsers.add_parser(
108
- "status",
109
- help="Show status of local infrastructure",
110
- )
53
+ @app.command()
54
+ def version() -> None:
55
+ """Show version information."""
56
+ try:
57
+ from atdata import __version__
111
58
 
112
- # 'diagnose' command
113
- diagnose_parser = subparsers.add_parser(
114
- "diagnose",
115
- help="Diagnose Redis configuration and connectivity",
116
- )
117
- diagnose_parser.add_argument(
118
- "--host",
119
- default="localhost",
120
- help="Redis host (default: localhost)",
121
- )
122
- diagnose_parser.add_argument(
123
- "--port",
124
- type=int,
125
- default=6379,
126
- help="Redis port (default: 6379)",
127
- )
59
+ ver = __version__
60
+ except ImportError:
61
+ from importlib.metadata import version as pkg_version
128
62
 
129
- # 'version' command (alternative to --version flag)
130
- subparsers.add_parser(
131
- "version",
132
- help="Show version information",
133
- )
63
+ ver = pkg_version("atdata")
134
64
 
135
- args = parser.parse_args(argv)
136
-
137
- # Handle --version flag
138
- if args.version or args.command == "version":
139
- return _cmd_version()
140
-
141
- # Handle 'local' commands
142
- if args.command == "local":
143
- if args.local_command == "up":
144
- return _cmd_local_up(
145
- redis_port=args.redis_port,
146
- minio_port=args.minio_port,
147
- minio_console_port=args.minio_console_port,
148
- detach=args.detach,
149
- )
150
- elif args.local_command == "down":
151
- return _cmd_local_down(remove_volumes=args.volumes)
152
- elif args.local_command == "status":
153
- return _cmd_local_status()
154
- else:
155
- local_parser.print_help()
156
- return 1
65
+ print(f"atdata {ver}")
157
66
 
158
- # Handle 'diagnose' command
159
- if args.command == "diagnose":
160
- return _cmd_diagnose(host=args.host, port=args.port)
161
67
 
162
- # No command given
163
- parser.print_help()
164
- return 0
68
+ @app.command()
69
+ def inspect(
70
+ url: str = typer.Argument(help="Dataset URL, local path, or atmosphere URI"),
71
+ ) -> None:
72
+ """Show dataset summary (sample count, schema, shards)."""
73
+ from .inspect import inspect_dataset
165
74
 
75
+ code = inspect_dataset(url=url)
76
+ raise typer.Exit(code=code)
166
77
 
167
- def _cmd_version() -> int:
168
- """Show version information."""
169
- try:
170
- from atdata import __version__
171
78
 
172
- version = __version__
173
- except ImportError:
174
- # Fallback to package metadata
175
- from importlib.metadata import version as pkg_version
79
+ @app.command()
80
+ def preview(
81
+ url: str = typer.Argument(help="Dataset URL, local path, or atmosphere URI"),
82
+ limit: int = typer.Option(5, help="Number of samples to preview."),
83
+ ) -> None:
84
+ """Preview first N samples of a dataset."""
85
+ from .preview import preview_dataset
86
+
87
+ code = preview_dataset(url=url, limit=limit)
88
+ raise typer.Exit(code=code)
89
+
90
+
91
+ @app.command()
92
+ def diagnose(
93
+ host: str = typer.Option("localhost", help="Redis host."),
94
+ port: int = typer.Option(6379, help="Redis port."),
95
+ ) -> None:
96
+ """Diagnose Redis configuration and connectivity."""
97
+ from .diagnose import diagnose_redis
176
98
 
177
- version = pkg_version("atdata")
99
+ code = diagnose_redis(host=host, port=port)
100
+ raise typer.Exit(code=code)
178
101
 
179
- print(f"atdata {version}")
180
- return 0
181
102
 
103
+ # ---------------------------------------------------------------------------
104
+ # local sub-commands
105
+ # ---------------------------------------------------------------------------
182
106
 
183
- def _cmd_local_up(
184
- redis_port: int,
185
- minio_port: int,
186
- minio_console_port: int,
187
- detach: bool,
188
- ) -> int:
189
- """Start local development infrastructure."""
107
+
108
+ @local_app.command()
109
+ def up(
110
+ redis_port: int = typer.Option(6379, help="Redis port."),
111
+ minio_port: int = typer.Option(9000, help="MinIO API port."),
112
+ minio_console_port: int = typer.Option(9001, help="MinIO console port."),
113
+ detach: bool = typer.Option(
114
+ True, "--detach", "-d", help="Run containers in detached mode."
115
+ ),
116
+ ) -> None:
117
+ """Start Redis and MinIO containers."""
190
118
  from .local import local_up
191
119
 
192
- return local_up(
120
+ code = local_up(
193
121
  redis_port=redis_port,
194
122
  minio_port=minio_port,
195
123
  minio_console_port=minio_console_port,
196
124
  detach=detach,
197
125
  )
126
+ raise typer.Exit(code=code)
198
127
 
199
128
 
200
- def _cmd_local_down(remove_volumes: bool) -> int:
201
- """Stop local development infrastructure."""
129
+ @local_app.command()
130
+ def down(
131
+ volumes: bool = typer.Option(
132
+ False, "--volumes", "-v", help="Also remove volumes (deletes all data)."
133
+ ),
134
+ ) -> None:
135
+ """Stop local development containers."""
202
136
  from .local import local_down
203
137
 
204
- return local_down(remove_volumes=remove_volumes)
138
+ code = local_down(remove_volumes=volumes)
139
+ raise typer.Exit(code=code)
205
140
 
206
141
 
207
- def _cmd_local_status() -> int:
142
+ @local_app.command()
143
+ def status() -> None:
208
144
  """Show status of local infrastructure."""
209
145
  from .local import local_status
210
146
 
211
- return local_status()
147
+ code = local_status()
148
+ raise typer.Exit(code=code)
212
149
 
213
150
 
214
- def _cmd_diagnose(host: str, port: int) -> int:
215
- """Diagnose Redis configuration."""
216
- from .diagnose import diagnose_redis
151
+ # ---------------------------------------------------------------------------
152
+ # schema sub-commands
153
+ # ---------------------------------------------------------------------------
154
+
155
+
156
+ @schema_app.command("show")
157
+ def schema_show(
158
+ dataset_ref: str = typer.Argument(
159
+ help="Dataset URL, local path, or index reference."
160
+ ),
161
+ ) -> None:
162
+ """Display dataset schema."""
163
+ from .schema import schema_show as _schema_show
164
+
165
+ code = _schema_show(dataset_ref=dataset_ref)
166
+ raise typer.Exit(code=code)
167
+
168
+
169
+ @schema_app.command("diff")
170
+ def schema_diff(
171
+ url_a: str = typer.Argument(help="First dataset URL."),
172
+ url_b: str = typer.Argument(help="Second dataset URL."),
173
+ ) -> None:
174
+ """Compare two dataset schemas."""
175
+ from .schema import schema_diff as _schema_diff
217
176
 
218
- return diagnose_redis(host=host, port=port)
177
+ code = _schema_diff(url_a=url_a, url_b=url_b)
178
+ raise typer.Exit(code=code)
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # Entrypoint
183
+ # ---------------------------------------------------------------------------
184
+
185
+
186
+ def main(argv: list[str] | None = None) -> int:
187
+ """Main entry point for the atdata CLI.
188
+
189
+ Args:
190
+ argv: Command-line arguments. If None, uses sys.argv[1:].
191
+
192
+ Returns:
193
+ Exit code (0 for success, non-zero for errors).
194
+ """
195
+ try:
196
+ if argv is not None:
197
+ app(args=argv, standalone_mode=False)
198
+ else:
199
+ app(standalone_mode=False)
200
+ return 0
201
+ except SystemExit as exc:
202
+ return exc.code if isinstance(exc.code, int) else 0
203
+ except Exception:
204
+ return 1
219
205
 
220
206
 
221
207
  if __name__ == "__main__":
atdata/cli/inspect.py ADDED
@@ -0,0 +1,69 @@
1
+ """``atdata inspect`` command — show dataset summary information."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from typing import Any
7
+
8
+
9
+ def inspect_dataset(url: str) -> int:
10
+ """Print summary information for a dataset at the given URL.
11
+
12
+ Args:
13
+ url: WebDataset URL, local path, or atmosphere URI.
14
+
15
+ Returns:
16
+ Exit code (0 success, 1 failure).
17
+ """
18
+ try:
19
+ from ..dataset import Dataset, DictSample
20
+
21
+ ds = Dataset[DictSample](url)
22
+ except Exception as exc:
23
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
24
+ return 1
25
+
26
+ try:
27
+ shards = ds.list_shards()
28
+ print(f"URL: {url}")
29
+ print(f"Shards: {len(shards)}")
30
+ for shard in shards:
31
+ print(f" - {shard}")
32
+
33
+ # Read first sample to infer schema
34
+ samples = ds.head(1)
35
+ if samples:
36
+ sample = samples[0]
37
+ print("Schema: (inferred from first sample)")
38
+ for key in sample.keys():
39
+ val = sample[key]
40
+ print(f" {key}: {_describe_value(val)}")
41
+ else:
42
+ print("Schema: (no samples found)")
43
+
44
+ # Count samples — scan all shards
45
+ count = sum(1 for _ in ds.ordered())
46
+ print(f"Samples: {count}")
47
+ except Exception as exc:
48
+ print(f"Error reading dataset: {exc}", file=sys.stderr)
49
+ return 1
50
+
51
+ return 0
52
+
53
+
54
+ def _describe_value(val: Any) -> str:
55
+ """Human-readable type description for a sample field value."""
56
+ import numpy as np
57
+
58
+ if isinstance(val, np.ndarray):
59
+ return f"ndarray dtype={val.dtype} shape={val.shape}"
60
+ if isinstance(val, bytes):
61
+ return f"bytes len={len(val)}"
62
+ if isinstance(val, str):
63
+ truncated = val[:60] + ("..." if len(val) > 60 else "")
64
+ return f'str "{truncated}"'
65
+ if isinstance(val, (int, float, bool)):
66
+ return f"{type(val).__name__} {val}"
67
+ if isinstance(val, list):
68
+ return f"list len={len(val)}"
69
+ return type(val).__name__
atdata/cli/local.py CHANGED
@@ -114,7 +114,7 @@ def _container_running(name: str) -> bool:
114
114
  timeout=5,
115
115
  )
116
116
  return result.returncode == 0 and result.stdout.strip() == "true"
117
- except Exception:
117
+ except (OSError, subprocess.SubprocessError):
118
118
  return False
119
119
 
120
120
 
atdata/cli/preview.py ADDED
@@ -0,0 +1,63 @@
1
+ """``atdata preview`` command — render first N samples of a dataset."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from typing import Any
7
+
8
+
9
+ def preview_dataset(url: str, limit: int = 5) -> int:
10
+ """Print a human-readable preview of the first *limit* samples.
11
+
12
+ Args:
13
+ url: Dataset URL, local path, or atmosphere URI.
14
+ limit: Number of samples to show. Default: 5.
15
+
16
+ Returns:
17
+ Exit code (0 success, 1 failure).
18
+ """
19
+ try:
20
+ from ..dataset import Dataset, DictSample
21
+
22
+ ds = Dataset[DictSample](url)
23
+ except Exception as exc:
24
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
25
+ return 1
26
+
27
+ samples = ds.head(limit)
28
+ if not samples:
29
+ print("No samples found.", file=sys.stderr)
30
+ return 1
31
+
32
+ print(f"Preview of {url} ({len(samples)} sample(s)):")
33
+ print()
34
+
35
+ for i, sample in enumerate(samples):
36
+ print(f"--- Sample {i} ---")
37
+ for key in sample.keys():
38
+ val = sample[key]
39
+ print(f" {key}: {_format_value(val)}")
40
+ print()
41
+
42
+ return 0
43
+
44
+
45
+ def _format_value(val: Any) -> str:
46
+ """Format a value for preview, truncating large data."""
47
+ import numpy as np
48
+
49
+ if isinstance(val, np.ndarray):
50
+ return f"ndarray shape={val.shape} dtype={val.dtype}"
51
+ if isinstance(val, bytes):
52
+ if len(val) <= 40:
53
+ return repr(val)
54
+ return f"bytes[{len(val)}] {val[:20]!r}..."
55
+ if isinstance(val, str):
56
+ if len(val) <= 80:
57
+ return repr(val)
58
+ return repr(val[:77] + "...")
59
+ if isinstance(val, list):
60
+ if len(val) <= 5:
61
+ return repr(val)
62
+ return f"[{val[0]!r}, {val[1]!r}, ... ({len(val)} items)]"
63
+ return repr(val)