atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +39 -0
  3. atdata/_cid.py +0 -21
  4. atdata/_exceptions.py +168 -0
  5. atdata/_helpers.py +41 -15
  6. atdata/_hf_api.py +95 -11
  7. atdata/_logging.py +70 -0
  8. atdata/_protocols.py +77 -238
  9. atdata/_schema_codec.py +7 -6
  10. atdata/_stub_manager.py +5 -25
  11. atdata/_type_utils.py +28 -2
  12. atdata/atmosphere/__init__.py +31 -20
  13. atdata/atmosphere/_types.py +4 -4
  14. atdata/atmosphere/client.py +64 -12
  15. atdata/atmosphere/lens.py +11 -12
  16. atdata/atmosphere/records.py +12 -12
  17. atdata/atmosphere/schema.py +16 -18
  18. atdata/atmosphere/store.py +6 -7
  19. atdata/cli/__init__.py +161 -175
  20. atdata/cli/diagnose.py +2 -2
  21. atdata/cli/{local.py → infra.py} +11 -11
  22. atdata/cli/inspect.py +69 -0
  23. atdata/cli/preview.py +63 -0
  24. atdata/cli/schema.py +109 -0
  25. atdata/dataset.py +583 -328
  26. atdata/index/__init__.py +54 -0
  27. atdata/index/_entry.py +157 -0
  28. atdata/index/_index.py +1198 -0
  29. atdata/index/_schema.py +380 -0
  30. atdata/lens.py +9 -2
  31. atdata/lexicons/__init__.py +121 -0
  32. atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  33. atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  34. atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
  35. atdata/lexicons/ac.foundation.dataset.record.json +96 -0
  36. atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
  37. atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  38. atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
  39. atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  40. atdata/lexicons/ndarray_shim.json +16 -0
  41. atdata/local/__init__.py +70 -0
  42. atdata/local/_repo_legacy.py +218 -0
  43. atdata/manifest/__init__.py +28 -0
  44. atdata/manifest/_aggregates.py +156 -0
  45. atdata/manifest/_builder.py +163 -0
  46. atdata/manifest/_fields.py +154 -0
  47. atdata/manifest/_manifest.py +146 -0
  48. atdata/manifest/_query.py +150 -0
  49. atdata/manifest/_writer.py +74 -0
  50. atdata/promote.py +18 -14
  51. atdata/providers/__init__.py +25 -0
  52. atdata/providers/_base.py +140 -0
  53. atdata/providers/_factory.py +69 -0
  54. atdata/providers/_postgres.py +214 -0
  55. atdata/providers/_redis.py +171 -0
  56. atdata/providers/_sqlite.py +191 -0
  57. atdata/repository.py +323 -0
  58. atdata/stores/__init__.py +23 -0
  59. atdata/stores/_disk.py +123 -0
  60. atdata/stores/_s3.py +349 -0
  61. atdata/testing.py +341 -0
  62. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
  63. atdata-0.3.1b1.dist-info/RECORD +67 -0
  64. atdata/local.py +0 -1720
  65. atdata-0.2.3b1.dist-info/RECORD +0 -28
  66. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
  67. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
  68. {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
atdata/cli/__init__.py CHANGED
@@ -1,221 +1,207 @@
1
1
  """Command-line interface for atdata.
2
2
 
3
- This module provides CLI commands for managing local development infrastructure
4
- and diagnosing configuration issues.
3
+ This module provides CLI commands for managing development infrastructure,
4
+ inspecting datasets, and diagnosing configuration issues.
5
5
 
6
6
  Commands:
7
- atdata local up Start Redis and MinIO containers for local development
8
- atdata local down Stop local development containers
9
- atdata diagnose Check Redis configuration and connectivity
10
- atdata version Show version information
11
-
12
- Example:
13
- $ atdata local up
14
- Starting Redis on port 6379...
15
- Starting MinIO on port 9000...
16
- Local infrastructure ready.
17
-
18
- $ atdata diagnose
19
- Checking Redis configuration...
20
- ✓ Redis connected
21
- ✓ Persistence enabled (AOF)
22
- ✓ Memory policy: noeviction
7
+ atdata infra up Start Redis and MinIO containers for development
8
+ atdata infra down Stop development containers
9
+ atdata infra status Show status of infrastructure
10
+ atdata diagnose Check Redis configuration and connectivity
11
+ atdata inspect Show dataset summary information
12
+ atdata schema show Display dataset schema
13
+ atdata schema diff Compare two dataset schemas
14
+ atdata preview Preview first N samples of a dataset
15
+ atdata version Show version information
23
16
  """
24
17
 
25
- import argparse
26
18
  import sys
27
- from typing import Sequence
28
19
 
20
+ import typer
29
21
 
30
- def main(argv: Sequence[str] | None = None) -> int:
31
- """Main entry point for the atdata CLI.
22
+ # ---------------------------------------------------------------------------
23
+ # App hierarchy
24
+ # ---------------------------------------------------------------------------
32
25
 
33
- Args:
34
- argv: Command-line arguments. If None, uses sys.argv[1:].
26
+ app = typer.Typer(
27
+ name="atdata",
28
+ help="A loose federation of distributed, typed datasets.",
29
+ add_completion=False,
30
+ no_args_is_help=True,
31
+ )
35
32
 
36
- Returns:
37
- Exit code (0 for success, non-zero for errors).
38
- """
39
- parser = argparse.ArgumentParser(
40
- prog="atdata",
41
- description="A loose federation of distributed, typed datasets",
42
- formatter_class=argparse.RawDescriptionHelpFormatter,
43
- )
44
- parser.add_argument(
45
- "--version",
46
- "-v",
47
- action="store_true",
48
- help="Show version information",
49
- )
33
+ infra_app = typer.Typer(
34
+ name="infra",
35
+ help="Manage development infrastructure.",
36
+ no_args_is_help=True,
37
+ )
38
+ app.add_typer(infra_app, name="infra")
50
39
 
51
- subparsers = parser.add_subparsers(dest="command", help="Available commands")
40
+ schema_app = typer.Typer(
41
+ name="schema",
42
+ help="Show or compare dataset schemas.",
43
+ no_args_is_help=True,
44
+ )
45
+ app.add_typer(schema_app, name="schema")
52
46
 
53
- # 'local' command group
54
- local_parser = subparsers.add_parser(
55
- "local",
56
- help="Manage local development infrastructure",
57
- )
58
- local_subparsers = local_parser.add_subparsers(
59
- dest="local_command",
60
- help="Local infrastructure commands",
61
- )
62
47
 
63
- # 'local up' command
64
- up_parser = local_subparsers.add_parser(
65
- "up",
66
- help="Start Redis and MinIO containers",
67
- )
68
- up_parser.add_argument(
69
- "--redis-port",
70
- type=int,
71
- default=6379,
72
- help="Redis port (default: 6379)",
73
- )
74
- up_parser.add_argument(
75
- "--minio-port",
76
- type=int,
77
- default=9000,
78
- help="MinIO API port (default: 9000)",
79
- )
80
- up_parser.add_argument(
81
- "--minio-console-port",
82
- type=int,
83
- default=9001,
84
- help="MinIO console port (default: 9001)",
85
- )
86
- up_parser.add_argument(
87
- "--detach",
88
- "-d",
89
- action="store_true",
90
- default=True,
91
- help="Run containers in detached mode (default: True)",
92
- )
48
+ # ---------------------------------------------------------------------------
49
+ # Top-level commands
50
+ # ---------------------------------------------------------------------------
93
51
 
94
- # 'local down' command
95
- down_parser = local_subparsers.add_parser(
96
- "down",
97
- help="Stop local development containers",
98
- )
99
- down_parser.add_argument(
100
- "--volumes",
101
- "-v",
102
- action="store_true",
103
- help="Also remove volumes (deletes all data)",
104
- )
105
52
 
106
- # 'local status' command
107
- local_subparsers.add_parser(
108
- "status",
109
- help="Show status of local infrastructure",
110
- )
53
+ @app.command()
54
+ def version() -> None:
55
+ """Show version information."""
56
+ try:
57
+ from atdata import __version__
111
58
 
112
- # 'diagnose' command
113
- diagnose_parser = subparsers.add_parser(
114
- "diagnose",
115
- help="Diagnose Redis configuration and connectivity",
116
- )
117
- diagnose_parser.add_argument(
118
- "--host",
119
- default="localhost",
120
- help="Redis host (default: localhost)",
121
- )
122
- diagnose_parser.add_argument(
123
- "--port",
124
- type=int,
125
- default=6379,
126
- help="Redis port (default: 6379)",
127
- )
59
+ ver = __version__
60
+ except ImportError:
61
+ from importlib.metadata import version as pkg_version
128
62
 
129
- # 'version' command (alternative to --version flag)
130
- subparsers.add_parser(
131
- "version",
132
- help="Show version information",
133
- )
63
+ ver = pkg_version("atdata")
134
64
 
135
- args = parser.parse_args(argv)
136
-
137
- # Handle --version flag
138
- if args.version or args.command == "version":
139
- return _cmd_version()
140
-
141
- # Handle 'local' commands
142
- if args.command == "local":
143
- if args.local_command == "up":
144
- return _cmd_local_up(
145
- redis_port=args.redis_port,
146
- minio_port=args.minio_port,
147
- minio_console_port=args.minio_console_port,
148
- detach=args.detach,
149
- )
150
- elif args.local_command == "down":
151
- return _cmd_local_down(remove_volumes=args.volumes)
152
- elif args.local_command == "status":
153
- return _cmd_local_status()
154
- else:
155
- local_parser.print_help()
156
- return 1
65
+ print(f"atdata {ver}")
157
66
 
158
- # Handle 'diagnose' command
159
- if args.command == "diagnose":
160
- return _cmd_diagnose(host=args.host, port=args.port)
161
67
 
162
- # No command given
163
- parser.print_help()
164
- return 0
68
+ @app.command()
69
+ def inspect(
70
+ url: str = typer.Argument(help="Dataset URL, local path, or atmosphere URI"),
71
+ ) -> None:
72
+ """Show dataset summary (sample count, schema, shards)."""
73
+ from .inspect import inspect_dataset
165
74
 
75
+ code = inspect_dataset(url=url)
76
+ raise typer.Exit(code=code)
166
77
 
167
- def _cmd_version() -> int:
168
- """Show version information."""
169
- try:
170
- from atdata import __version__
171
78
 
172
- version = __version__
173
- except ImportError:
174
- # Fallback to package metadata
175
- from importlib.metadata import version as pkg_version
79
+ @app.command()
80
+ def preview(
81
+ url: str = typer.Argument(help="Dataset URL, local path, or atmosphere URI"),
82
+ limit: int = typer.Option(5, help="Number of samples to preview."),
83
+ ) -> None:
84
+ """Preview first N samples of a dataset."""
85
+ from .preview import preview_dataset
86
+
87
+ code = preview_dataset(url=url, limit=limit)
88
+ raise typer.Exit(code=code)
89
+
90
+
91
+ @app.command()
92
+ def diagnose(
93
+ host: str = typer.Option("localhost", help="Redis host."),
94
+ port: int = typer.Option(6379, help="Redis port."),
95
+ ) -> None:
96
+ """Diagnose Redis configuration and connectivity."""
97
+ from .diagnose import diagnose_redis
176
98
 
177
- version = pkg_version("atdata")
99
+ code = diagnose_redis(host=host, port=port)
100
+ raise typer.Exit(code=code)
178
101
 
179
- print(f"atdata {version}")
180
- return 0
181
102
 
103
+ # ---------------------------------------------------------------------------
104
+ # infra sub-commands
105
+ # ---------------------------------------------------------------------------
182
106
 
183
- def _cmd_local_up(
184
- redis_port: int,
185
- minio_port: int,
186
- minio_console_port: int,
187
- detach: bool,
188
- ) -> int:
189
- """Start local development infrastructure."""
190
- from .local import local_up
191
107
 
192
- return local_up(
108
+ @infra_app.command()
109
+ def up(
110
+ redis_port: int = typer.Option(6379, help="Redis port."),
111
+ minio_port: int = typer.Option(9000, help="MinIO API port."),
112
+ minio_console_port: int = typer.Option(9001, help="MinIO console port."),
113
+ detach: bool = typer.Option(
114
+ True, "--detach", "-d", help="Run containers in detached mode."
115
+ ),
116
+ ) -> None:
117
+ """Start Redis and MinIO containers."""
118
+ from .infra import local_up
119
+
120
+ code = local_up(
193
121
  redis_port=redis_port,
194
122
  minio_port=minio_port,
195
123
  minio_console_port=minio_console_port,
196
124
  detach=detach,
197
125
  )
126
+ raise typer.Exit(code=code)
198
127
 
199
128
 
200
- def _cmd_local_down(remove_volumes: bool) -> int:
201
- """Stop local development infrastructure."""
202
- from .local import local_down
129
+ @infra_app.command()
130
+ def down(
131
+ volumes: bool = typer.Option(
132
+ False, "--volumes", "-v", help="Also remove volumes (deletes all data)."
133
+ ),
134
+ ) -> None:
135
+ """Stop local development containers."""
136
+ from .infra import local_down
203
137
 
204
- return local_down(remove_volumes=remove_volumes)
138
+ code = local_down(remove_volumes=volumes)
139
+ raise typer.Exit(code=code)
205
140
 
206
141
 
207
- def _cmd_local_status() -> int:
208
- """Show status of local infrastructure."""
209
- from .local import local_status
142
+ @infra_app.command()
143
+ def status() -> None:
144
+ """Show status of infrastructure."""
145
+ from .infra import local_status
210
146
 
211
- return local_status()
147
+ code = local_status()
148
+ raise typer.Exit(code=code)
212
149
 
213
150
 
214
- def _cmd_diagnose(host: str, port: int) -> int:
215
- """Diagnose Redis configuration."""
216
- from .diagnose import diagnose_redis
151
+ # ---------------------------------------------------------------------------
152
+ # schema sub-commands
153
+ # ---------------------------------------------------------------------------
154
+
155
+
156
+ @schema_app.command("show")
157
+ def schema_show(
158
+ dataset_ref: str = typer.Argument(
159
+ help="Dataset URL, local path, or index reference."
160
+ ),
161
+ ) -> None:
162
+ """Display dataset schema."""
163
+ from .schema import schema_show as _schema_show
164
+
165
+ code = _schema_show(dataset_ref=dataset_ref)
166
+ raise typer.Exit(code=code)
167
+
168
+
169
+ @schema_app.command("diff")
170
+ def schema_diff(
171
+ url_a: str = typer.Argument(help="First dataset URL."),
172
+ url_b: str = typer.Argument(help="Second dataset URL."),
173
+ ) -> None:
174
+ """Compare two dataset schemas."""
175
+ from .schema import schema_diff as _schema_diff
217
176
 
218
- return diagnose_redis(host=host, port=port)
177
+ code = _schema_diff(url_a=url_a, url_b=url_b)
178
+ raise typer.Exit(code=code)
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # Entrypoint
183
+ # ---------------------------------------------------------------------------
184
+
185
+
186
+ def main(argv: list[str] | None = None) -> int:
187
+ """Main entry point for the atdata CLI.
188
+
189
+ Args:
190
+ argv: Command-line arguments. If None, uses sys.argv[1:].
191
+
192
+ Returns:
193
+ Exit code (0 for success, non-zero for errors).
194
+ """
195
+ try:
196
+ if argv is not None:
197
+ app(args=argv, standalone_mode=False)
198
+ else:
199
+ app(standalone_mode=False)
200
+ return 0
201
+ except SystemExit as exc:
202
+ return exc.code if isinstance(exc.code, int) else 0
203
+ except Exception:
204
+ return 1
219
205
 
220
206
 
221
207
  if __name__ == "__main__":
atdata/cli/diagnose.py CHANGED
@@ -51,7 +51,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
51
51
  _print_status("Connection", False, str(e))
52
52
  print()
53
53
  print("Cannot connect to Redis. Make sure Redis is running:")
54
- print(" atdata local up")
54
+ print(" atdata infra up")
55
55
  return 1
56
56
 
57
57
  # Check Redis version
@@ -162,7 +162,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
162
162
  print(" maxmemory-policy noeviction")
163
163
  print()
164
164
  print(" # Or use atdata's preconfigured local setup:")
165
- print(" atdata local up")
165
+ print(" atdata infra up")
166
166
  return 1
167
167
  else:
168
168
  print("All checks passed. Redis is properly configured for atdata.")
@@ -1,6 +1,6 @@
1
- """Local infrastructure management for atdata.
1
+ """Infrastructure management for atdata.
2
2
 
3
- This module provides commands to start and stop local development infrastructure:
3
+ This module provides commands to start and stop development infrastructure:
4
4
  - Redis: For index storage and metadata
5
5
  - MinIO: S3-compatible object storage for dataset files
6
6
 
@@ -114,7 +114,7 @@ def _container_running(name: str) -> bool:
114
114
  timeout=5,
115
115
  )
116
116
  return result.returncode == 0 and result.stdout.strip() == "true"
117
- except Exception:
117
+ except (OSError, subprocess.SubprocessError):
118
118
  return False
119
119
 
120
120
 
@@ -179,7 +179,7 @@ def local_up(
179
179
  if not _check_docker():
180
180
  return 1
181
181
 
182
- print("Starting atdata local infrastructure...")
182
+ print("Starting atdata infrastructure...")
183
183
 
184
184
  compose_content = _get_compose_file(redis_port, minio_port, minio_console_port)
185
185
  command = ["up"]
@@ -202,7 +202,7 @@ def local_up(
202
202
 
203
203
  # Show status
204
204
  print()
205
- print("Local infrastructure started:")
205
+ print("Infrastructure started:")
206
206
  print(f" Redis: localhost:{redis_port}")
207
207
  print(f" MinIO API: http://localhost:{minio_port}")
208
208
  print(f" MinIO Console: http://localhost:{minio_console_port}")
@@ -210,7 +210,7 @@ def local_up(
210
210
  print("MinIO credentials: minioadmin / minioadmin")
211
211
  print()
212
212
  print("Example usage:")
213
- print(" from atdata.local import Index, S3DataStore")
213
+ print(" from atdata.stores import S3DataStore")
214
214
  print(" ")
215
215
  print(" store = S3DataStore.from_credentials({")
216
216
  print(f" 'AWS_ENDPOINT': 'http://localhost:{minio_port}',")
@@ -234,7 +234,7 @@ def local_down(remove_volumes: bool = False) -> int:
234
234
  if not _check_docker():
235
235
  return 1
236
236
 
237
- print("Stopping atdata local infrastructure...")
237
+ print("Stopping atdata infrastructure...")
238
238
 
239
239
  # Use default ports for compose file (actual ports don't matter for down)
240
240
  compose_content = _get_compose_file(6379, 9000, 9001)
@@ -252,7 +252,7 @@ def local_down(remove_volumes: bool = False) -> int:
252
252
  print(f"Error: {e}", file=sys.stderr)
253
253
  return 1
254
254
 
255
- print("Local infrastructure stopped.")
255
+ print("Infrastructure stopped.")
256
256
  return 0
257
257
 
258
258
 
@@ -268,16 +268,16 @@ def local_status() -> int:
268
268
  redis_running = _container_running(REDIS_CONTAINER)
269
269
  minio_running = _container_running(MINIO_CONTAINER)
270
270
 
271
- print("atdata local infrastructure status:")
271
+ print("atdata infrastructure status:")
272
272
  print()
273
273
  print(f" Redis ({REDIS_CONTAINER}): {'running' if redis_running else 'stopped'}")
274
274
  print(f" MinIO ({MINIO_CONTAINER}): {'running' if minio_running else 'stopped'}")
275
275
 
276
276
  if redis_running or minio_running:
277
277
  print()
278
- print("To stop: atdata local down")
278
+ print("To stop: atdata infra down")
279
279
  else:
280
280
  print()
281
- print("To start: atdata local up")
281
+ print("To start: atdata infra up")
282
282
 
283
283
  return 0
atdata/cli/inspect.py ADDED
@@ -0,0 +1,69 @@
1
+ """``atdata inspect`` command — show dataset summary information."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from typing import Any
7
+
8
+
9
+ def inspect_dataset(url: str) -> int:
10
+ """Print summary information for a dataset at the given URL.
11
+
12
+ Args:
13
+ url: WebDataset URL, local path, or atmosphere URI.
14
+
15
+ Returns:
16
+ Exit code (0 success, 1 failure).
17
+ """
18
+ try:
19
+ from ..dataset import Dataset, DictSample
20
+
21
+ ds = Dataset[DictSample](url)
22
+ except Exception as exc:
23
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
24
+ return 1
25
+
26
+ try:
27
+ shards = ds.list_shards()
28
+ print(f"URL: {url}")
29
+ print(f"Shards: {len(shards)}")
30
+ for shard in shards:
31
+ print(f" - {shard}")
32
+
33
+ # Read first sample to infer schema
34
+ samples = ds.head(1)
35
+ if samples:
36
+ sample = samples[0]
37
+ print("Schema: (inferred from first sample)")
38
+ for key in sample.keys():
39
+ val = sample[key]
40
+ print(f" {key}: {_describe_value(val)}")
41
+ else:
42
+ print("Schema: (no samples found)")
43
+
44
+ # Count samples — scan all shards
45
+ count = sum(1 for _ in ds.ordered())
46
+ print(f"Samples: {count}")
47
+ except Exception as exc:
48
+ print(f"Error reading dataset: {exc}", file=sys.stderr)
49
+ return 1
50
+
51
+ return 0
52
+
53
+
54
+ def _describe_value(val: Any) -> str:
55
+ """Human-readable type description for a sample field value."""
56
+ import numpy as np
57
+
58
+ if isinstance(val, np.ndarray):
59
+ return f"ndarray dtype={val.dtype} shape={val.shape}"
60
+ if isinstance(val, bytes):
61
+ return f"bytes len={len(val)}"
62
+ if isinstance(val, str):
63
+ truncated = val[:60] + ("..." if len(val) > 60 else "")
64
+ return f'str "{truncated}"'
65
+ if isinstance(val, (int, float, bool)):
66
+ return f"{type(val).__name__} {val}"
67
+ if isinstance(val, list):
68
+ return f"list len={len(val)}"
69
+ return type(val).__name__
atdata/cli/preview.py ADDED
@@ -0,0 +1,63 @@
1
+ """``atdata preview`` command — render first N samples of a dataset."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from typing import Any
7
+
8
+
9
+ def preview_dataset(url: str, limit: int = 5) -> int:
10
+ """Print a human-readable preview of the first *limit* samples.
11
+
12
+ Args:
13
+ url: Dataset URL, local path, or atmosphere URI.
14
+ limit: Number of samples to show. Default: 5.
15
+
16
+ Returns:
17
+ Exit code (0 success, 1 failure).
18
+ """
19
+ try:
20
+ from ..dataset import Dataset, DictSample
21
+
22
+ ds = Dataset[DictSample](url)
23
+ except Exception as exc:
24
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
25
+ return 1
26
+
27
+ samples = ds.head(limit)
28
+ if not samples:
29
+ print("No samples found.", file=sys.stderr)
30
+ return 1
31
+
32
+ print(f"Preview of {url} ({len(samples)} sample(s)):")
33
+ print()
34
+
35
+ for i, sample in enumerate(samples):
36
+ print(f"--- Sample {i} ---")
37
+ for key in sample.keys():
38
+ val = sample[key]
39
+ print(f" {key}: {_format_value(val)}")
40
+ print()
41
+
42
+ return 0
43
+
44
+
45
+ def _format_value(val: Any) -> str:
46
+ """Format a value for preview, truncating large data."""
47
+ import numpy as np
48
+
49
+ if isinstance(val, np.ndarray):
50
+ return f"ndarray shape={val.shape} dtype={val.dtype}"
51
+ if isinstance(val, bytes):
52
+ if len(val) <= 40:
53
+ return repr(val)
54
+ return f"bytes[{len(val)}] {val[:20]!r}..."
55
+ if isinstance(val, str):
56
+ if len(val) <= 80:
57
+ return repr(val)
58
+ return repr(val[:77] + "...")
59
+ if isinstance(val, list):
60
+ if len(val) <= 5:
61
+ return repr(val)
62
+ return f"[{val[0]!r}, {val[1]!r}, ... ({len(val)} items)]"
63
+ return repr(val)