atdata 0.2.3b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/.gitignore +1 -0
- atdata/__init__.py +30 -0
- atdata/_exceptions.py +168 -0
- atdata/_helpers.py +29 -15
- atdata/_hf_api.py +63 -11
- atdata/_logging.py +70 -0
- atdata/_protocols.py +19 -62
- atdata/_schema_codec.py +5 -4
- atdata/_type_utils.py +28 -2
- atdata/atmosphere/__init__.py +19 -9
- atdata/atmosphere/records.py +3 -2
- atdata/atmosphere/schema.py +2 -2
- atdata/cli/__init__.py +157 -171
- atdata/cli/inspect.py +69 -0
- atdata/cli/local.py +1 -1
- atdata/cli/preview.py +63 -0
- atdata/cli/schema.py +109 -0
- atdata/dataset.py +428 -326
- atdata/lens.py +9 -2
- atdata/local/__init__.py +71 -0
- atdata/local/_entry.py +157 -0
- atdata/local/_index.py +940 -0
- atdata/local/_repo_legacy.py +218 -0
- atdata/local/_s3.py +349 -0
- atdata/local/_schema.py +380 -0
- atdata/manifest/__init__.py +28 -0
- atdata/manifest/_aggregates.py +156 -0
- atdata/manifest/_builder.py +163 -0
- atdata/manifest/_fields.py +154 -0
- atdata/manifest/_manifest.py +146 -0
- atdata/manifest/_query.py +150 -0
- atdata/manifest/_writer.py +74 -0
- atdata/promote.py +4 -4
- atdata/providers/__init__.py +25 -0
- atdata/providers/_base.py +140 -0
- atdata/providers/_factory.py +69 -0
- atdata/providers/_postgres.py +214 -0
- atdata/providers/_redis.py +171 -0
- atdata/providers/_sqlite.py +191 -0
- atdata/repository.py +323 -0
- atdata/testing.py +337 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +4 -1
- atdata-0.3.0b1.dist-info/RECORD +54 -0
- atdata/local.py +0 -1720
- atdata-0.2.3b1.dist-info/RECORD +0 -28
- {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/_schema_codec.py
CHANGED
|
@@ -28,13 +28,14 @@ import hashlib
|
|
|
28
28
|
|
|
29
29
|
from numpy.typing import NDArray
|
|
30
30
|
|
|
31
|
-
# Import PackableSample for inheritance
|
|
31
|
+
# Import PackableSample for inheritance in dynamic class generation
|
|
32
32
|
from .dataset import PackableSample
|
|
33
|
+
from ._protocols import Packable
|
|
33
34
|
|
|
34
35
|
|
|
35
36
|
# Type cache to avoid regenerating identical types
|
|
36
37
|
# Uses insertion order (Python 3.7+) for simple FIFO eviction
|
|
37
|
-
_type_cache: dict[str, Type[
|
|
38
|
+
_type_cache: dict[str, Type[Packable]] = {}
|
|
38
39
|
_TYPE_CACHE_MAX_SIZE = 256
|
|
39
40
|
|
|
40
41
|
|
|
@@ -130,7 +131,7 @@ def schema_to_type(
|
|
|
130
131
|
schema: dict,
|
|
131
132
|
*,
|
|
132
133
|
use_cache: bool = True,
|
|
133
|
-
) -> Type[
|
|
134
|
+
) -> Type[Packable]:
|
|
134
135
|
"""Generate a PackableSample subclass from a schema record.
|
|
135
136
|
|
|
136
137
|
This function dynamically creates a dataclass that inherits from PackableSample,
|
|
@@ -420,7 +421,7 @@ def clear_type_cache() -> None:
|
|
|
420
421
|
_type_cache.clear()
|
|
421
422
|
|
|
422
423
|
|
|
423
|
-
def get_cached_types() -> dict[str, Type[
|
|
424
|
+
def get_cached_types() -> dict[str, Type[Packable]]:
|
|
424
425
|
"""Get a copy of the current type cache.
|
|
425
426
|
|
|
426
427
|
Returns:
|
atdata/_type_utils.py
CHANGED
|
@@ -45,9 +45,13 @@ def numpy_dtype_to_string(dtype: Any) -> str:
|
|
|
45
45
|
Schema dtype string (e.g., "float32", "int64"). Defaults to "float32".
|
|
46
46
|
"""
|
|
47
47
|
dtype_str = str(dtype)
|
|
48
|
-
|
|
48
|
+
# Exact match first (handles "float32", "int64", etc.)
|
|
49
|
+
if dtype_str in NUMPY_DTYPE_MAP:
|
|
50
|
+
return NUMPY_DTYPE_MAP[dtype_str]
|
|
51
|
+
# Substring match, longest keys first to avoid "int8" matching "uint8"
|
|
52
|
+
for key in sorted(NUMPY_DTYPE_MAP, key=len, reverse=True):
|
|
49
53
|
if key in dtype_str:
|
|
50
|
-
return
|
|
54
|
+
return NUMPY_DTYPE_MAP[key]
|
|
51
55
|
return "float32"
|
|
52
56
|
|
|
53
57
|
|
|
@@ -102,3 +106,25 @@ def extract_ndarray_dtype(python_type: Any) -> str:
|
|
|
102
106
|
if dtype_arg is not None:
|
|
103
107
|
return numpy_dtype_to_string(dtype_arg)
|
|
104
108
|
return "float32"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def parse_semver(version: str) -> tuple[int, int, int]:
|
|
112
|
+
"""Parse a semantic version string into a comparable tuple.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
version: A ``"major.minor.patch"`` version string.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
Tuple of (major, minor, patch) integers.
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
ValueError: If the version string is not valid semver.
|
|
122
|
+
|
|
123
|
+
Examples:
|
|
124
|
+
>>> parse_semver("1.2.3")
|
|
125
|
+
(1, 2, 3)
|
|
126
|
+
"""
|
|
127
|
+
parts = version.split(".")
|
|
128
|
+
if len(parts) != 3:
|
|
129
|
+
raise ValueError(f"Invalid semver: {version}")
|
|
130
|
+
return int(parts[0]), int(parts[1]), int(parts[2])
|
atdata/atmosphere/__init__.py
CHANGED
|
@@ -99,23 +99,25 @@ class AtmosphereIndexEntry:
|
|
|
99
99
|
class AtmosphereIndex:
|
|
100
100
|
"""ATProto index implementing AbstractIndex protocol.
|
|
101
101
|
|
|
102
|
+
.. deprecated::
|
|
103
|
+
Use ``atdata.Index(atmosphere=client)`` instead. ``AtmosphereIndex``
|
|
104
|
+
is retained for backwards compatibility and will be removed in a
|
|
105
|
+
future release.
|
|
106
|
+
|
|
102
107
|
Wraps SchemaPublisher/Loader and DatasetPublisher/Loader to provide
|
|
103
|
-
a unified interface compatible with
|
|
108
|
+
a unified interface compatible with Index.
|
|
104
109
|
|
|
105
110
|
Optionally accepts a ``PDSBlobStore`` for writing dataset shards as
|
|
106
111
|
ATProto blobs, enabling fully decentralized dataset storage.
|
|
107
112
|
|
|
108
113
|
Examples:
|
|
109
|
-
>>>
|
|
110
|
-
>>>
|
|
114
|
+
>>> # Preferred: use unified Index
|
|
115
|
+
>>> from atdata.local import Index
|
|
116
|
+
>>> from atdata.atmosphere import AtmosphereClient
|
|
117
|
+
>>> index = Index(atmosphere=client)
|
|
111
118
|
>>>
|
|
112
|
-
>>> #
|
|
119
|
+
>>> # Legacy (deprecated)
|
|
113
120
|
>>> index = AtmosphereIndex(client)
|
|
114
|
-
>>>
|
|
115
|
-
>>> # With PDS blob storage
|
|
116
|
-
>>> store = PDSBlobStore(client)
|
|
117
|
-
>>> index = AtmosphereIndex(client, data_store=store)
|
|
118
|
-
>>> entry = index.insert_dataset(dataset, name="my-data")
|
|
119
121
|
"""
|
|
120
122
|
|
|
121
123
|
def __init__(
|
|
@@ -131,6 +133,14 @@ class AtmosphereIndex:
|
|
|
131
133
|
data_store: Optional PDSBlobStore for writing shards as blobs.
|
|
132
134
|
If provided, insert_dataset will upload shards to PDS.
|
|
133
135
|
"""
|
|
136
|
+
import warnings
|
|
137
|
+
|
|
138
|
+
warnings.warn(
|
|
139
|
+
"AtmosphereIndex is deprecated. Use atdata.Index(atmosphere=client) "
|
|
140
|
+
"instead for unified index access.",
|
|
141
|
+
DeprecationWarning,
|
|
142
|
+
stacklevel=2,
|
|
143
|
+
)
|
|
134
144
|
self.client = client
|
|
135
145
|
self._schema_publisher = SchemaPublisher(client)
|
|
136
146
|
self._schema_loader = SchemaLoader(client)
|
atdata/atmosphere/records.py
CHANGED
|
@@ -21,9 +21,10 @@ from ._types import (
|
|
|
21
21
|
from typing import TYPE_CHECKING
|
|
22
22
|
|
|
23
23
|
if TYPE_CHECKING:
|
|
24
|
-
from ..dataset import
|
|
24
|
+
from ..dataset import Dataset
|
|
25
|
+
from .._protocols import Packable
|
|
25
26
|
|
|
26
|
-
ST = TypeVar("ST", bound="
|
|
27
|
+
ST = TypeVar("ST", bound="Packable")
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
class DatasetPublisher:
|
atdata/atmosphere/schema.py
CHANGED
|
@@ -26,9 +26,9 @@ from .._type_utils import (
|
|
|
26
26
|
from typing import TYPE_CHECKING
|
|
27
27
|
|
|
28
28
|
if TYPE_CHECKING:
|
|
29
|
-
from ..
|
|
29
|
+
from .._protocols import Packable
|
|
30
30
|
|
|
31
|
-
ST = TypeVar("ST", bound="
|
|
31
|
+
ST = TypeVar("ST", bound="Packable")
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class SchemaPublisher:
|
atdata/cli/__init__.py
CHANGED
|
@@ -1,221 +1,207 @@
|
|
|
1
1
|
"""Command-line interface for atdata.
|
|
2
2
|
|
|
3
|
-
This module provides CLI commands for managing local development infrastructure
|
|
4
|
-
and diagnosing configuration issues.
|
|
3
|
+
This module provides CLI commands for managing local development infrastructure,
|
|
4
|
+
inspecting datasets, and diagnosing configuration issues.
|
|
5
5
|
|
|
6
6
|
Commands:
|
|
7
|
-
atdata local up
|
|
8
|
-
atdata local down
|
|
9
|
-
atdata
|
|
10
|
-
atdata
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
Local infrastructure ready.
|
|
17
|
-
|
|
18
|
-
$ atdata diagnose
|
|
19
|
-
Checking Redis configuration...
|
|
20
|
-
✓ Redis connected
|
|
21
|
-
✓ Persistence enabled (AOF)
|
|
22
|
-
✓ Memory policy: noeviction
|
|
7
|
+
atdata local up Start Redis and MinIO containers for local development
|
|
8
|
+
atdata local down Stop local development containers
|
|
9
|
+
atdata local status Show status of local infrastructure
|
|
10
|
+
atdata diagnose Check Redis configuration and connectivity
|
|
11
|
+
atdata inspect Show dataset summary information
|
|
12
|
+
atdata schema show Display dataset schema
|
|
13
|
+
atdata schema diff Compare two dataset schemas
|
|
14
|
+
atdata preview Preview first N samples of a dataset
|
|
15
|
+
atdata version Show version information
|
|
23
16
|
"""
|
|
24
17
|
|
|
25
|
-
import argparse
|
|
26
18
|
import sys
|
|
27
|
-
from typing import Sequence
|
|
28
19
|
|
|
20
|
+
import typer
|
|
29
21
|
|
|
30
|
-
|
|
31
|
-
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# App hierarchy
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
32
25
|
|
|
33
|
-
|
|
34
|
-
|
|
26
|
+
app = typer.Typer(
|
|
27
|
+
name="atdata",
|
|
28
|
+
help="A loose federation of distributed, typed datasets.",
|
|
29
|
+
add_completion=False,
|
|
30
|
+
no_args_is_help=True,
|
|
31
|
+
)
|
|
35
32
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
""
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
43
|
-
)
|
|
44
|
-
parser.add_argument(
|
|
45
|
-
"--version",
|
|
46
|
-
"-v",
|
|
47
|
-
action="store_true",
|
|
48
|
-
help="Show version information",
|
|
49
|
-
)
|
|
33
|
+
local_app = typer.Typer(
|
|
34
|
+
name="local",
|
|
35
|
+
help="Manage local development infrastructure.",
|
|
36
|
+
no_args_is_help=True,
|
|
37
|
+
)
|
|
38
|
+
app.add_typer(local_app, name="local")
|
|
50
39
|
|
|
51
|
-
|
|
40
|
+
schema_app = typer.Typer(
|
|
41
|
+
name="schema",
|
|
42
|
+
help="Show or compare dataset schemas.",
|
|
43
|
+
no_args_is_help=True,
|
|
44
|
+
)
|
|
45
|
+
app.add_typer(schema_app, name="schema")
|
|
52
46
|
|
|
53
|
-
# 'local' command group
|
|
54
|
-
local_parser = subparsers.add_parser(
|
|
55
|
-
"local",
|
|
56
|
-
help="Manage local development infrastructure",
|
|
57
|
-
)
|
|
58
|
-
local_subparsers = local_parser.add_subparsers(
|
|
59
|
-
dest="local_command",
|
|
60
|
-
help="Local infrastructure commands",
|
|
61
|
-
)
|
|
62
47
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
help="Start Redis and MinIO containers",
|
|
67
|
-
)
|
|
68
|
-
up_parser.add_argument(
|
|
69
|
-
"--redis-port",
|
|
70
|
-
type=int,
|
|
71
|
-
default=6379,
|
|
72
|
-
help="Redis port (default: 6379)",
|
|
73
|
-
)
|
|
74
|
-
up_parser.add_argument(
|
|
75
|
-
"--minio-port",
|
|
76
|
-
type=int,
|
|
77
|
-
default=9000,
|
|
78
|
-
help="MinIO API port (default: 9000)",
|
|
79
|
-
)
|
|
80
|
-
up_parser.add_argument(
|
|
81
|
-
"--minio-console-port",
|
|
82
|
-
type=int,
|
|
83
|
-
default=9001,
|
|
84
|
-
help="MinIO console port (default: 9001)",
|
|
85
|
-
)
|
|
86
|
-
up_parser.add_argument(
|
|
87
|
-
"--detach",
|
|
88
|
-
"-d",
|
|
89
|
-
action="store_true",
|
|
90
|
-
default=True,
|
|
91
|
-
help="Run containers in detached mode (default: True)",
|
|
92
|
-
)
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Top-level commands
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
93
51
|
|
|
94
|
-
# 'local down' command
|
|
95
|
-
down_parser = local_subparsers.add_parser(
|
|
96
|
-
"down",
|
|
97
|
-
help="Stop local development containers",
|
|
98
|
-
)
|
|
99
|
-
down_parser.add_argument(
|
|
100
|
-
"--volumes",
|
|
101
|
-
"-v",
|
|
102
|
-
action="store_true",
|
|
103
|
-
help="Also remove volumes (deletes all data)",
|
|
104
|
-
)
|
|
105
52
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
53
|
+
@app.command()
|
|
54
|
+
def version() -> None:
|
|
55
|
+
"""Show version information."""
|
|
56
|
+
try:
|
|
57
|
+
from atdata import __version__
|
|
111
58
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
help="Diagnose Redis configuration and connectivity",
|
|
116
|
-
)
|
|
117
|
-
diagnose_parser.add_argument(
|
|
118
|
-
"--host",
|
|
119
|
-
default="localhost",
|
|
120
|
-
help="Redis host (default: localhost)",
|
|
121
|
-
)
|
|
122
|
-
diagnose_parser.add_argument(
|
|
123
|
-
"--port",
|
|
124
|
-
type=int,
|
|
125
|
-
default=6379,
|
|
126
|
-
help="Redis port (default: 6379)",
|
|
127
|
-
)
|
|
59
|
+
ver = __version__
|
|
60
|
+
except ImportError:
|
|
61
|
+
from importlib.metadata import version as pkg_version
|
|
128
62
|
|
|
129
|
-
|
|
130
|
-
subparsers.add_parser(
|
|
131
|
-
"version",
|
|
132
|
-
help="Show version information",
|
|
133
|
-
)
|
|
63
|
+
ver = pkg_version("atdata")
|
|
134
64
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
# Handle --version flag
|
|
138
|
-
if args.version or args.command == "version":
|
|
139
|
-
return _cmd_version()
|
|
140
|
-
|
|
141
|
-
# Handle 'local' commands
|
|
142
|
-
if args.command == "local":
|
|
143
|
-
if args.local_command == "up":
|
|
144
|
-
return _cmd_local_up(
|
|
145
|
-
redis_port=args.redis_port,
|
|
146
|
-
minio_port=args.minio_port,
|
|
147
|
-
minio_console_port=args.minio_console_port,
|
|
148
|
-
detach=args.detach,
|
|
149
|
-
)
|
|
150
|
-
elif args.local_command == "down":
|
|
151
|
-
return _cmd_local_down(remove_volumes=args.volumes)
|
|
152
|
-
elif args.local_command == "status":
|
|
153
|
-
return _cmd_local_status()
|
|
154
|
-
else:
|
|
155
|
-
local_parser.print_help()
|
|
156
|
-
return 1
|
|
65
|
+
print(f"atdata {ver}")
|
|
157
66
|
|
|
158
|
-
# Handle 'diagnose' command
|
|
159
|
-
if args.command == "diagnose":
|
|
160
|
-
return _cmd_diagnose(host=args.host, port=args.port)
|
|
161
67
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
68
|
+
@app.command()
|
|
69
|
+
def inspect(
|
|
70
|
+
url: str = typer.Argument(help="Dataset URL, local path, or atmosphere URI"),
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Show dataset summary (sample count, schema, shards)."""
|
|
73
|
+
from .inspect import inspect_dataset
|
|
165
74
|
|
|
75
|
+
code = inspect_dataset(url=url)
|
|
76
|
+
raise typer.Exit(code=code)
|
|
166
77
|
|
|
167
|
-
def _cmd_version() -> int:
|
|
168
|
-
"""Show version information."""
|
|
169
|
-
try:
|
|
170
|
-
from atdata import __version__
|
|
171
78
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
79
|
+
@app.command()
|
|
80
|
+
def preview(
|
|
81
|
+
url: str = typer.Argument(help="Dataset URL, local path, or atmosphere URI"),
|
|
82
|
+
limit: int = typer.Option(5, help="Number of samples to preview."),
|
|
83
|
+
) -> None:
|
|
84
|
+
"""Preview first N samples of a dataset."""
|
|
85
|
+
from .preview import preview_dataset
|
|
86
|
+
|
|
87
|
+
code = preview_dataset(url=url, limit=limit)
|
|
88
|
+
raise typer.Exit(code=code)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@app.command()
|
|
92
|
+
def diagnose(
|
|
93
|
+
host: str = typer.Option("localhost", help="Redis host."),
|
|
94
|
+
port: int = typer.Option(6379, help="Redis port."),
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Diagnose Redis configuration and connectivity."""
|
|
97
|
+
from .diagnose import diagnose_redis
|
|
176
98
|
|
|
177
|
-
|
|
99
|
+
code = diagnose_redis(host=host, port=port)
|
|
100
|
+
raise typer.Exit(code=code)
|
|
178
101
|
|
|
179
|
-
print(f"atdata {version}")
|
|
180
|
-
return 0
|
|
181
102
|
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
# local sub-commands
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
182
106
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
107
|
+
|
|
108
|
+
@local_app.command()
|
|
109
|
+
def up(
|
|
110
|
+
redis_port: int = typer.Option(6379, help="Redis port."),
|
|
111
|
+
minio_port: int = typer.Option(9000, help="MinIO API port."),
|
|
112
|
+
minio_console_port: int = typer.Option(9001, help="MinIO console port."),
|
|
113
|
+
detach: bool = typer.Option(
|
|
114
|
+
True, "--detach", "-d", help="Run containers in detached mode."
|
|
115
|
+
),
|
|
116
|
+
) -> None:
|
|
117
|
+
"""Start Redis and MinIO containers."""
|
|
190
118
|
from .local import local_up
|
|
191
119
|
|
|
192
|
-
|
|
120
|
+
code = local_up(
|
|
193
121
|
redis_port=redis_port,
|
|
194
122
|
minio_port=minio_port,
|
|
195
123
|
minio_console_port=minio_console_port,
|
|
196
124
|
detach=detach,
|
|
197
125
|
)
|
|
126
|
+
raise typer.Exit(code=code)
|
|
198
127
|
|
|
199
128
|
|
|
200
|
-
|
|
201
|
-
|
|
129
|
+
@local_app.command()
|
|
130
|
+
def down(
|
|
131
|
+
volumes: bool = typer.Option(
|
|
132
|
+
False, "--volumes", "-v", help="Also remove volumes (deletes all data)."
|
|
133
|
+
),
|
|
134
|
+
) -> None:
|
|
135
|
+
"""Stop local development containers."""
|
|
202
136
|
from .local import local_down
|
|
203
137
|
|
|
204
|
-
|
|
138
|
+
code = local_down(remove_volumes=volumes)
|
|
139
|
+
raise typer.Exit(code=code)
|
|
205
140
|
|
|
206
141
|
|
|
207
|
-
|
|
142
|
+
@local_app.command()
|
|
143
|
+
def status() -> None:
|
|
208
144
|
"""Show status of local infrastructure."""
|
|
209
145
|
from .local import local_status
|
|
210
146
|
|
|
211
|
-
|
|
147
|
+
code = local_status()
|
|
148
|
+
raise typer.Exit(code=code)
|
|
212
149
|
|
|
213
150
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
151
|
+
# ---------------------------------------------------------------------------
|
|
152
|
+
# schema sub-commands
|
|
153
|
+
# ---------------------------------------------------------------------------
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@schema_app.command("show")
|
|
157
|
+
def schema_show(
|
|
158
|
+
dataset_ref: str = typer.Argument(
|
|
159
|
+
help="Dataset URL, local path, or index reference."
|
|
160
|
+
),
|
|
161
|
+
) -> None:
|
|
162
|
+
"""Display dataset schema."""
|
|
163
|
+
from .schema import schema_show as _schema_show
|
|
164
|
+
|
|
165
|
+
code = _schema_show(dataset_ref=dataset_ref)
|
|
166
|
+
raise typer.Exit(code=code)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@schema_app.command("diff")
|
|
170
|
+
def schema_diff(
|
|
171
|
+
url_a: str = typer.Argument(help="First dataset URL."),
|
|
172
|
+
url_b: str = typer.Argument(help="Second dataset URL."),
|
|
173
|
+
) -> None:
|
|
174
|
+
"""Compare two dataset schemas."""
|
|
175
|
+
from .schema import schema_diff as _schema_diff
|
|
217
176
|
|
|
218
|
-
|
|
177
|
+
code = _schema_diff(url_a=url_a, url_b=url_b)
|
|
178
|
+
raise typer.Exit(code=code)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# ---------------------------------------------------------------------------
|
|
182
|
+
# Entrypoint
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def main(argv: list[str] | None = None) -> int:
|
|
187
|
+
"""Main entry point for the atdata CLI.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
argv: Command-line arguments. If None, uses sys.argv[1:].
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Exit code (0 for success, non-zero for errors).
|
|
194
|
+
"""
|
|
195
|
+
try:
|
|
196
|
+
if argv is not None:
|
|
197
|
+
app(args=argv, standalone_mode=False)
|
|
198
|
+
else:
|
|
199
|
+
app(standalone_mode=False)
|
|
200
|
+
return 0
|
|
201
|
+
except SystemExit as exc:
|
|
202
|
+
return exc.code if isinstance(exc.code, int) else 0
|
|
203
|
+
except Exception:
|
|
204
|
+
return 1
|
|
219
205
|
|
|
220
206
|
|
|
221
207
|
if __name__ == "__main__":
|
atdata/cli/inspect.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""``atdata inspect`` command — show dataset summary information."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def inspect_dataset(url: str) -> int:
|
|
10
|
+
"""Print summary information for a dataset at the given URL.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
url: WebDataset URL, local path, or atmosphere URI.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Exit code (0 success, 1 failure).
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
from ..dataset import Dataset, DictSample
|
|
20
|
+
|
|
21
|
+
ds = Dataset[DictSample](url)
|
|
22
|
+
except Exception as exc:
|
|
23
|
+
print(f"Error opening dataset: {exc}", file=sys.stderr)
|
|
24
|
+
return 1
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
shards = ds.list_shards()
|
|
28
|
+
print(f"URL: {url}")
|
|
29
|
+
print(f"Shards: {len(shards)}")
|
|
30
|
+
for shard in shards:
|
|
31
|
+
print(f" - {shard}")
|
|
32
|
+
|
|
33
|
+
# Read first sample to infer schema
|
|
34
|
+
samples = ds.head(1)
|
|
35
|
+
if samples:
|
|
36
|
+
sample = samples[0]
|
|
37
|
+
print("Schema: (inferred from first sample)")
|
|
38
|
+
for key in sample.keys():
|
|
39
|
+
val = sample[key]
|
|
40
|
+
print(f" {key}: {_describe_value(val)}")
|
|
41
|
+
else:
|
|
42
|
+
print("Schema: (no samples found)")
|
|
43
|
+
|
|
44
|
+
# Count samples — scan all shards
|
|
45
|
+
count = sum(1 for _ in ds.ordered())
|
|
46
|
+
print(f"Samples: {count}")
|
|
47
|
+
except Exception as exc:
|
|
48
|
+
print(f"Error reading dataset: {exc}", file=sys.stderr)
|
|
49
|
+
return 1
|
|
50
|
+
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _describe_value(val: Any) -> str:
|
|
55
|
+
"""Human-readable type description for a sample field value."""
|
|
56
|
+
import numpy as np
|
|
57
|
+
|
|
58
|
+
if isinstance(val, np.ndarray):
|
|
59
|
+
return f"ndarray dtype={val.dtype} shape={val.shape}"
|
|
60
|
+
if isinstance(val, bytes):
|
|
61
|
+
return f"bytes len={len(val)}"
|
|
62
|
+
if isinstance(val, str):
|
|
63
|
+
truncated = val[:60] + ("..." if len(val) > 60 else "")
|
|
64
|
+
return f'str "{truncated}"'
|
|
65
|
+
if isinstance(val, (int, float, bool)):
|
|
66
|
+
return f"{type(val).__name__} {val}"
|
|
67
|
+
if isinstance(val, list):
|
|
68
|
+
return f"list len={len(val)}"
|
|
69
|
+
return type(val).__name__
|
atdata/cli/local.py
CHANGED
atdata/cli/preview.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""``atdata preview`` command — render first N samples of a dataset."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def preview_dataset(url: str, limit: int = 5) -> int:
|
|
10
|
+
"""Print a human-readable preview of the first *limit* samples.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
url: Dataset URL, local path, or atmosphere URI.
|
|
14
|
+
limit: Number of samples to show. Default: 5.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Exit code (0 success, 1 failure).
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
from ..dataset import Dataset, DictSample
|
|
21
|
+
|
|
22
|
+
ds = Dataset[DictSample](url)
|
|
23
|
+
except Exception as exc:
|
|
24
|
+
print(f"Error opening dataset: {exc}", file=sys.stderr)
|
|
25
|
+
return 1
|
|
26
|
+
|
|
27
|
+
samples = ds.head(limit)
|
|
28
|
+
if not samples:
|
|
29
|
+
print("No samples found.", file=sys.stderr)
|
|
30
|
+
return 1
|
|
31
|
+
|
|
32
|
+
print(f"Preview of {url} ({len(samples)} sample(s)):")
|
|
33
|
+
print()
|
|
34
|
+
|
|
35
|
+
for i, sample in enumerate(samples):
|
|
36
|
+
print(f"--- Sample {i} ---")
|
|
37
|
+
for key in sample.keys():
|
|
38
|
+
val = sample[key]
|
|
39
|
+
print(f" {key}: {_format_value(val)}")
|
|
40
|
+
print()
|
|
41
|
+
|
|
42
|
+
return 0
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _format_value(val: Any) -> str:
|
|
46
|
+
"""Format a value for preview, truncating large data."""
|
|
47
|
+
import numpy as np
|
|
48
|
+
|
|
49
|
+
if isinstance(val, np.ndarray):
|
|
50
|
+
return f"ndarray shape={val.shape} dtype={val.dtype}"
|
|
51
|
+
if isinstance(val, bytes):
|
|
52
|
+
if len(val) <= 40:
|
|
53
|
+
return repr(val)
|
|
54
|
+
return f"bytes[{len(val)}] {val[:20]!r}..."
|
|
55
|
+
if isinstance(val, str):
|
|
56
|
+
if len(val) <= 80:
|
|
57
|
+
return repr(val)
|
|
58
|
+
return repr(val[:77] + "...")
|
|
59
|
+
if isinstance(val, list):
|
|
60
|
+
if len(val) <= 5:
|
|
61
|
+
return repr(val)
|
|
62
|
+
return f"[{val[0]!r}, {val[1]!r}, ... ({len(val)} items)]"
|
|
63
|
+
return repr(val)
|