atdata 0.2.3b1__py3-none-any.whl → 0.3.1b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/.gitignore +1 -0
- atdata/__init__.py +39 -0
- atdata/_cid.py +0 -21
- atdata/_exceptions.py +168 -0
- atdata/_helpers.py +41 -15
- atdata/_hf_api.py +95 -11
- atdata/_logging.py +70 -0
- atdata/_protocols.py +77 -238
- atdata/_schema_codec.py +7 -6
- atdata/_stub_manager.py +5 -25
- atdata/_type_utils.py +28 -2
- atdata/atmosphere/__init__.py +31 -20
- atdata/atmosphere/_types.py +4 -4
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +11 -12
- atdata/atmosphere/records.py +12 -12
- atdata/atmosphere/schema.py +16 -18
- atdata/atmosphere/store.py +6 -7
- atdata/cli/__init__.py +161 -175
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +11 -11
- atdata/cli/inspect.py +69 -0
- atdata/cli/preview.py +63 -0
- atdata/cli/schema.py +109 -0
- atdata/dataset.py +583 -328
- atdata/index/__init__.py +54 -0
- atdata/index/_entry.py +157 -0
- atdata/index/_index.py +1198 -0
- atdata/index/_schema.py +380 -0
- atdata/lens.py +9 -2
- atdata/lexicons/__init__.py +121 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +99 -0
- atdata/lexicons/ac.foundation.dataset.record.json +96 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +70 -0
- atdata/local/_repo_legacy.py +218 -0
- atdata/manifest/__init__.py +28 -0
- atdata/manifest/_aggregates.py +156 -0
- atdata/manifest/_builder.py +163 -0
- atdata/manifest/_fields.py +154 -0
- atdata/manifest/_manifest.py +146 -0
- atdata/manifest/_query.py +150 -0
- atdata/manifest/_writer.py +74 -0
- atdata/promote.py +18 -14
- atdata/providers/__init__.py +25 -0
- atdata/providers/_base.py +140 -0
- atdata/providers/_factory.py +69 -0
- atdata/providers/_postgres.py +214 -0
- atdata/providers/_redis.py +171 -0
- atdata/providers/_sqlite.py +191 -0
- atdata/repository.py +323 -0
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +123 -0
- atdata/stores/_s3.py +349 -0
- atdata/testing.py +341 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/METADATA +5 -2
- atdata-0.3.1b1.dist-info/RECORD +67 -0
- atdata/local.py +0 -1720
- atdata-0.2.3b1.dist-info/RECORD +0 -28
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.3b1.dist-info → atdata-0.3.1b1.dist-info}/licenses/LICENSE +0 -0
atdata/cli/__init__.py
CHANGED
|
@@ -1,221 +1,207 @@
|
|
|
1
1
|
"""Command-line interface for atdata.
|
|
2
2
|
|
|
3
|
-
This module provides CLI commands for managing
|
|
4
|
-
and diagnosing configuration issues.
|
|
3
|
+
This module provides CLI commands for managing development infrastructure,
|
|
4
|
+
inspecting datasets, and diagnosing configuration issues.
|
|
5
5
|
|
|
6
6
|
Commands:
|
|
7
|
-
atdata
|
|
8
|
-
atdata
|
|
9
|
-
atdata
|
|
10
|
-
atdata
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
Local infrastructure ready.
|
|
17
|
-
|
|
18
|
-
$ atdata diagnose
|
|
19
|
-
Checking Redis configuration...
|
|
20
|
-
✓ Redis connected
|
|
21
|
-
✓ Persistence enabled (AOF)
|
|
22
|
-
✓ Memory policy: noeviction
|
|
7
|
+
atdata infra up Start Redis and MinIO containers for development
|
|
8
|
+
atdata infra down Stop development containers
|
|
9
|
+
atdata infra status Show status of infrastructure
|
|
10
|
+
atdata diagnose Check Redis configuration and connectivity
|
|
11
|
+
atdata inspect Show dataset summary information
|
|
12
|
+
atdata schema show Display dataset schema
|
|
13
|
+
atdata schema diff Compare two dataset schemas
|
|
14
|
+
atdata preview Preview first N samples of a dataset
|
|
15
|
+
atdata version Show version information
|
|
23
16
|
"""
|
|
24
17
|
|
|
25
|
-
import argparse
|
|
26
18
|
import sys
|
|
27
|
-
from typing import Sequence
|
|
28
19
|
|
|
20
|
+
import typer
|
|
29
21
|
|
|
30
|
-
|
|
31
|
-
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# App hierarchy
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
32
25
|
|
|
33
|
-
|
|
34
|
-
|
|
26
|
+
app = typer.Typer(
|
|
27
|
+
name="atdata",
|
|
28
|
+
help="A loose federation of distributed, typed datasets.",
|
|
29
|
+
add_completion=False,
|
|
30
|
+
no_args_is_help=True,
|
|
31
|
+
)
|
|
35
32
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
""
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
43
|
-
)
|
|
44
|
-
parser.add_argument(
|
|
45
|
-
"--version",
|
|
46
|
-
"-v",
|
|
47
|
-
action="store_true",
|
|
48
|
-
help="Show version information",
|
|
49
|
-
)
|
|
33
|
+
infra_app = typer.Typer(
|
|
34
|
+
name="infra",
|
|
35
|
+
help="Manage development infrastructure.",
|
|
36
|
+
no_args_is_help=True,
|
|
37
|
+
)
|
|
38
|
+
app.add_typer(infra_app, name="infra")
|
|
50
39
|
|
|
51
|
-
|
|
40
|
+
schema_app = typer.Typer(
|
|
41
|
+
name="schema",
|
|
42
|
+
help="Show or compare dataset schemas.",
|
|
43
|
+
no_args_is_help=True,
|
|
44
|
+
)
|
|
45
|
+
app.add_typer(schema_app, name="schema")
|
|
52
46
|
|
|
53
|
-
# 'local' command group
|
|
54
|
-
local_parser = subparsers.add_parser(
|
|
55
|
-
"local",
|
|
56
|
-
help="Manage local development infrastructure",
|
|
57
|
-
)
|
|
58
|
-
local_subparsers = local_parser.add_subparsers(
|
|
59
|
-
dest="local_command",
|
|
60
|
-
help="Local infrastructure commands",
|
|
61
|
-
)
|
|
62
47
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
help="Start Redis and MinIO containers",
|
|
67
|
-
)
|
|
68
|
-
up_parser.add_argument(
|
|
69
|
-
"--redis-port",
|
|
70
|
-
type=int,
|
|
71
|
-
default=6379,
|
|
72
|
-
help="Redis port (default: 6379)",
|
|
73
|
-
)
|
|
74
|
-
up_parser.add_argument(
|
|
75
|
-
"--minio-port",
|
|
76
|
-
type=int,
|
|
77
|
-
default=9000,
|
|
78
|
-
help="MinIO API port (default: 9000)",
|
|
79
|
-
)
|
|
80
|
-
up_parser.add_argument(
|
|
81
|
-
"--minio-console-port",
|
|
82
|
-
type=int,
|
|
83
|
-
default=9001,
|
|
84
|
-
help="MinIO console port (default: 9001)",
|
|
85
|
-
)
|
|
86
|
-
up_parser.add_argument(
|
|
87
|
-
"--detach",
|
|
88
|
-
"-d",
|
|
89
|
-
action="store_true",
|
|
90
|
-
default=True,
|
|
91
|
-
help="Run containers in detached mode (default: True)",
|
|
92
|
-
)
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Top-level commands
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
93
51
|
|
|
94
|
-
# 'local down' command
|
|
95
|
-
down_parser = local_subparsers.add_parser(
|
|
96
|
-
"down",
|
|
97
|
-
help="Stop local development containers",
|
|
98
|
-
)
|
|
99
|
-
down_parser.add_argument(
|
|
100
|
-
"--volumes",
|
|
101
|
-
"-v",
|
|
102
|
-
action="store_true",
|
|
103
|
-
help="Also remove volumes (deletes all data)",
|
|
104
|
-
)
|
|
105
52
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
53
|
+
@app.command()
|
|
54
|
+
def version() -> None:
|
|
55
|
+
"""Show version information."""
|
|
56
|
+
try:
|
|
57
|
+
from atdata import __version__
|
|
111
58
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
help="Diagnose Redis configuration and connectivity",
|
|
116
|
-
)
|
|
117
|
-
diagnose_parser.add_argument(
|
|
118
|
-
"--host",
|
|
119
|
-
default="localhost",
|
|
120
|
-
help="Redis host (default: localhost)",
|
|
121
|
-
)
|
|
122
|
-
diagnose_parser.add_argument(
|
|
123
|
-
"--port",
|
|
124
|
-
type=int,
|
|
125
|
-
default=6379,
|
|
126
|
-
help="Redis port (default: 6379)",
|
|
127
|
-
)
|
|
59
|
+
ver = __version__
|
|
60
|
+
except ImportError:
|
|
61
|
+
from importlib.metadata import version as pkg_version
|
|
128
62
|
|
|
129
|
-
|
|
130
|
-
subparsers.add_parser(
|
|
131
|
-
"version",
|
|
132
|
-
help="Show version information",
|
|
133
|
-
)
|
|
63
|
+
ver = pkg_version("atdata")
|
|
134
64
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
# Handle --version flag
|
|
138
|
-
if args.version or args.command == "version":
|
|
139
|
-
return _cmd_version()
|
|
140
|
-
|
|
141
|
-
# Handle 'local' commands
|
|
142
|
-
if args.command == "local":
|
|
143
|
-
if args.local_command == "up":
|
|
144
|
-
return _cmd_local_up(
|
|
145
|
-
redis_port=args.redis_port,
|
|
146
|
-
minio_port=args.minio_port,
|
|
147
|
-
minio_console_port=args.minio_console_port,
|
|
148
|
-
detach=args.detach,
|
|
149
|
-
)
|
|
150
|
-
elif args.local_command == "down":
|
|
151
|
-
return _cmd_local_down(remove_volumes=args.volumes)
|
|
152
|
-
elif args.local_command == "status":
|
|
153
|
-
return _cmd_local_status()
|
|
154
|
-
else:
|
|
155
|
-
local_parser.print_help()
|
|
156
|
-
return 1
|
|
65
|
+
print(f"atdata {ver}")
|
|
157
66
|
|
|
158
|
-
# Handle 'diagnose' command
|
|
159
|
-
if args.command == "diagnose":
|
|
160
|
-
return _cmd_diagnose(host=args.host, port=args.port)
|
|
161
67
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
68
|
+
@app.command()
|
|
69
|
+
def inspect(
|
|
70
|
+
url: str = typer.Argument(help="Dataset URL, local path, or atmosphere URI"),
|
|
71
|
+
) -> None:
|
|
72
|
+
"""Show dataset summary (sample count, schema, shards)."""
|
|
73
|
+
from .inspect import inspect_dataset
|
|
165
74
|
|
|
75
|
+
code = inspect_dataset(url=url)
|
|
76
|
+
raise typer.Exit(code=code)
|
|
166
77
|
|
|
167
|
-
def _cmd_version() -> int:
|
|
168
|
-
"""Show version information."""
|
|
169
|
-
try:
|
|
170
|
-
from atdata import __version__
|
|
171
78
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
79
|
+
@app.command()
|
|
80
|
+
def preview(
|
|
81
|
+
url: str = typer.Argument(help="Dataset URL, local path, or atmosphere URI"),
|
|
82
|
+
limit: int = typer.Option(5, help="Number of samples to preview."),
|
|
83
|
+
) -> None:
|
|
84
|
+
"""Preview first N samples of a dataset."""
|
|
85
|
+
from .preview import preview_dataset
|
|
86
|
+
|
|
87
|
+
code = preview_dataset(url=url, limit=limit)
|
|
88
|
+
raise typer.Exit(code=code)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@app.command()
|
|
92
|
+
def diagnose(
|
|
93
|
+
host: str = typer.Option("localhost", help="Redis host."),
|
|
94
|
+
port: int = typer.Option(6379, help="Redis port."),
|
|
95
|
+
) -> None:
|
|
96
|
+
"""Diagnose Redis configuration and connectivity."""
|
|
97
|
+
from .diagnose import diagnose_redis
|
|
176
98
|
|
|
177
|
-
|
|
99
|
+
code = diagnose_redis(host=host, port=port)
|
|
100
|
+
raise typer.Exit(code=code)
|
|
178
101
|
|
|
179
|
-
print(f"atdata {version}")
|
|
180
|
-
return 0
|
|
181
102
|
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
# infra sub-commands
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
182
106
|
|
|
183
|
-
def _cmd_local_up(
|
|
184
|
-
redis_port: int,
|
|
185
|
-
minio_port: int,
|
|
186
|
-
minio_console_port: int,
|
|
187
|
-
detach: bool,
|
|
188
|
-
) -> int:
|
|
189
|
-
"""Start local development infrastructure."""
|
|
190
|
-
from .local import local_up
|
|
191
107
|
|
|
192
|
-
|
|
108
|
+
@infra_app.command()
|
|
109
|
+
def up(
|
|
110
|
+
redis_port: int = typer.Option(6379, help="Redis port."),
|
|
111
|
+
minio_port: int = typer.Option(9000, help="MinIO API port."),
|
|
112
|
+
minio_console_port: int = typer.Option(9001, help="MinIO console port."),
|
|
113
|
+
detach: bool = typer.Option(
|
|
114
|
+
True, "--detach", "-d", help="Run containers in detached mode."
|
|
115
|
+
),
|
|
116
|
+
) -> None:
|
|
117
|
+
"""Start Redis and MinIO containers."""
|
|
118
|
+
from .infra import local_up
|
|
119
|
+
|
|
120
|
+
code = local_up(
|
|
193
121
|
redis_port=redis_port,
|
|
194
122
|
minio_port=minio_port,
|
|
195
123
|
minio_console_port=minio_console_port,
|
|
196
124
|
detach=detach,
|
|
197
125
|
)
|
|
126
|
+
raise typer.Exit(code=code)
|
|
198
127
|
|
|
199
128
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
129
|
+
@infra_app.command()
|
|
130
|
+
def down(
|
|
131
|
+
volumes: bool = typer.Option(
|
|
132
|
+
False, "--volumes", "-v", help="Also remove volumes (deletes all data)."
|
|
133
|
+
),
|
|
134
|
+
) -> None:
|
|
135
|
+
"""Stop local development containers."""
|
|
136
|
+
from .infra import local_down
|
|
203
137
|
|
|
204
|
-
|
|
138
|
+
code = local_down(remove_volumes=volumes)
|
|
139
|
+
raise typer.Exit(code=code)
|
|
205
140
|
|
|
206
141
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
142
|
+
@infra_app.command()
|
|
143
|
+
def status() -> None:
|
|
144
|
+
"""Show status of infrastructure."""
|
|
145
|
+
from .infra import local_status
|
|
210
146
|
|
|
211
|
-
|
|
147
|
+
code = local_status()
|
|
148
|
+
raise typer.Exit(code=code)
|
|
212
149
|
|
|
213
150
|
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
151
|
+
# ---------------------------------------------------------------------------
|
|
152
|
+
# schema sub-commands
|
|
153
|
+
# ---------------------------------------------------------------------------
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@schema_app.command("show")
|
|
157
|
+
def schema_show(
|
|
158
|
+
dataset_ref: str = typer.Argument(
|
|
159
|
+
help="Dataset URL, local path, or index reference."
|
|
160
|
+
),
|
|
161
|
+
) -> None:
|
|
162
|
+
"""Display dataset schema."""
|
|
163
|
+
from .schema import schema_show as _schema_show
|
|
164
|
+
|
|
165
|
+
code = _schema_show(dataset_ref=dataset_ref)
|
|
166
|
+
raise typer.Exit(code=code)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
@schema_app.command("diff")
|
|
170
|
+
def schema_diff(
|
|
171
|
+
url_a: str = typer.Argument(help="First dataset URL."),
|
|
172
|
+
url_b: str = typer.Argument(help="Second dataset URL."),
|
|
173
|
+
) -> None:
|
|
174
|
+
"""Compare two dataset schemas."""
|
|
175
|
+
from .schema import schema_diff as _schema_diff
|
|
217
176
|
|
|
218
|
-
|
|
177
|
+
code = _schema_diff(url_a=url_a, url_b=url_b)
|
|
178
|
+
raise typer.Exit(code=code)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
# ---------------------------------------------------------------------------
|
|
182
|
+
# Entrypoint
|
|
183
|
+
# ---------------------------------------------------------------------------
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def main(argv: list[str] | None = None) -> int:
|
|
187
|
+
"""Main entry point for the atdata CLI.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
argv: Command-line arguments. If None, uses sys.argv[1:].
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
Exit code (0 for success, non-zero for errors).
|
|
194
|
+
"""
|
|
195
|
+
try:
|
|
196
|
+
if argv is not None:
|
|
197
|
+
app(args=argv, standalone_mode=False)
|
|
198
|
+
else:
|
|
199
|
+
app(standalone_mode=False)
|
|
200
|
+
return 0
|
|
201
|
+
except SystemExit as exc:
|
|
202
|
+
return exc.code if isinstance(exc.code, int) else 0
|
|
203
|
+
except Exception:
|
|
204
|
+
return 1
|
|
219
205
|
|
|
220
206
|
|
|
221
207
|
if __name__ == "__main__":
|
atdata/cli/diagnose.py
CHANGED
|
@@ -51,7 +51,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
51
51
|
_print_status("Connection", False, str(e))
|
|
52
52
|
print()
|
|
53
53
|
print("Cannot connect to Redis. Make sure Redis is running:")
|
|
54
|
-
print(" atdata
|
|
54
|
+
print(" atdata infra up")
|
|
55
55
|
return 1
|
|
56
56
|
|
|
57
57
|
# Check Redis version
|
|
@@ -162,7 +162,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
162
162
|
print(" maxmemory-policy noeviction")
|
|
163
163
|
print()
|
|
164
164
|
print(" # Or use atdata's preconfigured local setup:")
|
|
165
|
-
print(" atdata
|
|
165
|
+
print(" atdata infra up")
|
|
166
166
|
return 1
|
|
167
167
|
else:
|
|
168
168
|
print("All checks passed. Redis is properly configured for atdata.")
|
atdata/cli/{local.py → infra.py}
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Infrastructure management for atdata.
|
|
2
2
|
|
|
3
|
-
This module provides commands to start and stop
|
|
3
|
+
This module provides commands to start and stop development infrastructure:
|
|
4
4
|
- Redis: For index storage and metadata
|
|
5
5
|
- MinIO: S3-compatible object storage for dataset files
|
|
6
6
|
|
|
@@ -114,7 +114,7 @@ def _container_running(name: str) -> bool:
|
|
|
114
114
|
timeout=5,
|
|
115
115
|
)
|
|
116
116
|
return result.returncode == 0 and result.stdout.strip() == "true"
|
|
117
|
-
except
|
|
117
|
+
except (OSError, subprocess.SubprocessError):
|
|
118
118
|
return False
|
|
119
119
|
|
|
120
120
|
|
|
@@ -179,7 +179,7 @@ def local_up(
|
|
|
179
179
|
if not _check_docker():
|
|
180
180
|
return 1
|
|
181
181
|
|
|
182
|
-
print("Starting atdata
|
|
182
|
+
print("Starting atdata infrastructure...")
|
|
183
183
|
|
|
184
184
|
compose_content = _get_compose_file(redis_port, minio_port, minio_console_port)
|
|
185
185
|
command = ["up"]
|
|
@@ -202,7 +202,7 @@ def local_up(
|
|
|
202
202
|
|
|
203
203
|
# Show status
|
|
204
204
|
print()
|
|
205
|
-
print("
|
|
205
|
+
print("Infrastructure started:")
|
|
206
206
|
print(f" Redis: localhost:{redis_port}")
|
|
207
207
|
print(f" MinIO API: http://localhost:{minio_port}")
|
|
208
208
|
print(f" MinIO Console: http://localhost:{minio_console_port}")
|
|
@@ -210,7 +210,7 @@ def local_up(
|
|
|
210
210
|
print("MinIO credentials: minioadmin / minioadmin")
|
|
211
211
|
print()
|
|
212
212
|
print("Example usage:")
|
|
213
|
-
print(" from atdata.
|
|
213
|
+
print(" from atdata.stores import S3DataStore")
|
|
214
214
|
print(" ")
|
|
215
215
|
print(" store = S3DataStore.from_credentials({")
|
|
216
216
|
print(f" 'AWS_ENDPOINT': 'http://localhost:{minio_port}',")
|
|
@@ -234,7 +234,7 @@ def local_down(remove_volumes: bool = False) -> int:
|
|
|
234
234
|
if not _check_docker():
|
|
235
235
|
return 1
|
|
236
236
|
|
|
237
|
-
print("Stopping atdata
|
|
237
|
+
print("Stopping atdata infrastructure...")
|
|
238
238
|
|
|
239
239
|
# Use default ports for compose file (actual ports don't matter for down)
|
|
240
240
|
compose_content = _get_compose_file(6379, 9000, 9001)
|
|
@@ -252,7 +252,7 @@ def local_down(remove_volumes: bool = False) -> int:
|
|
|
252
252
|
print(f"Error: {e}", file=sys.stderr)
|
|
253
253
|
return 1
|
|
254
254
|
|
|
255
|
-
print("
|
|
255
|
+
print("Infrastructure stopped.")
|
|
256
256
|
return 0
|
|
257
257
|
|
|
258
258
|
|
|
@@ -268,16 +268,16 @@ def local_status() -> int:
|
|
|
268
268
|
redis_running = _container_running(REDIS_CONTAINER)
|
|
269
269
|
minio_running = _container_running(MINIO_CONTAINER)
|
|
270
270
|
|
|
271
|
-
print("atdata
|
|
271
|
+
print("atdata infrastructure status:")
|
|
272
272
|
print()
|
|
273
273
|
print(f" Redis ({REDIS_CONTAINER}): {'running' if redis_running else 'stopped'}")
|
|
274
274
|
print(f" MinIO ({MINIO_CONTAINER}): {'running' if minio_running else 'stopped'}")
|
|
275
275
|
|
|
276
276
|
if redis_running or minio_running:
|
|
277
277
|
print()
|
|
278
|
-
print("To stop: atdata
|
|
278
|
+
print("To stop: atdata infra down")
|
|
279
279
|
else:
|
|
280
280
|
print()
|
|
281
|
-
print("To start: atdata
|
|
281
|
+
print("To start: atdata infra up")
|
|
282
282
|
|
|
283
283
|
return 0
|
atdata/cli/inspect.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""``atdata inspect`` command — show dataset summary information."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def inspect_dataset(url: str) -> int:
|
|
10
|
+
"""Print summary information for a dataset at the given URL.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
url: WebDataset URL, local path, or atmosphere URI.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Exit code (0 success, 1 failure).
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
from ..dataset import Dataset, DictSample
|
|
20
|
+
|
|
21
|
+
ds = Dataset[DictSample](url)
|
|
22
|
+
except Exception as exc:
|
|
23
|
+
print(f"Error opening dataset: {exc}", file=sys.stderr)
|
|
24
|
+
return 1
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
shards = ds.list_shards()
|
|
28
|
+
print(f"URL: {url}")
|
|
29
|
+
print(f"Shards: {len(shards)}")
|
|
30
|
+
for shard in shards:
|
|
31
|
+
print(f" - {shard}")
|
|
32
|
+
|
|
33
|
+
# Read first sample to infer schema
|
|
34
|
+
samples = ds.head(1)
|
|
35
|
+
if samples:
|
|
36
|
+
sample = samples[0]
|
|
37
|
+
print("Schema: (inferred from first sample)")
|
|
38
|
+
for key in sample.keys():
|
|
39
|
+
val = sample[key]
|
|
40
|
+
print(f" {key}: {_describe_value(val)}")
|
|
41
|
+
else:
|
|
42
|
+
print("Schema: (no samples found)")
|
|
43
|
+
|
|
44
|
+
# Count samples — scan all shards
|
|
45
|
+
count = sum(1 for _ in ds.ordered())
|
|
46
|
+
print(f"Samples: {count}")
|
|
47
|
+
except Exception as exc:
|
|
48
|
+
print(f"Error reading dataset: {exc}", file=sys.stderr)
|
|
49
|
+
return 1
|
|
50
|
+
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _describe_value(val: Any) -> str:
|
|
55
|
+
"""Human-readable type description for a sample field value."""
|
|
56
|
+
import numpy as np
|
|
57
|
+
|
|
58
|
+
if isinstance(val, np.ndarray):
|
|
59
|
+
return f"ndarray dtype={val.dtype} shape={val.shape}"
|
|
60
|
+
if isinstance(val, bytes):
|
|
61
|
+
return f"bytes len={len(val)}"
|
|
62
|
+
if isinstance(val, str):
|
|
63
|
+
truncated = val[:60] + ("..." if len(val) > 60 else "")
|
|
64
|
+
return f'str "{truncated}"'
|
|
65
|
+
if isinstance(val, (int, float, bool)):
|
|
66
|
+
return f"{type(val).__name__} {val}"
|
|
67
|
+
if isinstance(val, list):
|
|
68
|
+
return f"list len={len(val)}"
|
|
69
|
+
return type(val).__name__
|
atdata/cli/preview.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""``atdata preview`` command — render first N samples of a dataset."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def preview_dataset(url: str, limit: int = 5) -> int:
|
|
10
|
+
"""Print a human-readable preview of the first *limit* samples.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
url: Dataset URL, local path, or atmosphere URI.
|
|
14
|
+
limit: Number of samples to show. Default: 5.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Exit code (0 success, 1 failure).
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
from ..dataset import Dataset, DictSample
|
|
21
|
+
|
|
22
|
+
ds = Dataset[DictSample](url)
|
|
23
|
+
except Exception as exc:
|
|
24
|
+
print(f"Error opening dataset: {exc}", file=sys.stderr)
|
|
25
|
+
return 1
|
|
26
|
+
|
|
27
|
+
samples = ds.head(limit)
|
|
28
|
+
if not samples:
|
|
29
|
+
print("No samples found.", file=sys.stderr)
|
|
30
|
+
return 1
|
|
31
|
+
|
|
32
|
+
print(f"Preview of {url} ({len(samples)} sample(s)):")
|
|
33
|
+
print()
|
|
34
|
+
|
|
35
|
+
for i, sample in enumerate(samples):
|
|
36
|
+
print(f"--- Sample {i} ---")
|
|
37
|
+
for key in sample.keys():
|
|
38
|
+
val = sample[key]
|
|
39
|
+
print(f" {key}: {_format_value(val)}")
|
|
40
|
+
print()
|
|
41
|
+
|
|
42
|
+
return 0
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _format_value(val: Any) -> str:
|
|
46
|
+
"""Format a value for preview, truncating large data."""
|
|
47
|
+
import numpy as np
|
|
48
|
+
|
|
49
|
+
if isinstance(val, np.ndarray):
|
|
50
|
+
return f"ndarray shape={val.shape} dtype={val.dtype}"
|
|
51
|
+
if isinstance(val, bytes):
|
|
52
|
+
if len(val) <= 40:
|
|
53
|
+
return repr(val)
|
|
54
|
+
return f"bytes[{len(val)}] {val[:20]!r}..."
|
|
55
|
+
if isinstance(val, str):
|
|
56
|
+
if len(val) <= 80:
|
|
57
|
+
return repr(val)
|
|
58
|
+
return repr(val[:77] + "...")
|
|
59
|
+
if isinstance(val, list):
|
|
60
|
+
if len(val) <= 5:
|
|
61
|
+
return repr(val)
|
|
62
|
+
return f"[{val[0]!r}, {val[1]!r}, ... ({len(val)} items)]"
|
|
63
|
+
return repr(val)
|