atdata 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/.gitignore +1 -0
- atdata/__init__.py +31 -1
- atdata/_cid.py +29 -35
- atdata/_exceptions.py +168 -0
- atdata/_helpers.py +33 -17
- atdata/_hf_api.py +109 -59
- atdata/_logging.py +70 -0
- atdata/_protocols.py +74 -132
- atdata/_schema_codec.py +38 -41
- atdata/_sources.py +57 -64
- atdata/_stub_manager.py +31 -26
- atdata/_type_utils.py +47 -7
- atdata/atmosphere/__init__.py +31 -24
- atdata/atmosphere/_types.py +11 -11
- atdata/atmosphere/client.py +11 -8
- atdata/atmosphere/lens.py +27 -30
- atdata/atmosphere/records.py +34 -39
- atdata/atmosphere/schema.py +35 -31
- atdata/atmosphere/store.py +16 -20
- atdata/cli/__init__.py +163 -168
- atdata/cli/diagnose.py +12 -8
- atdata/cli/inspect.py +69 -0
- atdata/cli/local.py +5 -2
- atdata/cli/preview.py +63 -0
- atdata/cli/schema.py +109 -0
- atdata/dataset.py +678 -533
- atdata/lens.py +85 -83
- atdata/local/__init__.py +71 -0
- atdata/local/_entry.py +157 -0
- atdata/local/_index.py +940 -0
- atdata/local/_repo_legacy.py +218 -0
- atdata/local/_s3.py +349 -0
- atdata/local/_schema.py +380 -0
- atdata/manifest/__init__.py +28 -0
- atdata/manifest/_aggregates.py +156 -0
- atdata/manifest/_builder.py +163 -0
- atdata/manifest/_fields.py +154 -0
- atdata/manifest/_manifest.py +146 -0
- atdata/manifest/_query.py +150 -0
- atdata/manifest/_writer.py +74 -0
- atdata/promote.py +20 -24
- atdata/providers/__init__.py +25 -0
- atdata/providers/_base.py +140 -0
- atdata/providers/_factory.py +69 -0
- atdata/providers/_postgres.py +214 -0
- atdata/providers/_redis.py +171 -0
- atdata/providers/_sqlite.py +191 -0
- atdata/repository.py +323 -0
- atdata/testing.py +337 -0
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +5 -1
- atdata-0.3.0b1.dist-info/RECORD +54 -0
- atdata/local.py +0 -1707
- atdata-0.2.2b1.dist-info/RECORD +0 -28
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/cli/diagnose.py
CHANGED
|
@@ -5,7 +5,6 @@ and other infrastructure components.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import sys
|
|
8
|
-
from typing import Any
|
|
9
8
|
|
|
10
9
|
|
|
11
10
|
def _print_status(label: str, ok: bool, detail: str = "") -> None:
|
|
@@ -41,6 +40,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
41
40
|
# Try to connect
|
|
42
41
|
try:
|
|
43
42
|
from redis import Redis
|
|
43
|
+
|
|
44
44
|
redis = Redis(host=host, port=port, socket_connect_timeout=5)
|
|
45
45
|
redis.ping()
|
|
46
46
|
_print_status("Connection", True, "connected")
|
|
@@ -70,7 +70,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
70
70
|
_print_status(
|
|
71
71
|
"AOF Persistence",
|
|
72
72
|
aof_ok,
|
|
73
|
-
"enabled" if aof_ok else "DISABLED - data may be lost on restart!"
|
|
73
|
+
"enabled" if aof_ok else "DISABLED - data may be lost on restart!",
|
|
74
74
|
)
|
|
75
75
|
if not aof_ok:
|
|
76
76
|
issues_found = True
|
|
@@ -85,7 +85,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
85
85
|
_print_status(
|
|
86
86
|
"RDB Persistence",
|
|
87
87
|
rdb_ok,
|
|
88
|
-
f"configured ({save_config})" if rdb_ok else "DISABLED"
|
|
88
|
+
f"configured ({save_config})" if rdb_ok else "DISABLED",
|
|
89
89
|
)
|
|
90
90
|
# RDB disabled is only a warning if AOF is enabled
|
|
91
91
|
except Exception as e:
|
|
@@ -95,7 +95,13 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
95
95
|
try:
|
|
96
96
|
policy = redis.config_get("maxmemory-policy").get("maxmemory-policy", "unknown")
|
|
97
97
|
# Safe policies that won't evict index data
|
|
98
|
-
safe_policies = {
|
|
98
|
+
safe_policies = {
|
|
99
|
+
"noeviction",
|
|
100
|
+
"volatile-lru",
|
|
101
|
+
"volatile-lfu",
|
|
102
|
+
"volatile-ttl",
|
|
103
|
+
"volatile-random",
|
|
104
|
+
}
|
|
99
105
|
policy_ok = policy in safe_policies
|
|
100
106
|
|
|
101
107
|
if policy_ok:
|
|
@@ -104,7 +110,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
104
110
|
_print_status(
|
|
105
111
|
"Memory Policy",
|
|
106
112
|
False,
|
|
107
|
-
f"{policy} - may evict index data! Use 'noeviction' or 'volatile-*'"
|
|
113
|
+
f"{policy} - may evict index data! Use 'noeviction' or 'volatile-*'",
|
|
108
114
|
)
|
|
109
115
|
issues_found = True
|
|
110
116
|
except Exception as e:
|
|
@@ -141,9 +147,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
|
|
|
141
147
|
for key in redis.scan_iter(match="LocalSchema:*", count=100):
|
|
142
148
|
schema_count += 1
|
|
143
149
|
_print_status(
|
|
144
|
-
"atdata Keys",
|
|
145
|
-
True,
|
|
146
|
-
f"{dataset_count} datasets, {schema_count} schemas"
|
|
150
|
+
"atdata Keys", True, f"{dataset_count} datasets, {schema_count} schemas"
|
|
147
151
|
)
|
|
148
152
|
except Exception as e:
|
|
149
153
|
_print_status("atdata Keys", False, f"check failed: {e}")
|
atdata/cli/inspect.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""``atdata inspect`` command — show dataset summary information."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def inspect_dataset(url: str) -> int:
|
|
10
|
+
"""Print summary information for a dataset at the given URL.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
url: WebDataset URL, local path, or atmosphere URI.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Exit code (0 success, 1 failure).
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
from ..dataset import Dataset, DictSample
|
|
20
|
+
|
|
21
|
+
ds = Dataset[DictSample](url)
|
|
22
|
+
except Exception as exc:
|
|
23
|
+
print(f"Error opening dataset: {exc}", file=sys.stderr)
|
|
24
|
+
return 1
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
shards = ds.list_shards()
|
|
28
|
+
print(f"URL: {url}")
|
|
29
|
+
print(f"Shards: {len(shards)}")
|
|
30
|
+
for shard in shards:
|
|
31
|
+
print(f" - {shard}")
|
|
32
|
+
|
|
33
|
+
# Read first sample to infer schema
|
|
34
|
+
samples = ds.head(1)
|
|
35
|
+
if samples:
|
|
36
|
+
sample = samples[0]
|
|
37
|
+
print("Schema: (inferred from first sample)")
|
|
38
|
+
for key in sample.keys():
|
|
39
|
+
val = sample[key]
|
|
40
|
+
print(f" {key}: {_describe_value(val)}")
|
|
41
|
+
else:
|
|
42
|
+
print("Schema: (no samples found)")
|
|
43
|
+
|
|
44
|
+
# Count samples — scan all shards
|
|
45
|
+
count = sum(1 for _ in ds.ordered())
|
|
46
|
+
print(f"Samples: {count}")
|
|
47
|
+
except Exception as exc:
|
|
48
|
+
print(f"Error reading dataset: {exc}", file=sys.stderr)
|
|
49
|
+
return 1
|
|
50
|
+
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _describe_value(val: Any) -> str:
|
|
55
|
+
"""Human-readable type description for a sample field value."""
|
|
56
|
+
import numpy as np
|
|
57
|
+
|
|
58
|
+
if isinstance(val, np.ndarray):
|
|
59
|
+
return f"ndarray dtype={val.dtype} shape={val.shape}"
|
|
60
|
+
if isinstance(val, bytes):
|
|
61
|
+
return f"bytes len={len(val)}"
|
|
62
|
+
if isinstance(val, str):
|
|
63
|
+
truncated = val[:60] + ("..." if len(val) > 60 else "")
|
|
64
|
+
return f'str "{truncated}"'
|
|
65
|
+
if isinstance(val, (int, float, bool)):
|
|
66
|
+
return f"{type(val).__name__} {val}"
|
|
67
|
+
if isinstance(val, list):
|
|
68
|
+
return f"list len={len(val)}"
|
|
69
|
+
return type(val).__name__
|
atdata/cli/local.py
CHANGED
|
@@ -114,7 +114,7 @@ def _container_running(name: str) -> bool:
|
|
|
114
114
|
timeout=5,
|
|
115
115
|
)
|
|
116
116
|
return result.returncode == 0 and result.stdout.strip() == "true"
|
|
117
|
-
except
|
|
117
|
+
except (OSError, subprocess.SubprocessError):
|
|
118
118
|
return False
|
|
119
119
|
|
|
120
120
|
|
|
@@ -144,7 +144,9 @@ def _run_compose(
|
|
|
144
144
|
elif shutil.which("docker-compose"):
|
|
145
145
|
base_cmd = ["docker-compose"]
|
|
146
146
|
else:
|
|
147
|
-
raise RuntimeError(
|
|
147
|
+
raise RuntimeError(
|
|
148
|
+
"Neither 'docker compose' nor 'docker-compose' available"
|
|
149
|
+
)
|
|
148
150
|
else:
|
|
149
151
|
raise RuntimeError("Docker not found")
|
|
150
152
|
|
|
@@ -195,6 +197,7 @@ def local_up(
|
|
|
195
197
|
|
|
196
198
|
# Wait a moment for containers to be healthy
|
|
197
199
|
import time
|
|
200
|
+
|
|
198
201
|
time.sleep(2)
|
|
199
202
|
|
|
200
203
|
# Show status
|
atdata/cli/preview.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""``atdata preview`` command — render first N samples of a dataset."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def preview_dataset(url: str, limit: int = 5) -> int:
|
|
10
|
+
"""Print a human-readable preview of the first *limit* samples.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
url: Dataset URL, local path, or atmosphere URI.
|
|
14
|
+
limit: Number of samples to show. Default: 5.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Exit code (0 success, 1 failure).
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
from ..dataset import Dataset, DictSample
|
|
21
|
+
|
|
22
|
+
ds = Dataset[DictSample](url)
|
|
23
|
+
except Exception as exc:
|
|
24
|
+
print(f"Error opening dataset: {exc}", file=sys.stderr)
|
|
25
|
+
return 1
|
|
26
|
+
|
|
27
|
+
samples = ds.head(limit)
|
|
28
|
+
if not samples:
|
|
29
|
+
print("No samples found.", file=sys.stderr)
|
|
30
|
+
return 1
|
|
31
|
+
|
|
32
|
+
print(f"Preview of {url} ({len(samples)} sample(s)):")
|
|
33
|
+
print()
|
|
34
|
+
|
|
35
|
+
for i, sample in enumerate(samples):
|
|
36
|
+
print(f"--- Sample {i} ---")
|
|
37
|
+
for key in sample.keys():
|
|
38
|
+
val = sample[key]
|
|
39
|
+
print(f" {key}: {_format_value(val)}")
|
|
40
|
+
print()
|
|
41
|
+
|
|
42
|
+
return 0
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _format_value(val: Any) -> str:
|
|
46
|
+
"""Format a value for preview, truncating large data."""
|
|
47
|
+
import numpy as np
|
|
48
|
+
|
|
49
|
+
if isinstance(val, np.ndarray):
|
|
50
|
+
return f"ndarray shape={val.shape} dtype={val.dtype}"
|
|
51
|
+
if isinstance(val, bytes):
|
|
52
|
+
if len(val) <= 40:
|
|
53
|
+
return repr(val)
|
|
54
|
+
return f"bytes[{len(val)}] {val[:20]!r}..."
|
|
55
|
+
if isinstance(val, str):
|
|
56
|
+
if len(val) <= 80:
|
|
57
|
+
return repr(val)
|
|
58
|
+
return repr(val[:77] + "...")
|
|
59
|
+
if isinstance(val, list):
|
|
60
|
+
if len(val) <= 5:
|
|
61
|
+
return repr(val)
|
|
62
|
+
return f"[{val[0]!r}, {val[1]!r}, ... ({len(val)} items)]"
|
|
63
|
+
return repr(val)
|
atdata/cli/schema.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""``atdata schema`` commands — show and diff dataset schemas."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def schema_show(dataset_ref: str) -> int:
|
|
9
|
+
"""Display the schema of a dataset.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
dataset_ref: Dataset URL, local path, or index reference
|
|
13
|
+
(e.g. ``@local/my-dataset``).
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
Exit code (0 success, 1 failure).
|
|
17
|
+
"""
|
|
18
|
+
try:
|
|
19
|
+
from ..dataset import Dataset, DictSample
|
|
20
|
+
|
|
21
|
+
ds = Dataset[DictSample](dataset_ref)
|
|
22
|
+
except Exception as exc:
|
|
23
|
+
print(f"Error opening dataset: {exc}", file=sys.stderr)
|
|
24
|
+
return 1
|
|
25
|
+
|
|
26
|
+
samples = ds.head(1)
|
|
27
|
+
if not samples:
|
|
28
|
+
print("No samples found — cannot infer schema.", file=sys.stderr)
|
|
29
|
+
return 1
|
|
30
|
+
|
|
31
|
+
sample = samples[0]
|
|
32
|
+
print(f"Schema for: {dataset_ref}")
|
|
33
|
+
print(f"Fields ({len(sample.keys())}):")
|
|
34
|
+
for key in sample.keys():
|
|
35
|
+
val = sample[key]
|
|
36
|
+
print(f" {key}: {_type_label(val)}")
|
|
37
|
+
|
|
38
|
+
return 0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def schema_diff(url_a: str, url_b: str) -> int:
|
|
42
|
+
"""Compare schemas of two datasets and print differences.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
url_a: First dataset URL / path.
|
|
46
|
+
url_b: Second dataset URL / path.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Exit code (0 identical, 1 different, 2 error).
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
from ..dataset import Dataset, DictSample
|
|
53
|
+
|
|
54
|
+
ds_a = Dataset[DictSample](url_a)
|
|
55
|
+
ds_b = Dataset[DictSample](url_b)
|
|
56
|
+
except Exception as exc:
|
|
57
|
+
print(f"Error opening dataset: {exc}", file=sys.stderr)
|
|
58
|
+
return 2
|
|
59
|
+
|
|
60
|
+
samples_a = ds_a.head(1)
|
|
61
|
+
samples_b = ds_b.head(1)
|
|
62
|
+
|
|
63
|
+
if not samples_a:
|
|
64
|
+
print(f"No samples in {url_a}", file=sys.stderr)
|
|
65
|
+
return 2
|
|
66
|
+
if not samples_b:
|
|
67
|
+
print(f"No samples in {url_b}", file=sys.stderr)
|
|
68
|
+
return 2
|
|
69
|
+
|
|
70
|
+
fields_a = {k: _type_label(samples_a[0][k]) for k in samples_a[0].keys()}
|
|
71
|
+
fields_b = {k: _type_label(samples_b[0][k]) for k in samples_b[0].keys()}
|
|
72
|
+
|
|
73
|
+
keys_a = set(fields_a)
|
|
74
|
+
keys_b = set(fields_b)
|
|
75
|
+
|
|
76
|
+
added = sorted(keys_b - keys_a)
|
|
77
|
+
removed = sorted(keys_a - keys_b)
|
|
78
|
+
common = sorted(keys_a & keys_b)
|
|
79
|
+
changed = [k for k in common if fields_a[k] != fields_b[k]]
|
|
80
|
+
|
|
81
|
+
if not added and not removed and not changed:
|
|
82
|
+
print("Schemas are identical.")
|
|
83
|
+
return 0
|
|
84
|
+
|
|
85
|
+
if added:
|
|
86
|
+
print("Added:")
|
|
87
|
+
for k in added:
|
|
88
|
+
print(f" + {k}: {fields_b[k]}")
|
|
89
|
+
if removed:
|
|
90
|
+
print("Removed:")
|
|
91
|
+
for k in removed:
|
|
92
|
+
print(f" - {k}: {fields_a[k]}")
|
|
93
|
+
if changed:
|
|
94
|
+
print("Changed:")
|
|
95
|
+
for k in changed:
|
|
96
|
+
print(f" ~ {k}: {fields_a[k]} -> {fields_b[k]}")
|
|
97
|
+
|
|
98
|
+
return 1
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _type_label(val: object) -> str:
|
|
102
|
+
"""Short type label for schema display."""
|
|
103
|
+
import numpy as np
|
|
104
|
+
|
|
105
|
+
if isinstance(val, np.ndarray):
|
|
106
|
+
return f"ndarray[{val.dtype}]"
|
|
107
|
+
if isinstance(val, bytes):
|
|
108
|
+
return "bytes"
|
|
109
|
+
return type(val).__name__
|