atdata 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +31 -1
  3. atdata/_cid.py +29 -35
  4. atdata/_exceptions.py +168 -0
  5. atdata/_helpers.py +33 -17
  6. atdata/_hf_api.py +109 -59
  7. atdata/_logging.py +70 -0
  8. atdata/_protocols.py +74 -132
  9. atdata/_schema_codec.py +38 -41
  10. atdata/_sources.py +57 -64
  11. atdata/_stub_manager.py +31 -26
  12. atdata/_type_utils.py +47 -7
  13. atdata/atmosphere/__init__.py +31 -24
  14. atdata/atmosphere/_types.py +11 -11
  15. atdata/atmosphere/client.py +11 -8
  16. atdata/atmosphere/lens.py +27 -30
  17. atdata/atmosphere/records.py +34 -39
  18. atdata/atmosphere/schema.py +35 -31
  19. atdata/atmosphere/store.py +16 -20
  20. atdata/cli/__init__.py +163 -168
  21. atdata/cli/diagnose.py +12 -8
  22. atdata/cli/inspect.py +69 -0
  23. atdata/cli/local.py +5 -2
  24. atdata/cli/preview.py +63 -0
  25. atdata/cli/schema.py +109 -0
  26. atdata/dataset.py +678 -533
  27. atdata/lens.py +85 -83
  28. atdata/local/__init__.py +71 -0
  29. atdata/local/_entry.py +157 -0
  30. atdata/local/_index.py +940 -0
  31. atdata/local/_repo_legacy.py +218 -0
  32. atdata/local/_s3.py +349 -0
  33. atdata/local/_schema.py +380 -0
  34. atdata/manifest/__init__.py +28 -0
  35. atdata/manifest/_aggregates.py +156 -0
  36. atdata/manifest/_builder.py +163 -0
  37. atdata/manifest/_fields.py +154 -0
  38. atdata/manifest/_manifest.py +146 -0
  39. atdata/manifest/_query.py +150 -0
  40. atdata/manifest/_writer.py +74 -0
  41. atdata/promote.py +20 -24
  42. atdata/providers/__init__.py +25 -0
  43. atdata/providers/_base.py +140 -0
  44. atdata/providers/_factory.py +69 -0
  45. atdata/providers/_postgres.py +214 -0
  46. atdata/providers/_redis.py +171 -0
  47. atdata/providers/_sqlite.py +191 -0
  48. atdata/repository.py +323 -0
  49. atdata/testing.py +337 -0
  50. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +5 -1
  51. atdata-0.3.0b1.dist-info/RECORD +54 -0
  52. atdata/local.py +0 -1707
  53. atdata-0.2.2b1.dist-info/RECORD +0 -28
  54. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
  55. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
  56. {atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/cli/diagnose.py CHANGED
@@ -5,7 +5,6 @@ and other infrastructure components.
5
5
  """
6
6
 
7
7
  import sys
8
- from typing import Any
9
8
 
10
9
 
11
10
  def _print_status(label: str, ok: bool, detail: str = "") -> None:
@@ -41,6 +40,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
41
40
  # Try to connect
42
41
  try:
43
42
  from redis import Redis
43
+
44
44
  redis = Redis(host=host, port=port, socket_connect_timeout=5)
45
45
  redis.ping()
46
46
  _print_status("Connection", True, "connected")
@@ -70,7 +70,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
70
70
  _print_status(
71
71
  "AOF Persistence",
72
72
  aof_ok,
73
- "enabled" if aof_ok else "DISABLED - data may be lost on restart!"
73
+ "enabled" if aof_ok else "DISABLED - data may be lost on restart!",
74
74
  )
75
75
  if not aof_ok:
76
76
  issues_found = True
@@ -85,7 +85,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
85
85
  _print_status(
86
86
  "RDB Persistence",
87
87
  rdb_ok,
88
- f"configured ({save_config})" if rdb_ok else "DISABLED"
88
+ f"configured ({save_config})" if rdb_ok else "DISABLED",
89
89
  )
90
90
  # RDB disabled is only a warning if AOF is enabled
91
91
  except Exception as e:
@@ -95,7 +95,13 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
95
95
  try:
96
96
  policy = redis.config_get("maxmemory-policy").get("maxmemory-policy", "unknown")
97
97
  # Safe policies that won't evict index data
98
- safe_policies = {"noeviction", "volatile-lru", "volatile-lfu", "volatile-ttl", "volatile-random"}
98
+ safe_policies = {
99
+ "noeviction",
100
+ "volatile-lru",
101
+ "volatile-lfu",
102
+ "volatile-ttl",
103
+ "volatile-random",
104
+ }
99
105
  policy_ok = policy in safe_policies
100
106
 
101
107
  if policy_ok:
@@ -104,7 +110,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
104
110
  _print_status(
105
111
  "Memory Policy",
106
112
  False,
107
- f"{policy} - may evict index data! Use 'noeviction' or 'volatile-*'"
113
+ f"{policy} - may evict index data! Use 'noeviction' or 'volatile-*'",
108
114
  )
109
115
  issues_found = True
110
116
  except Exception as e:
@@ -141,9 +147,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
141
147
  for key in redis.scan_iter(match="LocalSchema:*", count=100):
142
148
  schema_count += 1
143
149
  _print_status(
144
- "atdata Keys",
145
- True,
146
- f"{dataset_count} datasets, {schema_count} schemas"
150
+ "atdata Keys", True, f"{dataset_count} datasets, {schema_count} schemas"
147
151
  )
148
152
  except Exception as e:
149
153
  _print_status("atdata Keys", False, f"check failed: {e}")
atdata/cli/inspect.py ADDED
@@ -0,0 +1,69 @@
1
+ """``atdata inspect`` command — show dataset summary information."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from typing import Any
7
+
8
+
9
+ def inspect_dataset(url: str) -> int:
10
+ """Print summary information for a dataset at the given URL.
11
+
12
+ Args:
13
+ url: WebDataset URL, local path, or atmosphere URI.
14
+
15
+ Returns:
16
+ Exit code (0 success, 1 failure).
17
+ """
18
+ try:
19
+ from ..dataset import Dataset, DictSample
20
+
21
+ ds = Dataset[DictSample](url)
22
+ except Exception as exc:
23
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
24
+ return 1
25
+
26
+ try:
27
+ shards = ds.list_shards()
28
+ print(f"URL: {url}")
29
+ print(f"Shards: {len(shards)}")
30
+ for shard in shards:
31
+ print(f" - {shard}")
32
+
33
+ # Read first sample to infer schema
34
+ samples = ds.head(1)
35
+ if samples:
36
+ sample = samples[0]
37
+ print("Schema: (inferred from first sample)")
38
+ for key in sample.keys():
39
+ val = sample[key]
40
+ print(f" {key}: {_describe_value(val)}")
41
+ else:
42
+ print("Schema: (no samples found)")
43
+
44
+ # Count samples — scan all shards
45
+ count = sum(1 for _ in ds.ordered())
46
+ print(f"Samples: {count}")
47
+ except Exception as exc:
48
+ print(f"Error reading dataset: {exc}", file=sys.stderr)
49
+ return 1
50
+
51
+ return 0
52
+
53
+
54
+ def _describe_value(val: Any) -> str:
55
+ """Human-readable type description for a sample field value."""
56
+ import numpy as np
57
+
58
+ if isinstance(val, np.ndarray):
59
+ return f"ndarray dtype={val.dtype} shape={val.shape}"
60
+ if isinstance(val, bytes):
61
+ return f"bytes len={len(val)}"
62
+ if isinstance(val, str):
63
+ truncated = val[:60] + ("..." if len(val) > 60 else "")
64
+ return f'str "{truncated}"'
65
+ if isinstance(val, (int, float, bool)):
66
+ return f"{type(val).__name__} {val}"
67
+ if isinstance(val, list):
68
+ return f"list len={len(val)}"
69
+ return type(val).__name__
atdata/cli/local.py CHANGED
@@ -114,7 +114,7 @@ def _container_running(name: str) -> bool:
114
114
  timeout=5,
115
115
  )
116
116
  return result.returncode == 0 and result.stdout.strip() == "true"
117
- except Exception:
117
+ except (OSError, subprocess.SubprocessError):
118
118
  return False
119
119
 
120
120
 
@@ -144,7 +144,9 @@ def _run_compose(
144
144
  elif shutil.which("docker-compose"):
145
145
  base_cmd = ["docker-compose"]
146
146
  else:
147
- raise RuntimeError("Neither 'docker compose' nor 'docker-compose' available")
147
+ raise RuntimeError(
148
+ "Neither 'docker compose' nor 'docker-compose' available"
149
+ )
148
150
  else:
149
151
  raise RuntimeError("Docker not found")
150
152
 
@@ -195,6 +197,7 @@ def local_up(
195
197
 
196
198
  # Wait a moment for containers to be healthy
197
199
  import time
200
+
198
201
  time.sleep(2)
199
202
 
200
203
  # Show status
atdata/cli/preview.py ADDED
@@ -0,0 +1,63 @@
1
+ """``atdata preview`` command — render first N samples of a dataset."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from typing import Any
7
+
8
+
9
+ def preview_dataset(url: str, limit: int = 5) -> int:
10
+ """Print a human-readable preview of the first *limit* samples.
11
+
12
+ Args:
13
+ url: Dataset URL, local path, or atmosphere URI.
14
+ limit: Number of samples to show. Default: 5.
15
+
16
+ Returns:
17
+ Exit code (0 success, 1 failure).
18
+ """
19
+ try:
20
+ from ..dataset import Dataset, DictSample
21
+
22
+ ds = Dataset[DictSample](url)
23
+ except Exception as exc:
24
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
25
+ return 1
26
+
27
+ samples = ds.head(limit)
28
+ if not samples:
29
+ print("No samples found.", file=sys.stderr)
30
+ return 1
31
+
32
+ print(f"Preview of {url} ({len(samples)} sample(s)):")
33
+ print()
34
+
35
+ for i, sample in enumerate(samples):
36
+ print(f"--- Sample {i} ---")
37
+ for key in sample.keys():
38
+ val = sample[key]
39
+ print(f" {key}: {_format_value(val)}")
40
+ print()
41
+
42
+ return 0
43
+
44
+
45
+ def _format_value(val: Any) -> str:
46
+ """Format a value for preview, truncating large data."""
47
+ import numpy as np
48
+
49
+ if isinstance(val, np.ndarray):
50
+ return f"ndarray shape={val.shape} dtype={val.dtype}"
51
+ if isinstance(val, bytes):
52
+ if len(val) <= 40:
53
+ return repr(val)
54
+ return f"bytes[{len(val)}] {val[:20]!r}..."
55
+ if isinstance(val, str):
56
+ if len(val) <= 80:
57
+ return repr(val)
58
+ return repr(val[:77] + "...")
59
+ if isinstance(val, list):
60
+ if len(val) <= 5:
61
+ return repr(val)
62
+ return f"[{val[0]!r}, {val[1]!r}, ... ({len(val)} items)]"
63
+ return repr(val)
atdata/cli/schema.py ADDED
@@ -0,0 +1,109 @@
1
+ """``atdata schema`` commands — show and diff dataset schemas."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+
8
+ def schema_show(dataset_ref: str) -> int:
9
+ """Display the schema of a dataset.
10
+
11
+ Args:
12
+ dataset_ref: Dataset URL, local path, or index reference
13
+ (e.g. ``@local/my-dataset``).
14
+
15
+ Returns:
16
+ Exit code (0 success, 1 failure).
17
+ """
18
+ try:
19
+ from ..dataset import Dataset, DictSample
20
+
21
+ ds = Dataset[DictSample](dataset_ref)
22
+ except Exception as exc:
23
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
24
+ return 1
25
+
26
+ samples = ds.head(1)
27
+ if not samples:
28
+ print("No samples found — cannot infer schema.", file=sys.stderr)
29
+ return 1
30
+
31
+ sample = samples[0]
32
+ print(f"Schema for: {dataset_ref}")
33
+ print(f"Fields ({len(sample.keys())}):")
34
+ for key in sample.keys():
35
+ val = sample[key]
36
+ print(f" {key}: {_type_label(val)}")
37
+
38
+ return 0
39
+
40
+
41
+ def schema_diff(url_a: str, url_b: str) -> int:
42
+ """Compare schemas of two datasets and print differences.
43
+
44
+ Args:
45
+ url_a: First dataset URL / path.
46
+ url_b: Second dataset URL / path.
47
+
48
+ Returns:
49
+ Exit code (0 identical, 1 different, 2 error).
50
+ """
51
+ try:
52
+ from ..dataset import Dataset, DictSample
53
+
54
+ ds_a = Dataset[DictSample](url_a)
55
+ ds_b = Dataset[DictSample](url_b)
56
+ except Exception as exc:
57
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
58
+ return 2
59
+
60
+ samples_a = ds_a.head(1)
61
+ samples_b = ds_b.head(1)
62
+
63
+ if not samples_a:
64
+ print(f"No samples in {url_a}", file=sys.stderr)
65
+ return 2
66
+ if not samples_b:
67
+ print(f"No samples in {url_b}", file=sys.stderr)
68
+ return 2
69
+
70
+ fields_a = {k: _type_label(samples_a[0][k]) for k in samples_a[0].keys()}
71
+ fields_b = {k: _type_label(samples_b[0][k]) for k in samples_b[0].keys()}
72
+
73
+ keys_a = set(fields_a)
74
+ keys_b = set(fields_b)
75
+
76
+ added = sorted(keys_b - keys_a)
77
+ removed = sorted(keys_a - keys_b)
78
+ common = sorted(keys_a & keys_b)
79
+ changed = [k for k in common if fields_a[k] != fields_b[k]]
80
+
81
+ if not added and not removed and not changed:
82
+ print("Schemas are identical.")
83
+ return 0
84
+
85
+ if added:
86
+ print("Added:")
87
+ for k in added:
88
+ print(f" + {k}: {fields_b[k]}")
89
+ if removed:
90
+ print("Removed:")
91
+ for k in removed:
92
+ print(f" - {k}: {fields_a[k]}")
93
+ if changed:
94
+ print("Changed:")
95
+ for k in changed:
96
+ print(f" ~ {k}: {fields_a[k]} -> {fields_b[k]}")
97
+
98
+ return 1
99
+
100
+
101
+ def _type_label(val: object) -> str:
102
+ """Short type label for schema display."""
103
+ import numpy as np
104
+
105
+ if isinstance(val, np.ndarray):
106
+ return f"ndarray[{val.dtype}]"
107
+ if isinstance(val, bytes):
108
+ return "bytes"
109
+ return type(val).__name__