PyPI - atdata - Versions diffs - 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl - Mend

atdata 0.2.2b1py3-none-any.whl → 0.3.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

atdata/.gitignore +1 -0
atdata/__init__.py +31 -1
atdata/_cid.py +29 -35
atdata/_exceptions.py +168 -0
atdata/_helpers.py +33 -17
atdata/_hf_api.py +109 -59
atdata/_logging.py +70 -0
atdata/_protocols.py +74 -132
atdata/_schema_codec.py +38 -41
atdata/_sources.py +57 -64
atdata/_stub_manager.py +31 -26
atdata/_type_utils.py +47 -7
atdata/atmosphere/__init__.py +31 -24
atdata/atmosphere/_types.py +11 -11
atdata/atmosphere/client.py +11 -8
atdata/atmosphere/lens.py +27 -30
atdata/atmosphere/records.py +34 -39
atdata/atmosphere/schema.py +35 -31
atdata/atmosphere/store.py +16 -20
atdata/cli/__init__.py +163 -168
atdata/cli/diagnose.py +12 -8
atdata/cli/inspect.py +69 -0
atdata/cli/local.py +5 -2
atdata/cli/preview.py +63 -0
atdata/cli/schema.py +109 -0
atdata/dataset.py +678 -533
atdata/lens.py +85 -83
atdata/local/__init__.py +71 -0
atdata/local/_entry.py +157 -0
atdata/local/_index.py +940 -0
atdata/local/_repo_legacy.py +218 -0
atdata/local/_s3.py +349 -0
atdata/local/_schema.py +380 -0
atdata/manifest/__init__.py +28 -0
atdata/manifest/_aggregates.py +156 -0
atdata/manifest/_builder.py +163 -0
atdata/manifest/_fields.py +154 -0
atdata/manifest/_manifest.py +146 -0
atdata/manifest/_query.py +150 -0
atdata/manifest/_writer.py +74 -0
atdata/promote.py +20 -24
atdata/providers/__init__.py +25 -0
atdata/providers/_base.py +140 -0
atdata/providers/_factory.py +69 -0
atdata/providers/_postgres.py +214 -0
atdata/providers/_redis.py +171 -0
atdata/providers/_sqlite.py +191 -0
atdata/repository.py +323 -0
atdata/testing.py +337 -0
{atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +5 -1
atdata-0.3.0b1.dist-info/RECORD +54 -0
atdata/local.py +0 -1707
atdata-0.2.2b1.dist-info/RECORD +0 -28
{atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
{atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
{atdata-0.2.2b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0

atdata/cli/diagnose.py CHANGED Viewed

@@ -5,7 +5,6 @@ and other infrastructure components.
 """
 import sys
-from typing import Any
 def _print_status(label: str, ok: bool, detail: str = "") -> None:
@@ -41,6 +40,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
     # Try to connect
     try:
         from redis import Redis
         redis = Redis(host=host, port=port, socket_connect_timeout=5)
         redis.ping()
         _print_status("Connection", True, "connected")
@@ -70,7 +70,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
         _print_status(
             "AOF Persistence",
             aof_ok,
-            "enabled" if aof_ok else "DISABLED - data may be lost on restart!"
+            "enabled" if aof_ok else "DISABLED - data may be lost on restart!",
         )
         if not aof_ok:
             issues_found = True
@@ -85,7 +85,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
         _print_status(
             "RDB Persistence",
             rdb_ok,
-            f"configured ({save_config})" if rdb_ok else "DISABLED"
+            f"configured ({save_config})" if rdb_ok else "DISABLED",
         )
         # RDB disabled is only a warning if AOF is enabled
     except Exception as e:
@@ -95,7 +95,13 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
     try:
         policy = redis.config_get("maxmemory-policy").get("maxmemory-policy", "unknown")
         # Safe policies that won't evict index data
-        safe_policies = {"noeviction", "volatile-lru", "volatile-lfu", "volatile-ttl", "volatile-random"}
+        safe_policies = {
+            "noeviction",
+            "volatile-lru",
+            "volatile-lfu",
+            "volatile-ttl",
+            "volatile-random",
+        }
         policy_ok = policy in safe_policies
         if policy_ok:
@@ -104,7 +110,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
             _print_status(
                 "Memory Policy",
                 False,
-                f"{policy} - may evict index data! Use 'noeviction' or 'volatile-*'"
+                f"{policy} - may evict index data! Use 'noeviction' or 'volatile-*'",
             )
             issues_found = True
     except Exception as e:
@@ -141,9 +147,7 @@ def diagnose_redis(host: str = "localhost", port: int = 6379) -> int:
         for key in redis.scan_iter(match="LocalSchema:*", count=100):
             schema_count += 1
         _print_status(
-            "atdata Keys",
-            True,
-            f"{dataset_count} datasets, {schema_count} schemas"
+            "atdata Keys", True, f"{dataset_count} datasets, {schema_count} schemas"
         )
     except Exception as e:
         _print_status("atdata Keys", False, f"check failed: {e}")

atdata/cli/inspect.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""``atdata inspect`` command — show dataset summary information."""
+from __future__ import annotations
+import sys
+from typing import Any
+def inspect_dataset(url: str) -> int:
+    """Print summary information for a dataset at the given URL.
+    Args:
+        url: WebDataset URL, local path, or atmosphere URI.
+    Returns:
+        Exit code (0 success, 1 failure).
+    """
+    try:
+        from ..dataset import Dataset, DictSample
+        ds = Dataset[DictSample](url)
+    except Exception as exc:
+        print(f"Error opening dataset: {exc}", file=sys.stderr)
+        return 1
+    try:
+        shards = ds.list_shards()
+        print(f"URL:      {url}")
+        print(f"Shards:   {len(shards)}")
+        for shard in shards:
+            print(f"  - {shard}")
+        # Read first sample to infer schema
+        samples = ds.head(1)
+        if samples:
+            sample = samples[0]
+            print("Schema:   (inferred from first sample)")
+            for key in sample.keys():
+                val = sample[key]
+                print(f"  {key}: {_describe_value(val)}")
+        else:
+            print("Schema:   (no samples found)")
+        # Count samples — scan all shards
+        count = sum(1 for _ in ds.ordered())
+        print(f"Samples:  {count}")
+    except Exception as exc:
+        print(f"Error reading dataset: {exc}", file=sys.stderr)
+        return 1
+    return 0
+def _describe_value(val: Any) -> str:
+    """Human-readable type description for a sample field value."""
+    import numpy as np
+    if isinstance(val, np.ndarray):
+        return f"ndarray dtype={val.dtype} shape={val.shape}"
+    if isinstance(val, bytes):
+        return f"bytes len={len(val)}"
+    if isinstance(val, str):
+        truncated = val[:60] + ("..." if len(val) > 60 else "")
+        return f'str "{truncated}"'
+    if isinstance(val, (int, float, bool)):
+        return f"{type(val).__name__} {val}"
+    if isinstance(val, list):
+        return f"list len={len(val)}"
+    return type(val).__name__

atdata/cli/local.py CHANGED Viewed

@@ -114,7 +114,7 @@ def _container_running(name: str) -> bool:
             timeout=5,
         )
         return result.returncode == 0 and result.stdout.strip() == "true"
-    except Exception:
+    except (OSError, subprocess.SubprocessError):
         return False
@@ -144,7 +144,9 @@ def _run_compose(
         elif shutil.which("docker-compose"):
             base_cmd = ["docker-compose"]
         else:
-            raise RuntimeError("Neither 'docker compose' nor 'docker-compose' available")
+            raise RuntimeError(
+                "Neither 'docker compose' nor 'docker-compose' available"
+            )
     else:
         raise RuntimeError("Docker not found")
@@ -195,6 +197,7 @@ def local_up(
     # Wait a moment for containers to be healthy
     import time
     time.sleep(2)
     # Show status

atdata/cli/preview.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""``atdata preview`` command — render first N samples of a dataset."""
+from __future__ import annotations
+import sys
+from typing import Any
+def preview_dataset(url: str, limit: int = 5) -> int:
+    """Print a human-readable preview of the first *limit* samples.
+    Args:
+        url: Dataset URL, local path, or atmosphere URI.
+        limit: Number of samples to show. Default: 5.
+    Returns:
+        Exit code (0 success, 1 failure).
+    """
+    try:
+        from ..dataset import Dataset, DictSample
+        ds = Dataset[DictSample](url)
+    except Exception as exc:
+        print(f"Error opening dataset: {exc}", file=sys.stderr)
+        return 1
+    samples = ds.head(limit)
+    if not samples:
+        print("No samples found.", file=sys.stderr)
+        return 1
+    print(f"Preview of {url} ({len(samples)} sample(s)):")
+    print()
+    for i, sample in enumerate(samples):
+        print(f"--- Sample {i} ---")
+        for key in sample.keys():
+            val = sample[key]
+            print(f"  {key}: {_format_value(val)}")
+        print()
+    return 0
+def _format_value(val: Any) -> str:
+    """Format a value for preview, truncating large data."""
+    import numpy as np
+    if isinstance(val, np.ndarray):
+        return f"ndarray shape={val.shape} dtype={val.dtype}"
+    if isinstance(val, bytes):
+        if len(val) <= 40:
+            return repr(val)
+        return f"bytes[{len(val)}] {val[:20]!r}..."
+    if isinstance(val, str):
+        if len(val) <= 80:
+            return repr(val)
+        return repr(val[:77] + "...")
+    if isinstance(val, list):
+        if len(val) <= 5:
+            return repr(val)
+        return f"[{val[0]!r}, {val[1]!r}, ... ({len(val)} items)]"
+    return repr(val)

atdata/cli/schema.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""``atdata schema`` commands — show and diff dataset schemas."""
+from __future__ import annotations
+import sys
+def schema_show(dataset_ref: str) -> int:
+    """Display the schema of a dataset.
+    Args:
+        dataset_ref: Dataset URL, local path, or index reference
+            (e.g. ``@local/my-dataset``).
+    Returns:
+        Exit code (0 success, 1 failure).
+    """
+    try:
+        from ..dataset import Dataset, DictSample
+        ds = Dataset[DictSample](dataset_ref)
+    except Exception as exc:
+        print(f"Error opening dataset: {exc}", file=sys.stderr)
+        return 1
+    samples = ds.head(1)
+    if not samples:
+        print("No samples found — cannot infer schema.", file=sys.stderr)
+        return 1
+    sample = samples[0]
+    print(f"Schema for: {dataset_ref}")
+    print(f"Fields ({len(sample.keys())}):")
+    for key in sample.keys():
+        val = sample[key]
+        print(f"  {key}: {_type_label(val)}")
+    return 0
+def schema_diff(url_a: str, url_b: str) -> int:
+    """Compare schemas of two datasets and print differences.
+    Args:
+        url_a: First dataset URL / path.
+        url_b: Second dataset URL / path.
+    Returns:
+        Exit code (0 identical, 1 different, 2 error).
+    """
+    try:
+        from ..dataset import Dataset, DictSample
+        ds_a = Dataset[DictSample](url_a)
+        ds_b = Dataset[DictSample](url_b)
+    except Exception as exc:
+        print(f"Error opening dataset: {exc}", file=sys.stderr)
+        return 2
+    samples_a = ds_a.head(1)
+    samples_b = ds_b.head(1)
+    if not samples_a:
+        print(f"No samples in {url_a}", file=sys.stderr)
+        return 2
+    if not samples_b:
+        print(f"No samples in {url_b}", file=sys.stderr)
+        return 2
+    fields_a = {k: _type_label(samples_a[0][k]) for k in samples_a[0].keys()}
+    fields_b = {k: _type_label(samples_b[0][k]) for k in samples_b[0].keys()}
+    keys_a = set(fields_a)
+    keys_b = set(fields_b)
+    added = sorted(keys_b - keys_a)
+    removed = sorted(keys_a - keys_b)
+    common = sorted(keys_a & keys_b)
+    changed = [k for k in common if fields_a[k] != fields_b[k]]
+    if not added and not removed and not changed:
+        print("Schemas are identical.")
+        return 0
+    if added:
+        print("Added:")
+        for k in added:
+            print(f"  + {k}: {fields_b[k]}")
+    if removed:
+        print("Removed:")
+        for k in removed:
+            print(f"  - {k}: {fields_a[k]}")
+    if changed:
+        print("Changed:")
+        for k in changed:
+            print(f"  ~ {k}: {fields_a[k]} -> {fields_b[k]}")
+    return 1
+def _type_label(val: object) -> str:
+    """Short type label for schema display."""
+    import numpy as np
+    if isinstance(val, np.ndarray):
+        return f"ndarray[{val.dtype}]"
+    if isinstance(val, bytes):
+        return "bytes"
+    return type(val).__name__

atdata 0.2.2b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

atdata 0.2.2b1py3-none-any.whl → 0.3.0b1py3-none-any.whl