PyPI - atdata - Versions diffs - 0.2.0a1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl - Mend

atdata 0.2.0a1py3-none-any.whl → 0.2.3b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

atdata/__init__.py +43 -10
atdata/_cid.py +144 -0
atdata/_helpers.py +7 -5
atdata/_hf_api.py +690 -0
atdata/_protocols.py +504 -0
atdata/_schema_codec.py +438 -0
atdata/_sources.py +508 -0
atdata/_stub_manager.py +534 -0
atdata/_type_utils.py +104 -0
atdata/atmosphere/__init__.py +269 -1
atdata/atmosphere/_types.py +4 -2
atdata/atmosphere/client.py +146 -3
atdata/atmosphere/lens.py +4 -3
atdata/atmosphere/records.py +168 -7
atdata/atmosphere/schema.py +29 -82
atdata/atmosphere/store.py +204 -0
atdata/cli/__init__.py +222 -0
atdata/cli/diagnose.py +169 -0
atdata/cli/local.py +283 -0
atdata/dataset.py +615 -257
atdata/lens.py +53 -54
atdata/local.py +1456 -228
atdata/promote.py +195 -0
{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/METADATA +106 -14
atdata-0.2.3b1.dist-info/RECORD +28 -0
atdata-0.2.0a1.dist-info/RECORD +0 -16
{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/WHEEL +0 -0
{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/entry_points.txt +0 -0
{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/licenses/LICENSE +0 -0

atdata/promote.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""Promotion workflow for migrating datasets from local to atmosphere.
+This module provides functionality to promote locally-indexed datasets to the
+ATProto atmosphere network. This enables sharing datasets with the broader
+federation while maintaining schema consistency.
+Examples:
+    >>> from atdata.local import LocalIndex, Repo
+    >>> from atdata.atmosphere import AtmosphereClient, AtmosphereIndex
+    >>> from atdata.promote import promote_to_atmosphere
+    >>>
+    >>> # Setup
+    >>> local_index = LocalIndex()
+    >>> client = AtmosphereClient()
+    >>> client.login("handle.bsky.social", "app-password")
+    >>>
+    >>> # Promote a dataset
+    >>> entry = local_index.get_dataset("my-dataset")
+    >>> at_uri = promote_to_atmosphere(entry, local_index, client)
+"""
+from typing import TYPE_CHECKING, Type
+if TYPE_CHECKING:
+    from .local import LocalDatasetEntry, Index as LocalIndex
+    from .atmosphere import AtmosphereClient
+    from ._protocols import AbstractDataStore, Packable
+def _find_existing_schema(
+    client: "AtmosphereClient",
+    name: str,
+    version: str,
+) -> str | None:
+    """Check if a schema with the given name and version already exists.
+    Args:
+        client: Authenticated atmosphere client.
+        name: Schema name to search for.
+        version: Schema version to match.
+    Returns:
+        AT URI of existing schema if found, None otherwise.
+    """
+    from .atmosphere import SchemaLoader
+    loader = SchemaLoader(client)
+    for record in loader.list_all():
+        rec_value = record.get("value", record)
+        if rec_value.get("name") == name and rec_value.get("version") == version:
+            return record.get("uri", "")
+    return None
+def _find_or_publish_schema(
+    sample_type: "Type[Packable]",
+    version: str,
+    client: "AtmosphereClient",
+    description: str | None = None,
+) -> str:
+    """Find existing schema or publish a new one.
+    Checks if a schema with the same name and version already exists on the
+    user's atmosphere repository. If found, returns the existing URI to avoid
+    duplicates. Otherwise, publishes a new schema record.
+    Args:
+        sample_type: The PackableSample subclass to publish.
+        version: Semantic version string.
+        client: Authenticated atmosphere client.
+        description: Optional schema description.
+    Returns:
+        AT URI of the schema (existing or newly published).
+    """
+    from .atmosphere import SchemaPublisher
+    schema_name = f"{sample_type.__module__}.{sample_type.__name__}"
+    # Check for existing schema
+    existing = _find_existing_schema(client, schema_name, version)
+    if existing:
+        return existing
+    # Publish new schema
+    publisher = SchemaPublisher(client)
+    uri = publisher.publish(
+        sample_type,
+        version=version,
+        description=description,
+    )
+    return str(uri)
+def promote_to_atmosphere(
+    local_entry: "LocalDatasetEntry",
+    local_index: "LocalIndex",
+    atmosphere_client: "AtmosphereClient",
+    *,
+    data_store: "AbstractDataStore | None" = None,
+    name: str | None = None,
+    description: str | None = None,
+    tags: list[str] | None = None,
+    license: str | None = None,
+) -> str:
+    """Promote a local dataset to the atmosphere network.
+    This function takes a locally-indexed dataset and publishes it to ATProto,
+    making it discoverable on the federated atmosphere network.
+    Args:
+        local_entry: The LocalDatasetEntry to promote.
+        local_index: Local index containing the schema for this entry.
+        atmosphere_client: Authenticated AtmosphereClient.
+        data_store: Optional data store for copying data to new location.
+            If None, the existing data_urls are used as-is.
+        name: Override name for the atmosphere record. Defaults to local name.
+        description: Optional description for the dataset.
+        tags: Optional tags for discovery.
+        license: Optional license identifier.
+    Returns:
+        AT URI of the created atmosphere dataset record.
+    Raises:
+        KeyError: If schema not found in local index.
+        ValueError: If local entry has no data URLs.
+    Examples:
+        >>> entry = local_index.get_dataset("mnist-train")
+        >>> uri = promote_to_atmosphere(entry, local_index, client)
+        >>> print(uri)
+        at://did:plc:abc123/ac.foundation.dataset.datasetIndex/...
+    """
+    from .atmosphere import DatasetPublisher
+    from ._schema_codec import schema_to_type
+    # Validate entry has data
+    if not local_entry.data_urls:
+        raise ValueError(f"Local entry '{local_entry.name}' has no data URLs")
+    # Get schema from local index
+    schema_ref = local_entry.schema_ref
+    schema_record = local_index.get_schema(schema_ref)
+    # Reconstruct sample type from schema
+    sample_type = schema_to_type(schema_record)
+    schema_version = schema_record.get("version", "1.0.0")
+    # Find or publish schema on atmosphere (deduplication)
+    atmosphere_schema_uri = _find_or_publish_schema(
+        sample_type,
+        schema_version,
+        atmosphere_client,
+        description=schema_record.get("description"),
+    )
+    # Determine data URLs
+    if data_store is not None:
+        # Copy data to new storage location
+        # Create a temporary Dataset to write through the data store
+        from .dataset import Dataset
+        # Build WDS URL from data_urls
+        if len(local_entry.data_urls) == 1:
+            wds_url = local_entry.data_urls[0]
+        else:
+            # Use brace notation for multiple URLs
+            wds_url = " ".join(local_entry.data_urls)
+        ds = Dataset[sample_type](wds_url)
+        prefix = f"promoted/{local_entry.name}"
+        data_urls = data_store.write_shards(ds, prefix=prefix)
+    else:
+        # Use existing URLs as-is
+        data_urls = local_entry.data_urls
+    # Publish dataset record to atmosphere
+    publisher = DatasetPublisher(atmosphere_client)
+    uri = publisher.publish_with_urls(
+        urls=data_urls,
+        schema_uri=atmosphere_schema_uri,
+        name=name or local_entry.name,
+        description=description,
+        tags=tags,
+        license=license,
+        metadata=local_entry.metadata,
+    )
+    return str(uri)
+__all__ = [
+    "promote_to_atmosphere",
+]

{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,14 @@
 Metadata-Version: 2.4
 Name: atdata
-Version: 0.2.0a1
+Version: 0.2.3b1
 Summary: A loose federation of distributed, typed datasets
-Author-email: Maxine Levesque <hello@maxine.science>
+Author-email: Maxine Levesque <hello@maxine.science>, "Maxine @ Forecast Bio" <maxine@forecast.bio>
 License-File: LICENSE
 Requires-Python: >=3.12
 Requires-Dist: atproto>=0.0.65
+Requires-Dist: boto3>=1.41.5
 Requires-Dist: fastparquet>=2024.11.0
+Requires-Dist: libipld>=3.3.2
 Requires-Dist: msgpack>=1.1.2
 Requires-Dist: numpy>=2.3.4
 Requires-Dist: ormsgpack>=1.11.0
@@ -34,9 +36,13 @@ A loose federation of distributed, typed datasets built on WebDataset.
 ## Features
 - **Typed Samples** - Define dataset schemas using Python dataclasses with automatic msgpack serialization
+- **Schema-free Exploration** - Load datasets without defining a schema first using `DictSample`
 - **Lens Transformations** - Bidirectional, composable transformations between different dataset views
 - **Automatic Batching** - Smart batch aggregation with numpy array stacking
 - **WebDataset Integration** - Efficient storage and streaming for large-scale datasets
+- **Flexible Data Sources** - Stream from local files, HTTP URLs, or S3-compatible storage
+- **HuggingFace-style API** - `load_dataset()` with path resolution and split handling
+- **Local & Atmosphere Storage** - Index datasets locally with Redis or publish to ATProto network
 ## Installation
@@ -48,9 +54,27 @@ Requires Python 3.12 or later.
 ## Quick Start
-### Defining Sample Types
+### Loading Datasets
-Use the `@packable` decorator to create typed dataset samples:
+The primary way to load datasets is with `load_dataset()`:
+```python
+from atdata import load_dataset
+# Load without specifying a type - returns Dataset[DictSample]
+ds = load_dataset("path/to/data.tar", split="train")
+# Explore the data
+for sample in ds.ordered():
+    print(sample.keys())      # See available fields
+    print(sample["text"])     # Dict-style access
+    print(sample.label)       # Attribute access
+    break
+```
+### Defining Typed Schemas
+Once you understand your data, define a typed schema with `@packable`:
 ```python
 import atdata
@@ -63,18 +87,21 @@ class ImageSample:
     metadata: dict
 ```
-### Creating Datasets
+### Loading with Types
 ```python
-# Create a dataset
-dataset = atdata.Dataset[ImageSample]("path/to/data-{000000..000009}.tar")
+# Load with explicit type
+ds = load_dataset("path/to/data-{000000..000009}.tar", ImageSample, split="train")
+# Or convert from DictSample
+ds = load_dataset("path/to/data.tar", split="train").as_type(ImageSample)
-# Iterate over samples in order
-for sample in dataset.ordered(batch_size=None):
+# Iterate over samples
+for sample in ds.ordered():
     print(f"Label: {sample.label}, Image shape: {sample.image.shape}")
 # Iterate with shuffling and batching
-for batch in dataset.shuffled(batch_size=32):
+for batch in ds.shuffled(batch_size=32):
     # batch.image is automatically stacked into shape (32, ...)
     # batch.label is a list of 32 labels
     process_batch(batch.image, batch.label)
@@ -105,9 +132,28 @@ for sample in processed_ds.ordered(batch_size=None):
 ## Core Concepts
+### DictSample
+The default sample type for schema-free exploration. Provides both attribute and dict-style access:
+```python
+ds = load_dataset("data.tar", split="train")
+for sample in ds.ordered():
+    # Dict-style access
+    print(sample["field_name"])
+    # Attribute access
+    print(sample.field_name)
+    # Introspection
+    print(sample.keys())
+    print(sample.to_dict())
+```
 ### PackableSample
-Base class for serializable samples. Fields annotated as `NDArray` are automatically handled:
+Base class for typed, serializable samples. Fields annotated as `NDArray` are automatically handled:
 ```python
 @atdata.packable
@@ -117,6 +163,8 @@ class MySample:
     regular_field: str
 ```
+Every `@packable` class automatically registers a lens from `DictSample`, enabling seamless conversion via `.as_type()`.
 ### Lens
 Bidirectional transformations with getter/putter semantics:
@@ -133,6 +181,25 @@ def my_lens_put(view: ViewType, source: SourceType) -> SourceType:
     return SourceType(...)
 ```
+### Data Sources
+Datasets support multiple backends via the `DataSource` protocol:
+```python
+# String URLs (most common) - automatically wrapped in URLSource
+dataset = atdata.Dataset[ImageSample]("data-{000000..000009}.tar")
+# S3 with authentication (private buckets, Cloudflare R2, MinIO)
+source = atdata.S3Source(
+    bucket="my-bucket",
+    keys=["data-000000.tar", "data-000001.tar"],
+    endpoint="https://my-account.r2.cloudflarestorage.com",
+    access_key="...",
+    secret_key="...",
+)
+dataset = atdata.Dataset[ImageSample](source)
+```
 ### Dataset URLs
 Uses WebDataset brace expansion for sharded datasets:
@@ -141,6 +208,31 @@ Uses WebDataset brace expansion for sharded datasets:
 - Multiple shards: `"data/dataset-{000000..000099}.tar"`
 - Multiple patterns: `"data/{train,val}/dataset-{000000..000009}.tar"`
+### HuggingFace-style API
+Load datasets with a familiar interface:
+```python
+from atdata import load_dataset
+# Load without type for exploration (returns Dataset[DictSample])
+ds = load_dataset("./data/train-*.tar", split="train")
+# Load with explicit type
+ds = load_dataset("./data/train-*.tar", ImageSample, split="train")
+# Load from S3 with brace notation
+ds = load_dataset("s3://bucket/data-{000000..000099}.tar", ImageSample, split="train")
+# Load all splits (returns DatasetDict)
+ds_dict = load_dataset("./data", ImageSample)
+train_ds = ds_dict["train"]
+test_ds = ds_dict["test"]
+# Convert DictSample to typed schema
+ds = load_dataset("./data/train.tar", split="train").as_type(ImageSample)
+```
 ## Development
 ### Setup
@@ -157,13 +249,13 @@ uv sync
 ```bash
 # Run all tests with coverage
-pytest
+uv run pytest
 # Run specific test file
-pytest tests/test_dataset.py
+uv run pytest tests/test_dataset.py
 # Run single test
-pytest tests/test_lens.py::test_lens
+uv run pytest tests/test_lens.py::test_lens
 ```
 ### Building

atdata-0.2.3b1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,28 @@
+atdata/__init__.py,sha256=yMp3NFDIerlv0U0ltXnTg2CvbUY-9C_etNYA1JAaf88,2452
+atdata/_cid.py,sha256=6wLV_dcQJy5Eb-wld7_h7Kcp7QoVixIqUDIIoSwpQms,3992
+atdata/_helpers.py,sha256=zoo9tKs_soM9n_gTQ_DRgA3iPi8i8W01L819UmzVcwo,1553
+atdata/_hf_api.py,sha256=cG8JIZAOcKEVCS0XGthe-5YPMUsxHnOFpe_HeYN-WEs,22948
+atdata/_protocols.py,sha256=TkBnA4mosvelsxSGnzUUf8DAcspt-zwmSLbJPGuxkRE,15764
+atdata/_schema_codec.py,sha256=I2cjXuICpdP1cMsG7Vpj6T7Kz0zEwYO780pAmnpjGj8,14352
+atdata/_sources.py,sha256=A7HMkS_dqN5Sx7rG1nZsO1Laxozt3C65_P2Hiv41VXk,16624
+atdata/_stub_manager.py,sha256=Heh0HAYjVjnkUQcPWAEOrkkEKN3Mi9vTJPA-ZRseFw8,19141
+atdata/_type_utils.py,sha256=p8pdo_Ujtds1F_G816DsPKPY9JxI8Aha6iFruvn11ro,2947
+atdata/dataset.py,sha256=VTdK6rssSIHJH9GzDGLJYO8PJKNkxWW8g-U2ZQTxB_U,36773
+atdata/lens.py,sha256=vyoSRMEyqk9npKmm8vfhMsO-TOfpakDNJyD_GqnqmDM,9670
+atdata/local.py,sha256=S9uAsxrTm8kBwWF7VjrNaSMJvIllyFYP6a75oEyJljA,57352
+atdata/promote.py,sha256=fPLVNkwukX5rjvR4z24K4kQbXlWRtHzHbQYQ3P8dcy8,6303
+atdata/atmosphere/__init__.py,sha256=pm6nskOZguhnFiDbKK99uHHQW3c7v3Qe2OJmDfFSjaY,9778
+atdata/atmosphere/_types.py,sha256=MRhXnmAuQLJPdoq1skrBGXCsaQYdtKG_nA3YlSjwJXY,9595
+atdata/atmosphere/client.py,sha256=acw82w3_cxWbWDtIRvH1VDHGJSroGqhSenFFostXTXo,16210
+atdata/atmosphere/lens.py,sha256=EnrddTD-SAnyxU0XF__QkePLUhb4lPwfFLaseL_STDc,9260
+atdata/atmosphere/records.py,sha256=esEm8Lz2zUi4CS9udHTAeNLCHwilHM3VljhY202fdMk,15844
+atdata/atmosphere/schema.py,sha256=6V_lL-aFtgt56cbJioYygOrUFdzn6Hj5gqgBch__HMw,7767
+atdata/atmosphere/store.py,sha256=NR4tGS9u3_ogvnyyOHDVF0tRKChruj_NE9Df4qrZiDU,6324
+atdata/cli/__init__.py,sha256=R8GvGfbLhdGTStBgaD4nUGkInNE_pY60z_hA-rKPWH4,5728
+atdata/cli/diagnose.py,sha256=Det9ozOvxXKd8Abu-xEsMjaXR34H_cuSX9MJJIlhnsA,5483
+atdata/cli/local.py,sha256=7yatEQ61ipdtWtlMcIeMgcQLfu3ysCqOrcYBCyG3ivA,8077
+atdata-0.2.3b1.dist-info/METADATA,sha256=beXT0CgUFSG9heYgNS92KnvaGYimC4T7ZtspWuIzYl8,7299
+atdata-0.2.3b1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+atdata-0.2.3b1.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
+atdata-0.2.3b1.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
+atdata-0.2.3b1.dist-info/RECORD,,

atdata-0.2.0a1.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-atdata/__init__.py,sha256=6RYvy9GJwqtSQbCS81HaQyOyAVgLxm63kBt0SH5Qapo,1642
-atdata/_helpers.py,sha256=RvA-Xlj3AvgSWuiPdS8YTBp8AJT-u32BaLpxsu4PIIA,1564
-atdata/dataset.py,sha256=O2j1_ABvTFcs83_y-GGDRROD9zRe-237O2OiI1NhySg,24173
-atdata/lens.py,sha256=lFFVeuKXa17KYjfz3VFqE9Xf0vy3C6puSiF78hyIaAI,9673
-atdata/local.py,sha256=IdNOTA0nvszG-XRkRMkT_zkMivIx93WKh3bpgIx_u_o,15458
-atdata/atmosphere/__init__.py,sha256=8tPDziazrQWdyvetWTVV1eWRt6JBy86WfnvAeyh8iJE,1743
-atdata/atmosphere/_types.py,sha256=0606wb2c8Ty7cmZWTh5mb_qwJmAwYf5oaJU_wk9moa8,9564
-atdata/atmosphere/client.py,sha256=tihVBlhPCz3TZBHs_Ce7uYwE70IzKyeXNpDKsN_qc5U,11358
-atdata/atmosphere/lens.py,sha256=BzUdagItYsyzYHtK1jqppJJ1VUHJVQRw0hi7LuvJG5Q,9267
-atdata/atmosphere/records.py,sha256=-9hhSLsr6sDHkzCVWDudZtxTMHXcVyUHeVojlNcGdL4,10672
-atdata/atmosphere/schema.py,sha256=6gQMGSRjgESaXZzBYMfO51qL9JMiyNGrqJe4iWarO7w,9872
-atdata-0.2.0a1.dist-info/METADATA,sha256=EBwfarL5lmzP2lMdn7Z9yfZBjP6TwTalnTDC8cc7cdY,4471
-atdata-0.2.0a1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-atdata-0.2.0a1.dist-info/entry_points.txt,sha256=6-iQr1veSTq-ac94bLyfcyGHprrZWevPEd12BWX37tQ,39
-atdata-0.2.0a1.dist-info/licenses/LICENSE,sha256=Pz2eACSxkhsGfW9_iN60pgy-enjnbGTj8df8O3ebnQQ,16726
-atdata-0.2.0a1.dist-info/RECORD,,

{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/WHEEL RENAMED Viewed

File without changes

{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{atdata-0.2.0a1.dist-info → atdata-0.2.3b1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

atdata 0.2.0a1__py3-none-any.whl → 0.2.3b1__py3-none-any.whl

atdata 0.2.0a1py3-none-any.whl → 0.2.3b1py3-none-any.whl