atdata 0.3.1b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
atdata/__init__.py CHANGED
@@ -90,6 +90,7 @@ from ._schema_codec import (
90
90
  from ._logging import (
91
91
  configure_logging as configure_logging,
92
92
  get_logger as get_logger,
93
+ log_operation as log_operation,
93
94
  )
94
95
 
95
96
  from .repository import (
@@ -121,6 +122,7 @@ from .manifest import (
121
122
  ManifestWriter as ManifestWriter,
122
123
  QueryExecutor as QueryExecutor,
123
124
  SampleLocation as SampleLocation,
125
+ query_fields as query_fields,
124
126
  )
125
127
 
126
128
  # ATProto integration (lazy import to avoid requiring atproto package)
atdata/_hf_api.py CHANGED
@@ -714,12 +714,23 @@ def load_dataset(
714
714
  >>> index = Index()
715
715
  >>> ds = load_dataset("@local/my-dataset", index=index, split="train")
716
716
  """
717
+ from ._logging import get_logger
718
+
719
+ log = get_logger()
720
+ log.info(
721
+ "load_dataset: path=%s, split=%s, sample_type=%s",
722
+ path,
723
+ split,
724
+ sample_type.__name__ if sample_type is not None else "None",
725
+ )
726
+
717
727
  # Handle @handle/dataset indexed path resolution
718
728
  if _is_indexed_path(path):
719
729
  if index is None:
720
730
  index = get_default_index()
721
731
 
722
732
  source, schema_ref = _resolve_indexed_path(path, index)
733
+ log.debug("load_dataset: resolved indexed path, schema_ref=%s", schema_ref)
723
734
 
724
735
  # Resolve sample_type from schema if not provided
725
736
  resolved_type: Type = (
@@ -746,6 +757,8 @@ def load_dataset(
746
757
  if not splits_shards:
747
758
  raise FileNotFoundError(f"No data files found at path: {path}")
748
759
 
760
+ log.debug("load_dataset: resolved %d split(s) from path", len(splits_shards))
761
+
749
762
  # Build Dataset for each split
750
763
  datasets: dict[str, Dataset] = {}
751
764
  for split_name, shards in splits_shards.items():
atdata/_logging.py CHANGED
@@ -22,7 +22,10 @@ custom logger implementations.
22
22
 
23
23
  from __future__ import annotations
24
24
 
25
+ import contextlib
25
26
  import logging
27
+ import time
28
+ from collections.abc import Generator
26
29
  from typing import Any, Protocol, runtime_checkable
27
30
 
28
31
 
@@ -68,3 +71,43 @@ def get_logger() -> LoggerProtocol:
68
71
  whatever was last set via :func:`configure_logging`.
69
72
  """
70
73
  return _logger
74
+
75
+
76
+ @contextlib.contextmanager
77
+ def log_operation(op_name: str, **context: Any) -> Generator[None, None, None]:
78
+ """Log the start, completion, and duration of an operation.
79
+
80
+ Emits an ``info`` message on entry and on successful completion
81
+ (with elapsed time), or an ``error`` message if an exception
82
+ propagates out.
83
+
84
+ Args:
85
+ op_name: Short label for the operation (e.g. ``"write_samples"``).
86
+ **context: Arbitrary key-value pairs included in every log message.
87
+
88
+ Examples:
89
+ >>> with log_operation("write_samples", shard_count=10):
90
+ ... do_work()
91
+ """
92
+ log = get_logger()
93
+ ctx_str = ", ".join(f"{k}={v}" for k, v in context.items())
94
+ if ctx_str:
95
+ log.info("%s: started (%s)", op_name, ctx_str)
96
+ else:
97
+ log.info("%s: started", op_name)
98
+ t0 = time.monotonic()
99
+ try:
100
+ yield
101
+ except Exception:
102
+ elapsed = time.monotonic() - t0
103
+ if ctx_str:
104
+ log.error("%s: failed after %.2fs (%s)", op_name, elapsed, ctx_str)
105
+ else:
106
+ log.error("%s: failed after %.2fs", op_name, elapsed)
107
+ raise
108
+ else:
109
+ elapsed = time.monotonic() - t0
110
+ if ctx_str:
111
+ log.info("%s: completed in %.2fs (%s)", op_name, elapsed, ctx_str)
112
+ else:
113
+ log.info("%s: completed in %.2fs", op_name, elapsed)
atdata/_protocols.py CHANGED
@@ -151,12 +151,14 @@ class AbstractIndex(Protocol):
151
151
 
152
152
  # Dataset operations
153
153
 
154
- def write(
154
+ def write_samples(
155
155
  self,
156
156
  samples: Iterable,
157
157
  *,
158
158
  name: str,
159
159
  schema_ref: Optional[str] = None,
160
+ data_store: Optional["AbstractDataStore"] = None,
161
+ force: bool = False,
160
162
  **kwargs,
161
163
  ) -> IndexEntry:
162
164
  """Write samples and create an index entry in one step.
@@ -164,10 +166,15 @@ class AbstractIndex(Protocol):
164
166
  Serializes samples to WebDataset tar files, stores them via the
165
167
  appropriate backend, and creates an index entry.
166
168
 
169
+ For atmosphere targets, data is uploaded as PDS blobs by default
170
+ with size guards (50 MB per shard, 1 GB total).
171
+
167
172
  Args:
168
173
  samples: Iterable of Packable samples. Must be non-empty.
169
174
  name: Dataset name, optionally prefixed with target backend.
170
175
  schema_ref: Optional schema reference.
176
+ data_store: Explicit data store for shard storage.
177
+ force: Bypass PDS size limits.
171
178
  **kwargs: Backend-specific options (maxcount, description, etc.).
172
179
 
173
180
  Returns:
@@ -181,14 +188,24 @@ class AbstractIndex(Protocol):
181
188
  *,
182
189
  name: str,
183
190
  schema_ref: Optional[str] = None,
191
+ data_store: Optional["AbstractDataStore"] = None,
192
+ force: bool = False,
193
+ copy: bool = False,
184
194
  **kwargs,
185
195
  ) -> IndexEntry:
186
196
  """Register an existing dataset in the index.
187
197
 
198
+ For atmosphere targets, local sources are uploaded via
199
+ *data_store* (defaults to PDS blobs). Credentialed sources
200
+ require ``copy=True``.
201
+
188
202
  Args:
189
203
  ds: The Dataset to register.
190
204
  name: Human-readable name.
191
205
  schema_ref: Explicit schema ref; auto-published if ``None``.
206
+ data_store: Explicit data store for shard storage.
207
+ force: Bypass PDS size limits.
208
+ copy: Copy data to destination store even for remote sources.
192
209
  **kwargs: Backend-specific options.
193
210
  """
194
211
  ...
atdata/_sources.py CHANGED
@@ -64,10 +64,20 @@ class URLSource:
64
64
  """Expand brace pattern and return list of shard URLs."""
65
65
  return list(braceexpand.braceexpand(self.url))
66
66
 
67
- # Legacy alias for backwards compatibility
68
67
  @property
69
68
  def shard_list(self) -> list[str]:
70
- """Expand brace pattern and return list of shard URLs (deprecated, use list_shards())."""
69
+ """Expand brace pattern and return list of shard URLs.
70
+
71
+ .. deprecated::
72
+ Use :meth:`list_shards` instead.
73
+ """
74
+ import warnings
75
+
76
+ warnings.warn(
77
+ "shard_list is deprecated, use list_shards()",
78
+ DeprecationWarning,
79
+ stacklevel=2,
80
+ )
71
81
  return self.list_shards()
72
82
 
73
83
  @property
@@ -178,10 +188,20 @@ class S3Source:
178
188
  """Return list of S3 URIs for the shards."""
179
189
  return [f"s3://{self.bucket}/{key}" for key in self.keys]
180
190
 
181
- # Legacy alias for backwards compatibility
182
191
  @property
183
192
  def shard_list(self) -> list[str]:
184
- """Return list of S3 URIs for the shards (deprecated, use list_shards())."""
193
+ """Return list of S3 URIs for the shards.
194
+
195
+ .. deprecated::
196
+ Use :meth:`list_shards` instead.
197
+ """
198
+ import warnings
199
+
200
+ warnings.warn(
201
+ "shard_list is deprecated, use list_shards()",
202
+ DeprecationWarning,
203
+ stacklevel=2,
204
+ )
185
205
  return self.list_shards()
186
206
 
187
207
  @property
@@ -34,11 +34,23 @@ from .schema import SchemaPublisher, SchemaLoader
34
34
  from .records import DatasetPublisher, DatasetLoader
35
35
  from .lens import LensPublisher, LensLoader
36
36
  from .store import PDSBlobStore
37
- from ._types import (
38
- AtUri,
39
- SchemaRecord,
40
- DatasetRecord,
41
- LensRecord,
37
+ from ._types import AtUri, LEXICON_NAMESPACE
38
+ from ._lexicon_types import (
39
+ LexSchemaRecord,
40
+ LexDatasetRecord,
41
+ LexLensRecord,
42
+ LexCodeReference,
43
+ JsonSchemaFormat,
44
+ StorageHttp,
45
+ StorageS3,
46
+ StorageBlobs,
47
+ ShardChecksum,
48
+ HttpShardEntry,
49
+ S3ShardEntry,
50
+ BlobEntry,
51
+ DatasetSize,
52
+ StorageUnion,
53
+ storage_from_record,
42
54
  )
43
55
 
44
56
  if TYPE_CHECKING:
@@ -70,11 +82,23 @@ class AtmosphereIndexEntry:
70
82
 
71
83
  @property
72
84
  def data_urls(self) -> list[str]:
73
- """WebDataset URLs from external storage."""
85
+ """WebDataset URLs from storage.
86
+
87
+ Handles storageHttp (shard URLs), storageS3 (s3:// URLs),
88
+ storageExternal (legacy), and storageBlobs (PDS blob URLs).
89
+ """
74
90
  storage = self._record.get("storage", {})
75
91
  storage_type = storage.get("$type", "")
92
+ if "storageHttp" in storage_type:
93
+ return [s["url"] for s in storage.get("shards", [])]
94
+ if "storageS3" in storage_type:
95
+ bucket = storage.get("bucket", "")
96
+ return [f"s3://{bucket}/{s['key']}" for s in storage.get("shards", [])]
76
97
  if "storageExternal" in storage_type:
77
98
  return storage.get("urls", [])
99
+ if "storageBlobs" in storage_type:
100
+ # Blob URLs must be resolved via PDS; return empty for now
101
+ return []
78
102
  return []
79
103
 
80
104
  @property
@@ -332,9 +356,23 @@ __all__ = [
332
356
  # Lens operations
333
357
  "LensPublisher",
334
358
  "LensLoader",
335
- # Types
359
+ # Core types
336
360
  "AtUri",
337
- "SchemaRecord",
338
- "DatasetRecord",
339
- "LensRecord",
361
+ "LEXICON_NAMESPACE",
362
+ # Lexicon-mirror types (Tier 1)
363
+ "LexSchemaRecord",
364
+ "LexDatasetRecord",
365
+ "LexLensRecord",
366
+ "LexCodeReference",
367
+ "JsonSchemaFormat",
368
+ "StorageHttp",
369
+ "StorageS3",
370
+ "StorageBlobs",
371
+ "StorageUnion",
372
+ "storage_from_record",
373
+ "ShardChecksum",
374
+ "HttpShardEntry",
375
+ "S3ShardEntry",
376
+ "BlobEntry",
377
+ "DatasetSize",
340
378
  ]