atdata 0.3.0b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +11 -0
- atdata/_cid.py +0 -21
- atdata/_helpers.py +12 -0
- atdata/_hf_api.py +46 -1
- atdata/_logging.py +43 -0
- atdata/_protocols.py +81 -182
- atdata/_schema_codec.py +2 -2
- atdata/_sources.py +24 -4
- atdata/_stub_manager.py +5 -25
- atdata/atmosphere/__init__.py +60 -21
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +73 -245
- atdata/atmosphere/client.py +64 -12
- atdata/atmosphere/lens.py +60 -53
- atdata/atmosphere/records.py +291 -100
- atdata/atmosphere/schema.py +91 -65
- atdata/atmosphere/store.py +68 -66
- atdata/cli/__init__.py +16 -16
- atdata/cli/diagnose.py +2 -2
- atdata/cli/{local.py → infra.py} +10 -10
- atdata/dataset.py +266 -47
- atdata/index/__init__.py +54 -0
- atdata/{local → index}/_entry.py +6 -2
- atdata/{local → index}/_index.py +617 -72
- atdata/{local → index}/_schema.py +5 -5
- atdata/lexicons/__init__.py +127 -0
- atdata/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata/lexicons/ac.foundation.dataset.lens.json +101 -0
- atdata/lexicons/ac.foundation.dataset.record.json +117 -0
- atdata/lexicons/ac.foundation.dataset.schema.json +107 -0
- atdata/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +46 -0
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/lexicons/ndarray_shim.json +16 -0
- atdata/local/__init__.py +12 -13
- atdata/local/_repo_legacy.py +3 -3
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/promote.py +14 -10
- atdata/repository.py +66 -16
- atdata/stores/__init__.py +23 -0
- atdata/stores/_disk.py +131 -0
- atdata/{local → stores}/_s3.py +134 -112
- atdata/testing.py +12 -8
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +2 -2
- atdata-0.3.2b1.dist-info/RECORD +71 -0
- atdata-0.3.0b1.dist-info/RECORD +0 -54
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.0b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageHttp",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "HTTP/HTTPS storage for WebDataset tar archives. Each shard is listed individually with a checksum for integrity verification. Consumers build brace-expansion patterns on the fly when needed.",
|
|
8
|
+
"required": [
|
|
9
|
+
"shards"
|
|
10
|
+
],
|
|
11
|
+
"properties": {
|
|
12
|
+
"shards": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "Array of shard entries with URL and integrity checksum",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "ref",
|
|
17
|
+
"ref": "#shardEntry"
|
|
18
|
+
},
|
|
19
|
+
"minLength": 1
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"shardEntry": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"description": "A single HTTP-accessible shard with integrity checksum",
|
|
26
|
+
"required": [
|
|
27
|
+
"url",
|
|
28
|
+
"checksum"
|
|
29
|
+
],
|
|
30
|
+
"properties": {
|
|
31
|
+
"url": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"format": "uri",
|
|
34
|
+
"description": "HTTP/HTTPS URL for this WebDataset tar shard",
|
|
35
|
+
"maxLength": 2000
|
|
36
|
+
},
|
|
37
|
+
"checksum": {
|
|
38
|
+
"type": "ref",
|
|
39
|
+
"ref": "ac.foundation.dataset.record#shardChecksum",
|
|
40
|
+
"description": "Content hash for integrity verification"
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageS3",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "S3 or S3-compatible storage for WebDataset tar archives. Supports custom endpoints for MinIO, Cloudflare R2, and other S3-compatible services.",
|
|
8
|
+
"required": [
|
|
9
|
+
"bucket",
|
|
10
|
+
"shards"
|
|
11
|
+
],
|
|
12
|
+
"properties": {
|
|
13
|
+
"bucket": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"description": "S3 bucket name",
|
|
16
|
+
"maxLength": 255
|
|
17
|
+
},
|
|
18
|
+
"region": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "AWS region (e.g., 'us-east-1'). Optional for S3-compatible services.",
|
|
21
|
+
"maxLength": 50
|
|
22
|
+
},
|
|
23
|
+
"endpoint": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"format": "uri",
|
|
26
|
+
"description": "Custom S3-compatible endpoint URL (e.g., for MinIO, Cloudflare R2). Omit for standard AWS S3.",
|
|
27
|
+
"maxLength": 500
|
|
28
|
+
},
|
|
29
|
+
"shards": {
|
|
30
|
+
"type": "array",
|
|
31
|
+
"description": "Array of shard entries with object key and integrity checksum",
|
|
32
|
+
"items": {
|
|
33
|
+
"type": "ref",
|
|
34
|
+
"ref": "#shardEntry"
|
|
35
|
+
},
|
|
36
|
+
"minLength": 1
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
"shardEntry": {
|
|
41
|
+
"type": "object",
|
|
42
|
+
"description": "A single S3 object shard with integrity checksum",
|
|
43
|
+
"required": [
|
|
44
|
+
"key",
|
|
45
|
+
"checksum"
|
|
46
|
+
],
|
|
47
|
+
"properties": {
|
|
48
|
+
"key": {
|
|
49
|
+
"type": "string",
|
|
50
|
+
"description": "S3 object key for this WebDataset tar shard",
|
|
51
|
+
"maxLength": 1024
|
|
52
|
+
},
|
|
53
|
+
"checksum": {
|
|
54
|
+
"type": "ref",
|
|
55
|
+
"ref": "ac.foundation.dataset.record#shardChecksum",
|
|
56
|
+
"description": "Content hash for integrity verification"
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://foundation.ac/schemas/atdata-ndarray-bytes/1.0.0",
|
|
4
|
+
"title": "ATDataNDArrayBytes",
|
|
5
|
+
"description": "Standard definition for numpy NDArray types in JSON Schema, compatible with atdata WebDataset serialization. This type's contents are interpreted as containing the raw bytes data for a serialized numpy NDArray, and serve as a marker for atdata-based code generation to use standard numpy types, rather than generated dataclasses.",
|
|
6
|
+
"version": "1.0.0",
|
|
7
|
+
"$defs": {
|
|
8
|
+
"ndarray": {
|
|
9
|
+
"type": "string",
|
|
10
|
+
"format": "byte",
|
|
11
|
+
"description": "Numpy array serialized using numpy `.npy` format via `np.save` (includes dtype and shape in binary header). When represented in JSON, this is a base64-encoded string. In msgpack, this is raw bytes.",
|
|
12
|
+
"contentEncoding": "base64",
|
|
13
|
+
"contentMediaType": "application/octet-stream"
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
}
|
atdata/local/__init__.py
CHANGED
|
@@ -1,24 +1,22 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""Backward-compatibility shim for atdata.local.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
.. deprecated::
|
|
4
|
+
Import from ``atdata.index`` and ``atdata.stores`` instead::
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
- ``LocalDatasetEntry``: Index entry with ATProto-compatible CIDs.
|
|
8
|
-
- ``S3DataStore``: S3-compatible shard storage.
|
|
6
|
+
from atdata.index import Index, LocalDatasetEntry
|
|
7
|
+
from atdata.stores import S3DataStore, LocalDiskStore
|
|
9
8
|
"""
|
|
10
9
|
|
|
11
|
-
from atdata.
|
|
10
|
+
from atdata.index import (
|
|
11
|
+
Index,
|
|
12
12
|
LocalDatasetEntry,
|
|
13
13
|
BasicIndexEntry,
|
|
14
|
-
REDIS_KEY_DATASET_ENTRY,
|
|
15
|
-
REDIS_KEY_SCHEMA,
|
|
16
|
-
)
|
|
17
|
-
from atdata.local._schema import (
|
|
18
14
|
SchemaNamespace,
|
|
19
15
|
SchemaFieldType,
|
|
20
16
|
SchemaField,
|
|
21
17
|
LocalSchemaRecord,
|
|
18
|
+
REDIS_KEY_DATASET_ENTRY,
|
|
19
|
+
REDIS_KEY_SCHEMA,
|
|
22
20
|
_ATDATA_URI_PREFIX,
|
|
23
21
|
_LEGACY_URI_PREFIX,
|
|
24
22
|
_kind_str_for_sample_type,
|
|
@@ -29,8 +27,8 @@ from atdata.local._schema import (
|
|
|
29
27
|
_python_type_to_field_type,
|
|
30
28
|
_build_schema_record,
|
|
31
29
|
)
|
|
32
|
-
from atdata.
|
|
33
|
-
|
|
30
|
+
from atdata.stores import (
|
|
31
|
+
LocalDiskStore,
|
|
34
32
|
S3DataStore,
|
|
35
33
|
_s3_env,
|
|
36
34
|
_s3_from_credentials,
|
|
@@ -44,6 +42,7 @@ from s3fs import S3FileSystem # noqa: F401 — re-exported for backward compat
|
|
|
44
42
|
|
|
45
43
|
__all__ = [
|
|
46
44
|
# Public API
|
|
45
|
+
"LocalDiskStore",
|
|
47
46
|
"Index",
|
|
48
47
|
"LocalDatasetEntry",
|
|
49
48
|
"BasicIndexEntry",
|
atdata/local/_repo_legacy.py
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
from atdata import Dataset
|
|
4
4
|
|
|
5
|
-
from atdata.
|
|
6
|
-
from atdata.
|
|
5
|
+
from atdata.index._entry import LocalDatasetEntry
|
|
6
|
+
from atdata.stores._s3 import _s3_env, _s3_from_credentials, _create_s3_write_callbacks
|
|
7
7
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from uuid import uuid4
|
|
@@ -97,7 +97,7 @@ class Repo:
|
|
|
97
97
|
|
|
98
98
|
#
|
|
99
99
|
|
|
100
|
-
from atdata.
|
|
100
|
+
from atdata.index._index import Index
|
|
101
101
|
|
|
102
102
|
self.index = Index(redis=redis)
|
|
103
103
|
|
atdata/manifest/__init__.py
CHANGED
|
@@ -26,3 +26,7 @@ from ._manifest import MANIFEST_FORMAT_VERSION as MANIFEST_FORMAT_VERSION
|
|
|
26
26
|
from ._writer import ManifestWriter as ManifestWriter
|
|
27
27
|
from ._query import QueryExecutor as QueryExecutor
|
|
28
28
|
from ._query import SampleLocation as SampleLocation
|
|
29
|
+
from ._proxy import FieldProxy as FieldProxy
|
|
30
|
+
from ._proxy import Predicate as Predicate
|
|
31
|
+
from ._proxy import query_fields as query_fields
|
|
32
|
+
from ._proxy import F as F
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""Typed proxy DSL for manifest queries.
|
|
2
|
+
|
|
3
|
+
Provides ``FieldProxy`` and ``Predicate`` classes that build pandas
|
|
4
|
+
filter expressions with IDE autocomplete and type safety.
|
|
5
|
+
|
|
6
|
+
Components:
|
|
7
|
+
|
|
8
|
+
- ``FieldProxy``: Wraps a field name; comparison operators return ``Predicate``
|
|
9
|
+
- ``Predicate``: Composable boolean expression tree; compiles to pandas ops
|
|
10
|
+
- ``query_fields()``: Factory that creates a typed proxy from a sample type
|
|
11
|
+
- ``F``: Untyped convenience proxy (Django-style F expressions)
|
|
12
|
+
|
|
13
|
+
Examples:
|
|
14
|
+
>>> Q = query_fields(MySample)
|
|
15
|
+
>>> pred = (Q.confidence > 0.9) & (Q.label == "dog")
|
|
16
|
+
|
|
17
|
+
>>> from atdata.manifest import F
|
|
18
|
+
>>> pred = (F.confidence > 0.9)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import functools
|
|
24
|
+
from typing import Any, Callable, Sequence, TYPE_CHECKING
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
import pandas as pd
|
|
28
|
+
|
|
29
|
+
from ._fields import resolve_manifest_fields
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Predicate:
|
|
33
|
+
"""A composable boolean predicate over manifest fields.
|
|
34
|
+
|
|
35
|
+
Constructed by comparison operators on ``FieldProxy`` objects.
|
|
36
|
+
Supports ``&`` (AND), ``|`` (OR), and ``~`` (NOT).
|
|
37
|
+
|
|
38
|
+
Call the predicate directly on a DataFrame to evaluate it,
|
|
39
|
+
or pass it as the ``where`` argument to ``QueryExecutor.query()``.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
>>> from atdata.manifest import F
|
|
43
|
+
>>> pred = (F.confidence > 0.9) & (F.label == "dog")
|
|
44
|
+
>>> pred = (F.score >= 0.5) | (F.label.isin(["cat", "dog"]))
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
__slots__ = ("_kind", "_field", "_op", "_value", "_children", "_child", "_compiled")
|
|
48
|
+
__hash__ = None # type: ignore[assignment]
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
kind: str,
|
|
53
|
+
*,
|
|
54
|
+
field: str | None = None,
|
|
55
|
+
op: str | None = None,
|
|
56
|
+
value: Any = None,
|
|
57
|
+
children: list[Predicate] | None = None,
|
|
58
|
+
child: Predicate | None = None,
|
|
59
|
+
) -> None:
|
|
60
|
+
self._kind = kind
|
|
61
|
+
self._field = field
|
|
62
|
+
self._op = op
|
|
63
|
+
self._value = value
|
|
64
|
+
self._children = children
|
|
65
|
+
self._child = child
|
|
66
|
+
self._compiled: Callable[[pd.DataFrame], pd.Series] | None = None
|
|
67
|
+
|
|
68
|
+
def __and__(self, other: Predicate) -> Predicate:
|
|
69
|
+
if not isinstance(other, Predicate):
|
|
70
|
+
return NotImplemented
|
|
71
|
+
# Flatten nested ANDs for a cleaner tree
|
|
72
|
+
left = self._children if self._kind == "and" else [self]
|
|
73
|
+
right = other._children if other._kind == "and" else [other]
|
|
74
|
+
return Predicate("and", children=[*left, *right])
|
|
75
|
+
|
|
76
|
+
def __or__(self, other: Predicate) -> Predicate:
|
|
77
|
+
if not isinstance(other, Predicate):
|
|
78
|
+
return NotImplemented
|
|
79
|
+
left = self._children if self._kind == "or" else [self]
|
|
80
|
+
right = other._children if other._kind == "or" else [other]
|
|
81
|
+
return Predicate("or", children=[*left, *right])
|
|
82
|
+
|
|
83
|
+
def __invert__(self) -> Predicate:
|
|
84
|
+
return Predicate("not", child=self)
|
|
85
|
+
|
|
86
|
+
def compile(self) -> Callable[[pd.DataFrame], pd.Series]:
|
|
87
|
+
"""Compile this predicate tree into a callable DataFrame filter.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
A callable that accepts a ``pd.DataFrame`` and returns a
|
|
91
|
+
boolean ``pd.Series``.
|
|
92
|
+
"""
|
|
93
|
+
if self._compiled is not None:
|
|
94
|
+
return self._compiled
|
|
95
|
+
|
|
96
|
+
self._compiled = self._build()
|
|
97
|
+
return self._compiled
|
|
98
|
+
|
|
99
|
+
def _build(self) -> Callable[[pd.DataFrame], pd.Series]:
|
|
100
|
+
"""Recursively build the pandas filter closure."""
|
|
101
|
+
|
|
102
|
+
if self._kind == "comparison":
|
|
103
|
+
field = self._field
|
|
104
|
+
op = self._op
|
|
105
|
+
value = self._value
|
|
106
|
+
return _make_comparison(field, op, value)
|
|
107
|
+
|
|
108
|
+
if self._kind == "and":
|
|
109
|
+
compiled_children = [c.compile() for c in self._children] # type: ignore[union-attr]
|
|
110
|
+
return _make_and(compiled_children)
|
|
111
|
+
|
|
112
|
+
if self._kind == "or":
|
|
113
|
+
compiled_children = [c.compile() for c in self._children] # type: ignore[union-attr]
|
|
114
|
+
return _make_or(compiled_children)
|
|
115
|
+
|
|
116
|
+
if self._kind == "not":
|
|
117
|
+
compiled_child = self._child.compile() # type: ignore[union-attr]
|
|
118
|
+
return _make_not(compiled_child)
|
|
119
|
+
|
|
120
|
+
raise ValueError(f"Unknown predicate kind: {self._kind!r}")
|
|
121
|
+
|
|
122
|
+
def __call__(self, df: pd.DataFrame) -> pd.Series:
|
|
123
|
+
"""Evaluate this predicate against a DataFrame.
|
|
124
|
+
|
|
125
|
+
This makes ``Predicate`` directly usable as a ``where`` argument
|
|
126
|
+
to ``QueryExecutor.query()`` without any adapter code.
|
|
127
|
+
"""
|
|
128
|
+
return self.compile()(df)
|
|
129
|
+
|
|
130
|
+
def __repr__(self) -> str:
|
|
131
|
+
if self._kind == "comparison":
|
|
132
|
+
return f"Predicate({self._field!r} {self._op} {self._value!r})"
|
|
133
|
+
if self._kind == "not":
|
|
134
|
+
return f"~{self._child!r}"
|
|
135
|
+
sep = " & " if self._kind == "and" else " | "
|
|
136
|
+
parts = sep.join(repr(c) for c in self._children) # type: ignore[union-attr]
|
|
137
|
+
return f"({parts})"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _make_comparison(
|
|
141
|
+
field: str | None, op: str | None, value: Any
|
|
142
|
+
) -> Callable[[pd.DataFrame], pd.Series]:
|
|
143
|
+
"""Create a closure for a single comparison operation."""
|
|
144
|
+
if op == "gt":
|
|
145
|
+
return lambda df: df[field] > value
|
|
146
|
+
if op == "lt":
|
|
147
|
+
return lambda df: df[field] < value
|
|
148
|
+
if op == "ge":
|
|
149
|
+
return lambda df: df[field] >= value
|
|
150
|
+
if op == "le":
|
|
151
|
+
return lambda df: df[field] <= value
|
|
152
|
+
if op == "eq":
|
|
153
|
+
return lambda df: df[field] == value
|
|
154
|
+
if op == "ne":
|
|
155
|
+
return lambda df: df[field] != value
|
|
156
|
+
if op == "isin":
|
|
157
|
+
return lambda df: df[field].isin(value)
|
|
158
|
+
raise ValueError(f"Unknown operator: {op!r}")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _make_and(
|
|
162
|
+
children: list[Callable[[pd.DataFrame], pd.Series]],
|
|
163
|
+
) -> Callable[[pd.DataFrame], pd.Series]:
|
|
164
|
+
"""Create a closure that ANDs multiple child predicates."""
|
|
165
|
+
|
|
166
|
+
def _and(df: pd.DataFrame) -> pd.Series:
|
|
167
|
+
return functools.reduce(lambda a, b: a & b, (c(df) for c in children))
|
|
168
|
+
|
|
169
|
+
return _and
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _make_or(
|
|
173
|
+
children: list[Callable[[pd.DataFrame], pd.Series]],
|
|
174
|
+
) -> Callable[[pd.DataFrame], pd.Series]:
|
|
175
|
+
"""Create a closure that ORs multiple child predicates."""
|
|
176
|
+
|
|
177
|
+
def _or(df: pd.DataFrame) -> pd.Series:
|
|
178
|
+
return functools.reduce(lambda a, b: a | b, (c(df) for c in children))
|
|
179
|
+
|
|
180
|
+
return _or
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _make_not(
|
|
184
|
+
child: Callable[[pd.DataFrame], pd.Series],
|
|
185
|
+
) -> Callable[[pd.DataFrame], pd.Series]:
|
|
186
|
+
"""Create a closure that negates a child predicate."""
|
|
187
|
+
return lambda df: ~child(df)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class FieldProxy:
|
|
191
|
+
"""Proxy for a single manifest field.
|
|
192
|
+
|
|
193
|
+
Comparison operators return ``Predicate`` objects for composable queries.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
name: The manifest field name (column name in the parquet DataFrame).
|
|
197
|
+
|
|
198
|
+
Examples:
|
|
199
|
+
>>> from atdata.manifest import F
|
|
200
|
+
>>> pred = F.confidence > 0.9
|
|
201
|
+
>>> pred = F.label.isin(["dog", "cat"])
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
__slots__ = ("_name",)
|
|
205
|
+
|
|
206
|
+
def __init__(self, name: str) -> None:
|
|
207
|
+
self._name = name
|
|
208
|
+
|
|
209
|
+
def __gt__(self, value: Any) -> Predicate:
|
|
210
|
+
return Predicate("comparison", field=self._name, op="gt", value=value)
|
|
211
|
+
|
|
212
|
+
def __lt__(self, value: Any) -> Predicate:
|
|
213
|
+
return Predicate("comparison", field=self._name, op="lt", value=value)
|
|
214
|
+
|
|
215
|
+
def __ge__(self, value: Any) -> Predicate:
|
|
216
|
+
return Predicate("comparison", field=self._name, op="ge", value=value)
|
|
217
|
+
|
|
218
|
+
def __le__(self, value: Any) -> Predicate:
|
|
219
|
+
return Predicate("comparison", field=self._name, op="le", value=value)
|
|
220
|
+
|
|
221
|
+
def __eq__(self, value: Any) -> Predicate: # type: ignore[override]
|
|
222
|
+
return Predicate("comparison", field=self._name, op="eq", value=value)
|
|
223
|
+
|
|
224
|
+
def __ne__(self, value: Any) -> Predicate: # type: ignore[override]
|
|
225
|
+
return Predicate("comparison", field=self._name, op="ne", value=value)
|
|
226
|
+
|
|
227
|
+
def isin(self, values: Sequence[Any]) -> Predicate:
|
|
228
|
+
"""Check membership in a set of values.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
values: Collection of values to test membership against.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
A ``Predicate`` that filters for rows where this field's
|
|
235
|
+
value is in *values*.
|
|
236
|
+
|
|
237
|
+
Examples:
|
|
238
|
+
>>> pred = F.label.isin(["dog", "cat", "bird"])
|
|
239
|
+
"""
|
|
240
|
+
return Predicate("comparison", field=self._name, op="isin", value=values)
|
|
241
|
+
|
|
242
|
+
def between(self, low: Any, high: Any) -> Predicate:
|
|
243
|
+
"""Check that the field value is within a closed range.
|
|
244
|
+
|
|
245
|
+
Shorthand for ``(field >= low) & (field <= high)``.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
low: Lower bound (inclusive).
|
|
249
|
+
high: Upper bound (inclusive).
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
A ``Predicate`` that filters for rows where this field's
|
|
253
|
+
value is between *low* and *high* inclusive.
|
|
254
|
+
|
|
255
|
+
Examples:
|
|
256
|
+
>>> pred = F.confidence.between(0.5, 0.9)
|
|
257
|
+
"""
|
|
258
|
+
return (self >= low) & (self <= high)
|
|
259
|
+
|
|
260
|
+
def __repr__(self) -> str:
|
|
261
|
+
return f"FieldProxy({self._name!r})"
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def query_fields(sample_type: type) -> Any:
|
|
265
|
+
"""Create a typed field proxy for querying a sample type.
|
|
266
|
+
|
|
267
|
+
Returns an object whose attributes are ``FieldProxy`` instances for
|
|
268
|
+
each manifest-eligible field of *sample_type*. Provides IDE
|
|
269
|
+
autocomplete when the return type is inferred.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
sample_type: A ``@packable`` or ``PackableSample`` subclass.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
A proxy object with one ``FieldProxy`` attribute per manifest field.
|
|
276
|
+
|
|
277
|
+
Raises:
|
|
278
|
+
TypeError: If *sample_type* is not a dataclass.
|
|
279
|
+
|
|
280
|
+
Examples:
|
|
281
|
+
>>> Q = query_fields(MySample)
|
|
282
|
+
>>> pred = (Q.confidence > 0.9) & (Q.label == "dog")
|
|
283
|
+
"""
|
|
284
|
+
fields = resolve_manifest_fields(sample_type)
|
|
285
|
+
attrs: dict[str, Any] = {}
|
|
286
|
+
annotations: dict[str, type] = {}
|
|
287
|
+
for name in fields:
|
|
288
|
+
attrs[name] = FieldProxy(name)
|
|
289
|
+
annotations[name] = FieldProxy
|
|
290
|
+
attrs["__annotations__"] = annotations
|
|
291
|
+
attrs["__slots__"] = ()
|
|
292
|
+
attrs["__repr__"] = (
|
|
293
|
+
lambda self: f"{sample_type.__name__}Fields({', '.join(annotations)})"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
proxy_cls = type(f"{sample_type.__name__}Fields", (), attrs)
|
|
297
|
+
return proxy_cls()
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class _UntypedFieldProxy:
|
|
301
|
+
"""Untyped convenience proxy for quick field access.
|
|
302
|
+
|
|
303
|
+
Attribute access returns a ``FieldProxy`` for any name, without
|
|
304
|
+
requiring a sample type. Useful for ad-hoc queries where IDE
|
|
305
|
+
autocomplete is not needed.
|
|
306
|
+
|
|
307
|
+
Examples:
|
|
308
|
+
>>> from atdata.manifest import F
|
|
309
|
+
>>> pred = (F.confidence > 0.9) & (F.label == "dog")
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
def __getattr__(self, name: str) -> FieldProxy:
|
|
313
|
+
if name.startswith("_"):
|
|
314
|
+
raise AttributeError(name)
|
|
315
|
+
return FieldProxy(name)
|
|
316
|
+
|
|
317
|
+
def __repr__(self) -> str:
|
|
318
|
+
return "F"
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
F = _UntypedFieldProxy()
|
atdata/promote.py
CHANGED
|
@@ -6,29 +6,28 @@ federation while maintaining schema consistency.
|
|
|
6
6
|
|
|
7
7
|
Examples:
|
|
8
8
|
>>> from atdata.local import Index, Repo
|
|
9
|
-
>>> from atdata.atmosphere import
|
|
9
|
+
>>> from atdata.atmosphere import Atmosphere
|
|
10
10
|
>>> from atdata.promote import promote_to_atmosphere
|
|
11
11
|
>>>
|
|
12
12
|
>>> # Setup
|
|
13
13
|
>>> local_index = Index()
|
|
14
|
-
>>>
|
|
15
|
-
>>> client.login("handle.bsky.social", "app-password")
|
|
14
|
+
>>> atmo = Atmosphere.login("handle.bsky.social", "app-password")
|
|
16
15
|
>>>
|
|
17
16
|
>>> # Promote a dataset
|
|
18
17
|
>>> entry = local_index.get_dataset("my-dataset")
|
|
19
|
-
>>> at_uri = promote_to_atmosphere(entry, local_index,
|
|
18
|
+
>>> at_uri = promote_to_atmosphere(entry, local_index, atmo)
|
|
20
19
|
"""
|
|
21
20
|
|
|
22
21
|
from typing import TYPE_CHECKING, Type
|
|
23
22
|
|
|
24
23
|
if TYPE_CHECKING:
|
|
25
24
|
from .local import LocalDatasetEntry, Index
|
|
26
|
-
from .atmosphere import
|
|
25
|
+
from .atmosphere import Atmosphere
|
|
27
26
|
from ._protocols import AbstractDataStore, Packable
|
|
28
27
|
|
|
29
28
|
|
|
30
29
|
def _find_existing_schema(
|
|
31
|
-
client: "
|
|
30
|
+
client: "Atmosphere",
|
|
32
31
|
name: str,
|
|
33
32
|
version: str,
|
|
34
33
|
) -> str | None:
|
|
@@ -55,7 +54,7 @@ def _find_existing_schema(
|
|
|
55
54
|
def _find_or_publish_schema(
|
|
56
55
|
sample_type: "Type[Packable]",
|
|
57
56
|
version: str,
|
|
58
|
-
client: "
|
|
57
|
+
client: "Atmosphere",
|
|
59
58
|
description: str | None = None,
|
|
60
59
|
) -> str:
|
|
61
60
|
"""Find existing schema or publish a new one.
|
|
@@ -95,7 +94,7 @@ def _find_or_publish_schema(
|
|
|
95
94
|
def promote_to_atmosphere(
|
|
96
95
|
local_entry: "LocalDatasetEntry",
|
|
97
96
|
local_index: "Index",
|
|
98
|
-
atmosphere_client: "
|
|
97
|
+
atmosphere_client: "Atmosphere",
|
|
99
98
|
*,
|
|
100
99
|
data_store: "AbstractDataStore | None" = None,
|
|
101
100
|
name: str | None = None,
|
|
@@ -108,10 +107,15 @@ def promote_to_atmosphere(
|
|
|
108
107
|
This function takes a locally-indexed dataset and publishes it to ATProto,
|
|
109
108
|
making it discoverable on the federated atmosphere network.
|
|
110
109
|
|
|
110
|
+
.. deprecated::
|
|
111
|
+
Prefer ``Index.promote_entry()`` or ``Index.promote_dataset()``
|
|
112
|
+
which provide the same functionality through the unified Index
|
|
113
|
+
interface without requiring separate client and index arguments.
|
|
114
|
+
|
|
111
115
|
Args:
|
|
112
116
|
local_entry: The LocalDatasetEntry to promote.
|
|
113
117
|
local_index: Local index containing the schema for this entry.
|
|
114
|
-
atmosphere_client: Authenticated
|
|
118
|
+
atmosphere_client: Authenticated Atmosphere.
|
|
115
119
|
data_store: Optional data store for copying data to new location.
|
|
116
120
|
If None, the existing data_urls are used as-is.
|
|
117
121
|
name: Override name for the atmosphere record. Defaults to local name.
|
|
@@ -128,7 +132,7 @@ def promote_to_atmosphere(
|
|
|
128
132
|
|
|
129
133
|
Examples:
|
|
130
134
|
>>> entry = local_index.get_dataset("mnist-train")
|
|
131
|
-
>>> uri = promote_to_atmosphere(entry, local_index,
|
|
135
|
+
>>> uri = promote_to_atmosphere(entry, local_index, atmo)
|
|
132
136
|
>>> print(uri)
|
|
133
137
|
at://did:plc:abc123/ac.foundation.dataset.datasetIndex/...
|
|
134
138
|
"""
|