atdata 0.3.1b1__py3-none-any.whl → 0.3.2b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata/__init__.py +2 -0
- atdata/_hf_api.py +13 -0
- atdata/_logging.py +43 -0
- atdata/_protocols.py +18 -1
- atdata/_sources.py +24 -4
- atdata/atmosphere/__init__.py +48 -10
- atdata/atmosphere/_lexicon_types.py +595 -0
- atdata/atmosphere/_types.py +71 -243
- atdata/atmosphere/lens.py +49 -41
- atdata/atmosphere/records.py +282 -90
- atdata/atmosphere/schema.py +78 -50
- atdata/atmosphere/store.py +62 -59
- atdata/dataset.py +201 -135
- atdata/index/_entry.py +6 -2
- atdata/index/_index.py +396 -109
- atdata/lexicons/__init__.py +9 -3
- atdata/lexicons/ac.foundation.dataset.lens.json +2 -0
- atdata/lexicons/ac.foundation.dataset.record.json +22 -1
- atdata/lexicons/ac.foundation.dataset.storageBlobs.json +26 -4
- atdata/lexicons/ac.foundation.dataset.storageExternal.json +1 -1
- atdata/lexicons/ac.foundation.dataset.storageHttp.json +45 -0
- atdata/lexicons/ac.foundation.dataset.storageS3.json +61 -0
- atdata/manifest/__init__.py +4 -0
- atdata/manifest/_proxy.py +321 -0
- atdata/repository.py +59 -9
- atdata/stores/_disk.py +19 -11
- atdata/stores/_s3.py +134 -112
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/METADATA +1 -1
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/RECORD +37 -33
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/WHEEL +0 -0
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/entry_points.txt +0 -0
- {atdata-0.3.1b1.dist-info → atdata-0.3.2b1.dist-info}/licenses/LICENSE +0 -0
atdata/lexicons/__init__.py
CHANGED
|
@@ -16,10 +16,14 @@ Lexicons:
|
|
|
16
16
|
Extensible token for schema format identifiers.
|
|
17
17
|
ac.foundation.dataset.arrayFormat
|
|
18
18
|
Extensible token for array serialization formats.
|
|
19
|
-
ac.foundation.dataset.
|
|
20
|
-
|
|
19
|
+
ac.foundation.dataset.storageHttp
|
|
20
|
+
HTTP/HTTPS URL-based storage with per-shard checksums.
|
|
21
|
+
ac.foundation.dataset.storageS3
|
|
22
|
+
S3/S3-compatible object storage with per-shard checksums.
|
|
21
23
|
ac.foundation.dataset.storageBlobs
|
|
22
24
|
ATProto PDS blob-based storage.
|
|
25
|
+
ac.foundation.dataset.storageExternal
|
|
26
|
+
(Deprecated) External URL-based storage.
|
|
23
27
|
ac.foundation.dataset.getLatestSchema
|
|
24
28
|
XRPC query for fetching the latest schema version.
|
|
25
29
|
|
|
@@ -47,8 +51,10 @@ LEXICON_IDS = (
|
|
|
47
51
|
f"{NAMESPACE}.lens",
|
|
48
52
|
f"{NAMESPACE}.schemaType",
|
|
49
53
|
f"{NAMESPACE}.arrayFormat",
|
|
50
|
-
f"{NAMESPACE}.
|
|
54
|
+
f"{NAMESPACE}.storageHttp",
|
|
55
|
+
f"{NAMESPACE}.storageS3",
|
|
51
56
|
f"{NAMESPACE}.storageBlobs",
|
|
57
|
+
f"{NAMESPACE}.storageExternal", # deprecated
|
|
52
58
|
f"{NAMESPACE}.getLatestSchema",
|
|
53
59
|
)
|
|
54
60
|
|
|
@@ -24,11 +24,13 @@
|
|
|
24
24
|
},
|
|
25
25
|
"sourceSchema": {
|
|
26
26
|
"type": "string",
|
|
27
|
+
"format": "at-uri",
|
|
27
28
|
"description": "AT-URI reference to source schema",
|
|
28
29
|
"maxLength": 500
|
|
29
30
|
},
|
|
30
31
|
"targetSchema": {
|
|
31
32
|
"type": "string",
|
|
33
|
+
"format": "at-uri",
|
|
32
34
|
"description": "AT-URI reference to target schema",
|
|
33
35
|
"maxLength": 500
|
|
34
36
|
},
|
|
@@ -30,7 +30,8 @@
|
|
|
30
30
|
"type": "union",
|
|
31
31
|
"description": "Storage location for dataset files (WebDataset tar archives)",
|
|
32
32
|
"refs": [
|
|
33
|
-
"ac.foundation.dataset.
|
|
33
|
+
"ac.foundation.dataset.storageHttp",
|
|
34
|
+
"ac.foundation.dataset.storageS3",
|
|
34
35
|
"ac.foundation.dataset.storageBlobs"
|
|
35
36
|
]
|
|
36
37
|
},
|
|
@@ -71,6 +72,26 @@
|
|
|
71
72
|
}
|
|
72
73
|
}
|
|
73
74
|
},
|
|
75
|
+
"shardChecksum": {
|
|
76
|
+
"type": "object",
|
|
77
|
+
"description": "Content hash for shard integrity verification. Algorithm is flexible to allow SHA-256, BLAKE3, or other hash functions.",
|
|
78
|
+
"required": [
|
|
79
|
+
"algorithm",
|
|
80
|
+
"digest"
|
|
81
|
+
],
|
|
82
|
+
"properties": {
|
|
83
|
+
"algorithm": {
|
|
84
|
+
"type": "string",
|
|
85
|
+
"description": "Hash algorithm identifier (e.g., 'sha256', 'blake3')",
|
|
86
|
+
"maxLength": 20
|
|
87
|
+
},
|
|
88
|
+
"digest": {
|
|
89
|
+
"type": "string",
|
|
90
|
+
"description": "Hex-encoded hash digest",
|
|
91
|
+
"maxLength": 128
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
},
|
|
74
95
|
"datasetSize": {
|
|
75
96
|
"type": "object",
|
|
76
97
|
"description": "Information about dataset size",
|
|
@@ -4,21 +4,43 @@
|
|
|
4
4
|
"defs": {
|
|
5
5
|
"main": {
|
|
6
6
|
"type": "object",
|
|
7
|
-
"description": "Storage via ATProto PDS blobs for WebDataset tar archives.
|
|
7
|
+
"description": "Storage via ATProto PDS blobs for WebDataset tar archives. Used in ac.foundation.dataset.record storage union for maximum decentralization.",
|
|
8
8
|
"required": [
|
|
9
9
|
"blobs"
|
|
10
10
|
],
|
|
11
11
|
"properties": {
|
|
12
12
|
"blobs": {
|
|
13
13
|
"type": "array",
|
|
14
|
-
"description": "Array of blob
|
|
14
|
+
"description": "Array of blob entries for WebDataset tar files",
|
|
15
15
|
"items": {
|
|
16
|
-
"type": "
|
|
17
|
-
"
|
|
16
|
+
"type": "ref",
|
|
17
|
+
"ref": "#blobEntry"
|
|
18
18
|
},
|
|
19
19
|
"minLength": 1
|
|
20
20
|
}
|
|
21
21
|
}
|
|
22
|
+
},
|
|
23
|
+
"blobEntry": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"description": "A single PDS blob shard with optional integrity checksum",
|
|
26
|
+
"required": [
|
|
27
|
+
"blob"
|
|
28
|
+
],
|
|
29
|
+
"properties": {
|
|
30
|
+
"blob": {
|
|
31
|
+
"type": "blob",
|
|
32
|
+
"accept": [
|
|
33
|
+
"application/x-tar"
|
|
34
|
+
],
|
|
35
|
+
"maxSize": 52428800,
|
|
36
|
+
"description": "Blob reference to a WebDataset tar archive"
|
|
37
|
+
},
|
|
38
|
+
"checksum": {
|
|
39
|
+
"type": "ref",
|
|
40
|
+
"ref": "ac.foundation.dataset.record#shardChecksum",
|
|
41
|
+
"description": "Content hash for integrity verification (optional since PDS blobs have built-in CID integrity)"
|
|
42
|
+
}
|
|
43
|
+
}
|
|
22
44
|
}
|
|
23
45
|
}
|
|
24
46
|
}
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
"defs": {
|
|
5
5
|
"main": {
|
|
6
6
|
"type": "object",
|
|
7
|
-
"description": "External storage via URLs
|
|
7
|
+
"description": "(Deprecated: use storageHttp or storageS3 instead.) External storage via URLs for WebDataset tar archives. URLs support brace notation for sharding (e.g., 'data-{000000..000099}.tar').",
|
|
8
8
|
"required": [
|
|
9
9
|
"urls"
|
|
10
10
|
],
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageHttp",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "HTTP/HTTPS storage for WebDataset tar archives. Each shard is listed individually with a checksum for integrity verification. Consumers build brace-expansion patterns on the fly when needed.",
|
|
8
|
+
"required": [
|
|
9
|
+
"shards"
|
|
10
|
+
],
|
|
11
|
+
"properties": {
|
|
12
|
+
"shards": {
|
|
13
|
+
"type": "array",
|
|
14
|
+
"description": "Array of shard entries with URL and integrity checksum",
|
|
15
|
+
"items": {
|
|
16
|
+
"type": "ref",
|
|
17
|
+
"ref": "#shardEntry"
|
|
18
|
+
},
|
|
19
|
+
"minLength": 1
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"shardEntry": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"description": "A single HTTP-accessible shard with integrity checksum",
|
|
26
|
+
"required": [
|
|
27
|
+
"url",
|
|
28
|
+
"checksum"
|
|
29
|
+
],
|
|
30
|
+
"properties": {
|
|
31
|
+
"url": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"format": "uri",
|
|
34
|
+
"description": "HTTP/HTTPS URL for this WebDataset tar shard",
|
|
35
|
+
"maxLength": 2000
|
|
36
|
+
},
|
|
37
|
+
"checksum": {
|
|
38
|
+
"type": "ref",
|
|
39
|
+
"ref": "ac.foundation.dataset.record#shardChecksum",
|
|
40
|
+
"description": "Content hash for integrity verification"
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
{
|
|
2
|
+
"lexicon": 1,
|
|
3
|
+
"id": "ac.foundation.dataset.storageS3",
|
|
4
|
+
"defs": {
|
|
5
|
+
"main": {
|
|
6
|
+
"type": "object",
|
|
7
|
+
"description": "S3 or S3-compatible storage for WebDataset tar archives. Supports custom endpoints for MinIO, Cloudflare R2, and other S3-compatible services.",
|
|
8
|
+
"required": [
|
|
9
|
+
"bucket",
|
|
10
|
+
"shards"
|
|
11
|
+
],
|
|
12
|
+
"properties": {
|
|
13
|
+
"bucket": {
|
|
14
|
+
"type": "string",
|
|
15
|
+
"description": "S3 bucket name",
|
|
16
|
+
"maxLength": 255
|
|
17
|
+
},
|
|
18
|
+
"region": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "AWS region (e.g., 'us-east-1'). Optional for S3-compatible services.",
|
|
21
|
+
"maxLength": 50
|
|
22
|
+
},
|
|
23
|
+
"endpoint": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"format": "uri",
|
|
26
|
+
"description": "Custom S3-compatible endpoint URL (e.g., for MinIO, Cloudflare R2). Omit for standard AWS S3.",
|
|
27
|
+
"maxLength": 500
|
|
28
|
+
},
|
|
29
|
+
"shards": {
|
|
30
|
+
"type": "array",
|
|
31
|
+
"description": "Array of shard entries with object key and integrity checksum",
|
|
32
|
+
"items": {
|
|
33
|
+
"type": "ref",
|
|
34
|
+
"ref": "#shardEntry"
|
|
35
|
+
},
|
|
36
|
+
"minLength": 1
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
"shardEntry": {
|
|
41
|
+
"type": "object",
|
|
42
|
+
"description": "A single S3 object shard with integrity checksum",
|
|
43
|
+
"required": [
|
|
44
|
+
"key",
|
|
45
|
+
"checksum"
|
|
46
|
+
],
|
|
47
|
+
"properties": {
|
|
48
|
+
"key": {
|
|
49
|
+
"type": "string",
|
|
50
|
+
"description": "S3 object key for this WebDataset tar shard",
|
|
51
|
+
"maxLength": 1024
|
|
52
|
+
},
|
|
53
|
+
"checksum": {
|
|
54
|
+
"type": "ref",
|
|
55
|
+
"ref": "ac.foundation.dataset.record#shardChecksum",
|
|
56
|
+
"description": "Content hash for integrity verification"
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
atdata/manifest/__init__.py
CHANGED
|
@@ -26,3 +26,7 @@ from ._manifest import MANIFEST_FORMAT_VERSION as MANIFEST_FORMAT_VERSION
|
|
|
26
26
|
from ._writer import ManifestWriter as ManifestWriter
|
|
27
27
|
from ._query import QueryExecutor as QueryExecutor
|
|
28
28
|
from ._query import SampleLocation as SampleLocation
|
|
29
|
+
from ._proxy import FieldProxy as FieldProxy
|
|
30
|
+
from ._proxy import Predicate as Predicate
|
|
31
|
+
from ._proxy import query_fields as query_fields
|
|
32
|
+
from ._proxy import F as F
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""Typed proxy DSL for manifest queries.
|
|
2
|
+
|
|
3
|
+
Provides ``FieldProxy`` and ``Predicate`` classes that build pandas
|
|
4
|
+
filter expressions with IDE autocomplete and type safety.
|
|
5
|
+
|
|
6
|
+
Components:
|
|
7
|
+
|
|
8
|
+
- ``FieldProxy``: Wraps a field name; comparison operators return ``Predicate``
|
|
9
|
+
- ``Predicate``: Composable boolean expression tree; compiles to pandas ops
|
|
10
|
+
- ``query_fields()``: Factory that creates a typed proxy from a sample type
|
|
11
|
+
- ``F``: Untyped convenience proxy (Django-style F expressions)
|
|
12
|
+
|
|
13
|
+
Examples:
|
|
14
|
+
>>> Q = query_fields(MySample)
|
|
15
|
+
>>> pred = (Q.confidence > 0.9) & (Q.label == "dog")
|
|
16
|
+
|
|
17
|
+
>>> from atdata.manifest import F
|
|
18
|
+
>>> pred = (F.confidence > 0.9)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import functools
|
|
24
|
+
from typing import Any, Callable, Sequence, TYPE_CHECKING
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
import pandas as pd
|
|
28
|
+
|
|
29
|
+
from ._fields import resolve_manifest_fields
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Predicate:
|
|
33
|
+
"""A composable boolean predicate over manifest fields.
|
|
34
|
+
|
|
35
|
+
Constructed by comparison operators on ``FieldProxy`` objects.
|
|
36
|
+
Supports ``&`` (AND), ``|`` (OR), and ``~`` (NOT).
|
|
37
|
+
|
|
38
|
+
Call the predicate directly on a DataFrame to evaluate it,
|
|
39
|
+
or pass it as the ``where`` argument to ``QueryExecutor.query()``.
|
|
40
|
+
|
|
41
|
+
Examples:
|
|
42
|
+
>>> from atdata.manifest import F
|
|
43
|
+
>>> pred = (F.confidence > 0.9) & (F.label == "dog")
|
|
44
|
+
>>> pred = (F.score >= 0.5) | (F.label.isin(["cat", "dog"]))
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
__slots__ = ("_kind", "_field", "_op", "_value", "_children", "_child", "_compiled")
|
|
48
|
+
__hash__ = None # type: ignore[assignment]
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
kind: str,
|
|
53
|
+
*,
|
|
54
|
+
field: str | None = None,
|
|
55
|
+
op: str | None = None,
|
|
56
|
+
value: Any = None,
|
|
57
|
+
children: list[Predicate] | None = None,
|
|
58
|
+
child: Predicate | None = None,
|
|
59
|
+
) -> None:
|
|
60
|
+
self._kind = kind
|
|
61
|
+
self._field = field
|
|
62
|
+
self._op = op
|
|
63
|
+
self._value = value
|
|
64
|
+
self._children = children
|
|
65
|
+
self._child = child
|
|
66
|
+
self._compiled: Callable[[pd.DataFrame], pd.Series] | None = None
|
|
67
|
+
|
|
68
|
+
def __and__(self, other: Predicate) -> Predicate:
|
|
69
|
+
if not isinstance(other, Predicate):
|
|
70
|
+
return NotImplemented
|
|
71
|
+
# Flatten nested ANDs for a cleaner tree
|
|
72
|
+
left = self._children if self._kind == "and" else [self]
|
|
73
|
+
right = other._children if other._kind == "and" else [other]
|
|
74
|
+
return Predicate("and", children=[*left, *right])
|
|
75
|
+
|
|
76
|
+
def __or__(self, other: Predicate) -> Predicate:
|
|
77
|
+
if not isinstance(other, Predicate):
|
|
78
|
+
return NotImplemented
|
|
79
|
+
left = self._children if self._kind == "or" else [self]
|
|
80
|
+
right = other._children if other._kind == "or" else [other]
|
|
81
|
+
return Predicate("or", children=[*left, *right])
|
|
82
|
+
|
|
83
|
+
def __invert__(self) -> Predicate:
|
|
84
|
+
return Predicate("not", child=self)
|
|
85
|
+
|
|
86
|
+
def compile(self) -> Callable[[pd.DataFrame], pd.Series]:
|
|
87
|
+
"""Compile this predicate tree into a callable DataFrame filter.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
A callable that accepts a ``pd.DataFrame`` and returns a
|
|
91
|
+
boolean ``pd.Series``.
|
|
92
|
+
"""
|
|
93
|
+
if self._compiled is not None:
|
|
94
|
+
return self._compiled
|
|
95
|
+
|
|
96
|
+
self._compiled = self._build()
|
|
97
|
+
return self._compiled
|
|
98
|
+
|
|
99
|
+
def _build(self) -> Callable[[pd.DataFrame], pd.Series]:
|
|
100
|
+
"""Recursively build the pandas filter closure."""
|
|
101
|
+
|
|
102
|
+
if self._kind == "comparison":
|
|
103
|
+
field = self._field
|
|
104
|
+
op = self._op
|
|
105
|
+
value = self._value
|
|
106
|
+
return _make_comparison(field, op, value)
|
|
107
|
+
|
|
108
|
+
if self._kind == "and":
|
|
109
|
+
compiled_children = [c.compile() for c in self._children] # type: ignore[union-attr]
|
|
110
|
+
return _make_and(compiled_children)
|
|
111
|
+
|
|
112
|
+
if self._kind == "or":
|
|
113
|
+
compiled_children = [c.compile() for c in self._children] # type: ignore[union-attr]
|
|
114
|
+
return _make_or(compiled_children)
|
|
115
|
+
|
|
116
|
+
if self._kind == "not":
|
|
117
|
+
compiled_child = self._child.compile() # type: ignore[union-attr]
|
|
118
|
+
return _make_not(compiled_child)
|
|
119
|
+
|
|
120
|
+
raise ValueError(f"Unknown predicate kind: {self._kind!r}")
|
|
121
|
+
|
|
122
|
+
def __call__(self, df: pd.DataFrame) -> pd.Series:
|
|
123
|
+
"""Evaluate this predicate against a DataFrame.
|
|
124
|
+
|
|
125
|
+
This makes ``Predicate`` directly usable as a ``where`` argument
|
|
126
|
+
to ``QueryExecutor.query()`` without any adapter code.
|
|
127
|
+
"""
|
|
128
|
+
return self.compile()(df)
|
|
129
|
+
|
|
130
|
+
def __repr__(self) -> str:
|
|
131
|
+
if self._kind == "comparison":
|
|
132
|
+
return f"Predicate({self._field!r} {self._op} {self._value!r})"
|
|
133
|
+
if self._kind == "not":
|
|
134
|
+
return f"~{self._child!r}"
|
|
135
|
+
sep = " & " if self._kind == "and" else " | "
|
|
136
|
+
parts = sep.join(repr(c) for c in self._children) # type: ignore[union-attr]
|
|
137
|
+
return f"({parts})"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _make_comparison(
|
|
141
|
+
field: str | None, op: str | None, value: Any
|
|
142
|
+
) -> Callable[[pd.DataFrame], pd.Series]:
|
|
143
|
+
"""Create a closure for a single comparison operation."""
|
|
144
|
+
if op == "gt":
|
|
145
|
+
return lambda df: df[field] > value
|
|
146
|
+
if op == "lt":
|
|
147
|
+
return lambda df: df[field] < value
|
|
148
|
+
if op == "ge":
|
|
149
|
+
return lambda df: df[field] >= value
|
|
150
|
+
if op == "le":
|
|
151
|
+
return lambda df: df[field] <= value
|
|
152
|
+
if op == "eq":
|
|
153
|
+
return lambda df: df[field] == value
|
|
154
|
+
if op == "ne":
|
|
155
|
+
return lambda df: df[field] != value
|
|
156
|
+
if op == "isin":
|
|
157
|
+
return lambda df: df[field].isin(value)
|
|
158
|
+
raise ValueError(f"Unknown operator: {op!r}")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _make_and(
|
|
162
|
+
children: list[Callable[[pd.DataFrame], pd.Series]],
|
|
163
|
+
) -> Callable[[pd.DataFrame], pd.Series]:
|
|
164
|
+
"""Create a closure that ANDs multiple child predicates."""
|
|
165
|
+
|
|
166
|
+
def _and(df: pd.DataFrame) -> pd.Series:
|
|
167
|
+
return functools.reduce(lambda a, b: a & b, (c(df) for c in children))
|
|
168
|
+
|
|
169
|
+
return _and
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _make_or(
|
|
173
|
+
children: list[Callable[[pd.DataFrame], pd.Series]],
|
|
174
|
+
) -> Callable[[pd.DataFrame], pd.Series]:
|
|
175
|
+
"""Create a closure that ORs multiple child predicates."""
|
|
176
|
+
|
|
177
|
+
def _or(df: pd.DataFrame) -> pd.Series:
|
|
178
|
+
return functools.reduce(lambda a, b: a | b, (c(df) for c in children))
|
|
179
|
+
|
|
180
|
+
return _or
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _make_not(
|
|
184
|
+
child: Callable[[pd.DataFrame], pd.Series],
|
|
185
|
+
) -> Callable[[pd.DataFrame], pd.Series]:
|
|
186
|
+
"""Create a closure that negates a child predicate."""
|
|
187
|
+
return lambda df: ~child(df)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class FieldProxy:
|
|
191
|
+
"""Proxy for a single manifest field.
|
|
192
|
+
|
|
193
|
+
Comparison operators return ``Predicate`` objects for composable queries.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
name: The manifest field name (column name in the parquet DataFrame).
|
|
197
|
+
|
|
198
|
+
Examples:
|
|
199
|
+
>>> from atdata.manifest import F
|
|
200
|
+
>>> pred = F.confidence > 0.9
|
|
201
|
+
>>> pred = F.label.isin(["dog", "cat"])
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
__slots__ = ("_name",)
|
|
205
|
+
|
|
206
|
+
def __init__(self, name: str) -> None:
|
|
207
|
+
self._name = name
|
|
208
|
+
|
|
209
|
+
def __gt__(self, value: Any) -> Predicate:
|
|
210
|
+
return Predicate("comparison", field=self._name, op="gt", value=value)
|
|
211
|
+
|
|
212
|
+
def __lt__(self, value: Any) -> Predicate:
|
|
213
|
+
return Predicate("comparison", field=self._name, op="lt", value=value)
|
|
214
|
+
|
|
215
|
+
def __ge__(self, value: Any) -> Predicate:
|
|
216
|
+
return Predicate("comparison", field=self._name, op="ge", value=value)
|
|
217
|
+
|
|
218
|
+
def __le__(self, value: Any) -> Predicate:
|
|
219
|
+
return Predicate("comparison", field=self._name, op="le", value=value)
|
|
220
|
+
|
|
221
|
+
def __eq__(self, value: Any) -> Predicate: # type: ignore[override]
|
|
222
|
+
return Predicate("comparison", field=self._name, op="eq", value=value)
|
|
223
|
+
|
|
224
|
+
def __ne__(self, value: Any) -> Predicate: # type: ignore[override]
|
|
225
|
+
return Predicate("comparison", field=self._name, op="ne", value=value)
|
|
226
|
+
|
|
227
|
+
def isin(self, values: Sequence[Any]) -> Predicate:
|
|
228
|
+
"""Check membership in a set of values.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
values: Collection of values to test membership against.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
A ``Predicate`` that filters for rows where this field's
|
|
235
|
+
value is in *values*.
|
|
236
|
+
|
|
237
|
+
Examples:
|
|
238
|
+
>>> pred = F.label.isin(["dog", "cat", "bird"])
|
|
239
|
+
"""
|
|
240
|
+
return Predicate("comparison", field=self._name, op="isin", value=values)
|
|
241
|
+
|
|
242
|
+
def between(self, low: Any, high: Any) -> Predicate:
|
|
243
|
+
"""Check that the field value is within a closed range.
|
|
244
|
+
|
|
245
|
+
Shorthand for ``(field >= low) & (field <= high)``.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
low: Lower bound (inclusive).
|
|
249
|
+
high: Upper bound (inclusive).
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
A ``Predicate`` that filters for rows where this field's
|
|
253
|
+
value is between *low* and *high* inclusive.
|
|
254
|
+
|
|
255
|
+
Examples:
|
|
256
|
+
>>> pred = F.confidence.between(0.5, 0.9)
|
|
257
|
+
"""
|
|
258
|
+
return (self >= low) & (self <= high)
|
|
259
|
+
|
|
260
|
+
def __repr__(self) -> str:
|
|
261
|
+
return f"FieldProxy({self._name!r})"
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def query_fields(sample_type: type) -> Any:
|
|
265
|
+
"""Create a typed field proxy for querying a sample type.
|
|
266
|
+
|
|
267
|
+
Returns an object whose attributes are ``FieldProxy`` instances for
|
|
268
|
+
each manifest-eligible field of *sample_type*. Provides IDE
|
|
269
|
+
autocomplete when the return type is inferred.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
sample_type: A ``@packable`` or ``PackableSample`` subclass.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
A proxy object with one ``FieldProxy`` attribute per manifest field.
|
|
276
|
+
|
|
277
|
+
Raises:
|
|
278
|
+
TypeError: If *sample_type* is not a dataclass.
|
|
279
|
+
|
|
280
|
+
Examples:
|
|
281
|
+
>>> Q = query_fields(MySample)
|
|
282
|
+
>>> pred = (Q.confidence > 0.9) & (Q.label == "dog")
|
|
283
|
+
"""
|
|
284
|
+
fields = resolve_manifest_fields(sample_type)
|
|
285
|
+
attrs: dict[str, Any] = {}
|
|
286
|
+
annotations: dict[str, type] = {}
|
|
287
|
+
for name in fields:
|
|
288
|
+
attrs[name] = FieldProxy(name)
|
|
289
|
+
annotations[name] = FieldProxy
|
|
290
|
+
attrs["__annotations__"] = annotations
|
|
291
|
+
attrs["__slots__"] = ()
|
|
292
|
+
attrs["__repr__"] = (
|
|
293
|
+
lambda self: f"{sample_type.__name__}Fields({', '.join(annotations)})"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
proxy_cls = type(f"{sample_type.__name__}Fields", (), attrs)
|
|
297
|
+
return proxy_cls()
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class _UntypedFieldProxy:
|
|
301
|
+
"""Untyped convenience proxy for quick field access.
|
|
302
|
+
|
|
303
|
+
Attribute access returns a ``FieldProxy`` for any name, without
|
|
304
|
+
requiring a sample type. Useful for ad-hoc queries where IDE
|
|
305
|
+
autocomplete is not needed.
|
|
306
|
+
|
|
307
|
+
Examples:
|
|
308
|
+
>>> from atdata.manifest import F
|
|
309
|
+
>>> pred = (F.confidence > 0.9) & (F.label == "dog")
|
|
310
|
+
"""
|
|
311
|
+
|
|
312
|
+
def __getattr__(self, name: str) -> FieldProxy:
|
|
313
|
+
if name.startswith("_"):
|
|
314
|
+
raise AttributeError(name)
|
|
315
|
+
return FieldProxy(name)
|
|
316
|
+
|
|
317
|
+
def __repr__(self) -> str:
|
|
318
|
+
return "F"
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
F = _UntypedFieldProxy()
|
atdata/repository.py
CHANGED
|
@@ -210,14 +210,26 @@ class _AtmosphereBackend:
|
|
|
210
210
|
*,
|
|
211
211
|
name: str,
|
|
212
212
|
schema_ref: str | None = None,
|
|
213
|
+
data_urls: list[str] | None = None,
|
|
214
|
+
blob_refs: list[dict] | None = None,
|
|
213
215
|
**kwargs: Any,
|
|
214
216
|
) -> Any:
|
|
215
217
|
"""Insert a dataset into ATProto.
|
|
216
218
|
|
|
219
|
+
When *blob_refs* is provided the record uses ``storageBlobs`` with
|
|
220
|
+
embedded blob reference objects so the PDS retains the uploaded blobs.
|
|
221
|
+
|
|
222
|
+
When *data_urls* is provided (without *blob_refs*) the record uses
|
|
223
|
+
``storageExternal`` with those URLs.
|
|
224
|
+
|
|
217
225
|
Args:
|
|
218
226
|
ds: The Dataset to publish.
|
|
219
227
|
name: Human-readable name.
|
|
220
228
|
schema_ref: Optional schema AT URI. If None, auto-publishes schema.
|
|
229
|
+
data_urls: Explicit shard URLs to store in the record. When
|
|
230
|
+
provided, these replace whatever ``ds.url`` contains.
|
|
231
|
+
blob_refs: Pre-uploaded blob reference dicts from
|
|
232
|
+
``PDSBlobStore``. Takes precedence over *data_urls*.
|
|
221
233
|
**kwargs: Additional options (description, tags, license).
|
|
222
234
|
|
|
223
235
|
Returns:
|
|
@@ -226,15 +238,53 @@ class _AtmosphereBackend:
|
|
|
226
238
|
self._ensure_loaders()
|
|
227
239
|
from .atmosphere import AtmosphereIndexEntry
|
|
228
240
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
241
|
+
if blob_refs is not None or data_urls is not None:
|
|
242
|
+
# Ensure schema is published first
|
|
243
|
+
if schema_ref is None:
|
|
244
|
+
from .atmosphere import SchemaPublisher
|
|
245
|
+
|
|
246
|
+
sp = SchemaPublisher(self.client)
|
|
247
|
+
schema_uri_obj = sp.publish(
|
|
248
|
+
ds.sample_type,
|
|
249
|
+
version=kwargs.get("schema_version", "1.0.0"),
|
|
250
|
+
)
|
|
251
|
+
schema_ref = str(schema_uri_obj)
|
|
252
|
+
|
|
253
|
+
metadata = kwargs.get("metadata")
|
|
254
|
+
if metadata is None and hasattr(ds, "_metadata"):
|
|
255
|
+
metadata = ds._metadata
|
|
256
|
+
|
|
257
|
+
if blob_refs is not None:
|
|
258
|
+
uri = self._dataset_publisher.publish_with_blob_refs(
|
|
259
|
+
blob_refs=blob_refs,
|
|
260
|
+
schema_uri=schema_ref,
|
|
261
|
+
name=name,
|
|
262
|
+
description=kwargs.get("description"),
|
|
263
|
+
tags=kwargs.get("tags"),
|
|
264
|
+
license=kwargs.get("license"),
|
|
265
|
+
metadata=metadata,
|
|
266
|
+
)
|
|
267
|
+
else:
|
|
268
|
+
uri = self._dataset_publisher.publish_with_urls(
|
|
269
|
+
urls=data_urls,
|
|
270
|
+
schema_uri=schema_ref,
|
|
271
|
+
name=name,
|
|
272
|
+
description=kwargs.get("description"),
|
|
273
|
+
tags=kwargs.get("tags"),
|
|
274
|
+
license=kwargs.get("license"),
|
|
275
|
+
metadata=metadata,
|
|
276
|
+
)
|
|
277
|
+
else:
|
|
278
|
+
uri = self._dataset_publisher.publish(
|
|
279
|
+
ds,
|
|
280
|
+
name=name,
|
|
281
|
+
schema_uri=schema_ref,
|
|
282
|
+
description=kwargs.get("description"),
|
|
283
|
+
tags=kwargs.get("tags"),
|
|
284
|
+
license=kwargs.get("license"),
|
|
285
|
+
auto_publish_schema=(schema_ref is None),
|
|
286
|
+
)
|
|
287
|
+
|
|
238
288
|
record = self._dataset_loader.get(uri)
|
|
239
289
|
return AtmosphereIndexEntry(str(uri), record)
|
|
240
290
|
|