atdata 0.1.3b3__tar.gz → 0.2.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- atdata-0.2.0a1/.github/workflows/uv-test.yml +58 -0
- {atdata-0.1.3b3 → atdata-0.2.0a1}/.gitignore +3 -0
- atdata-0.2.0a1/.planning/01_overview.md +204 -0
- atdata-0.2.0a1/.planning/02_lexicon_design.md +576 -0
- atdata-0.2.0a1/.planning/03_python_client.md +690 -0
- atdata-0.2.0a1/.planning/04_appview.md +578 -0
- atdata-0.2.0a1/.planning/05_codegen.md +799 -0
- atdata-0.2.0a1/.planning/README.md +195 -0
- atdata-0.2.0a1/.planning/atproto_integration.md +19 -0
- atdata-0.2.0a1/.planning/decisions/01_schema_representation_format.md +239 -0
- atdata-0.2.0a1/.planning/decisions/02_lens_code_storage.md +352 -0
- atdata-0.2.0a1/.planning/decisions/03_webdataset_storage.md +366 -0
- atdata-0.2.0a1/.planning/decisions/04_schema_evolution.md +509 -0
- atdata-0.2.0a1/.planning/decisions/05_lexicon_namespace.md +388 -0
- atdata-0.2.0a1/.planning/decisions/06_lexicon_validation.md +459 -0
- atdata-0.2.0a1/.planning/decisions/README.md +158 -0
- atdata-0.2.0a1/.planning/decisions/assessment.md +313 -0
- atdata-0.2.0a1/.planning/decisions/record_lexicon_assessment.md +468 -0
- atdata-0.2.0a1/.planning/decisions/sampleSchema_design_questions.md +166 -0
- atdata-0.2.0a1/.planning/examples/code/ndarray_roundtrip.py +252 -0
- atdata-0.2.0a1/.planning/examples/code/validate_ndarray_shim.py +316 -0
- atdata-0.2.0a1/.planning/examples/dataset_blob_storage.json +39 -0
- atdata-0.2.0a1/.planning/examples/dataset_external_storage.json +26 -0
- atdata-0.2.0a1/.planning/examples/lens_example.json +27 -0
- atdata-0.2.0a1/.planning/examples/sampleSchema_example.json +53 -0
- atdata-0.2.0a1/.planning/lexicons/README.md +259 -0
- atdata-0.2.0a1/.planning/lexicons/README_ARRAY_FORMATS.md +178 -0
- atdata-0.2.0a1/.planning/lexicons/README_SCHEMA_TYPES.md +150 -0
- atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
- atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
- atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.lens.json +99 -0
- atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.record.json +96 -0
- atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.sampleSchema.json +107 -0
- atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.schemaType.json +16 -0
- atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
- atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
- atdata-0.2.0a1/.planning/lexicons/ndarray_shim.json +16 -0
- atdata-0.2.0a1/.planning/ndarray_shim_spec.md +386 -0
- atdata-0.2.0a1/.reference/atproto_lexicon_guide.md +336 -0
- atdata-0.2.0a1/.reference/atproto_lexicon_spec.md +230 -0
- atdata-0.2.0a1/.reference/python_atproto_sdk.md +347 -0
- atdata-0.2.0a1/.vscode/settings.json +25 -0
- atdata-0.2.0a1/CHANGELOG.md +17 -0
- atdata-0.2.0a1/CLAUDE.md +190 -0
- atdata-0.2.0a1/PKG-INFO +181 -0
- atdata-0.2.0a1/README.md +156 -0
- atdata-0.2.0a1/examples/atmosphere_demo.py +368 -0
- atdata-0.2.0a1/prototyping/.credentials/.gitignore +1 -0
- atdata-0.2.0a1/prototyping/data/.gitignore +1 -0
- {atdata-0.1.3b3 → atdata-0.2.0a1}/pyproject.toml +15 -1
- atdata-0.2.0a1/src/atdata/__init__.py +58 -0
- atdata-0.2.0a1/src/atdata/_helpers.py +58 -0
- atdata-0.2.0a1/src/atdata/atmosphere/__init__.py +61 -0
- atdata-0.2.0a1/src/atdata/atmosphere/_types.py +329 -0
- atdata-0.2.0a1/src/atdata/atmosphere/client.py +393 -0
- atdata-0.2.0a1/src/atdata/atmosphere/lens.py +280 -0
- atdata-0.2.0a1/src/atdata/atmosphere/records.py +342 -0
- atdata-0.2.0a1/src/atdata/atmosphere/schema.py +296 -0
- atdata-0.2.0a1/src/atdata/dataset.py +730 -0
- atdata-0.2.0a1/src/atdata/lens.py +300 -0
- atdata-0.2.0a1/src/atdata/local.py +492 -0
- atdata-0.2.0a1/tests/conftest.py +1 -0
- atdata-0.2.0a1/tests/test_atmosphere.py +1363 -0
- {atdata-0.1.3b3 → atdata-0.2.0a1}/tests/test_dataset.py +213 -35
- atdata-0.2.0a1/tests/test_helpers.py +94 -0
- {atdata-0.1.3b3 → atdata-0.2.0a1}/tests/test_lens.py +81 -7
- atdata-0.2.0a1/tests/test_local.py +1032 -0
- atdata-0.1.3b3/.github/workflows/uv-test.yml +0 -40
- atdata-0.1.3b3/PKG-INFO +0 -18
- atdata-0.1.3b3/README.md +0 -2
- atdata-0.1.3b3/src/atdata/__init__.py +0 -20
- atdata-0.1.3b3/src/atdata/_helpers.py +0 -22
- atdata-0.1.3b3/src/atdata/dataset.py +0 -597
- atdata-0.1.3b3/src/atdata/lens.py +0 -200
- {atdata-0.1.3b3 → atdata-0.2.0a1}/.github/workflows/uv-publish-pypi.yml +0 -0
- {atdata-0.1.3b3 → atdata-0.2.0a1}/.python-version +0 -0
- {atdata-0.1.3b3 → atdata-0.2.0a1}/LICENSE +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#
|
|
2
|
+
|
|
3
|
+
name: Run tests with `uv`
|
|
4
|
+
|
|
5
|
+
on:
|
|
6
|
+
push:
|
|
7
|
+
branches:
|
|
8
|
+
- main
|
|
9
|
+
- release/*
|
|
10
|
+
pull_request:
|
|
11
|
+
branches:
|
|
12
|
+
- main
|
|
13
|
+
|
|
14
|
+
jobs:
|
|
15
|
+
uv-test:
|
|
16
|
+
name: Run tests
|
|
17
|
+
runs-on: ubuntu-latest
|
|
18
|
+
environment:
|
|
19
|
+
name: test
|
|
20
|
+
strategy:
|
|
21
|
+
matrix:
|
|
22
|
+
python-version: [3.12, 3.13, 3.14]
|
|
23
|
+
redis-version: [6, 7]
|
|
24
|
+
|
|
25
|
+
steps:
|
|
26
|
+
- uses: actions/checkout@v5
|
|
27
|
+
|
|
28
|
+
- name: "Set up Python"
|
|
29
|
+
uses: actions/setup-python@v5
|
|
30
|
+
with:
|
|
31
|
+
python-version: ${{ matrix.python-version }}
|
|
32
|
+
# python-version-file: "pyproject.toml"
|
|
33
|
+
|
|
34
|
+
- name: Install uv
|
|
35
|
+
uses: astral-sh/setup-uv@v6
|
|
36
|
+
|
|
37
|
+
- name: Install the project
|
|
38
|
+
run: uv sync --all-extras --dev
|
|
39
|
+
# TODO Better to use --locked for author control over versions?
|
|
40
|
+
# run: uv sync --locked --all-extras --dev
|
|
41
|
+
|
|
42
|
+
- name: Start Redis
|
|
43
|
+
uses: supercharge/redis-github-action@1.8.1
|
|
44
|
+
with:
|
|
45
|
+
redis-version: ${{ matrix.redis-version }}
|
|
46
|
+
|
|
47
|
+
- name: Run tests with coverage
|
|
48
|
+
run: uv run pytest --cov=atdata --cov-report=xml --cov-report=term
|
|
49
|
+
|
|
50
|
+
- name: Upload coverage to Codecov
|
|
51
|
+
uses: codecov/codecov-action@v5
|
|
52
|
+
with:
|
|
53
|
+
# file: ./coverage.xml # Claude hallucination -- fascinating!
|
|
54
|
+
fail_ci_if_error: false
|
|
55
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
#
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# ATProto Integration - Overview
|
|
2
|
+
|
|
3
|
+
## Vision
|
|
4
|
+
|
|
5
|
+
Transform `atdata` from a local/centralized dataset library into a **distributed dataset federation** built on AT Protocol. Datasets, schemas, and transformations become discoverable, versioned records on the ATProto network, enabling:
|
|
6
|
+
|
|
7
|
+
- **Decentralized dataset publishing**: Anyone can publish datasets without centralized infrastructure
|
|
8
|
+
- **Schema sharing & reuse**: Sample type definitions become reusable records with automatic code generation
|
|
9
|
+
- **Discoverable transformations**: Lens transformations are published as bidirectional mappings between schemas
|
|
10
|
+
- **Interoperability**: Different tools and languages can consume the same datasets using generated code
|
|
11
|
+
- **Versioning & provenance**: Immutable records provide audit trails for dataset evolution
|
|
12
|
+
|
|
13
|
+
## High-Level Architecture
|
|
14
|
+
|
|
15
|
+
```
|
|
16
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
17
|
+
│ AT Protocol Network │
|
|
18
|
+
│ ┌──────────────────┐ ┌──────────────────┐ ┌───────────────┐ │
|
|
19
|
+
│ │ Schema Records │ │ Dataset Records │ │ Lens Records │ │
|
|
20
|
+
│ │ (Lexicon) │ │ (Lexicon) │ │ (Lexicon) │ │
|
|
21
|
+
│ └──────────────────┘ └──────────────────┘ └───────────────┘ │
|
|
22
|
+
│ ▲ ▲ ▲ │
|
|
23
|
+
│ │ │ │ │
|
|
24
|
+
└─────────┼──────────────────────┼─────────────────────┼──────────┘
|
|
25
|
+
│ │ │
|
|
26
|
+
│ publish/query │ │
|
|
27
|
+
│ │ │
|
|
28
|
+
┌─────┴──────────────────────┴─────────────────────┴─────┐
|
|
29
|
+
│ Python Client Library (atdata) │
|
|
30
|
+
│ │
|
|
31
|
+
│ ┌────────────┐ ┌────────────┐ ┌──────────────────┐ │
|
|
32
|
+
│ │ ATProto │ │ Schema │ │ Dataset │ │
|
|
33
|
+
│ │ Auth │ │ Publisher │ │ Loader │ │
|
|
34
|
+
│ └────────────┘ └────────────┘ └──────────────────┘ │
|
|
35
|
+
│ │
|
|
36
|
+
│ Existing: │
|
|
37
|
+
│ - PackableSample, Dataset, Lens │
|
|
38
|
+
│ - WebDataset integration │
|
|
39
|
+
└──────────────────────────────────────────────────────────┘
|
|
40
|
+
│
|
|
41
|
+
│ queries (optional)
|
|
42
|
+
▼
|
|
43
|
+
┌─────────────────────┐
|
|
44
|
+
│ AppView Service │
|
|
45
|
+
│ (Index Aggregator) │
|
|
46
|
+
│ │
|
|
47
|
+
│ - Fast search │
|
|
48
|
+
│ - Schema browser │
|
|
49
|
+
│ - Metadata cache │
|
|
50
|
+
└─────────────────────┘
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Core Concepts
|
|
54
|
+
|
|
55
|
+
### 1. Schema Records (PackableSample definitions)
|
|
56
|
+
|
|
57
|
+
Published ATProto records containing:
|
|
58
|
+
- Field names and types (with special handling for NDArray)
|
|
59
|
+
- Serialization metadata
|
|
60
|
+
- Version information
|
|
61
|
+
- Author/provenance
|
|
62
|
+
|
|
63
|
+
These become the **source of truth** for sample types across the network.
|
|
64
|
+
|
|
65
|
+
### 2. Dataset Index Records
|
|
66
|
+
|
|
67
|
+
Published ATProto records containing:
|
|
68
|
+
- Reference to schema record (the sample type)
|
|
69
|
+
- WebDataset URL(s) using brace notation (e.g., `s3://bucket/data-{000000..000099}.tar`)
|
|
70
|
+
- Msgpack-encoded metadata (arbitrary key-value pairs)
|
|
71
|
+
- Dataset description, tags, author
|
|
72
|
+
|
|
73
|
+
Users discover datasets by querying these records, then load them using existing `Dataset` class.
|
|
74
|
+
|
|
75
|
+
### 3. Lens Transformation Records
|
|
76
|
+
|
|
77
|
+
Published ATProto records containing:
|
|
78
|
+
- Source schema reference
|
|
79
|
+
- Target schema reference
|
|
80
|
+
- Transformation code (or reference to code)
|
|
81
|
+
- Bidirectional mapping metadata (getter/putter)
|
|
82
|
+
|
|
83
|
+
Enables building a **network of transformations** between schemas.
|
|
84
|
+
|
|
85
|
+
## Integration with Existing `atdata`
|
|
86
|
+
|
|
87
|
+
The ATProto integration is **additive**:
|
|
88
|
+
|
|
89
|
+
1. **Existing functionality unchanged**: `PackableSample`, `Dataset`, `Lens` continue to work as-is
|
|
90
|
+
2. **New methods added**:
|
|
91
|
+
- `sample_type.publish_to_atproto(client)` - Publish schema
|
|
92
|
+
- `dataset.publish_to_atproto(client)` - Publish index record
|
|
93
|
+
- `Dataset.from_atproto(client, record_uri)` - Load from published record
|
|
94
|
+
- `lens.publish_to_atproto(client)` - Publish transformation
|
|
95
|
+
3. **Optional AppView**: Query service for faster discovery (like Bluesky's AppView)
|
|
96
|
+
|
|
97
|
+
## Development Phases
|
|
98
|
+
|
|
99
|
+
### Phase 1: Lexicon Design (Issues #17, #22-25)
|
|
100
|
+
- Design three Lexicon schemas (sample, dataset, lens)
|
|
101
|
+
- Evaluate schema representation formats
|
|
102
|
+
- Create reference documentation
|
|
103
|
+
|
|
104
|
+
**Deliverable**: Lexicon JSON definitions ready for use
|
|
105
|
+
|
|
106
|
+
### Phase 2: Python Client Library (Issues #18, #26-31)
|
|
107
|
+
- ATProto SDK integration (auth, session management)
|
|
108
|
+
- Publishing implementations for all three record types
|
|
109
|
+
- Query/discovery functionality
|
|
110
|
+
- Extend `Dataset` class with `from_atproto()` method
|
|
111
|
+
|
|
112
|
+
**Deliverable**: Working Python library that can publish/load from ATProto
|
|
113
|
+
|
|
114
|
+
### Phase 3: AppView Service (Issues #19, #32-35)
|
|
115
|
+
- Optional aggregation service
|
|
116
|
+
- Firehose ingestion
|
|
117
|
+
- Search/query API
|
|
118
|
+
- Performance optimization
|
|
119
|
+
|
|
120
|
+
**Deliverable**: Hosted service for fast dataset discovery
|
|
121
|
+
|
|
122
|
+
### Phase 4: Code Generation (Issues #20, #36-39)
|
|
123
|
+
- Template system for Python codegen
|
|
124
|
+
- CLI tool for generating classes from schema records
|
|
125
|
+
- Type validation and compatibility checking
|
|
126
|
+
|
|
127
|
+
**Deliverable**: Tool to generate Python code from published schemas
|
|
128
|
+
|
|
129
|
+
### Phase 5: Integration & Testing (Issues #21, #40-43)
|
|
130
|
+
- End-to-end workflows and examples
|
|
131
|
+
- Integration test suite
|
|
132
|
+
- Documentation and guides
|
|
133
|
+
- Performance benchmarks
|
|
134
|
+
|
|
135
|
+
**Deliverable**: Production-ready feature with complete documentation
|
|
136
|
+
|
|
137
|
+
## Open Design Questions
|
|
138
|
+
|
|
139
|
+
### Schema Representation Format
|
|
140
|
+
**Question**: How should we represent `PackableSample` schemas in Lexicon records?
|
|
141
|
+
|
|
142
|
+
**Options**:
|
|
143
|
+
1. **JSON Schema** - Standard, well-supported, validation tools exist
|
|
144
|
+
2. **Protobuf** - Compact, has codegen ecosystem, good for cross-language
|
|
145
|
+
3. **Custom format** - Tailored to `PackableSample` specifics (NDArray handling, msgpack serialization)
|
|
146
|
+
|
|
147
|
+
**Considerations**:
|
|
148
|
+
- Need to represent `NDArray` types specially (dtype, shape constraints?)
|
|
149
|
+
- Should support future extensions (constraints, validation rules)
|
|
150
|
+
- Must be human-readable and machine-processable
|
|
151
|
+
- Codegen tooling needs to parse it
|
|
152
|
+
|
|
153
|
+
**Decision needed**: See Issue #25
|
|
154
|
+
|
|
155
|
+
### WebDataset Storage Location
|
|
156
|
+
**Question**: Should actual WebDataset `.tar` files be stored on ATProto, or just references to external storage?
|
|
157
|
+
|
|
158
|
+
**Current approach**: References only (S3, HTTP URLs, etc.)
|
|
159
|
+
- Pros: No storage limits, existing infrastructure works
|
|
160
|
+
- Cons: Centralization risk if datasets disappear
|
|
161
|
+
|
|
162
|
+
**Future consideration**: ATProto blob storage for datasets
|
|
163
|
+
- Pros: Truly decentralized
|
|
164
|
+
- Cons: Storage costs, size limits, performance
|
|
165
|
+
|
|
166
|
+
### Lens Code Storage
|
|
167
|
+
**Question**: How should Lens transformation code be stored?
|
|
168
|
+
|
|
169
|
+
**Options**:
|
|
170
|
+
1. Python code as string in record (security concerns!)
|
|
171
|
+
2. Reference to GitHub/GitLab repo + commit hash
|
|
172
|
+
3. Bytecode or AST representation
|
|
173
|
+
4. Only store metadata, expect manual implementation
|
|
174
|
+
|
|
175
|
+
**Decision needed**: See Phase 1 planning
|
|
176
|
+
|
|
177
|
+
## Success Metrics
|
|
178
|
+
|
|
179
|
+
- **Functionality**: Can publish schema, publish dataset, discover, load end-to-end
|
|
180
|
+
- **Performance**: Dataset discovery <100ms (with AppView), load time unchanged
|
|
181
|
+
- **Adoption**: Easy enough that external users publish datasets
|
|
182
|
+
- **Interop**: Schema records usable from other languages (future)
|
|
183
|
+
|
|
184
|
+
## Timeline & Dependencies
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
Phase 1 (Lexicon Design)
|
|
188
|
+
↓
|
|
189
|
+
Phase 2 (Python Client) ← CRITICAL PATH
|
|
190
|
+
↓
|
|
191
|
+
├── Phase 3 (AppView) [parallel, optional]
|
|
192
|
+
└── Phase 4 (Codegen) [parallel]
|
|
193
|
+
↓
|
|
194
|
+
Phase 5 (Integration & Testing)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Phase 2 is the critical path. Phases 3 & 4 can proceed in parallel once Phase 2 foundations are in place.
|
|
198
|
+
|
|
199
|
+
## Related Documents
|
|
200
|
+
|
|
201
|
+
- `02_lexicon_design.md` - Detailed Lexicon schema specifications
|
|
202
|
+
- `03_python_client.md` - Python library architecture and API design
|
|
203
|
+
- `04_appview.md` - AppView service architecture
|
|
204
|
+
- `05_codegen.md` - Code generation approach and templates
|