atdata 0.1.3b3__tar.gz → 0.2.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. atdata-0.2.0a1/.github/workflows/uv-test.yml +58 -0
  2. {atdata-0.1.3b3 → atdata-0.2.0a1}/.gitignore +3 -0
  3. atdata-0.2.0a1/.planning/01_overview.md +204 -0
  4. atdata-0.2.0a1/.planning/02_lexicon_design.md +576 -0
  5. atdata-0.2.0a1/.planning/03_python_client.md +690 -0
  6. atdata-0.2.0a1/.planning/04_appview.md +578 -0
  7. atdata-0.2.0a1/.planning/05_codegen.md +799 -0
  8. atdata-0.2.0a1/.planning/README.md +195 -0
  9. atdata-0.2.0a1/.planning/atproto_integration.md +19 -0
  10. atdata-0.2.0a1/.planning/decisions/01_schema_representation_format.md +239 -0
  11. atdata-0.2.0a1/.planning/decisions/02_lens_code_storage.md +352 -0
  12. atdata-0.2.0a1/.planning/decisions/03_webdataset_storage.md +366 -0
  13. atdata-0.2.0a1/.planning/decisions/04_schema_evolution.md +509 -0
  14. atdata-0.2.0a1/.planning/decisions/05_lexicon_namespace.md +388 -0
  15. atdata-0.2.0a1/.planning/decisions/06_lexicon_validation.md +459 -0
  16. atdata-0.2.0a1/.planning/decisions/README.md +158 -0
  17. atdata-0.2.0a1/.planning/decisions/assessment.md +313 -0
  18. atdata-0.2.0a1/.planning/decisions/record_lexicon_assessment.md +468 -0
  19. atdata-0.2.0a1/.planning/decisions/sampleSchema_design_questions.md +166 -0
  20. atdata-0.2.0a1/.planning/examples/code/ndarray_roundtrip.py +252 -0
  21. atdata-0.2.0a1/.planning/examples/code/validate_ndarray_shim.py +316 -0
  22. atdata-0.2.0a1/.planning/examples/dataset_blob_storage.json +39 -0
  23. atdata-0.2.0a1/.planning/examples/dataset_external_storage.json +26 -0
  24. atdata-0.2.0a1/.planning/examples/lens_example.json +27 -0
  25. atdata-0.2.0a1/.planning/examples/sampleSchema_example.json +53 -0
  26. atdata-0.2.0a1/.planning/lexicons/README.md +259 -0
  27. atdata-0.2.0a1/.planning/lexicons/README_ARRAY_FORMATS.md +178 -0
  28. atdata-0.2.0a1/.planning/lexicons/README_SCHEMA_TYPES.md +150 -0
  29. atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.arrayFormat.json +16 -0
  30. atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.getLatestSchema.json +78 -0
  31. atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.lens.json +99 -0
  32. atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.record.json +96 -0
  33. atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.sampleSchema.json +107 -0
  34. atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.schemaType.json +16 -0
  35. atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.storageBlobs.json +24 -0
  36. atdata-0.2.0a1/.planning/lexicons/ac.foundation.dataset.storageExternal.json +25 -0
  37. atdata-0.2.0a1/.planning/lexicons/ndarray_shim.json +16 -0
  38. atdata-0.2.0a1/.planning/ndarray_shim_spec.md +386 -0
  39. atdata-0.2.0a1/.reference/atproto_lexicon_guide.md +336 -0
  40. atdata-0.2.0a1/.reference/atproto_lexicon_spec.md +230 -0
  41. atdata-0.2.0a1/.reference/python_atproto_sdk.md +347 -0
  42. atdata-0.2.0a1/.vscode/settings.json +25 -0
  43. atdata-0.2.0a1/CHANGELOG.md +17 -0
  44. atdata-0.2.0a1/CLAUDE.md +190 -0
  45. atdata-0.2.0a1/PKG-INFO +181 -0
  46. atdata-0.2.0a1/README.md +156 -0
  47. atdata-0.2.0a1/examples/atmosphere_demo.py +368 -0
  48. atdata-0.2.0a1/prototyping/.credentials/.gitignore +1 -0
  49. atdata-0.2.0a1/prototyping/data/.gitignore +1 -0
  50. {atdata-0.1.3b3 → atdata-0.2.0a1}/pyproject.toml +15 -1
  51. atdata-0.2.0a1/src/atdata/__init__.py +58 -0
  52. atdata-0.2.0a1/src/atdata/_helpers.py +58 -0
  53. atdata-0.2.0a1/src/atdata/atmosphere/__init__.py +61 -0
  54. atdata-0.2.0a1/src/atdata/atmosphere/_types.py +329 -0
  55. atdata-0.2.0a1/src/atdata/atmosphere/client.py +393 -0
  56. atdata-0.2.0a1/src/atdata/atmosphere/lens.py +280 -0
  57. atdata-0.2.0a1/src/atdata/atmosphere/records.py +342 -0
  58. atdata-0.2.0a1/src/atdata/atmosphere/schema.py +296 -0
  59. atdata-0.2.0a1/src/atdata/dataset.py +730 -0
  60. atdata-0.2.0a1/src/atdata/lens.py +300 -0
  61. atdata-0.2.0a1/src/atdata/local.py +492 -0
  62. atdata-0.2.0a1/tests/conftest.py +1 -0
  63. atdata-0.2.0a1/tests/test_atmosphere.py +1363 -0
  64. {atdata-0.1.3b3 → atdata-0.2.0a1}/tests/test_dataset.py +213 -35
  65. atdata-0.2.0a1/tests/test_helpers.py +94 -0
  66. {atdata-0.1.3b3 → atdata-0.2.0a1}/tests/test_lens.py +81 -7
  67. atdata-0.2.0a1/tests/test_local.py +1032 -0
  68. atdata-0.1.3b3/.github/workflows/uv-test.yml +0 -40
  69. atdata-0.1.3b3/PKG-INFO +0 -18
  70. atdata-0.1.3b3/README.md +0 -2
  71. atdata-0.1.3b3/src/atdata/__init__.py +0 -20
  72. atdata-0.1.3b3/src/atdata/_helpers.py +0 -22
  73. atdata-0.1.3b3/src/atdata/dataset.py +0 -597
  74. atdata-0.1.3b3/src/atdata/lens.py +0 -200
  75. {atdata-0.1.3b3 → atdata-0.2.0a1}/.github/workflows/uv-publish-pypi.yml +0 -0
  76. {atdata-0.1.3b3 → atdata-0.2.0a1}/.python-version +0 -0
  77. {atdata-0.1.3b3 → atdata-0.2.0a1}/LICENSE +0 -0
@@ -0,0 +1,58 @@
1
+ #
2
+
3
+ name: Run tests with `uv`
4
+
5
+ on:
6
+ push:
7
+ branches:
8
+ - main
9
+ - release/*
10
+ pull_request:
11
+ branches:
12
+ - main
13
+
14
+ jobs:
15
+ uv-test:
16
+ name: Run tests
17
+ runs-on: ubuntu-latest
18
+ environment:
19
+ name: test
20
+ strategy:
21
+ matrix:
22
+ python-version: [3.12, 3.13, 3.14]
23
+ redis-version: [6, 7]
24
+
25
+ steps:
26
+ - uses: actions/checkout@v5
27
+
28
+ - name: "Set up Python"
29
+ uses: actions/setup-python@v5
30
+ with:
31
+ python-version: ${{ matrix.python-version }}
32
+ # python-version-file: "pyproject.toml"
33
+
34
+ - name: Install uv
35
+ uses: astral-sh/setup-uv@v6
36
+
37
+ - name: Install the project
38
+ run: uv sync --all-extras --dev
39
+ # TODO Better to use --locked for author control over versions?
40
+ # run: uv sync --locked --all-extras --dev
41
+
42
+ - name: Start Redis
43
+ uses: supercharge/redis-github-action@1.8.1
44
+ with:
45
+ redis-version: ${{ matrix.redis-version }}
46
+
47
+ - name: Run tests with coverage
48
+ run: uv run pytest --cov=atdata --cov-report=xml --cov-report=term
49
+
50
+ - name: Upload coverage to Codecov
51
+ uses: codecov/codecov-action@v5
52
+ with:
53
+ # file: ./coverage.xml # Claude hallucination -- fascinating!
54
+ fail_ci_if_error: false
55
+ token: ${{ secrets.CODECOV_TOKEN }}
56
+
57
+
58
+ #
@@ -6,6 +6,9 @@
6
6
  **/*.env
7
7
  # Don't commit `uv` lockfiles
8
8
  **/uv.lock
9
+ # Development tooling (keep local, not in upstream)
10
+ .chainlink/
11
+ .claude/
9
12
 
10
13
  ##
11
14
 
@@ -0,0 +1,204 @@
1
+ # ATProto Integration - Overview
2
+
3
+ ## Vision
4
+
5
+ Transform `atdata` from a local/centralized dataset library into a **distributed dataset federation** built on AT Protocol. Datasets, schemas, and transformations become discoverable, versioned records on the ATProto network, enabling:
6
+
7
+ - **Decentralized dataset publishing**: Anyone can publish datasets without centralized infrastructure
8
+ - **Schema sharing & reuse**: Sample type definitions become reusable records with automatic code generation
9
+ - **Discoverable transformations**: Lens transformations are published as bidirectional mappings between schemas
10
+ - **Interoperability**: Different tools and languages can consume the same datasets using generated code
11
+ - **Versioning & provenance**: Immutable records provide audit trails for dataset evolution
12
+
13
+ ## High-Level Architecture
14
+
15
+ ```
16
+ ┌─────────────────────────────────────────────────────────────────┐
17
+ │ AT Protocol Network │
18
+ │ ┌──────────────────┐ ┌──────────────────┐ ┌───────────────┐ │
19
+ │ │ Schema Records │ │ Dataset Records │ │ Lens Records │ │
20
+ │ │ (Lexicon) │ │ (Lexicon) │ │ (Lexicon) │ │
21
+ │ └──────────────────┘ └──────────────────┘ └───────────────┘ │
22
+ │ ▲ ▲ ▲ │
23
+ │ │ │ │ │
24
+ └─────────┼──────────────────────┼─────────────────────┼──────────┘
25
+ │ │ │
26
+ │ publish/query │ │
27
+ │ │ │
28
+ ┌─────┴──────────────────────┴─────────────────────┴─────┐
29
+ │ Python Client Library (atdata) │
30
+ │ │
31
+ │ ┌────────────┐ ┌────────────┐ ┌──────────────────┐ │
32
+ │ │ ATProto │ │ Schema │ │ Dataset │ │
33
+ │ │ Auth │ │ Publisher │ │ Loader │ │
34
+ │ └────────────┘ └────────────┘ └──────────────────┘ │
35
+ │ │
36
+ │ Existing: │
37
+ │ - PackableSample, Dataset, Lens │
38
+ │ - WebDataset integration │
39
+ └──────────────────────────────────────────────────────────┘
40
+
41
+ │ queries (optional)
42
+
43
+ ┌─────────────────────┐
44
+ │ AppView Service │
45
+ │ (Index Aggregator) │
46
+ │ │
47
+ │ - Fast search │
48
+ │ - Schema browser │
49
+ │ - Metadata cache │
50
+ └─────────────────────┘
51
+ ```
52
+
53
+ ## Core Concepts
54
+
55
+ ### 1. Schema Records (PackableSample definitions)
56
+
57
+ Published ATProto records containing:
58
+ - Field names and types (with special handling for NDArray)
59
+ - Serialization metadata
60
+ - Version information
61
+ - Author/provenance
62
+
63
+ These become the **source of truth** for sample types across the network.
64
+
65
+ ### 2. Dataset Index Records
66
+
67
+ Published ATProto records containing:
68
+ - Reference to schema record (the sample type)
69
+ - WebDataset URL(s) using brace notation (e.g., `s3://bucket/data-{000000..000099}.tar`)
70
+ - Msgpack-encoded metadata (arbitrary key-value pairs)
71
+ - Dataset description, tags, author
72
+
73
+ Users discover datasets by querying these records, then load them using existing `Dataset` class.
74
+
75
+ ### 3. Lens Transformation Records
76
+
77
+ Published ATProto records containing:
78
+ - Source schema reference
79
+ - Target schema reference
80
+ - Transformation code (or reference to code)
81
+ - Bidirectional mapping metadata (getter/putter)
82
+
83
+ Enables building a **network of transformations** between schemas.
84
+
85
+ ## Integration with Existing `atdata`
86
+
87
+ The ATProto integration is **additive**:
88
+
89
+ 1. **Existing functionality unchanged**: `PackableSample`, `Dataset`, `Lens` continue to work as-is
90
+ 2. **New methods added**:
91
+ - `sample_type.publish_to_atproto(client)` - Publish schema
92
+ - `dataset.publish_to_atproto(client)` - Publish index record
93
+ - `Dataset.from_atproto(client, record_uri)` - Load from published record
94
+ - `lens.publish_to_atproto(client)` - Publish transformation
95
+ 3. **Optional AppView**: Query service for faster discovery (like Bluesky's AppView)
96
+
97
+ ## Development Phases
98
+
99
+ ### Phase 1: Lexicon Design (Issues #17, #22-25)
100
+ - Design three Lexicon schemas (sample, dataset, lens)
101
+ - Evaluate schema representation formats
102
+ - Create reference documentation
103
+
104
+ **Deliverable**: Lexicon JSON definitions ready for use
105
+
106
+ ### Phase 2: Python Client Library (Issues #18, #26-31)
107
+ - ATProto SDK integration (auth, session management)
108
+ - Publishing implementations for all three record types
109
+ - Query/discovery functionality
110
+ - Extend `Dataset` class with `from_atproto()` method
111
+
112
+ **Deliverable**: Working Python library that can publish/load from ATProto
113
+
114
+ ### Phase 3: AppView Service (Issues #19, #32-35)
115
+ - Optional aggregation service
116
+ - Firehose ingestion
117
+ - Search/query API
118
+ - Performance optimization
119
+
120
+ **Deliverable**: Hosted service for fast dataset discovery
121
+
122
+ ### Phase 4: Code Generation (Issues #20, #36-39)
123
+ - Template system for Python codegen
124
+ - CLI tool for generating classes from schema records
125
+ - Type validation and compatibility checking
126
+
127
+ **Deliverable**: Tool to generate Python code from published schemas
128
+
129
+ ### Phase 5: Integration & Testing (Issues #21, #40-43)
130
+ - End-to-end workflows and examples
131
+ - Integration test suite
132
+ - Documentation and guides
133
+ - Performance benchmarks
134
+
135
+ **Deliverable**: Production-ready feature with complete documentation
136
+
137
+ ## Open Design Questions
138
+
139
+ ### Schema Representation Format
140
+ **Question**: How should we represent `PackableSample` schemas in Lexicon records?
141
+
142
+ **Options**:
143
+ 1. **JSON Schema** - Standard, well-supported, validation tools exist
144
+ 2. **Protobuf** - Compact, has codegen ecosystem, good for cross-language
145
+ 3. **Custom format** - Tailored to `PackableSample` specifics (NDArray handling, msgpack serialization)
146
+
147
+ **Considerations**:
148
+ - Need to represent `NDArray` types specially (dtype, shape constraints?)
149
+ - Should support future extensions (constraints, validation rules)
150
+ - Must be human-readable and machine-processable
151
+ - Codegen tooling needs to parse it
152
+
153
+ **Decision needed**: See Issue #25
154
+
155
+ ### WebDataset Storage Location
156
+ **Question**: Should actual WebDataset `.tar` files be stored on ATProto, or just references to external storage?
157
+
158
+ **Current approach**: References only (S3, HTTP URLs, etc.)
159
+ - Pros: No storage limits, existing infrastructure works
160
+ - Cons: Centralization risk if datasets disappear
161
+
162
+ **Future consideration**: ATProto blob storage for datasets
163
+ - Pros: Truly decentralized
164
+ - Cons: Storage costs, size limits, performance
165
+
166
+ ### Lens Code Storage
167
+ **Question**: How should Lens transformation code be stored?
168
+
169
+ **Options**:
170
+ 1. Python code as string in record (security concerns!)
171
+ 2. Reference to GitHub/GitLab repo + commit hash
172
+ 3. Bytecode or AST representation
173
+ 4. Only store metadata, expect manual implementation
174
+
175
+ **Decision needed**: See Phase 1 planning
176
+
177
+ ## Success Metrics
178
+
179
+ - **Functionality**: Can publish schema, publish dataset, discover, load end-to-end
180
+ - **Performance**: Dataset discovery <100ms (with AppView), load time unchanged
181
+ - **Adoption**: Easy enough that external users publish datasets
182
+ - **Interop**: Schema records usable from other languages (future)
183
+
184
+ ## Timeline & Dependencies
185
+
186
+ ```
187
+ Phase 1 (Lexicon Design)
188
+
189
+ Phase 2 (Python Client) ← CRITICAL PATH
190
+
191
+ ├── Phase 3 (AppView) [parallel, optional]
192
+ └── Phase 4 (Codegen) [parallel]
193
+
194
+ Phase 5 (Integration & Testing)
195
+ ```
196
+
197
+ Phase 2 is the critical path. Phases 3 & 4 can proceed in parallel once Phase 2 foundations are in place.
198
+
199
+ ## Related Documents
200
+
201
+ - `02_lexicon_design.md` - Detailed Lexicon schema specifications
202
+ - `03_python_client.md` - Python library architecture and API design
203
+ - `04_appview.md` - AppView service architecture
204
+ - `05_codegen.md` - Code generation approach and templates