pio-arch 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. pio_arch-0.1.0/.gitignore +51 -0
  2. pio_arch-0.1.0/.pre-commit-config.yaml +17 -0
  3. pio_arch-0.1.0/AGENT_GUIDE.shared.md +313 -0
  4. pio_arch-0.1.0/LEARNING_GUIDE.md +659 -0
  5. pio_arch-0.1.0/LICENSE +21 -0
  6. pio_arch-0.1.0/PKG-INFO +234 -0
  7. pio_arch-0.1.0/README.md +193 -0
  8. pio_arch-0.1.0/context.md +500 -0
  9. pio_arch-0.1.0/docs/Makefile +24 -0
  10. pio_arch-0.1.0/docs/api/embedder.rst +7 -0
  11. pio_arch-0.1.0/docs/conf.py +39 -0
  12. pio_arch-0.1.0/docs/index.rst +12 -0
  13. pio_arch-0.1.0/notebooks/full_pio_walkthrough.ipynb +687 -0
  14. pio_arch-0.1.0/notebooks/generate_pio_synthetic_data.ipynb +396 -0
  15. pio_arch-0.1.0/notebooks/optuna_sweep.ipynb +578 -0
  16. pio_arch-0.1.0/notebooks/pio_data_engineering.ipynb +521 -0
  17. pio_arch-0.1.0/notebooks/pio_encoder_design.md +141 -0
  18. pio_arch-0.1.0/notebooks/pio_prototype.ipynb +97 -0
  19. pio_arch-0.1.0/notebooks/pio_sentence_model_prototype.ipynb +487 -0
  20. pio_arch-0.1.0/notebooks/pio_synthetic_50k.parquet +0 -0
  21. pio_arch-0.1.0/notebooks/pio_text_embedding_plan.md +338 -0
  22. pio_arch-0.1.0/notebooks/sentence_only_pio.ipynb +588 -0
  23. pio_arch-0.1.0/notebooks/universal_feature_transformer_demo.ipynb +873 -0
  24. pio_arch-0.1.0/pio_arch/__init__.py +15 -0
  25. pio_arch-0.1.0/pio_arch/models/__init__.py +22 -0
  26. pio_arch-0.1.0/pio_arch/models/context_encoder.py +178 -0
  27. pio_arch-0.1.0/pio_arch/models/embedder.py +132 -0
  28. pio_arch-0.1.0/pio_arch/models/pio_attention.py +368 -0
  29. pio_arch-0.1.0/pio_arch/models/sentence_encoder.py +237 -0
  30. pio_arch-0.1.0/pio_arch/utils/__init__.py +0 -0
  31. pio_arch-0.1.0/pio_arch/utils/collate.py +109 -0
  32. pio_arch-0.1.0/pio_arch/utils/data.py +159 -0
  33. pio_arch-0.1.0/pio_arch/utils/rff.py +63 -0
  34. pio_arch-0.1.0/pio_arch/utils/sentence_pool.py +115 -0
  35. pio_arch-0.1.0/pio_arch/utils/text_dropout.py +62 -0
  36. pio_arch-0.1.0/pio_arch/utils/train.py +242 -0
  37. pio_arch-0.1.0/pyproject.toml +183 -0
  38. pio_arch-0.1.0/scripts/build_walkthrough_notebooks.py +1610 -0
  39. pio_arch-0.1.0/scripts/smoke_test_notebooks.py +494 -0
  40. pio_arch-0.1.0/tests/__init__.py +0 -0
  41. pio_arch-0.1.0/tests/conftest.py +42 -0
  42. pio_arch-0.1.0/tests/integration/__init__.py +0 -0
  43. pio_arch-0.1.0/tests/integration/test_embedder_integration.py +48 -0
  44. pio_arch-0.1.0/tests/integration/test_pio_attention_integration.py +110 -0
  45. pio_arch-0.1.0/tests/integration/test_uft_integration.py +47 -0
  46. pio_arch-0.1.0/tests/test_collate.py +116 -0
  47. pio_arch-0.1.0/tests/test_context_encoder.py +242 -0
  48. pio_arch-0.1.0/tests/test_data.py +174 -0
  49. pio_arch-0.1.0/tests/test_embedder.py +413 -0
  50. pio_arch-0.1.0/tests/test_hello.py +14 -0
  51. pio_arch-0.1.0/tests/test_pio_attention.py +422 -0
  52. pio_arch-0.1.0/tests/test_rff.py +91 -0
  53. pio_arch-0.1.0/tests/test_sentence_encoder.py +251 -0
  54. pio_arch-0.1.0/tests/test_sentence_pool.py +165 -0
  55. pio_arch-0.1.0/tests/test_text_dropout.py +87 -0
  56. pio_arch-0.1.0/tests/test_train.py +188 -0
@@ -0,0 +1,51 @@
1
+ # Python
2
+ .venv/
3
+ __pycache__/
4
+ *.pyc
5
+ *.pyo
6
+ *.pyd
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ .eggs/
11
+ *.whl
12
+ *.tar.gz
13
+
14
+ # Testing & coverage
15
+ .coverage
16
+ .coverage.*
17
+ htmlcov/
18
+ .pytest_cache/
19
+
20
+ # Type checking & linting
21
+ .mypy_cache/
22
+ .ruff_cache/
23
+
24
+ # uv
25
+ uv.lock
26
+
27
+ # PyTorch artifacts
28
+ *.pt
29
+ *.pth
30
+ *.ckpt
31
+ checkpoints/
32
+ runs/
33
+ outputs/
34
+
35
+ # Jupyter
36
+ .ipynb_checkpoints/
37
+
38
+ # Sphinx docs build output
39
+ docs/_build/
40
+
41
+ # IDE
42
+ .vscode/
43
+ .idea/
44
+ *.swp
45
+
46
+ # macOS
47
+ .DS_Store
48
+
49
+ # Secrets
50
+ .env
51
+ *.key
@@ -0,0 +1,17 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.15.11
4
+ hooks:
5
+ - id: ruff
6
+ args: [--fix]
7
+ - id: ruff-format
8
+
9
+ - repo: https://github.com/pre-commit/pre-commit-hooks
10
+ rev: v4.6.0
11
+ hooks:
12
+ - id: trailing-whitespace
13
+ - id: end-of-file-fixer
14
+ - id: check-yaml
15
+ - id: check-toml
16
+ - id: check-merge-conflict
17
+ - id: debug-statements
@@ -0,0 +1,313 @@
1
+ # pio-arch — Shared Agent Guide
2
+
3
+ This is the shared project guide for non-Claude coding agents. `AGENTS.md` and
4
+ `GEMINI.md` are symlinks to this file so Codex, Gemini CLI, and similar tools
5
+ read the same durable project guidance with no duplicated body text.
6
+
7
+ Keep `.claude/CLAUDE.md` as the Claude-specific guide for Claude Code features,
8
+ then mirror durable cross-agent rules here when they should also apply outside
9
+ Claude.
10
+
11
+ For deeper role-specific guidance, read:
12
+
13
+ - `.claude/CLAUDE.md` - primary project guide and architecture overview
14
+ - `context.md` - architecture reference material
15
+ - `.claude/agents/ml-engineer.md` - model implementation and PyTorch guidance
16
+ - `.claude/agents/qa-reviewer.md` - mathematical/model review checklist
17
+ - `.claude/agents/unit-tester.md` - pytest, coverage, and ruff workflow
18
+ - `.claude/agents/integration-tester.md` - end-to-end synthetic validation
19
+ - `.claude/agents/python-expert.md` - packaging, dependencies, and tooling
20
+ - `.claude/agents/data-scientist.md` - end-user notebooks and examples
21
+ - `.claude/skills/implement-model/SKILL.md` - Claude-specific model pipeline
22
+ - `.claude/uv-guide.md` - uv reference for conda users
23
+
24
+ ## Project Summary
25
+
26
+ This repo contains PyTorch implementations of permutation-invariant neural
27
+ network architectures for tabular, mixed-row, and text/set-structured data.
28
+ Models consume variable-length unordered rows and produce fixed-size
29
+ representations or per-task predictions.
30
+
31
+ Core constraint: `f(X) = f(pi(X))` for any permutation `pi`.
32
+
33
+ ## Architectures
34
+
35
+ Implemented model files:
36
+
37
+ - `pio_arch/models/embedder.py` - `UniversalFeatureTransformer`, type-aware scalar and
38
+ discrete feature embedding.
39
+ - `pio_arch/models/context_encoder.py` - `ContextRowEncoder`, mixed numeric/discrete/text
40
+ row encoder with feature-type embeddings.
41
+ - `pio_arch/models/sentence_encoder.py` - sentence-set baseline, task heads, `TaskSpec`,
42
+ and masked multi-task loss utilities.
43
+ - `pio_arch/models/pio_attention.py` - latent-bottleneck attention model for context
44
+ sets, with optional context self-attention, latent self-attention, task-query
45
+ pooling, and per-task or shared heads. Structurally Perceiver IO
46
+ (Jaegle et al., 2022).
47
+
48
+ Utilities under `pio_arch/utils/`:
49
+
50
+ - `pio_arch/utils/rff.py` - Random Fourier Feature helpers shared by embedders.
51
+ - `pio_arch/utils/collate.py` - `sentence_set_collate` (tensor-only).
52
+ - `pio_arch/utils/data.py` - `SentenceSetDataset` and `collate_sentence_set_batch`
53
+ for trainer-ready dict batches.
54
+ - `pio_arch/utils/train.py` - `Trainer`, `train_one_epoch`, `evaluate`,
55
+ `make_masked_multitask_loss_fn`, `move_batch`.
56
+ - `pio_arch/utils/text_dropout.py` - `TextRowDropout` (training-time row dropout).
57
+ - `pio_arch/utils/sentence_pool.py` - `pool_sentence_embeddings`, a DeepSets-style
58
+ aggregator that collapses ``[B, N, text_dim]`` to ``[B, text_dim]``.
59
+
60
+ Historical architecture notes and unimplemented reference designs live in
61
+ `context.md`; do not treat those snippets as current source files.
62
+
63
+ ## Input Embedding
64
+
65
+ `pio_arch/models/embedder.py` contains `UniversalFeatureTransformer`, the shared raw
66
+ feature embedding layer.
67
+
68
+ ```python
69
+ model = UniversalFeatureTransformer(
70
+ feature_vocab_size=500,
71
+ dim=64, # must be even
72
+ rff_sigma=1.0,
73
+ )
74
+
75
+ # forward(feature_ids, feature_values, is_numerical=None) -> [B, n, dim]
76
+ ```
77
+
78
+ Arguments:
79
+
80
+ - `feature_ids: [B, n]` int tensor; used for categorical/sentinel/missing values
81
+ - `feature_values: [B, n]` float tensor; used for numerical scalar values
82
+ - `is_numerical: [B, n]` bool tensor or `None`; `None` means all numerical
83
+
84
+ Conventions:
85
+
86
+ - Numerical positions use fixed Random Fourier Features.
87
+ - Categorical, sentinel, and missing positions use `nn.Embedding`.
88
+ - ID `0` is reserved for missing/padding and returns a zero vector.
89
+ - All categorical/sentinel/missing indexing is the caller's responsibility.
90
+
91
+ ## Mixed Context Rows
92
+
93
+ `pio_arch/models/context_encoder.py` contains `ContextRowEncoder`, the current mixed-row
94
+ input path for PIO-style models.
95
+
96
+ ```python
97
+ model = ContextRowEncoder(
98
+ discrete_vocab_size=1000,
99
+ feature_type_vocab_size=64,
100
+ value_dim=32, # must be even
101
+ feature_type_dim=32,
102
+ text_input_dim=384, # optional; required only for text rows
103
+ )
104
+
105
+ # forward(value_ids, scalar_values, row_kinds, feature_type_ids, padding_mask,
106
+ # text_values=None) -> [B, N, value_dim + feature_type_dim]
107
+ ```
108
+
109
+ Arguments:
110
+
111
+ - `value_ids: [B, N]` long tensor for discrete rows.
112
+ - `scalar_values: [B, N]` float tensor for numeric rows.
113
+ - `row_kinds: [B, N]` long tensor using `ContextRowKind.NUMERIC`,
114
+ `ContextRowKind.DISCRETE`, or `ContextRowKind.TEXT`.
115
+ - `feature_type_ids: [B, N]` long tensor identifying the row's feature type.
116
+ - `padding_mask: [B, N]` bool tensor with `True` at padded rows.
117
+ - `text_values: [B, N, text_input_dim]` optional precomputed text embeddings.
118
+
119
+ Conventions:
120
+
121
+ - Numeric rows use fixed RFFs.
122
+ - Discrete rows use `nn.Embedding` with ID `0` reserved for padding/unknown.
123
+ - Text rows require `text_input_dim` at construction and `text_values` at
124
+ forward time.
125
+ - Padding rows are zeroed before returning.
126
+ - The row encoder is permutation-equivariant; downstream pooling/attention is
127
+ responsible for producing invariant predictions.
128
+
129
+ ## Sentence and PIO Models
130
+
131
+ `pio_arch/models/sentence_encoder.py` provides a sentence-only baseline:
132
+
133
+ - `SentenceSetEncoder(sentence_embeddings, padding_mask) -> [B, model_dim]`
134
+ projects externally generated sentence embeddings, masks padded rows, and
135
+ pools with a masked mean.
136
+ - `SentenceSetMultiTaskModel` wraps the encoder with task-specific heads.
137
+ - `TaskSpec(name, kind, weight=1.0)` supports `kind="binary"` and
138
+ `kind="regression"`.
139
+ - `masked_multitask_loss(predictions, targets, target_mask, tasks)` computes
140
+ weighted BCE-with-logits or MSE over observed targets only.
141
+
142
+ `pio_arch/models/pio_attention.py` contains the current attention architecture:
143
+
144
+ ```python
145
+ model = PIOAttentionModel(
146
+ input_dim=context_dim,
147
+ tasks=[TaskSpec("target", "binary")],
148
+ model_dim=64, # context-stage dim
149
+ num_latents=8,
150
+ latent_dim=None, # defaults to model_dim
151
+ task_dim=None, # defaults to latent_dim
152
+ num_context_self_attn_blocks=0,
153
+ num_latent_self_attn_blocks=1,
154
+ num_heads=4,
155
+ head_mode="per_task", # or "shared"
156
+ )
157
+
158
+ # forward(context, padding_mask) -> dict[str, [B, 1]]
159
+ # encode_latents(context, padding_mask) -> [B, num_latents, latent_dim]
160
+ ```
161
+
162
+ Each of ``model_dim``, ``latent_dim``, and ``task_dim`` must be divisible by
163
+ ``num_heads``. Set them independently to shrink or grow the latent bottleneck
164
+ relative to the context and task queries.
165
+
166
+ PIO processing order:
167
+
168
+ 1. Project context rows to `model_dim`.
169
+ 2. Optionally apply context self-attention blocks.
170
+ 3. Cross-attend learnable latent queries to context rows.
171
+ 4. Optionally apply latent self-attention blocks.
172
+ 5. Cross-attend learnable task queries to latents.
173
+ 6. Predict one `[B, 1]` tensor per task.
174
+
175
+ The model handles all-padded samples with a safe attention mask so PyTorch
176
+ `MultiheadAttention` does not produce NaNs.
177
+
178
+ ## Development Rules
179
+
180
+ - Python 3.12+, PyTorch only for neural network code.
181
+ - Use the project venv: `.venv/bin/python`, `.venv/bin/pytest`,
182
+ `.venv/bin/ruff`.
183
+ - Each architecture lives in its own file under `pio_arch/models/`.
184
+ - Shared utilities live in `utils/`.
185
+ - Most set-model forwards use explicit padding masks where
186
+ `padding_mask: [B, N]` is bool with `True` at padded positions.
187
+ - `PIOAttentionModel.forward(context, padding_mask)` returns a dictionary from
188
+ task name to `[B, 1]` prediction tensor.
189
+ - `SentenceSetEncoder.forward(sentence_embeddings, padding_mask)` returns
190
+ `[B, model_dim]`.
191
+ - `ContextRowEncoder.forward()` and `UniversalFeatureTransformer.forward()` have
192
+ their own signatures; do not force them into the downstream model interface.
193
+ - Use `batch_first=True` in every `nn.MultiheadAttention`.
194
+ - Prefer pre-norm attention blocks: normalize query/key/value inputs before
195
+ attention, then use residual feed-forward updates.
196
+ - Add type hints to public methods.
197
+ - Do not add positional encodings to set models; that breaks permutation
198
+ invariance.
199
+ - Apply masks before aggregation so padded values cannot leak into outputs.
200
+ - In attention modules, pass padded rows through `key_padding_mask` and re-zero
201
+ masked context rows after updates where those rows remain in the set.
202
+ - Keep list-valued input semantics distinct: `None` means upstream field
203
+ missing, `[]` means present but empty, and non-empty lists mean observed
204
+ values. Do not use empty strings as missing markers.
205
+
206
+ ## Testing Expectations
207
+
208
+ Run tests through the venv:
209
+
210
+ ```bash
211
+ .venv/bin/pytest
212
+ .venv/bin/pytest tests/test_embedder.py -v
213
+ .venv/bin/pytest tests/test_context_encoder.py tests/test_pio_attention.py -v
214
+ ```
215
+
216
+ Implemented coverage currently includes:
217
+
218
+ - `tests/test_embedder.py` - `UniversalFeatureTransformer` shape, RFF,
219
+ categorical path, and permutation-equivariance checks.
220
+ - `tests/test_context_encoder.py` - row-kind paths, padding zeroing, validation,
221
+ and permutation-equivariance checks.
222
+ - `tests/test_sentence_encoder.py` - sentence-set pooling, multi-task heads, and
223
+ masked loss checks.
224
+ - `tests/test_pio_attention.py` - attention block shapes, PIO task outputs,
225
+ all-padded sample safety, permutation invariance, and masked-context
226
+ corruption checks.
227
+ - `tests/integration/` - UFT/embedder integration tests.
228
+
229
+ Each downstream set model should have:
230
+
231
+ - Shape test for its public output contract.
232
+ - Permutation-invariance test: shuffled rows and mask produce the same output.
233
+ - Masking test: corrupting padded positions does not affect outputs.
234
+ - All-padded sample test when attention is involved.
235
+
236
+ For substantial changes, also run:
237
+
238
+ ```bash
239
+ .venv/bin/ruff format --check models/ utils/ tests/
240
+ .venv/bin/ruff check models/ utils/ tests/
241
+ env PRE_COMMIT_HOME=/private/tmp/pinn_models_precommit_cache .venv/bin/pre-commit run --all-files
242
+ ```
243
+
244
+ ## Model Review Checklist
245
+
246
+ When reviewing or implementing model code, check:
247
+
248
+ - No positional encoding or order-dependent feature injection.
249
+ - Aggregation is order agnostic: sum, mean, PMA seeds, CLS latent, or slot mean.
250
+ - Padded positions are excluded from attention via `key_padding_mask`.
251
+ - Masked positions are excluded before pooling/aggregation.
252
+ - Shapes are consistently `[B, N, dim]` internally.
253
+ - Invariant encoders return fixed-size `[B, dim]` tensors or dictionaries of
254
+ per-task `[B, 1]` predictions.
255
+ - Standard attention softmax is over keys.
256
+ - Residual connections and normalization are present in transformer-style blocks.
257
+ - Learnable latents or task queries are not initialized to all zeros.
258
+ - Masked multi-task losses normalize each task over observed targets only.
259
+
260
+ ## Dependencies
261
+
262
+ - Runtime dependencies live inlined in `[project] dependencies` in
263
+ `pyproject.toml` and should stay minimal: `torch>=2.0` and `numpy>=1.24`.
264
+ The torch constraint is forked with an environment marker because PyTorch
265
+ dropped macOS x86_64 wheels at torch 2.3.0; Intel Mac users are capped at
266
+ `<2.3`, everyone else stays free to upgrade.
267
+ - Model-development extras live under `[project.optional-dependencies]` as
268
+ `model-dev`, including notebook/prototype tooling such as `polars`,
269
+ `pyarrow`, `sentence-transformers`, `transformers`, and `optuna`
270
+ (hyperparameter sweeps).
271
+ - Test, lint, docs, and dev tooling live in PEP 735 `[dependency-groups]`.
272
+ - See `.claude/agents/python-expert.md` and `.claude/uv-guide.md` before
273
+ changing packaging or dependency configuration.
274
+
275
+ ## Documentation
276
+
277
+ Sphinx docs live in `docs/`:
278
+
279
+ ```bash
280
+ .venv/bin/sphinx-build -b html docs docs/_build/html
281
+ ```
282
+
283
+ Example notebooks live in `notebooks/`.
284
+
285
+ PIO prototype notes live in:
286
+
287
+ - `PIO_HANDOFF.md` - current handoff and recommended next steps.
288
+ - `notebooks/pio_encoder_design.md` - implemented encoder/attention design.
289
+ - `notebooks/pio_sentence_model_prototype.ipynb` - sentence-only prototype.
290
+ - `notebooks/pio_text_embedding_plan.md` - text embedding and deployment notes.
291
+
292
+ ## Symlink / Redundancy Layout
293
+
294
+ Current layout:
295
+
296
+ - `AGENT_GUIDE.shared.md` - single maintained body of cross-agent guidance
297
+ - `AGENTS.md -> AGENT_GUIDE.shared.md` - Codex and other AGENTS-aware tools
298
+ - `GEMINI.md -> AGENT_GUIDE.shared.md` - Gemini CLI project context
299
+ - `.claude/CLAUDE.md` - Claude Code-specific guide and imports
300
+
301
+ Do not make these files direct symlinks to `.claude/CLAUDE.md` unless every
302
+ agent that reads this repo understands Claude's `@file` import syntax and
303
+ Claude-specific conventions. A direct symlink would minimize files but would
304
+ couple Codex and Gemini CLI to Claude Code mechanics such as slash skills,
305
+ subagent frontmatter, and settings.
306
+
307
+ When updating guidance:
308
+
309
+ 1. Put tool-neutral project rules in this file.
310
+ 2. Put Claude-specific orchestration, skills, permissions, or subagent details
311
+ under `.claude/`.
312
+ 3. If a tool cannot follow symlinks, replace its symlink with a tiny wrapper that
313
+ imports this file using that tool's supported include syntax.