json2vec 0.4.9__tar.gz → 0.4.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. {json2vec-0.4.9/src/json2vec.egg-info → json2vec-0.4.10}/PKG-INFO +60 -68
  2. {json2vec-0.4.9 → json2vec-0.4.10}/README.md +58 -60
  3. {json2vec-0.4.9 → json2vec-0.4.10}/pyproject.toml +2 -8
  4. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/__init__.py +30 -10
  5. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/checkpoint.py +7 -7
  6. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/contracts.py +30 -13
  7. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/encoder.py +14 -14
  8. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/graph.py +12 -12
  9. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/mutations.py +19 -19
  10. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/node.py +9 -9
  11. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/root.py +44 -41
  12. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/runtime.py +52 -42
  13. json2vec-0.4.10/src/json2vec/data/__init__.py +33 -0
  14. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/data/datasets/base.py +29 -14
  15. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/data/datasets/custom.py +19 -27
  16. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/data/datasets/polars.py +19 -27
  17. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/data/datasets/streaming.py +20 -28
  18. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/data/iterables.py +38 -48
  19. json2vec-0.4.9/src/json2vec/data/processing.py → json2vec-0.4.10/src/json2vec/data/nested.py +25 -78
  20. json2vec-0.4.10/src/json2vec/data/processors.py +410 -0
  21. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/helpers/inference.py +24 -27
  22. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/inference/__init__.py +2 -1
  23. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/inference/callback.py +15 -18
  24. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/inference/deployment.py +105 -24
  25. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/logging/throughput.py +7 -10
  26. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/structs/enums.py +1 -0
  27. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/structs/experiment.py +107 -67
  28. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/structs/selectors.py +2 -2
  29. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/structs/structure.py +36 -17
  30. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/structs/tree.py +14 -9
  31. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/base.py +58 -44
  32. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/category.py +52 -38
  33. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/dateparts.py +21 -21
  34. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/entity.py +46 -39
  35. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/number.py +18 -18
  36. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/set.py +52 -30
  37. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/text.py +138 -219
  38. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/vector.py +21 -21
  39. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/shared/counter.py +1 -1
  40. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/shared/vocabulary.py +11 -11
  41. {json2vec-0.4.9 → json2vec-0.4.10/src/json2vec.egg-info}/PKG-INFO +60 -68
  42. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec.egg-info/SOURCES.txt +2 -5
  43. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec.egg-info/requires.txt +1 -7
  44. {json2vec-0.4.9 → json2vec-0.4.10}/tests/test_callbacks.py +1 -1
  45. {json2vec-0.4.9 → json2vec-0.4.10}/tests/test_public_api.py +8 -3
  46. {json2vec-0.4.9 → json2vec-0.4.10}/tests/test_schema_inference.py +30 -30
  47. json2vec-0.4.9/src/json2vec/preprocessors/__init__.py +0 -14
  48. json2vec-0.4.9/src/json2vec/preprocessors/base.py +0 -158
  49. json2vec-0.4.9/src/json2vec/preprocessors/extensions/__init__.py +0 -1
  50. json2vec-0.4.9/src/json2vec/preprocessors/spec.py +0 -8
  51. json2vec-0.4.9/src/json2vec/structs/__init__.py +0 -0
  52. {json2vec-0.4.9 → json2vec-0.4.10}/LICENSE +0 -0
  53. {json2vec-0.4.9 → json2vec-0.4.10}/NOTICE +0 -0
  54. {json2vec-0.4.9 → json2vec-0.4.10}/setup.cfg +0 -0
  55. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/__init__.py +0 -0
  56. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/attention.py +0 -0
  57. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/pool.py +0 -0
  58. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/architecture/rotary.py +0 -0
  59. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/data/datasets/__init__.py +0 -0
  60. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/distributed.py +0 -0
  61. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/helpers/__init__.py +0 -0
  62. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/helpers/hyperparameters.py +0 -0
  63. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/helpers/optimizers.py +0 -0
  64. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/helpers/trainer.py +0 -0
  65. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/logging/__init__.py +0 -0
  66. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/logging/config.py +0 -0
  67. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/logging/epoch.py +0 -0
  68. {json2vec-0.4.9/src/json2vec/data → json2vec-0.4.10/src/json2vec/structs}/__init__.py +0 -0
  69. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/structs/packages.py +0 -0
  70. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/__init__.py +0 -0
  71. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/__init__.py +0 -0
  72. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/shared/__init__.py +0 -0
  73. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec/tensorfields/spec.py +0 -0
  74. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec.egg-info/dependency_links.txt +0 -0
  75. {json2vec-0.4.9 → json2vec-0.4.10}/src/json2vec.egg-info/top_level.txt +0 -0
  76. {json2vec-0.4.9 → json2vec-0.4.10}/tests/test_optimizers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: json2vec
3
- Version: 0.4.9
3
+ Version: 0.4.10
4
4
  Summary: Schema-first PyTorch models for hierarchical / nested / sequence data structures
5
5
  License-Expression: Apache-2.0
6
6
  Requires-Python: >=3.12
@@ -28,13 +28,7 @@ Requires-Dist: uvicorn>=0.38.0; extra == "serving"
28
28
  Provides-Extra: text
29
29
  Requires-Dist: transformers>=4.55.0; extra == "text"
30
30
  Provides-Extra: docs
31
- Requires-Dist: fastapi>=0.124.0; extra == "docs"
32
- Requires-Dist: mkdocs-material>=9.6; extra == "docs"
33
- Requires-Dist: mkdocs-jupyter>=0.26.3; extra == "docs"
34
- Requires-Dist: mkdocstrings[python]>=0.27; extra == "docs"
35
- Requires-Dist: orjson>=3.10.0; extra == "docs"
36
- Requires-Dist: pydantic-settings>=2.10.1; extra == "docs"
37
- Requires-Dist: uvicorn>=0.38.0; extra == "docs"
31
+ Requires-Dist: marimo>=0.23.8; extra == "docs"
38
32
  Dynamic: license-file
39
33
 
40
34
  <h1 align="center"><code>json2vec</code></h1>
@@ -42,7 +36,7 @@ Dynamic: license-file
42
36
  <p align="center">
43
37
  <img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&amp;logoColor=white" />
44
38
  <a href="LICENSE"><img alt="Apache-2.0 license" src="https://img.shields.io/badge/license-Apache--2.0-2E8B57" /></a>
45
- <a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-MkDocs-526CFE?logo=materialformkdocs&amp;logoColor=white" /></a>
39
+ <a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-Quarto-39729E?logo=quarto&amp;logoColor=white" /></a>
46
40
  <!-- discord-invite:start -->
47
41
  <a href="https://discord.gg/DVyZUkvTFA"><img alt="Discord channel invite" src="https://img.shields.io/badge/discord-join%20the%20channel-5865F2?logo=discord&amp;logoColor=white" /></a>
48
42
  <!-- discord-invite:end -->
@@ -63,7 +57,8 @@ A `json2vec` schema is both a data contract and an architecture blueprint.
63
57
 
64
58
  - Leaf fields such as `Number`, `Category`, `Set`, `Entity`, `Text`, and
65
59
  `Vector` become datatype-specific tensorfields.
66
- - `Array` nodes become local context encoders for repeated child objects.
60
+ - `Branch` nodes define shared contexts for child fields, with optional local
61
+ attention and pooling before the representation flows upward.
67
62
  - Targets, masks, pruning, and embeddings are configured on the same schema
68
63
  tree.
69
64
  - Prediction output is keyed by schema address, so decoded values and
@@ -76,24 +71,23 @@ inference, and serving.
76
71
  ## A Model From A Nested Record
77
72
 
78
73
  ```python
79
- import json2vec as j2v
80
-
81
- model = j2v.Model.from_schema(
82
- j2v.Category("customer_tier", max_vocab_size=16),
83
- j2v.Array(
84
- j2v.Category("sku", max_vocab_size=2048),
85
- j2v.Number("quantity"),
86
- j2v.Number("price"),
87
- name="line_items",
88
- max_length=32,
89
- embed=True,
90
- ),
91
- j2v.Category("returned", target=True, max_vocab_size=2),
74
+ import json2vec as jv
75
+
76
+ model = jv.Model.from_tree(
92
77
  name="order",
93
78
  d_model=64,
94
79
  n_layers=2,
95
80
  n_heads=4,
96
81
  embed=True,
82
+ customer_tier=jv.Category(size=16),
83
+ line_items=jv.Branch(
84
+ length=32,
85
+ embed=True,
86
+ sku=jv.Category(size=2048),
87
+ quantity=jv.Number,
88
+ price=jv.Number,
89
+ ),
90
+ returned=jv.Category(target=True, size=2),
97
91
  )
98
92
  ```
99
93
 
@@ -116,8 +110,8 @@ to emit embeddings at configured addresses.
116
110
 
117
111
  ## Train With Lightning
118
112
 
119
- `j2v.Model` is a LightningModule. `j2v.PolarsDataModule` and
120
- `j2v.StreamingDataModule` are LightningDataModule implementations. The schema
113
+ `jv.Model` is a LightningModule. `jv.PolarsDataModule` and
114
+ `jv.StreamingDataModule` are LightningDataModule implementations. The schema
121
115
  defines the model tree, typed losses, prediction outputs, and embeddings;
122
116
  Lightning runs `fit`, `validate`, `test`, and `predict`.
123
117
 
@@ -126,23 +120,23 @@ import lightning.pytorch as lit
126
120
  import polars as pl
127
121
  import torch
128
122
 
129
- import json2vec as j2v
123
+ import json2vec as jv
130
124
 
131
125
  records = pl.read_ndjson("docs/data/iris.jsonl").head(36)
132
126
 
133
- model = j2v.Model.from_schema(
134
- j2v.Number("sepal_length"),
135
- j2v.Number("petal_length"),
136
- j2v.Category("species", target=True, max_vocab_size=4, topk=[2]),
127
+ model = jv.Model.from_tree(
137
128
  d_model=16,
138
129
  n_layers=1,
139
130
  n_heads=4,
140
131
  batch_size=8,
141
132
  embed=True,
142
133
  optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
134
+ sepal_length=jv.Number,
135
+ petal_length=jv.Number,
136
+ species=jv.Category(target=True, size=4, topk=[2]),
143
137
  )
144
138
 
145
- datamodule = j2v.PolarsDataModule(
139
+ datamodule = jv.PolarsDataModule(
146
140
  model=model,
147
141
  train=records,
148
142
  validate=records,
@@ -170,7 +164,7 @@ trainer.fit(model=model, datamodule=datamodule)
170
164
  For larger jobs, the same model can run through normal Lightning callbacks,
171
165
  checkpointing, precision settings, device placement, and distributed
172
166
  strategies. See
173
- [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning/).
167
+ [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning.html).
174
168
 
175
169
  ## Predict And Embed
176
170
 
@@ -179,8 +173,8 @@ For small interactive batches, call `model.predict(...)` with raw dictionaries.
179
173
  ```python
180
174
  predictions = model.predict(records.to_dicts()[:3])
181
175
 
182
- species = predictions[j2v.Address("record", "species")]
183
- record = predictions[j2v.Address("record")]
176
+ species = predictions[jv.Address("record", "species")]
177
+ record = predictions[jv.Address("record")]
184
178
 
185
179
  print(species["content"]["value"])
186
180
  print(species["content"]["probability"])
@@ -188,10 +182,10 @@ print(record["embedding"])
188
182
  ```
189
183
 
190
184
  For larger offline jobs, configure a `predict` split on a data module and attach
191
- `j2v.Writer` to Lightning's prediction loop.
185
+ `jv.Writer` to Lightning's prediction loop.
192
186
 
193
187
  ```python
194
- writer = j2v.Writer("predictions")
188
+ writer = jv.Writer("predictions")
195
189
 
196
190
  trainer = lit.Trainer(
197
191
  accelerator="cpu",
@@ -199,7 +193,7 @@ trainer = lit.Trainer(
199
193
  logger=False,
200
194
  )
201
195
 
202
- predict_datamodule = j2v.PolarsDataModule(
196
+ predict_datamodule = jv.PolarsDataModule(
203
197
  model=model,
204
198
  predict=records.drop("species"),
205
199
  num_workers=0,
@@ -213,8 +207,8 @@ trainer.predict(model=model, datamodule=predict_datamodule)
213
207
  `Writer` creates rank-partitioned Parquet files such as
214
208
  `predictions/rank-0.parquet`. Use a postprocessor when downstream systems need
215
209
  flat columns, renamed addresses, redacted payloads, or fewer fields. See
216
- [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference/)
217
- and [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors/).
210
+ [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference.html)
211
+ and [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors.html).
218
212
 
219
213
  ## Learning Modes
220
214
 
@@ -254,7 +248,7 @@ Choose the data module by where the records live:
254
248
  arguments are compiled regular expressions matched against discovered file
255
249
  paths.
256
250
 
257
- See [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules/)
251
+ See [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules.html)
258
252
  for split configuration, sharding, sampling, buffers, and preprocessors.
259
253
 
260
254
  ## What Makes This Different
@@ -265,7 +259,7 @@ for split configuration, sharding, sampling, buffers, and preprocessors.
265
259
  missing-state handling, masking, decoding, loss, metrics, and output writing.
266
260
  - **Unified training roles:** `target=True`, `p_prune`, and `p_mask` all use the
267
261
  same reconstruction path.
268
- - **Embedding trees:** embeddings can come from the root, arrays, or selected
262
+ - **Embedding trees:** embeddings can come from the root, branches, or selected
269
263
  leaves.
270
264
  - **Schema evolution:** fields can be added, removed, updated, reset, or
271
265
  temporarily overridden after construction.
@@ -318,55 +312,53 @@ uv sync --extra docs
318
312
 
319
313
  The `text` extra installs Hugging Face `transformers`. The `serving` extra
320
314
  installs FastAPI-backed deployment dependencies. The `docs` extra installs the
321
- MkDocs toolchain.
315
+ Python packages used by the Quarto docs.
322
316
 
323
317
  ## Documentation Map
324
318
 
325
319
  Start with:
326
320
 
327
- - [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
328
- - [AI / Expert Quickstart](https://json2vec.github.io/json2vec/ai-quickstart/)
329
- - [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree/)
330
- - [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths/)
331
- - [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types/)
332
- - [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings/)
333
- - [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning/)
334
- - [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules/)
335
- - [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference/)
336
- - [API Reference](https://json2vec.github.io/json2vec/reference/api/)
321
+ - [Getting Started](https://json2vec.github.io/json2vec/getting-started.html)
322
+ - [AI / Expert Quickstart](https://json2vec.github.io/json2vec/ai-quickstart.html)
323
+ - [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree.html)
324
+ - [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths.html)
325
+ - [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types.html)
326
+ - [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings.html)
327
+ - [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning.html)
328
+ - [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules.html)
329
+ - [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference.html)
337
330
 
338
331
  Tutorials and guides:
339
332
 
340
- - [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
341
- - [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
342
- - [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
343
- - [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
344
- - [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
345
- - [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
346
- - [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors/)
347
- - [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance/)
348
- - [Field Stacking](https://json2vec.github.io/json2vec/guides/field-stacking/)
349
- - [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
350
- - [Device Tenure Case Study](https://json2vec.github.io/json2vec/case-studies/device-tenure/)
351
- - [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
333
+ - [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors.html)
334
+ - [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance.html)
335
+ - [Field Stacking](https://json2vec.github.io/json2vec/guides/field-stacking.html)
336
+ - [Branch](https://json2vec.github.io/json2vec/data-types/branch.html)
337
+ - [Number](https://json2vec.github.io/json2vec/data-types/number.html)
338
+ - [Category](https://json2vec.github.io/json2vec/data-types/category.html)
339
+ - [Set](https://json2vec.github.io/json2vec/data-types/set.html)
340
+ - [Entity](https://json2vec.github.io/json2vec/data-types/entity.html)
341
+ - [DateParts](https://json2vec.github.io/json2vec/data-types/dateparts.html)
342
+ - [Vector](https://json2vec.github.io/json2vec/data-types/vector.html)
343
+ - [Text](https://json2vec.github.io/json2vec/data-types/text.html)
344
+ - [Device Tenure Case Study](https://json2vec.github.io/json2vec/case-studies/device-tenure.html)
352
345
 
353
346
  Build the docs locally with:
354
347
 
355
348
  ```bash
356
- uv run --extra docs mkdocs build --strict
349
+ make render
357
350
  ```
358
351
 
359
352
  ## Repository Layout
360
353
 
361
354
  - `src/json2vec/architecture`: model assembly, attention, pooling, and routing
362
- - `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline
355
+ - `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline and preprocessor exports
363
356
  - `src/json2vec/inference`: serving and prediction callbacks
364
357
  - `src/json2vec/logging`: runtime logging callbacks
365
- - `src/json2vec/preprocessors`: preprocessor registry
366
358
  - `src/json2vec/structs`: pydantic config models, enums, and tree nodes
367
359
  - `src/json2vec/tensorfields`: tensorfield plugin system and built-in fields
368
360
  - `tests/`: package test suite
369
- - `docs/`: tutorials, guides, diagrams, and whitepaper source
361
+ - `docs/`: Quarto project, pages, guides, stylesheets, and sample data
370
362
 
371
363
  ## Development
372
364
 
@@ -3,7 +3,7 @@
3
3
  <p align="center">
4
4
  <img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&amp;logoColor=white" />
5
5
  <a href="LICENSE"><img alt="Apache-2.0 license" src="https://img.shields.io/badge/license-Apache--2.0-2E8B57" /></a>
6
- <a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-MkDocs-526CFE?logo=materialformkdocs&amp;logoColor=white" /></a>
6
+ <a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-Quarto-39729E?logo=quarto&amp;logoColor=white" /></a>
7
7
  <!-- discord-invite:start -->
8
8
  <a href="https://discord.gg/DVyZUkvTFA"><img alt="Discord channel invite" src="https://img.shields.io/badge/discord-join%20the%20channel-5865F2?logo=discord&amp;logoColor=white" /></a>
9
9
  <!-- discord-invite:end -->
@@ -24,7 +24,8 @@ A `json2vec` schema is both a data contract and an architecture blueprint.
24
24
 
25
25
  - Leaf fields such as `Number`, `Category`, `Set`, `Entity`, `Text`, and
26
26
  `Vector` become datatype-specific tensorfields.
27
- - `Array` nodes become local context encoders for repeated child objects.
27
+ - `Branch` nodes define shared contexts for child fields, with optional local
28
+ attention and pooling before the representation flows upward.
28
29
  - Targets, masks, pruning, and embeddings are configured on the same schema
29
30
  tree.
30
31
  - Prediction output is keyed by schema address, so decoded values and
@@ -37,24 +38,23 @@ inference, and serving.
37
38
  ## A Model From A Nested Record
38
39
 
39
40
  ```python
40
- import json2vec as j2v
41
-
42
- model = j2v.Model.from_schema(
43
- j2v.Category("customer_tier", max_vocab_size=16),
44
- j2v.Array(
45
- j2v.Category("sku", max_vocab_size=2048),
46
- j2v.Number("quantity"),
47
- j2v.Number("price"),
48
- name="line_items",
49
- max_length=32,
50
- embed=True,
51
- ),
52
- j2v.Category("returned", target=True, max_vocab_size=2),
41
+ import json2vec as jv
42
+
43
+ model = jv.Model.from_tree(
53
44
  name="order",
54
45
  d_model=64,
55
46
  n_layers=2,
56
47
  n_heads=4,
57
48
  embed=True,
49
+ customer_tier=jv.Category(size=16),
50
+ line_items=jv.Branch(
51
+ length=32,
52
+ embed=True,
53
+ sku=jv.Category(size=2048),
54
+ quantity=jv.Number,
55
+ price=jv.Number,
56
+ ),
57
+ returned=jv.Category(target=True, size=2),
58
58
  )
59
59
  ```
60
60
 
@@ -77,8 +77,8 @@ to emit embeddings at configured addresses.
77
77
 
78
78
  ## Train With Lightning
79
79
 
80
- `j2v.Model` is a LightningModule. `j2v.PolarsDataModule` and
81
- `j2v.StreamingDataModule` are LightningDataModule implementations. The schema
80
+ `jv.Model` is a LightningModule. `jv.PolarsDataModule` and
81
+ `jv.StreamingDataModule` are LightningDataModule implementations. The schema
82
82
  defines the model tree, typed losses, prediction outputs, and embeddings;
83
83
  Lightning runs `fit`, `validate`, `test`, and `predict`.
84
84
 
@@ -87,23 +87,23 @@ import lightning.pytorch as lit
87
87
  import polars as pl
88
88
  import torch
89
89
 
90
- import json2vec as j2v
90
+ import json2vec as jv
91
91
 
92
92
  records = pl.read_ndjson("docs/data/iris.jsonl").head(36)
93
93
 
94
- model = j2v.Model.from_schema(
95
- j2v.Number("sepal_length"),
96
- j2v.Number("petal_length"),
97
- j2v.Category("species", target=True, max_vocab_size=4, topk=[2]),
94
+ model = jv.Model.from_tree(
98
95
  d_model=16,
99
96
  n_layers=1,
100
97
  n_heads=4,
101
98
  batch_size=8,
102
99
  embed=True,
103
100
  optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
101
+ sepal_length=jv.Number,
102
+ petal_length=jv.Number,
103
+ species=jv.Category(target=True, size=4, topk=[2]),
104
104
  )
105
105
 
106
- datamodule = j2v.PolarsDataModule(
106
+ datamodule = jv.PolarsDataModule(
107
107
  model=model,
108
108
  train=records,
109
109
  validate=records,
@@ -131,7 +131,7 @@ trainer.fit(model=model, datamodule=datamodule)
131
131
  For larger jobs, the same model can run through normal Lightning callbacks,
132
132
  checkpointing, precision settings, device placement, and distributed
133
133
  strategies. See
134
- [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning/).
134
+ [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning.html).
135
135
 
136
136
  ## Predict And Embed
137
137
 
@@ -140,8 +140,8 @@ For small interactive batches, call `model.predict(...)` with raw dictionaries.
140
140
  ```python
141
141
  predictions = model.predict(records.to_dicts()[:3])
142
142
 
143
- species = predictions[j2v.Address("record", "species")]
144
- record = predictions[j2v.Address("record")]
143
+ species = predictions[jv.Address("record", "species")]
144
+ record = predictions[jv.Address("record")]
145
145
 
146
146
  print(species["content"]["value"])
147
147
  print(species["content"]["probability"])
@@ -149,10 +149,10 @@ print(record["embedding"])
149
149
  ```
150
150
 
151
151
  For larger offline jobs, configure a `predict` split on a data module and attach
152
- `j2v.Writer` to Lightning's prediction loop.
152
+ `jv.Writer` to Lightning's prediction loop.
153
153
 
154
154
  ```python
155
- writer = j2v.Writer("predictions")
155
+ writer = jv.Writer("predictions")
156
156
 
157
157
  trainer = lit.Trainer(
158
158
  accelerator="cpu",
@@ -160,7 +160,7 @@ trainer = lit.Trainer(
160
160
  logger=False,
161
161
  )
162
162
 
163
- predict_datamodule = j2v.PolarsDataModule(
163
+ predict_datamodule = jv.PolarsDataModule(
164
164
  model=model,
165
165
  predict=records.drop("species"),
166
166
  num_workers=0,
@@ -174,8 +174,8 @@ trainer.predict(model=model, datamodule=predict_datamodule)
174
174
  `Writer` creates rank-partitioned Parquet files such as
175
175
  `predictions/rank-0.parquet`. Use a postprocessor when downstream systems need
176
176
  flat columns, renamed addresses, redacted payloads, or fewer fields. See
177
- [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference/)
178
- and [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors/).
177
+ [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference.html)
178
+ and [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors.html).
179
179
 
180
180
  ## Learning Modes
181
181
 
@@ -215,7 +215,7 @@ Choose the data module by where the records live:
215
215
  arguments are compiled regular expressions matched against discovered file
216
216
  paths.
217
217
 
218
- See [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules/)
218
+ See [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules.html)
219
219
  for split configuration, sharding, sampling, buffers, and preprocessors.
220
220
 
221
221
  ## What Makes This Different
@@ -226,7 +226,7 @@ for split configuration, sharding, sampling, buffers, and preprocessors.
226
226
  missing-state handling, masking, decoding, loss, metrics, and output writing.
227
227
  - **Unified training roles:** `target=True`, `p_prune`, and `p_mask` all use the
228
228
  same reconstruction path.
229
- - **Embedding trees:** embeddings can come from the root, arrays, or selected
229
+ - **Embedding trees:** embeddings can come from the root, branches, or selected
230
230
  leaves.
231
231
  - **Schema evolution:** fields can be added, removed, updated, reset, or
232
232
  temporarily overridden after construction.
@@ -279,55 +279,53 @@ uv sync --extra docs
279
279
 
280
280
  The `text` extra installs Hugging Face `transformers`. The `serving` extra
281
281
  installs FastAPI-backed deployment dependencies. The `docs` extra installs the
282
- MkDocs toolchain.
282
+ Python packages used by the Quarto docs.
283
283
 
284
284
  ## Documentation Map
285
285
 
286
286
  Start with:
287
287
 
288
- - [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
289
- - [AI / Expert Quickstart](https://json2vec.github.io/json2vec/ai-quickstart/)
290
- - [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree/)
291
- - [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths/)
292
- - [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types/)
293
- - [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings/)
294
- - [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning/)
295
- - [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules/)
296
- - [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference/)
297
- - [API Reference](https://json2vec.github.io/json2vec/reference/api/)
288
+ - [Getting Started](https://json2vec.github.io/json2vec/getting-started.html)
289
+ - [AI / Expert Quickstart](https://json2vec.github.io/json2vec/ai-quickstart.html)
290
+ - [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree.html)
291
+ - [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths.html)
292
+ - [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types.html)
293
+ - [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings.html)
294
+ - [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning.html)
295
+ - [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules.html)
296
+ - [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference.html)
298
297
 
299
298
  Tutorials and guides:
300
299
 
301
- - [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
302
- - [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
303
- - [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
304
- - [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
305
- - [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
306
- - [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
307
- - [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors/)
308
- - [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance/)
309
- - [Field Stacking](https://json2vec.github.io/json2vec/guides/field-stacking/)
310
- - [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
311
- - [Device Tenure Case Study](https://json2vec.github.io/json2vec/case-studies/device-tenure/)
312
- - [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
300
+ - [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors.html)
301
+ - [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance.html)
302
+ - [Field Stacking](https://json2vec.github.io/json2vec/guides/field-stacking.html)
303
+ - [Branch](https://json2vec.github.io/json2vec/data-types/branch.html)
304
+ - [Number](https://json2vec.github.io/json2vec/data-types/number.html)
305
+ - [Category](https://json2vec.github.io/json2vec/data-types/category.html)
306
+ - [Set](https://json2vec.github.io/json2vec/data-types/set.html)
307
+ - [Entity](https://json2vec.github.io/json2vec/data-types/entity.html)
308
+ - [DateParts](https://json2vec.github.io/json2vec/data-types/dateparts.html)
309
+ - [Vector](https://json2vec.github.io/json2vec/data-types/vector.html)
310
+ - [Text](https://json2vec.github.io/json2vec/data-types/text.html)
311
+ - [Device Tenure Case Study](https://json2vec.github.io/json2vec/case-studies/device-tenure.html)
313
312
 
314
313
  Build the docs locally with:
315
314
 
316
315
  ```bash
317
- uv run --extra docs mkdocs build --strict
316
+ make render
318
317
  ```
319
318
 
320
319
  ## Repository Layout
321
320
 
322
321
  - `src/json2vec/architecture`: model assembly, attention, pooling, and routing
323
- - `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline
322
+ - `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline and preprocessor exports
324
323
  - `src/json2vec/inference`: serving and prediction callbacks
325
324
  - `src/json2vec/logging`: runtime logging callbacks
326
- - `src/json2vec/preprocessors`: preprocessor registry
327
325
  - `src/json2vec/structs`: pydantic config models, enums, and tree nodes
328
326
  - `src/json2vec/tensorfields`: tensorfield plugin system and built-in fields
329
327
  - `tests/`: package test suite
330
- - `docs/`: tutorials, guides, diagrams, and whitepaper source
328
+ - `docs/`: Quarto project, pages, guides, stylesheets, and sample data
331
329
 
332
330
  ## Development
333
331
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "json2vec"
3
- version = "0.4.9"
3
+ version = "0.4.10"
4
4
  description = "Schema-first PyTorch models for hierarchical / nested / sequence data structures"
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -32,13 +32,7 @@ text = [
32
32
  "transformers>=4.55.0",
33
33
  ]
34
34
  docs = [
35
- "fastapi>=0.124.0",
36
- "mkdocs-material>=9.6",
37
- "mkdocs-jupyter>=0.26.3",
38
- "mkdocstrings[python]>=0.27",
39
- "orjson>=3.10.0",
40
- "pydantic-settings>=2.10.1",
41
- "uvicorn>=0.38.0",
35
+ "marimo>=0.23.8",
42
36
  ]
43
37
 
44
38
  [dependency-groups]
@@ -1,7 +1,7 @@
1
1
  """Public `json2vec` SDK surface.
2
2
 
3
3
  The top-level package exports the constructors and helpers used by most
4
- applications: `Model.from_schema(...)` for model construction, tensorfield
4
+ applications: `Model.from_tree(...)` for model construction, tensorfield
5
5
  request constructors such as `Category` and `Number`, data modules, schema
6
6
  mutation predicates, and the `@preprocess` decorator.
7
7
  """
@@ -17,9 +17,22 @@ from json2vec.architecture.root import (
17
17
  SchedulerConfig,
18
18
  )
19
19
  from json2vec.data.datasets import CustomDataModule, PolarsDataModule, StreamingDataModule
20
- from json2vec.data.processing import MASK_LITERAL, MaskLiteral
21
- from json2vec.inference.callback import Postprocessor, Writer
22
- from json2vec.preprocessors import PREPROCESSORS, Preprocessor, PreprocessorMode, preprocess
20
+ from json2vec.data.nested import MASK_LITERAL, MaskLiteral
21
+ from json2vec.data.processors import (
22
+ Metadata,
23
+ Observation,
24
+ Postprocessor,
25
+ PostprocessorProvider,
26
+ PostprocessorResult,
27
+ Predictions,
28
+ Preprocessor,
29
+ PreprocessorProvider,
30
+ RawBatch,
31
+ RawObservation,
32
+ postprocess,
33
+ preprocess,
34
+ )
35
+ from json2vec.inference.callback import Writer
23
36
  from json2vec.structs.enums import (
24
37
  AttentionMode,
25
38
  Component,
@@ -32,14 +45,14 @@ from json2vec.structs.enums import (
32
45
  Tokens,
33
46
  )
34
47
  from json2vec.structs.experiment import (
35
- Hyperparameters,
36
48
  NodeAttribute,
37
49
  NodePredicate,
50
+ Schema,
38
51
  SchemaField,
39
52
  predicate,
40
53
  where,
41
54
  )
42
- from json2vec.structs.structure import Array, Mask
55
+ from json2vec.structs.structure import Branch, Mask
43
56
  from json2vec.structs.tree import Address, Leaf
44
57
  from json2vec.tensorfields import TENSORFIELDS, DecoderBase, EmbedderBase, Plugin, RequestBase, TensorFieldBase
45
58
  from json2vec.tensorfields.extensions.category import Request as Category
@@ -96,7 +109,7 @@ def __dir__() -> list[str]:
96
109
  __all__ = [
97
110
  "Address",
98
111
  "Accelerator",
99
- "Array",
112
+ "Branch",
100
113
  "AttentionMode",
101
114
  "Category",
102
115
  "Component",
@@ -107,7 +120,7 @@ __all__ = [
107
120
  "EmbedderBase",
108
121
  "Entity",
109
122
  "helpers",
110
- "Hyperparameters",
123
+ "Schema",
111
124
  "Input",
112
125
  "JSONBackend",
113
126
  "Leaf",
@@ -115,20 +128,26 @@ __all__ = [
115
128
  "MASK_LITERAL",
116
129
  "Mask",
117
130
  "MaskLiteral",
131
+ "Metadata",
118
132
  "Model",
119
133
  "ModelSource",
120
134
  "MutationLockCallback",
121
135
  "NodeAttribute",
122
136
  "NodePredicate",
123
137
  "Number",
138
+ "Observation",
124
139
  "OptimizerConfig",
125
140
  "Overflow",
126
- "PREPROCESSORS",
127
141
  "Plugin",
128
142
  "PolarsDataModule",
129
143
  "Postprocessor",
144
+ "PostprocessorProvider",
145
+ "PostprocessorResult",
146
+ "Predictions",
130
147
  "Preprocessor",
131
- "PreprocessorMode",
148
+ "PreprocessorProvider",
149
+ "RawBatch",
150
+ "RawObservation",
132
151
  "RequestBase",
133
152
  "RollbackCheckpoint",
134
153
  "RuntimePlacementCallback",
@@ -149,6 +168,7 @@ __all__ = [
149
168
  "VocabularySyncCallback",
150
169
  "Writer",
151
170
  "predicate",
171
+ "postprocess",
152
172
  "preprocess",
153
173
  "where",
154
174
  ]