json2vec 0.4.8__tar.gz → 0.4.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {json2vec-0.4.8/src/json2vec.egg-info → json2vec-0.4.10}/PKG-INFO +60 -68
- {json2vec-0.4.8 → json2vec-0.4.10}/README.md +58 -60
- {json2vec-0.4.8 → json2vec-0.4.10}/pyproject.toml +2 -8
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/__init__.py +37 -12
- json2vec-0.4.10/src/json2vec/architecture/checkpoint.py +111 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/contracts.py +30 -13
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/encoder.py +14 -14
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/graph.py +12 -12
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/mutations.py +102 -24
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/node.py +9 -9
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/root.py +60 -150
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/runtime.py +63 -44
- json2vec-0.4.10/src/json2vec/data/__init__.py +33 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/data/datasets/base.py +29 -14
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/data/datasets/custom.py +19 -29
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/data/datasets/polars.py +19 -29
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/data/datasets/streaming.py +20 -30
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/data/iterables.py +83 -40
- json2vec-0.4.10/src/json2vec/data/nested.py +237 -0
- json2vec-0.4.10/src/json2vec/data/processors.py +410 -0
- json2vec-0.4.10/src/json2vec/helpers/__init__.py +8 -0
- json2vec-0.4.10/src/json2vec/helpers/inference.py +629 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/inference/__init__.py +2 -1
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/inference/callback.py +15 -18
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/inference/deployment.py +105 -24
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/logging/throughput.py +7 -10
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/structs/enums.py +1 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/structs/experiment.py +127 -72
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/structs/selectors.py +2 -2
- json2vec-0.4.10/src/json2vec/structs/structure.py +247 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/structs/tree.py +14 -9
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/base.py +262 -19
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/category.py +83 -63
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/dateparts.py +51 -48
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/entity.py +77 -65
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/number.py +51 -49
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/set.py +81 -54
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/text.py +155 -226
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/vector.py +51 -45
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/shared/counter.py +1 -1
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/shared/vocabulary.py +14 -10
- {json2vec-0.4.8 → json2vec-0.4.10/src/json2vec.egg-info}/PKG-INFO +60 -68
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec.egg-info/SOURCES.txt +5 -6
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec.egg-info/requires.txt +1 -7
- {json2vec-0.4.8 → json2vec-0.4.10}/tests/test_callbacks.py +1 -1
- {json2vec-0.4.8 → json2vec-0.4.10}/tests/test_public_api.py +8 -3
- json2vec-0.4.10/tests/test_schema_inference.py +327 -0
- json2vec-0.4.8/src/json2vec/architecture/checkpoint.py +0 -69
- json2vec-0.4.8/src/json2vec/data/processing.py +0 -203
- json2vec-0.4.8/src/json2vec/helpers/__init__.py +0 -0
- json2vec-0.4.8/src/json2vec/preprocessors/__init__.py +0 -14
- json2vec-0.4.8/src/json2vec/preprocessors/base.py +0 -158
- json2vec-0.4.8/src/json2vec/preprocessors/extensions/__init__.py +0 -1
- json2vec-0.4.8/src/json2vec/preprocessors/spec.py +0 -8
- json2vec-0.4.8/src/json2vec/structs/__init__.py +0 -0
- json2vec-0.4.8/src/json2vec/structs/structure.py +0 -110
- {json2vec-0.4.8 → json2vec-0.4.10}/LICENSE +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/NOTICE +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/setup.cfg +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/__init__.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/attention.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/pool.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/architecture/rotary.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/data/datasets/__init__.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/distributed.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/helpers/hyperparameters.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/helpers/optimizers.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/helpers/trainer.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/logging/__init__.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/logging/config.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/logging/epoch.py +0 -0
- {json2vec-0.4.8/src/json2vec/data → json2vec-0.4.10/src/json2vec/structs}/__init__.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/structs/packages.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/__init__.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/extensions/__init__.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/shared/__init__.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec/tensorfields/spec.py +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec.egg-info/dependency_links.txt +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/src/json2vec.egg-info/top_level.txt +0 -0
- {json2vec-0.4.8 → json2vec-0.4.10}/tests/test_optimizers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: json2vec
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.10
|
|
4
4
|
Summary: Schema-first PyTorch models for hierarchical / nested / sequence data structures
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
Requires-Python: >=3.12
|
|
@@ -28,13 +28,7 @@ Requires-Dist: uvicorn>=0.38.0; extra == "serving"
|
|
|
28
28
|
Provides-Extra: text
|
|
29
29
|
Requires-Dist: transformers>=4.55.0; extra == "text"
|
|
30
30
|
Provides-Extra: docs
|
|
31
|
-
Requires-Dist:
|
|
32
|
-
Requires-Dist: mkdocs-material>=9.6; extra == "docs"
|
|
33
|
-
Requires-Dist: mkdocs-jupyter>=0.26.3; extra == "docs"
|
|
34
|
-
Requires-Dist: mkdocstrings[python]>=0.27; extra == "docs"
|
|
35
|
-
Requires-Dist: orjson>=3.10.0; extra == "docs"
|
|
36
|
-
Requires-Dist: pydantic-settings>=2.10.1; extra == "docs"
|
|
37
|
-
Requires-Dist: uvicorn>=0.38.0; extra == "docs"
|
|
31
|
+
Requires-Dist: marimo>=0.23.8; extra == "docs"
|
|
38
32
|
Dynamic: license-file
|
|
39
33
|
|
|
40
34
|
<h1 align="center"><code>json2vec</code></h1>
|
|
@@ -42,7 +36,7 @@ Dynamic: license-file
|
|
|
42
36
|
<p align="center">
|
|
43
37
|
<img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&logoColor=white" />
|
|
44
38
|
<a href="LICENSE"><img alt="Apache-2.0 license" src="https://img.shields.io/badge/license-Apache--2.0-2E8B57" /></a>
|
|
45
|
-
<a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-
|
|
39
|
+
<a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-Quarto-39729E?logo=quarto&logoColor=white" /></a>
|
|
46
40
|
<!-- discord-invite:start -->
|
|
47
41
|
<a href="https://discord.gg/DVyZUkvTFA"><img alt="Discord channel invite" src="https://img.shields.io/badge/discord-join%20the%20channel-5865F2?logo=discord&logoColor=white" /></a>
|
|
48
42
|
<!-- discord-invite:end -->
|
|
@@ -63,7 +57,8 @@ A `json2vec` schema is both a data contract and an architecture blueprint.
|
|
|
63
57
|
|
|
64
58
|
- Leaf fields such as `Number`, `Category`, `Set`, `Entity`, `Text`, and
|
|
65
59
|
`Vector` become datatype-specific tensorfields.
|
|
66
|
-
- `
|
|
60
|
+
- `Branch` nodes define shared contexts for child fields, with optional local
|
|
61
|
+
attention and pooling before the representation flows upward.
|
|
67
62
|
- Targets, masks, pruning, and embeddings are configured on the same schema
|
|
68
63
|
tree.
|
|
69
64
|
- Prediction output is keyed by schema address, so decoded values and
|
|
@@ -76,24 +71,23 @@ inference, and serving.
|
|
|
76
71
|
## A Model From A Nested Record
|
|
77
72
|
|
|
78
73
|
```python
|
|
79
|
-
import json2vec as
|
|
80
|
-
|
|
81
|
-
model =
|
|
82
|
-
j2v.Category("customer_tier", max_vocab_size=16),
|
|
83
|
-
j2v.Array(
|
|
84
|
-
j2v.Category("sku", max_vocab_size=2048),
|
|
85
|
-
j2v.Number("quantity"),
|
|
86
|
-
j2v.Number("price"),
|
|
87
|
-
name="line_items",
|
|
88
|
-
max_length=32,
|
|
89
|
-
embed=True,
|
|
90
|
-
),
|
|
91
|
-
j2v.Category("returned", target=True, max_vocab_size=2),
|
|
74
|
+
import json2vec as jv
|
|
75
|
+
|
|
76
|
+
model = jv.Model.from_tree(
|
|
92
77
|
name="order",
|
|
93
78
|
d_model=64,
|
|
94
79
|
n_layers=2,
|
|
95
80
|
n_heads=4,
|
|
96
81
|
embed=True,
|
|
82
|
+
customer_tier=jv.Category(size=16),
|
|
83
|
+
line_items=jv.Branch(
|
|
84
|
+
length=32,
|
|
85
|
+
embed=True,
|
|
86
|
+
sku=jv.Category(size=2048),
|
|
87
|
+
quantity=jv.Number,
|
|
88
|
+
price=jv.Number,
|
|
89
|
+
),
|
|
90
|
+
returned=jv.Category(target=True, size=2),
|
|
97
91
|
)
|
|
98
92
|
```
|
|
99
93
|
|
|
@@ -116,8 +110,8 @@ to emit embeddings at configured addresses.
|
|
|
116
110
|
|
|
117
111
|
## Train With Lightning
|
|
118
112
|
|
|
119
|
-
`
|
|
120
|
-
`
|
|
113
|
+
`jv.Model` is a LightningModule. `jv.PolarsDataModule` and
|
|
114
|
+
`jv.StreamingDataModule` are LightningDataModule implementations. The schema
|
|
121
115
|
defines the model tree, typed losses, prediction outputs, and embeddings;
|
|
122
116
|
Lightning runs `fit`, `validate`, `test`, and `predict`.
|
|
123
117
|
|
|
@@ -126,23 +120,23 @@ import lightning.pytorch as lit
|
|
|
126
120
|
import polars as pl
|
|
127
121
|
import torch
|
|
128
122
|
|
|
129
|
-
import json2vec as
|
|
123
|
+
import json2vec as jv
|
|
130
124
|
|
|
131
125
|
records = pl.read_ndjson("docs/data/iris.jsonl").head(36)
|
|
132
126
|
|
|
133
|
-
model =
|
|
134
|
-
j2v.Number("sepal_length"),
|
|
135
|
-
j2v.Number("petal_length"),
|
|
136
|
-
j2v.Category("species", target=True, max_vocab_size=4, topk=[2]),
|
|
127
|
+
model = jv.Model.from_tree(
|
|
137
128
|
d_model=16,
|
|
138
129
|
n_layers=1,
|
|
139
130
|
n_heads=4,
|
|
140
131
|
batch_size=8,
|
|
141
132
|
embed=True,
|
|
142
133
|
optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
|
|
134
|
+
sepal_length=jv.Number,
|
|
135
|
+
petal_length=jv.Number,
|
|
136
|
+
species=jv.Category(target=True, size=4, topk=[2]),
|
|
143
137
|
)
|
|
144
138
|
|
|
145
|
-
datamodule =
|
|
139
|
+
datamodule = jv.PolarsDataModule(
|
|
146
140
|
model=model,
|
|
147
141
|
train=records,
|
|
148
142
|
validate=records,
|
|
@@ -170,7 +164,7 @@ trainer.fit(model=model, datamodule=datamodule)
|
|
|
170
164
|
For larger jobs, the same model can run through normal Lightning callbacks,
|
|
171
165
|
checkpointing, precision settings, device placement, and distributed
|
|
172
166
|
strategies. See
|
|
173
|
-
[Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning
|
|
167
|
+
[Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning.html).
|
|
174
168
|
|
|
175
169
|
## Predict And Embed
|
|
176
170
|
|
|
@@ -179,8 +173,8 @@ For small interactive batches, call `model.predict(...)` with raw dictionaries.
|
|
|
179
173
|
```python
|
|
180
174
|
predictions = model.predict(records.to_dicts()[:3])
|
|
181
175
|
|
|
182
|
-
species = predictions[
|
|
183
|
-
record = predictions[
|
|
176
|
+
species = predictions[jv.Address("record", "species")]
|
|
177
|
+
record = predictions[jv.Address("record")]
|
|
184
178
|
|
|
185
179
|
print(species["content"]["value"])
|
|
186
180
|
print(species["content"]["probability"])
|
|
@@ -188,10 +182,10 @@ print(record["embedding"])
|
|
|
188
182
|
```
|
|
189
183
|
|
|
190
184
|
For larger offline jobs, configure a `predict` split on a data module and attach
|
|
191
|
-
`
|
|
185
|
+
`jv.Writer` to Lightning's prediction loop.
|
|
192
186
|
|
|
193
187
|
```python
|
|
194
|
-
writer =
|
|
188
|
+
writer = jv.Writer("predictions")
|
|
195
189
|
|
|
196
190
|
trainer = lit.Trainer(
|
|
197
191
|
accelerator="cpu",
|
|
@@ -199,7 +193,7 @@ trainer = lit.Trainer(
|
|
|
199
193
|
logger=False,
|
|
200
194
|
)
|
|
201
195
|
|
|
202
|
-
predict_datamodule =
|
|
196
|
+
predict_datamodule = jv.PolarsDataModule(
|
|
203
197
|
model=model,
|
|
204
198
|
predict=records.drop("species"),
|
|
205
199
|
num_workers=0,
|
|
@@ -213,8 +207,8 @@ trainer.predict(model=model, datamodule=predict_datamodule)
|
|
|
213
207
|
`Writer` creates rank-partitioned Parquet files such as
|
|
214
208
|
`predictions/rank-0.parquet`. Use a postprocessor when downstream systems need
|
|
215
209
|
flat columns, renamed addresses, redacted payloads, or fewer fields. See
|
|
216
|
-
[Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference
|
|
217
|
-
and [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors
|
|
210
|
+
[Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference.html)
|
|
211
|
+
and [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors.html).
|
|
218
212
|
|
|
219
213
|
## Learning Modes
|
|
220
214
|
|
|
@@ -254,7 +248,7 @@ Choose the data module by where the records live:
|
|
|
254
248
|
arguments are compiled regular expressions matched against discovered file
|
|
255
249
|
paths.
|
|
256
250
|
|
|
257
|
-
See [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules
|
|
251
|
+
See [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules.html)
|
|
258
252
|
for split configuration, sharding, sampling, buffers, and preprocessors.
|
|
259
253
|
|
|
260
254
|
## What Makes This Different
|
|
@@ -265,7 +259,7 @@ for split configuration, sharding, sampling, buffers, and preprocessors.
|
|
|
265
259
|
missing-state handling, masking, decoding, loss, metrics, and output writing.
|
|
266
260
|
- **Unified training roles:** `target=True`, `p_prune`, and `p_mask` all use the
|
|
267
261
|
same reconstruction path.
|
|
268
|
-
- **Embedding trees:** embeddings can come from the root,
|
|
262
|
+
- **Embedding trees:** embeddings can come from the root, branches, or selected
|
|
269
263
|
leaves.
|
|
270
264
|
- **Schema evolution:** fields can be added, removed, updated, reset, or
|
|
271
265
|
temporarily overridden after construction.
|
|
@@ -318,55 +312,53 @@ uv sync --extra docs
|
|
|
318
312
|
|
|
319
313
|
The `text` extra installs Hugging Face `transformers`. The `serving` extra
|
|
320
314
|
installs FastAPI-backed deployment dependencies. The `docs` extra installs the
|
|
321
|
-
|
|
315
|
+
Python packages used by the Quarto docs.
|
|
322
316
|
|
|
323
317
|
## Documentation Map
|
|
324
318
|
|
|
325
319
|
Start with:
|
|
326
320
|
|
|
327
|
-
- [Getting Started](https://json2vec.github.io/json2vec/getting-started
|
|
328
|
-
- [AI / Expert Quickstart](https://json2vec.github.io/json2vec/ai-quickstart
|
|
329
|
-
- [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree
|
|
330
|
-
- [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths
|
|
331
|
-
- [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types
|
|
332
|
-
- [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings
|
|
333
|
-
- [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning
|
|
334
|
-
- [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules
|
|
335
|
-
- [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference
|
|
336
|
-
- [API Reference](https://json2vec.github.io/json2vec/reference/api/)
|
|
321
|
+
- [Getting Started](https://json2vec.github.io/json2vec/getting-started.html)
|
|
322
|
+
- [AI / Expert Quickstart](https://json2vec.github.io/json2vec/ai-quickstart.html)
|
|
323
|
+
- [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree.html)
|
|
324
|
+
- [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths.html)
|
|
325
|
+
- [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types.html)
|
|
326
|
+
- [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings.html)
|
|
327
|
+
- [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning.html)
|
|
328
|
+
- [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules.html)
|
|
329
|
+
- [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference.html)
|
|
337
330
|
|
|
338
331
|
Tutorials and guides:
|
|
339
332
|
|
|
340
|
-
- [
|
|
341
|
-
- [
|
|
342
|
-
- [
|
|
343
|
-
- [
|
|
344
|
-
- [
|
|
345
|
-
- [
|
|
346
|
-
- [
|
|
347
|
-
- [
|
|
348
|
-
- [
|
|
349
|
-
- [
|
|
350
|
-
- [
|
|
351
|
-
- [
|
|
333
|
+
- [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors.html)
|
|
334
|
+
- [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance.html)
|
|
335
|
+
- [Field Stacking](https://json2vec.github.io/json2vec/guides/field-stacking.html)
|
|
336
|
+
- [Branch](https://json2vec.github.io/json2vec/data-types/branch.html)
|
|
337
|
+
- [Number](https://json2vec.github.io/json2vec/data-types/number.html)
|
|
338
|
+
- [Category](https://json2vec.github.io/json2vec/data-types/category.html)
|
|
339
|
+
- [Set](https://json2vec.github.io/json2vec/data-types/set.html)
|
|
340
|
+
- [Entity](https://json2vec.github.io/json2vec/data-types/entity.html)
|
|
341
|
+
- [DateParts](https://json2vec.github.io/json2vec/data-types/dateparts.html)
|
|
342
|
+
- [Vector](https://json2vec.github.io/json2vec/data-types/vector.html)
|
|
343
|
+
- [Text](https://json2vec.github.io/json2vec/data-types/text.html)
|
|
344
|
+
- [Device Tenure Case Study](https://json2vec.github.io/json2vec/case-studies/device-tenure.html)
|
|
352
345
|
|
|
353
346
|
Build the docs locally with:
|
|
354
347
|
|
|
355
348
|
```bash
|
|
356
|
-
|
|
349
|
+
make render
|
|
357
350
|
```
|
|
358
351
|
|
|
359
352
|
## Repository Layout
|
|
360
353
|
|
|
361
354
|
- `src/json2vec/architecture`: model assembly, attention, pooling, and routing
|
|
362
|
-
- `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline
|
|
355
|
+
- `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline and preprocessor exports
|
|
363
356
|
- `src/json2vec/inference`: serving and prediction callbacks
|
|
364
357
|
- `src/json2vec/logging`: runtime logging callbacks
|
|
365
|
-
- `src/json2vec/preprocessors`: preprocessor registry
|
|
366
358
|
- `src/json2vec/structs`: pydantic config models, enums, and tree nodes
|
|
367
359
|
- `src/json2vec/tensorfields`: tensorfield plugin system and built-in fields
|
|
368
360
|
- `tests/`: package test suite
|
|
369
|
-
- `docs/`:
|
|
361
|
+
- `docs/`: Quarto project, pages, guides, stylesheets, and sample data
|
|
370
362
|
|
|
371
363
|
## Development
|
|
372
364
|
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
<p align="center">
|
|
4
4
|
<img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&logoColor=white" />
|
|
5
5
|
<a href="LICENSE"><img alt="Apache-2.0 license" src="https://img.shields.io/badge/license-Apache--2.0-2E8B57" /></a>
|
|
6
|
-
<a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-
|
|
6
|
+
<a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-Quarto-39729E?logo=quarto&logoColor=white" /></a>
|
|
7
7
|
<!-- discord-invite:start -->
|
|
8
8
|
<a href="https://discord.gg/DVyZUkvTFA"><img alt="Discord channel invite" src="https://img.shields.io/badge/discord-join%20the%20channel-5865F2?logo=discord&logoColor=white" /></a>
|
|
9
9
|
<!-- discord-invite:end -->
|
|
@@ -24,7 +24,8 @@ A `json2vec` schema is both a data contract and an architecture blueprint.
|
|
|
24
24
|
|
|
25
25
|
- Leaf fields such as `Number`, `Category`, `Set`, `Entity`, `Text`, and
|
|
26
26
|
`Vector` become datatype-specific tensorfields.
|
|
27
|
-
- `
|
|
27
|
+
- `Branch` nodes define shared contexts for child fields, with optional local
|
|
28
|
+
attention and pooling before the representation flows upward.
|
|
28
29
|
- Targets, masks, pruning, and embeddings are configured on the same schema
|
|
29
30
|
tree.
|
|
30
31
|
- Prediction output is keyed by schema address, so decoded values and
|
|
@@ -37,24 +38,23 @@ inference, and serving.
|
|
|
37
38
|
## A Model From A Nested Record
|
|
38
39
|
|
|
39
40
|
```python
|
|
40
|
-
import json2vec as
|
|
41
|
-
|
|
42
|
-
model =
|
|
43
|
-
j2v.Category("customer_tier", max_vocab_size=16),
|
|
44
|
-
j2v.Array(
|
|
45
|
-
j2v.Category("sku", max_vocab_size=2048),
|
|
46
|
-
j2v.Number("quantity"),
|
|
47
|
-
j2v.Number("price"),
|
|
48
|
-
name="line_items",
|
|
49
|
-
max_length=32,
|
|
50
|
-
embed=True,
|
|
51
|
-
),
|
|
52
|
-
j2v.Category("returned", target=True, max_vocab_size=2),
|
|
41
|
+
import json2vec as jv
|
|
42
|
+
|
|
43
|
+
model = jv.Model.from_tree(
|
|
53
44
|
name="order",
|
|
54
45
|
d_model=64,
|
|
55
46
|
n_layers=2,
|
|
56
47
|
n_heads=4,
|
|
57
48
|
embed=True,
|
|
49
|
+
customer_tier=jv.Category(size=16),
|
|
50
|
+
line_items=jv.Branch(
|
|
51
|
+
length=32,
|
|
52
|
+
embed=True,
|
|
53
|
+
sku=jv.Category(size=2048),
|
|
54
|
+
quantity=jv.Number,
|
|
55
|
+
price=jv.Number,
|
|
56
|
+
),
|
|
57
|
+
returned=jv.Category(target=True, size=2),
|
|
58
58
|
)
|
|
59
59
|
```
|
|
60
60
|
|
|
@@ -77,8 +77,8 @@ to emit embeddings at configured addresses.
|
|
|
77
77
|
|
|
78
78
|
## Train With Lightning
|
|
79
79
|
|
|
80
|
-
`
|
|
81
|
-
`
|
|
80
|
+
`jv.Model` is a LightningModule. `jv.PolarsDataModule` and
|
|
81
|
+
`jv.StreamingDataModule` are LightningDataModule implementations. The schema
|
|
82
82
|
defines the model tree, typed losses, prediction outputs, and embeddings;
|
|
83
83
|
Lightning runs `fit`, `validate`, `test`, and `predict`.
|
|
84
84
|
|
|
@@ -87,23 +87,23 @@ import lightning.pytorch as lit
|
|
|
87
87
|
import polars as pl
|
|
88
88
|
import torch
|
|
89
89
|
|
|
90
|
-
import json2vec as
|
|
90
|
+
import json2vec as jv
|
|
91
91
|
|
|
92
92
|
records = pl.read_ndjson("docs/data/iris.jsonl").head(36)
|
|
93
93
|
|
|
94
|
-
model =
|
|
95
|
-
j2v.Number("sepal_length"),
|
|
96
|
-
j2v.Number("petal_length"),
|
|
97
|
-
j2v.Category("species", target=True, max_vocab_size=4, topk=[2]),
|
|
94
|
+
model = jv.Model.from_tree(
|
|
98
95
|
d_model=16,
|
|
99
96
|
n_layers=1,
|
|
100
97
|
n_heads=4,
|
|
101
98
|
batch_size=8,
|
|
102
99
|
embed=True,
|
|
103
100
|
optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
|
|
101
|
+
sepal_length=jv.Number,
|
|
102
|
+
petal_length=jv.Number,
|
|
103
|
+
species=jv.Category(target=True, size=4, topk=[2]),
|
|
104
104
|
)
|
|
105
105
|
|
|
106
|
-
datamodule =
|
|
106
|
+
datamodule = jv.PolarsDataModule(
|
|
107
107
|
model=model,
|
|
108
108
|
train=records,
|
|
109
109
|
validate=records,
|
|
@@ -131,7 +131,7 @@ trainer.fit(model=model, datamodule=datamodule)
|
|
|
131
131
|
For larger jobs, the same model can run through normal Lightning callbacks,
|
|
132
132
|
checkpointing, precision settings, device placement, and distributed
|
|
133
133
|
strategies. See
|
|
134
|
-
[Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning
|
|
134
|
+
[Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning.html).
|
|
135
135
|
|
|
136
136
|
## Predict And Embed
|
|
137
137
|
|
|
@@ -140,8 +140,8 @@ For small interactive batches, call `model.predict(...)` with raw dictionaries.
|
|
|
140
140
|
```python
|
|
141
141
|
predictions = model.predict(records.to_dicts()[:3])
|
|
142
142
|
|
|
143
|
-
species = predictions[
|
|
144
|
-
record = predictions[
|
|
143
|
+
species = predictions[jv.Address("record", "species")]
|
|
144
|
+
record = predictions[jv.Address("record")]
|
|
145
145
|
|
|
146
146
|
print(species["content"]["value"])
|
|
147
147
|
print(species["content"]["probability"])
|
|
@@ -149,10 +149,10 @@ print(record["embedding"])
|
|
|
149
149
|
```
|
|
150
150
|
|
|
151
151
|
For larger offline jobs, configure a `predict` split on a data module and attach
|
|
152
|
-
`
|
|
152
|
+
`jv.Writer` to Lightning's prediction loop.
|
|
153
153
|
|
|
154
154
|
```python
|
|
155
|
-
writer =
|
|
155
|
+
writer = jv.Writer("predictions")
|
|
156
156
|
|
|
157
157
|
trainer = lit.Trainer(
|
|
158
158
|
accelerator="cpu",
|
|
@@ -160,7 +160,7 @@ trainer = lit.Trainer(
|
|
|
160
160
|
logger=False,
|
|
161
161
|
)
|
|
162
162
|
|
|
163
|
-
predict_datamodule =
|
|
163
|
+
predict_datamodule = jv.PolarsDataModule(
|
|
164
164
|
model=model,
|
|
165
165
|
predict=records.drop("species"),
|
|
166
166
|
num_workers=0,
|
|
@@ -174,8 +174,8 @@ trainer.predict(model=model, datamodule=predict_datamodule)
|
|
|
174
174
|
`Writer` creates rank-partitioned Parquet files such as
|
|
175
175
|
`predictions/rank-0.parquet`. Use a postprocessor when downstream systems need
|
|
176
176
|
flat columns, renamed addresses, redacted payloads, or fewer fields. See
|
|
177
|
-
[Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference
|
|
178
|
-
and [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors
|
|
177
|
+
[Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference.html)
|
|
178
|
+
and [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors.html).
|
|
179
179
|
|
|
180
180
|
## Learning Modes
|
|
181
181
|
|
|
@@ -215,7 +215,7 @@ Choose the data module by where the records live:
|
|
|
215
215
|
arguments are compiled regular expressions matched against discovered file
|
|
216
216
|
paths.
|
|
217
217
|
|
|
218
|
-
See [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules
|
|
218
|
+
See [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules.html)
|
|
219
219
|
for split configuration, sharding, sampling, buffers, and preprocessors.
|
|
220
220
|
|
|
221
221
|
## What Makes This Different
|
|
@@ -226,7 +226,7 @@ for split configuration, sharding, sampling, buffers, and preprocessors.
|
|
|
226
226
|
missing-state handling, masking, decoding, loss, metrics, and output writing.
|
|
227
227
|
- **Unified training roles:** `target=True`, `p_prune`, and `p_mask` all use the
|
|
228
228
|
same reconstruction path.
|
|
229
|
-
- **Embedding trees:** embeddings can come from the root,
|
|
229
|
+
- **Embedding trees:** embeddings can come from the root, branches, or selected
|
|
230
230
|
leaves.
|
|
231
231
|
- **Schema evolution:** fields can be added, removed, updated, reset, or
|
|
232
232
|
temporarily overridden after construction.
|
|
@@ -279,55 +279,53 @@ uv sync --extra docs
|
|
|
279
279
|
|
|
280
280
|
The `text` extra installs Hugging Face `transformers`. The `serving` extra
|
|
281
281
|
installs FastAPI-backed deployment dependencies. The `docs` extra installs the
|
|
282
|
-
|
|
282
|
+
Python packages used by the Quarto docs.
|
|
283
283
|
|
|
284
284
|
## Documentation Map
|
|
285
285
|
|
|
286
286
|
Start with:
|
|
287
287
|
|
|
288
|
-
- [Getting Started](https://json2vec.github.io/json2vec/getting-started
|
|
289
|
-
- [AI / Expert Quickstart](https://json2vec.github.io/json2vec/ai-quickstart
|
|
290
|
-
- [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree
|
|
291
|
-
- [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths
|
|
292
|
-
- [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types
|
|
293
|
-
- [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings
|
|
294
|
-
- [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning
|
|
295
|
-
- [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules
|
|
296
|
-
- [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference
|
|
297
|
-
- [API Reference](https://json2vec.github.io/json2vec/reference/api/)
|
|
288
|
+
- [Getting Started](https://json2vec.github.io/json2vec/getting-started.html)
|
|
289
|
+
- [AI / Expert Quickstart](https://json2vec.github.io/json2vec/ai-quickstart.html)
|
|
290
|
+
- [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree.html)
|
|
291
|
+
- [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths.html)
|
|
292
|
+
- [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types.html)
|
|
293
|
+
- [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings.html)
|
|
294
|
+
- [Training With Lightning](https://json2vec.github.io/json2vec/guides/lightning.html)
|
|
295
|
+
- [Data Modules](https://json2vec.github.io/json2vec/guides/data-modules.html)
|
|
296
|
+
- [Batch Inference](https://json2vec.github.io/json2vec/guides/batch-inference.html)
|
|
298
297
|
|
|
299
298
|
Tutorials and guides:
|
|
300
299
|
|
|
301
|
-
- [
|
|
302
|
-
- [
|
|
303
|
-
- [
|
|
304
|
-
- [
|
|
305
|
-
- [
|
|
306
|
-
- [
|
|
307
|
-
- [
|
|
308
|
-
- [
|
|
309
|
-
- [
|
|
310
|
-
- [
|
|
311
|
-
- [
|
|
312
|
-
- [
|
|
300
|
+
- [Postprocessors](https://json2vec.github.io/json2vec/guides/postprocessors.html)
|
|
301
|
+
- [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance.html)
|
|
302
|
+
- [Field Stacking](https://json2vec.github.io/json2vec/guides/field-stacking.html)
|
|
303
|
+
- [Branch](https://json2vec.github.io/json2vec/data-types/branch.html)
|
|
304
|
+
- [Number](https://json2vec.github.io/json2vec/data-types/number.html)
|
|
305
|
+
- [Category](https://json2vec.github.io/json2vec/data-types/category.html)
|
|
306
|
+
- [Set](https://json2vec.github.io/json2vec/data-types/set.html)
|
|
307
|
+
- [Entity](https://json2vec.github.io/json2vec/data-types/entity.html)
|
|
308
|
+
- [DateParts](https://json2vec.github.io/json2vec/data-types/dateparts.html)
|
|
309
|
+
- [Vector](https://json2vec.github.io/json2vec/data-types/vector.html)
|
|
310
|
+
- [Text](https://json2vec.github.io/json2vec/data-types/text.html)
|
|
311
|
+
- [Device Tenure Case Study](https://json2vec.github.io/json2vec/case-studies/device-tenure.html)
|
|
313
312
|
|
|
314
313
|
Build the docs locally with:
|
|
315
314
|
|
|
316
315
|
```bash
|
|
317
|
-
|
|
316
|
+
make render
|
|
318
317
|
```
|
|
319
318
|
|
|
320
319
|
## Repository Layout
|
|
321
320
|
|
|
322
321
|
- `src/json2vec/architecture`: model assembly, attention, pooling, and routing
|
|
323
|
-
- `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline
|
|
322
|
+
- `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline and preprocessor exports
|
|
324
323
|
- `src/json2vec/inference`: serving and prediction callbacks
|
|
325
324
|
- `src/json2vec/logging`: runtime logging callbacks
|
|
326
|
-
- `src/json2vec/preprocessors`: preprocessor registry
|
|
327
325
|
- `src/json2vec/structs`: pydantic config models, enums, and tree nodes
|
|
328
326
|
- `src/json2vec/tensorfields`: tensorfield plugin system and built-in fields
|
|
329
327
|
- `tests/`: package test suite
|
|
330
|
-
- `docs/`:
|
|
328
|
+
- `docs/`: Quarto project, pages, guides, stylesheets, and sample data
|
|
331
329
|
|
|
332
330
|
## Development
|
|
333
331
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "json2vec"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.10"
|
|
4
4
|
description = "Schema-first PyTorch models for hierarchical / nested / sequence data structures"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -32,13 +32,7 @@ text = [
|
|
|
32
32
|
"transformers>=4.55.0",
|
|
33
33
|
]
|
|
34
34
|
docs = [
|
|
35
|
-
"
|
|
36
|
-
"mkdocs-material>=9.6",
|
|
37
|
-
"mkdocs-jupyter>=0.26.3",
|
|
38
|
-
"mkdocstrings[python]>=0.27",
|
|
39
|
-
"orjson>=3.10.0",
|
|
40
|
-
"pydantic-settings>=2.10.1",
|
|
41
|
-
"uvicorn>=0.38.0",
|
|
35
|
+
"marimo>=0.23.8",
|
|
42
36
|
]
|
|
43
37
|
|
|
44
38
|
[dependency-groups]
|
|
@@ -1,24 +1,38 @@
|
|
|
1
1
|
"""Public `json2vec` SDK surface.
|
|
2
2
|
|
|
3
3
|
The top-level package exports the constructors and helpers used by most
|
|
4
|
-
applications: `Model.
|
|
4
|
+
applications: `Model.from_tree(...)` for model construction, tensorfield
|
|
5
5
|
request constructors such as `Category` and `Number`, data modules, schema
|
|
6
6
|
mutation predicates, and the `@preprocess` decorator.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
9
|
from typing import TYPE_CHECKING, Any
|
|
10
10
|
|
|
11
|
+
from json2vec import helpers as helpers
|
|
12
|
+
from json2vec.architecture.checkpoint import RollbackCheckpoint
|
|
13
|
+
from json2vec.architecture.mutations import MutationLockCallback, RuntimePlacementCallback
|
|
11
14
|
from json2vec.architecture.root import (
|
|
12
15
|
Model,
|
|
13
|
-
MutationLockCallback,
|
|
14
16
|
OptimizerConfig,
|
|
15
|
-
RollbackCheckpoint,
|
|
16
|
-
RuntimePlacementCallback,
|
|
17
17
|
SchedulerConfig,
|
|
18
18
|
)
|
|
19
19
|
from json2vec.data.datasets import CustomDataModule, PolarsDataModule, StreamingDataModule
|
|
20
|
-
from json2vec.
|
|
21
|
-
from json2vec.
|
|
20
|
+
from json2vec.data.nested import MASK_LITERAL, MaskLiteral
|
|
21
|
+
from json2vec.data.processors import (
|
|
22
|
+
Metadata,
|
|
23
|
+
Observation,
|
|
24
|
+
Postprocessor,
|
|
25
|
+
PostprocessorProvider,
|
|
26
|
+
PostprocessorResult,
|
|
27
|
+
Predictions,
|
|
28
|
+
Preprocessor,
|
|
29
|
+
PreprocessorProvider,
|
|
30
|
+
RawBatch,
|
|
31
|
+
RawObservation,
|
|
32
|
+
postprocess,
|
|
33
|
+
preprocess,
|
|
34
|
+
)
|
|
35
|
+
from json2vec.inference.callback import Writer
|
|
22
36
|
from json2vec.structs.enums import (
|
|
23
37
|
AttentionMode,
|
|
24
38
|
Component,
|
|
@@ -31,14 +45,14 @@ from json2vec.structs.enums import (
|
|
|
31
45
|
Tokens,
|
|
32
46
|
)
|
|
33
47
|
from json2vec.structs.experiment import (
|
|
34
|
-
Hyperparameters,
|
|
35
48
|
NodeAttribute,
|
|
36
49
|
NodePredicate,
|
|
50
|
+
Schema,
|
|
37
51
|
SchemaField,
|
|
38
52
|
predicate,
|
|
39
53
|
where,
|
|
40
54
|
)
|
|
41
|
-
from json2vec.structs.structure import
|
|
55
|
+
from json2vec.structs.structure import Branch, Mask
|
|
42
56
|
from json2vec.structs.tree import Address, Leaf
|
|
43
57
|
from json2vec.tensorfields import TENSORFIELDS, DecoderBase, EmbedderBase, Plugin, RequestBase, TensorFieldBase
|
|
44
58
|
from json2vec.tensorfields.extensions.category import Request as Category
|
|
@@ -95,7 +109,7 @@ def __dir__() -> list[str]:
|
|
|
95
109
|
__all__ = [
|
|
96
110
|
"Address",
|
|
97
111
|
"Accelerator",
|
|
98
|
-
"
|
|
112
|
+
"Branch",
|
|
99
113
|
"AttentionMode",
|
|
100
114
|
"Category",
|
|
101
115
|
"Component",
|
|
@@ -105,25 +119,35 @@ __all__ = [
|
|
|
105
119
|
"Deployment",
|
|
106
120
|
"EmbedderBase",
|
|
107
121
|
"Entity",
|
|
108
|
-
"
|
|
122
|
+
"helpers",
|
|
123
|
+
"Schema",
|
|
109
124
|
"Input",
|
|
110
125
|
"JSONBackend",
|
|
111
126
|
"Leaf",
|
|
112
127
|
"Metric",
|
|
128
|
+
"MASK_LITERAL",
|
|
129
|
+
"Mask",
|
|
130
|
+
"MaskLiteral",
|
|
131
|
+
"Metadata",
|
|
113
132
|
"Model",
|
|
114
133
|
"ModelSource",
|
|
115
134
|
"MutationLockCallback",
|
|
116
135
|
"NodeAttribute",
|
|
117
136
|
"NodePredicate",
|
|
118
137
|
"Number",
|
|
138
|
+
"Observation",
|
|
119
139
|
"OptimizerConfig",
|
|
120
140
|
"Overflow",
|
|
121
|
-
"PREPROCESSORS",
|
|
122
141
|
"Plugin",
|
|
123
142
|
"PolarsDataModule",
|
|
124
143
|
"Postprocessor",
|
|
144
|
+
"PostprocessorProvider",
|
|
145
|
+
"PostprocessorResult",
|
|
146
|
+
"Predictions",
|
|
125
147
|
"Preprocessor",
|
|
126
|
-
"
|
|
148
|
+
"PreprocessorProvider",
|
|
149
|
+
"RawBatch",
|
|
150
|
+
"RawObservation",
|
|
127
151
|
"RequestBase",
|
|
128
152
|
"RollbackCheckpoint",
|
|
129
153
|
"RuntimePlacementCallback",
|
|
@@ -144,6 +168,7 @@ __all__ = [
|
|
|
144
168
|
"VocabularySyncCallback",
|
|
145
169
|
"Writer",
|
|
146
170
|
"predicate",
|
|
171
|
+
"postprocess",
|
|
147
172
|
"preprocess",
|
|
148
173
|
"where",
|
|
149
174
|
]
|