json2vec 0.2.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json2vec-0.4.0/PKG-INFO +355 -0
- json2vec-0.4.0/README.md +321 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/pyproject.toml +16 -12
- json2vec-0.4.0/src/json2vec/__init__.py +84 -0
- json2vec-0.4.0/src/json2vec/architecture/attention.py +75 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/architecture/encoder.py +36 -20
- json2vec-0.4.0/src/json2vec/architecture/node.py +39 -0
- json2vec-0.4.0/src/json2vec/architecture/plot.py +560 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/architecture/pool.py +17 -1
- json2vec-0.4.0/src/json2vec/architecture/root.py +812 -0
- json2vec-0.4.0/src/json2vec/data/datasets/__init__.py +36 -0
- json2vec-0.4.0/src/json2vec/data/datasets/base.py +94 -0
- json2vec-0.4.0/src/json2vec/data/datasets/polars.py +353 -0
- json2vec-0.4.0/src/json2vec/data/datasets/streaming.py +492 -0
- json2vec-0.4.0/src/json2vec/data/iterables.py +284 -0
- json2vec-0.4.0/src/json2vec/distributed.py +53 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/inference/callback.py +48 -25
- json2vec-0.4.0/src/json2vec/inference/deployment.py +396 -0
- json2vec-0.4.0/src/json2vec/logging/__init__.py +4 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/logging/config.py +2 -7
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/logging/epoch.py +2 -2
- json2vec-0.4.0/src/json2vec/logging/throughput.py +60 -0
- json2vec-0.4.0/src/json2vec/preprocessors/__init__.py +14 -0
- json2vec-0.4.0/src/json2vec/preprocessors/base.py +160 -0
- json2vec-0.4.0/src/json2vec/preprocessors/extensions/__init__.py +1 -0
- {json2vec-0.2.0/src/json2vec/processors → json2vec-0.4.0/src/json2vec/preprocessors}/spec.py +1 -1
- json2vec-0.4.0/src/json2vec/structs/enums.py +128 -0
- json2vec-0.4.0/src/json2vec/structs/experiment.py +746 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/structs/packages.py +35 -6
- json2vec-0.4.0/src/json2vec/structs/structure.py +56 -0
- json2vec-0.4.0/src/json2vec/structs/tree.py +210 -0
- json2vec-0.4.0/src/json2vec/tensorfields/__init__.py +25 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/base.py +136 -43
- json2vec-0.4.0/src/json2vec/tensorfields/extensions/__init__.py +17 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/category.py +102 -178
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/dateparts.py +33 -32
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/entity.py +36 -39
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/number.py +72 -47
- json2vec-0.4.0/src/json2vec/tensorfields/extensions/set.py +418 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/text.py +79 -89
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/vector.py +38 -38
- json2vec-0.4.0/src/json2vec/tensorfields/shared/__init__.py +12 -0
- json2vec-0.4.0/src/json2vec/tensorfields/shared/counter.py +175 -0
- json2vec-0.4.0/src/json2vec/tensorfields/shared/vocabulary.py +286 -0
- json2vec-0.4.0/src/json2vec.egg-info/PKG-INFO +355 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec.egg-info/SOURCES.txt +18 -14
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec.egg-info/requires.txt +10 -8
- json2vec-0.4.0/tests/test_callbacks.py +56 -0
- json2vec-0.4.0/tests/test_public_api.py +26 -0
- json2vec-0.2.0/PKG-INFO +0 -264
- json2vec-0.2.0/README.md +0 -230
- json2vec-0.2.0/src/json2vec/__main__.py +0 -32
- json2vec-0.2.0/src/json2vec/architecture/attention.py +0 -64
- json2vec-0.2.0/src/json2vec/architecture/counter.py +0 -37
- json2vec-0.2.0/src/json2vec/architecture/node.py +0 -34
- json2vec-0.2.0/src/json2vec/architecture/root.py +0 -338
- json2vec-0.2.0/src/json2vec/data/datasets.py +0 -518
- json2vec-0.2.0/src/json2vec/entrypoints/__init__.py +0 -3
- json2vec-0.2.0/src/json2vec/entrypoints/pipeline.py +0 -174
- json2vec-0.2.0/src/json2vec/inference/deployment.py +0 -175
- json2vec-0.2.0/src/json2vec/logging/__init__.py +0 -0
- json2vec-0.2.0/src/json2vec/logging/throughput.py +0 -39
- json2vec-0.2.0/src/json2vec/logging/tracking.py +0 -152
- json2vec-0.2.0/src/json2vec/processors/__init__.py +0 -8
- json2vec-0.2.0/src/json2vec/processors/base.py +0 -109
- json2vec-0.2.0/src/json2vec/processors/extensions/__init__.py +0 -0
- json2vec-0.2.0/src/json2vec/processors/extensions/example.py +0 -6
- json2vec-0.2.0/src/json2vec/structs/__init__.py +0 -0
- json2vec-0.2.0/src/json2vec/structs/enums.py +0 -84
- json2vec-0.2.0/src/json2vec/structs/environment.py +0 -138
- json2vec-0.2.0/src/json2vec/structs/experiment.py +0 -330
- json2vec-0.2.0/src/json2vec/structs/structure.py +0 -70
- json2vec-0.2.0/src/json2vec/structs/tree.py +0 -92
- json2vec-0.2.0/src/json2vec/tensorfields/__init__.py +0 -8
- json2vec-0.2.0/src/json2vec/tensorfields/extensions/__init__.py +0 -0
- json2vec-0.2.0/src/json2vec.egg-info/PKG-INFO +0 -264
- json2vec-0.2.0/src/json2vec.egg-info/entry_points.txt +0 -2
- {json2vec-0.2.0 → json2vec-0.4.0}/LICENSE +0 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/NOTICE +0 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/setup.cfg +0 -0
- {json2vec-0.2.0/src/json2vec → json2vec-0.4.0/src/json2vec/architecture}/__init__.py +0 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/architecture/rotary.py +0 -0
- {json2vec-0.2.0/src/json2vec/architecture → json2vec-0.4.0/src/json2vec/data}/__init__.py +0 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/data/processing.py +0 -0
- {json2vec-0.2.0/src/json2vec/data → json2vec-0.4.0/src/json2vec/inference}/__init__.py +0 -0
- {json2vec-0.2.0/src/json2vec/inference → json2vec-0.4.0/src/json2vec/structs}/__init__.py +0 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/spec.py +0 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec.egg-info/dependency_links.txt +0 -0
- {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec.egg-info/top_level.txt +0 -0
json2vec-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: json2vec
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: JSON -> [*]
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
License-File: NOTICE
|
|
10
|
+
Requires-Dist: beartype>=0.21.0
|
|
11
|
+
Requires-Dist: pluggy>=1.6.0
|
|
12
|
+
Requires-Dist: rich>=14.0.0
|
|
13
|
+
Requires-Dist: pydantic>=2.11.7
|
|
14
|
+
Requires-Dist: jmespath>=1.0.1
|
|
15
|
+
Requires-Dist: loguru>=0.7.3
|
|
16
|
+
Requires-Dist: anytree>=2.13.0
|
|
17
|
+
Requires-Dist: ordered-set>=4.1.0
|
|
18
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
19
|
+
Requires-Dist: polars>=1.35.2
|
|
20
|
+
Requires-Dist: numpy>=2.2.6
|
|
21
|
+
Requires-Dist: lightning>=2.6.4
|
|
22
|
+
Requires-Dist: tensordict>=0.10.0
|
|
23
|
+
Requires-Dist: torch>=2.7.1
|
|
24
|
+
Provides-Extra: serving
|
|
25
|
+
Requires-Dist: litserve>=0.2.13; extra == "serving"
|
|
26
|
+
Requires-Dist: pydantic-settings>=2.10.1; extra == "serving"
|
|
27
|
+
Provides-Extra: text
|
|
28
|
+
Requires-Dist: transformers>=4.55.0; extra == "text"
|
|
29
|
+
Provides-Extra: docs
|
|
30
|
+
Requires-Dist: mkdocs-material>=9.6; extra == "docs"
|
|
31
|
+
Requires-Dist: mkdocs-jupyter>=0.26.3; extra == "docs"
|
|
32
|
+
Requires-Dist: mkdocstrings[python]>=0.27; extra == "docs"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
<p align="center">
|
|
36
|
+
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="JSON2Vec logo" width="180">
|
|
37
|
+
</p>
|
|
38
|
+
|
|
39
|
+
<h1 align="center">JSON2Vec</h1>
|
|
40
|
+
|
|
41
|
+
<p align="center">
|
|
42
|
+
<img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&logoColor=white" />
|
|
43
|
+
<a href="LICENSE"><img alt="Apache-2.0 license" src="https://img.shields.io/badge/license-Apache--2.0-2E8B57" /></a>
|
|
44
|
+
<a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-MkDocs-526CFE?logo=materialformkdocs&logoColor=white" /></a>
|
|
45
|
+
<!-- discord-invite:start -->
|
|
46
|
+
<a href="https://discord.gg/DVyZUkvTFA"><img alt="Discord channel invite" src="https://img.shields.io/badge/discord-join%20the%20channel-5865F2?logo=discord&logoColor=white" /></a>
|
|
47
|
+
<!-- discord-invite:end -->
|
|
48
|
+
</p>
|
|
49
|
+
|
|
50
|
+
`json2vec` is a schema-driven framework for predictive modeling over nested,
|
|
51
|
+
structured records without flattening them into a fixed feature table first.
|
|
52
|
+
|
|
53
|
+
The schema becomes the encoder: leaf tensorfield plugins encode raw values,
|
|
54
|
+
array nodes aggregate child embeddings with transformer layers, and
|
|
55
|
+
datatype-specific decoders reconstruct masked, targeted, or supervised fields
|
|
56
|
+
from the surrounding hierarchy.
|
|
57
|
+
|
|
58
|
+
This supports self-supervised pretraining, supervised targets, embeddings, and
|
|
59
|
+
schema evolution in one model surface. Customer/account/transaction data,
|
|
60
|
+
flight itineraries, order fulfillment events, clickstream sessions, and other
|
|
61
|
+
nested records can use the same machinery while keeping proprietary data,
|
|
62
|
+
schemas, and checkpoints private.
|
|
63
|
+
|
|
64
|
+
## What Makes This Different
|
|
65
|
+
|
|
66
|
+
- **Attributed embeddings.** The model can emit embeddings from any configured
|
|
67
|
+
field or array, not only from the root. That makes branch-level similarity and
|
|
68
|
+
retrieval workflows possible without flattening the record.
|
|
69
|
+
- **Extensible data types for predictive modeling.** Masked values,
|
|
70
|
+
targeted fields, and explicit supervised targets all flow through the same
|
|
71
|
+
datatype-specific heads. A new
|
|
72
|
+
[tensorfield type](https://json2vec.github.io/json2vec/guides/tensorfields/) brings its own embedding,
|
|
73
|
+
decoding, loss, and writing logic, so the framework stays reusable as schemas
|
|
74
|
+
grow.
|
|
75
|
+
- **Schema evolution is a first-class workflow.** Between training loops
|
|
76
|
+
(pretraining, finetuning, refitting, and task adaptation), the model can be
|
|
77
|
+
mutated. Fields can be added (`model.extend`), removed (`model.delete`),
|
|
78
|
+
updated (`model.update` / `with model.override`), and reset (`model.reset`).
|
|
79
|
+
See the [model update guide](https://json2vec.github.io/json2vec/guides/model-update/).
|
|
80
|
+
- **Production semantics for missingness.** `null`, `padded`, `masked`, and
|
|
81
|
+
`valued` are distinct states in the tensorfield type system.
|
|
82
|
+
They are not collapsed into one generic missing-value bucket.
|
|
83
|
+
- **Online state lives with the model.** Stateful components such as category
|
|
84
|
+
vocabularies, counters, and numeric normalization state are learned during
|
|
85
|
+
streaming training and serialized with checkpoints, so deployment does not
|
|
86
|
+
depend on a parallel tokenizer or normalizer artifact.
|
|
87
|
+
- **Training-serving parity.** The same configured graph is used for fitting,
|
|
88
|
+
validation, testing, batch prediction, and LitServe-backed online inference.
|
|
89
|
+
- **Target-trained counterfactuals.** Training can periodically remove whole
|
|
90
|
+
field instances with `target=True` or `p_prune`, not just mask individual
|
|
91
|
+
values. At inference time, schema overrides support ablation questions such
|
|
92
|
+
as "what changes if device data is unavailable?" without retraining a separate
|
|
93
|
+
model for every feature-removal scenario.
|
|
94
|
+
|
|
95
|
+
## Where It Fits
|
|
96
|
+
|
|
97
|
+
Use `json2vec` when the hierarchy is part of the signal:
|
|
98
|
+
|
|
99
|
+
- customer, account, transaction, statement, device, and session records
|
|
100
|
+
- flight itineraries, legs, segments, and events
|
|
101
|
+
- orders, shipments, fulfillment events, and support histories
|
|
102
|
+
- entities with repeated sub-objects, evolving schemas, and mixed datatypes
|
|
103
|
+
- embedding retrieval, anomaly detection, counterfactual ablation, and
|
|
104
|
+
multi-target prediction over nested records
|
|
105
|
+
|
|
106
|
+
For more context on the modeling problem, read
|
|
107
|
+
[Why JSON2Vec](https://json2vec.github.io/json2vec/motivation/).
|
|
108
|
+
|
|
109
|
+
## What It Does Not Do
|
|
110
|
+
|
|
111
|
+
`json2vec` stops at the representation and typed prediction layer. It does not
|
|
112
|
+
try to be a feature store, governance system, rule engine, authorization layer,
|
|
113
|
+
decision-capture system, or audit platform. Those systems can consume
|
|
114
|
+
`json2vec` embeddings and predictions, but their policies and operational
|
|
115
|
+
controls remain separate concerns.
|
|
116
|
+
|
|
117
|
+
It also does not require users to publish data, schemas, checkpoints, or model
|
|
118
|
+
parameters. The open-source layer is the reusable encoder and runtime
|
|
119
|
+
infrastructure. Your data stays yours, and so do your parameters.
|
|
120
|
+
The framework works under the assumption that model parameters will not be shared.
|
|
121
|
+
|
|
122
|
+
## What Is In This Repository
|
|
123
|
+
|
|
124
|
+
This repository currently contains:
|
|
125
|
+
|
|
126
|
+
- the core library under `src/json2vec/`
|
|
127
|
+
- tensorfield plugins for `number`, `category`, `set`, `dateparts`, `entity`, `vector`, and `text`
|
|
128
|
+
- a preprocessor registry for dataset-specific preprocessing
|
|
129
|
+
- a LitServe deployment entrypoint for serving from checkpoints
|
|
130
|
+
- tests covering structure loading, data processing, tensorfields, training helpers, logging, and inference
|
|
131
|
+
- rendered tutorial and guide notebooks under [`docs/`](https://json2vec.github.io/json2vec/)
|
|
132
|
+
- diagrams plus whitepaper in [`docs/`](https://json2vec.github.io/json2vec/)
|
|
133
|
+
|
|
134
|
+
## Install
|
|
135
|
+
|
|
136
|
+
For local development:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
uv sync
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
The package requires Python `>=3.12`.
|
|
143
|
+
|
|
144
|
+
## Hello World Notebook
|
|
145
|
+
|
|
146
|
+
The [hello world notebook](https://json2vec.github.io/json2vec/tutorials/hello-world/) trains a tiny model
|
|
147
|
+
from the bundled Iris JSONL buffer. It demonstrates the full loop: create a
|
|
148
|
+
Polars DataFrame, declare a schema, train a supervised category target, then
|
|
149
|
+
call `predict` and `embed`.
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
import lightning.pytorch as lit
|
|
153
|
+
import polars as pl
|
|
154
|
+
import torch
|
|
155
|
+
from rich.pretty import pprint
|
|
156
|
+
|
|
157
|
+
import json2vec as j2v
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
records = pl.read_ndjson("docs/data/iris.jsonl").head(36)
|
|
161
|
+
|
|
162
|
+
model = j2v.Model.from_schema(
|
|
163
|
+
j2v.Number("sepal_length"),
|
|
164
|
+
j2v.Number("petal_length"),
|
|
165
|
+
j2v.Category("species", target=True, max_vocab_size=4, topk=[2]),
|
|
166
|
+
d_model=16,
|
|
167
|
+
n_layers=1,
|
|
168
|
+
n_heads=4,
|
|
169
|
+
batch_size=8,
|
|
170
|
+
embed=True,
|
|
171
|
+
optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
datamodule = j2v.PolarsDataModule.from_model(
|
|
175
|
+
model,
|
|
176
|
+
train=records,
|
|
177
|
+
validate=records,
|
|
178
|
+
num_workers=0,
|
|
179
|
+
persistent_workers=False,
|
|
180
|
+
pin_memory=False,
|
|
181
|
+
observation_buffer_size=32,
|
|
182
|
+
sample_rate=1.0,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
trainer = lit.Trainer(
|
|
186
|
+
accelerator="cpu",
|
|
187
|
+
max_epochs=1,
|
|
188
|
+
logger=False,
|
|
189
|
+
enable_progress_bar=False,
|
|
190
|
+
enable_model_summary=False,
|
|
191
|
+
enable_checkpointing=False,
|
|
192
|
+
limit_train_batches=1,
|
|
193
|
+
limit_val_batches=1,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
trainer.fit(model=model, datamodule=datamodule)
|
|
197
|
+
|
|
198
|
+
batch = [[record] for record in records.to_dicts()[:3]]
|
|
199
|
+
|
|
200
|
+
pprint(model.predict(batch))
|
|
201
|
+
pprint(model.embed(batch))
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
The prediction call returns a typed result for `record/species`. The embedding
|
|
205
|
+
call returns the configured `record` embedding for each input observation.
|
|
206
|
+
|
|
207
|
+
## Documentation
|
|
208
|
+
|
|
209
|
+
The tutorial examples live as self-contained notebooks under `docs/` and are
|
|
210
|
+
rendered in the documentation site. Build the site locally with:
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
uv run --extra docs mkdocs build --strict
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
Useful entry points:
|
|
217
|
+
|
|
218
|
+
- [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
|
|
219
|
+
- [Why JSON2Vec](https://json2vec.github.io/json2vec/motivation/)
|
|
220
|
+
- [Schemas & Queries](https://json2vec.github.io/json2vec/guides/model-schemas/)
|
|
221
|
+
- [Model Updates](https://json2vec.github.io/json2vec/guides/model-update/)
|
|
222
|
+
- [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
|
|
223
|
+
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
224
|
+
- [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
|
|
225
|
+
- [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
|
|
226
|
+
- [Field Ablation](https://json2vec.github.io/json2vec/guides/field-ablation/)
|
|
227
|
+
- [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
|
|
228
|
+
- [Tensorfield Extensions](https://json2vec.github.io/json2vec/guides/tensorfields/)
|
|
229
|
+
- [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
|
|
230
|
+
- [API Reference](https://json2vec.github.io/json2vec/reference/api/)
|
|
231
|
+
- [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
|
|
232
|
+
|
|
233
|
+
## Core Concepts
|
|
234
|
+
|
|
235
|
+
- `Model.from_schema(...)` builds the model tree plus masking, targeting, and embedding controls.
|
|
236
|
+
- `Array` nodes describe hierarchical grouping and aggregation.
|
|
237
|
+
- Field `Request` nodes declare a `type`, a `query`, and type-specific options.
|
|
238
|
+
- `Address` values are stable paths such as `record/account/transaction/amount`.
|
|
239
|
+
- `jmespath` queries extract values from each observation.
|
|
240
|
+
- `TensorField` instances preserve typed content plus state tokens such as
|
|
241
|
+
`valued`, `null`, `padded`, and `masked`.
|
|
242
|
+
- `Parcel` objects carry embeddings from leaves to parent arrays and then up
|
|
243
|
+
the tree.
|
|
244
|
+
- `heritage` is the path from a leaf to the root; decoders use that path when
|
|
245
|
+
reconstructing masked, targeted, or supervised targets.
|
|
246
|
+
|
|
247
|
+
For large local or cloud-hosted datasets, `StreamingDataModule` supports these
|
|
248
|
+
dataset suffixes:
|
|
249
|
+
|
|
250
|
+
- `ndjson`
|
|
251
|
+
- `parquet`
|
|
252
|
+
- `feather`
|
|
253
|
+
- `avro`
|
|
254
|
+
- `csv`
|
|
255
|
+
- `orc`
|
|
256
|
+
- `json`
|
|
257
|
+
|
|
258
|
+
Supported dataset roots are local paths and `s3://...` URIs.
|
|
259
|
+
|
|
260
|
+
## How The Graph Runs
|
|
261
|
+
|
|
262
|
+
For each batch:
|
|
263
|
+
|
|
264
|
+
1. Each field request extracts values with its `jmespath` query.
|
|
265
|
+
2. The matching tensorfield plugin tensorizes values, updates online state when
|
|
266
|
+
allowed for the current split, and records trainable targets when masking or
|
|
267
|
+
targeting occurs.
|
|
268
|
+
3. Leaf embedders emit parcels to their parent arrays.
|
|
269
|
+
4. Array nodes run bottom-up, aggregate child parcels, and emit parent context.
|
|
270
|
+
5. Leaf decoders consume their context path to reconstruct trainable targets.
|
|
271
|
+
|
|
272
|
+
Random `p_mask` corrupts individual values. Random `p_prune` removes whole
|
|
273
|
+
field instances across an observation. `target=True` is shorthand for
|
|
274
|
+
`p_prune=1.0`; `embed=True` exposes embeddings during prediction.
|
|
275
|
+
|
|
276
|
+
## Preprocessor Model
|
|
277
|
+
|
|
278
|
+
Preprocessors are optional registered Python callables. See the
|
|
279
|
+
[preprocessor guide](https://json2vec.github.io/json2vec/guides/preprocessors/) for examples. If no
|
|
280
|
+
preprocessor is configured, each observation is used as-is without calling a
|
|
281
|
+
default function.
|
|
282
|
+
|
|
283
|
+
Custom preprocessors are registered with `@preprocess(yields=False)` for single-object transformations or `@preprocess(yields=True)` for generators.
|
|
284
|
+
|
|
285
|
+
- transformation preprocessors must return a single `dict`
|
|
286
|
+
- generator preprocessors may yield `dict` objects or return a `list[dict]`
|
|
287
|
+
- every emitted object is wrapped as a single-item root array before tensorization
|
|
288
|
+
|
|
289
|
+
Configured `dataset.kwargs` are passed into the preprocessor, with unsupported keyword arguments automatically ignored.
|
|
290
|
+
|
|
291
|
+
## Tensorfield Plugins
|
|
292
|
+
|
|
293
|
+
Each tensorfield plugin provides a request schema plus the model components
|
|
294
|
+
needed to encode values, decode predictions, compute losses, and optionally
|
|
295
|
+
serialize outputs. See [Tensorfield Extensions](https://json2vec.github.io/json2vec/guides/tensorfields/)
|
|
296
|
+
for a custom plugin walkthrough. Built-in tensorfields share the base leaf
|
|
297
|
+
options `name`, `query`, `pooling`, `weight`, `n_heads`, `n_linear`, `dropout`,
|
|
298
|
+
`p_mask`, and `p_prune`.
|
|
299
|
+
|
|
300
|
+
| Type | Use It For | Key Options |
|
|
301
|
+
| --- | --- | --- |
|
|
302
|
+
| `number` | Scalar numeric values. Values are padded with explicit state tokens, normalized online during training, embedded with learned Fourier features, and decoded as regression targets. | `jitter`, `n_bands`, `offset`, `alpha`, `objective` (`mae`, `mse`, `huber`) |
|
|
303
|
+
| `category` | Single-label categorical values with an online vocabulary stored in the checkpoint. Unknown or overflow labels route to a reserved unavailable bucket instead of becoming `null`. Prediction output includes label probabilities and optional top-k candidates. | `max_vocab_size`, `n_bands`, `p_unavailable`, `topk` |
|
|
304
|
+
| `set` | Unordered collections of categorical labels, encoded as a multi-hot vector over an online vocabulary. Strings are treated as one-item sets, iterables as many-item sets, and unknown labels use the reserved unavailable bucket. | `max_vocab_size`, `p_unavailable` |
|
|
305
|
+
| `dateparts` | Datetime values represented through selected calendar/time components. Inputs may be native datetimes or strings parsed with a configured pattern. | `dateparts` (`day_of_year`, `week_of_year`, `month_of_year`, `day_of_month`, `week_of_month`, `day_of_week`, `hour_of_day`, `minute_of_hour`), `pattern` |
|
|
306
|
+
| `entity` | Hashable identifiers where the useful signal is equality or co-occurrence within the current observation rather than a global vocabulary. Values are re-indexed locally per observation and require at least two slots per observation. | `topk` |
|
|
307
|
+
| `vector` | Fixed-width numeric embeddings or dense feature vectors supplied by another model or system. Inputs may be lists, tuples, 1D NumPy arrays, or 1D Torch tensors and are projected into `d_model`. | `n_dim`, `objective` (`l1`, `l2`) |
|
|
308
|
+
| `text` | String values encoded by a frozen Hugging Face `AutoModel`, pooled, and projected into `d_model`. Masked or targeted text is trained by reconstructing the encoder representation rather than generating text. | `model_name`, `max_length`, `encoder_batch_size`, `encoder_pooling` (`cls`, `mean`, `pooler`), `objective` (`l1`, `l2`), `revision`, `local_files_only` |
|
|
309
|
+
|
|
310
|
+
The `text` tensorfield requires the optional `transformers` dependency and is
|
|
311
|
+
not installed by default:
|
|
312
|
+
|
|
313
|
+
```bash
|
|
314
|
+
uv sync --extra text
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
## Community
|
|
318
|
+
|
|
319
|
+
Join the Discord channel for questions, design discussion, and release notes:
|
|
320
|
+
<https://discord.gg/DVyZUkvTFA>
|
|
321
|
+
|
|
322
|
+
## Repository Layout
|
|
323
|
+
|
|
324
|
+
- `src/json2vec/architecture`: model assembly, attention, pooling, and parcel routing
|
|
325
|
+
- `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline
|
|
326
|
+
- `src/json2vec/inference`: serving and prediction callbacks
|
|
327
|
+
- `src/json2vec/logging`: runtime logging callbacks
|
|
328
|
+
- `src/json2vec/preprocessors`: preprocessor registry
|
|
329
|
+
- `src/json2vec/structs`: pydantic config models, enums, and tree nodes
|
|
330
|
+
- `src/json2vec/tensorfields`: tensorfield plugin system and built-in field types
|
|
331
|
+
- `tests/`: package test suite
|
|
332
|
+
- [`docs/whitepaper.typ`](https://json2vec.github.io/json2vec/whitepaper.pdf): longer written documentation
|
|
333
|
+
|
|
334
|
+
## Development
|
|
335
|
+
|
|
336
|
+
Run the test suite with:
|
|
337
|
+
|
|
338
|
+
```bash
|
|
339
|
+
uv run pytest
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
Run lint checks with:
|
|
343
|
+
|
|
344
|
+
```bash
|
|
345
|
+
uv run ruff check
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
## License
|
|
349
|
+
|
|
350
|
+
Licensed under the Apache License, Version 2.0. See `LICENSE` and `NOTICE`.
|
|
351
|
+
|
|
352
|
+
## References
|
|
353
|
+
|
|
354
|
+
- `BIBLIOGRAPHY.md`
|
|
355
|
+
- `CITATION.bib`
|
json2vec-0.4.0/README.md
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="JSON2Vec logo" width="180">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">JSON2Vec</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&logoColor=white" />
|
|
9
|
+
<a href="LICENSE"><img alt="Apache-2.0 license" src="https://img.shields.io/badge/license-Apache--2.0-2E8B57" /></a>
|
|
10
|
+
<a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-MkDocs-526CFE?logo=materialformkdocs&logoColor=white" /></a>
|
|
11
|
+
<!-- discord-invite:start -->
|
|
12
|
+
<a href="https://discord.gg/DVyZUkvTFA"><img alt="Discord channel invite" src="https://img.shields.io/badge/discord-join%20the%20channel-5865F2?logo=discord&logoColor=white" /></a>
|
|
13
|
+
<!-- discord-invite:end -->
|
|
14
|
+
</p>
|
|
15
|
+
|
|
16
|
+
`json2vec` is a schema-driven framework for predictive modeling over nested,
|
|
17
|
+
structured records without flattening them into a fixed feature table first.
|
|
18
|
+
|
|
19
|
+
The schema becomes the encoder: leaf tensorfield plugins encode raw values,
|
|
20
|
+
array nodes aggregate child embeddings with transformer layers, and
|
|
21
|
+
datatype-specific decoders reconstruct masked, targeted, or supervised fields
|
|
22
|
+
from the surrounding hierarchy.
|
|
23
|
+
|
|
24
|
+
This supports self-supervised pretraining, supervised targets, embeddings, and
|
|
25
|
+
schema evolution in one model surface. Customer/account/transaction data,
|
|
26
|
+
flight itineraries, order fulfillment events, clickstream sessions, and other
|
|
27
|
+
nested records can use the same machinery while keeping proprietary data,
|
|
28
|
+
schemas, and checkpoints private.
|
|
29
|
+
|
|
30
|
+
## What Makes This Different
|
|
31
|
+
|
|
32
|
+
- **Attributed embeddings.** The model can emit embeddings from any configured
|
|
33
|
+
field or array, not only from the root. That makes branch-level similarity and
|
|
34
|
+
retrieval workflows possible without flattening the record.
|
|
35
|
+
- **Extensible data types for predictive modeling.** Masked values,
|
|
36
|
+
targeted fields, and explicit supervised targets all flow through the same
|
|
37
|
+
datatype-specific heads. A new
|
|
38
|
+
[tensorfield type](https://json2vec.github.io/json2vec/guides/tensorfields/) brings its own embedding,
|
|
39
|
+
decoding, loss, and writing logic, so the framework stays reusable as schemas
|
|
40
|
+
grow.
|
|
41
|
+
- **Schema evolution is a first-class workflow.** Between training loops
|
|
42
|
+
(pretraining, finetuning, refitting, and task adaptation), the model can be
|
|
43
|
+
mutated. Fields can be added (`model.extend`), removed (`model.delete`),
|
|
44
|
+
updated (`model.update` / `with model.override`), and reset (`model.reset`).
|
|
45
|
+
See the [model update guide](https://json2vec.github.io/json2vec/guides/model-update/).
|
|
46
|
+
- **Production semantics for missingness.** `null`, `padded`, `masked`, and
|
|
47
|
+
`valued` are distinct states in the tensorfield type system.
|
|
48
|
+
They are not collapsed into one generic missing-value bucket.
|
|
49
|
+
- **Online state lives with the model.** Stateful components such as category
|
|
50
|
+
vocabularies, counters, and numeric normalization state are learned during
|
|
51
|
+
streaming training and serialized with checkpoints, so deployment does not
|
|
52
|
+
depend on a parallel tokenizer or normalizer artifact.
|
|
53
|
+
- **Training-serving parity.** The same configured graph is used for fitting,
|
|
54
|
+
validation, testing, batch prediction, and LitServe-backed online inference.
|
|
55
|
+
- **Target-trained counterfactuals.** Training can periodically remove whole
|
|
56
|
+
field instances with `target=True` or `p_prune`, not just mask individual
|
|
57
|
+
values. At inference time, schema overrides support ablation questions such
|
|
58
|
+
as "what changes if device data is unavailable?" without retraining a separate
|
|
59
|
+
model for every feature-removal scenario.
|
|
60
|
+
|
|
61
|
+
## Where It Fits
|
|
62
|
+
|
|
63
|
+
Use `json2vec` when the hierarchy is part of the signal:
|
|
64
|
+
|
|
65
|
+
- customer, account, transaction, statement, device, and session records
|
|
66
|
+
- flight itineraries, legs, segments, and events
|
|
67
|
+
- orders, shipments, fulfillment events, and support histories
|
|
68
|
+
- entities with repeated sub-objects, evolving schemas, and mixed datatypes
|
|
69
|
+
- embedding retrieval, anomaly detection, counterfactual ablation, and
|
|
70
|
+
multi-target prediction over nested records
|
|
71
|
+
|
|
72
|
+
For more context on the modeling problem, read
|
|
73
|
+
[Why JSON2Vec](https://json2vec.github.io/json2vec/motivation/).
|
|
74
|
+
|
|
75
|
+
## What It Does Not Do
|
|
76
|
+
|
|
77
|
+
`json2vec` stops at the representation and typed prediction layer. It does not
|
|
78
|
+
try to be a feature store, governance system, rule engine, authorization layer,
|
|
79
|
+
decision-capture system, or audit platform. Those systems can consume
|
|
80
|
+
`json2vec` embeddings and predictions, but their policies and operational
|
|
81
|
+
controls remain separate concerns.
|
|
82
|
+
|
|
83
|
+
It also does not require users to publish data, schemas, checkpoints, or model
|
|
84
|
+
parameters. The open-source layer is the reusable encoder and runtime
|
|
85
|
+
infrastructure. Your data stays yours, and so do your parameters.
|
|
86
|
+
The framework works under the assumption that model parameters will not be shared.
|
|
87
|
+
|
|
88
|
+
## What Is In This Repository
|
|
89
|
+
|
|
90
|
+
This repository currently contains:
|
|
91
|
+
|
|
92
|
+
- the core library under `src/json2vec/`
|
|
93
|
+
- tensorfield plugins for `number`, `category`, `set`, `dateparts`, `entity`, `vector`, and `text`
|
|
94
|
+
- a preprocessor registry for dataset-specific preprocessing
|
|
95
|
+
- a LitServe deployment entrypoint for serving from checkpoints
|
|
96
|
+
- tests covering structure loading, data processing, tensorfields, training helpers, logging, and inference
|
|
97
|
+
- rendered tutorial and guide notebooks under [`docs/`](https://json2vec.github.io/json2vec/)
|
|
98
|
+
- diagrams plus whitepaper in [`docs/`](https://json2vec.github.io/json2vec/)
|
|
99
|
+
|
|
100
|
+
## Install
|
|
101
|
+
|
|
102
|
+
For local development:
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
uv sync
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
The package requires Python `>=3.12`.
|
|
109
|
+
|
|
110
|
+
## Hello World Notebook
|
|
111
|
+
|
|
112
|
+
The [hello world notebook](https://json2vec.github.io/json2vec/tutorials/hello-world/) trains a tiny model
|
|
113
|
+
from the bundled Iris JSONL buffer. It demonstrates the full loop: create a
|
|
114
|
+
Polars DataFrame, declare a schema, train a supervised category target, then
|
|
115
|
+
call `predict` and `embed`.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
import lightning.pytorch as lit
|
|
119
|
+
import polars as pl
|
|
120
|
+
import torch
|
|
121
|
+
from rich.pretty import pprint
|
|
122
|
+
|
|
123
|
+
import json2vec as j2v
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
records = pl.read_ndjson("docs/data/iris.jsonl").head(36)
|
|
127
|
+
|
|
128
|
+
model = j2v.Model.from_schema(
|
|
129
|
+
j2v.Number("sepal_length"),
|
|
130
|
+
j2v.Number("petal_length"),
|
|
131
|
+
j2v.Category("species", target=True, max_vocab_size=4, topk=[2]),
|
|
132
|
+
d_model=16,
|
|
133
|
+
n_layers=1,
|
|
134
|
+
n_heads=4,
|
|
135
|
+
batch_size=8,
|
|
136
|
+
embed=True,
|
|
137
|
+
optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
datamodule = j2v.PolarsDataModule.from_model(
|
|
141
|
+
model,
|
|
142
|
+
train=records,
|
|
143
|
+
validate=records,
|
|
144
|
+
num_workers=0,
|
|
145
|
+
persistent_workers=False,
|
|
146
|
+
pin_memory=False,
|
|
147
|
+
observation_buffer_size=32,
|
|
148
|
+
sample_rate=1.0,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
trainer = lit.Trainer(
|
|
152
|
+
accelerator="cpu",
|
|
153
|
+
max_epochs=1,
|
|
154
|
+
logger=False,
|
|
155
|
+
enable_progress_bar=False,
|
|
156
|
+
enable_model_summary=False,
|
|
157
|
+
enable_checkpointing=False,
|
|
158
|
+
limit_train_batches=1,
|
|
159
|
+
limit_val_batches=1,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
trainer.fit(model=model, datamodule=datamodule)
|
|
163
|
+
|
|
164
|
+
batch = [[record] for record in records.to_dicts()[:3]]
|
|
165
|
+
|
|
166
|
+
pprint(model.predict(batch))
|
|
167
|
+
pprint(model.embed(batch))
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
The prediction call returns a typed result for `record/species`. The embedding
|
|
171
|
+
call returns the configured `record` embedding for each input observation.
|
|
172
|
+
|
|
173
|
+
## Documentation
|
|
174
|
+
|
|
175
|
+
The tutorial examples live as self-contained notebooks under `docs/` and are
|
|
176
|
+
rendered in the documentation site. Build the site locally with:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
uv run --extra docs mkdocs build --strict
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
Useful entry points:
|
|
183
|
+
|
|
184
|
+
- [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
|
|
185
|
+
- [Why JSON2Vec](https://json2vec.github.io/json2vec/motivation/)
|
|
186
|
+
- [Schemas & Queries](https://json2vec.github.io/json2vec/guides/model-schemas/)
|
|
187
|
+
- [Model Updates](https://json2vec.github.io/json2vec/guides/model-update/)
|
|
188
|
+
- [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
|
|
189
|
+
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
190
|
+
- [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
|
|
191
|
+
- [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
|
|
192
|
+
- [Field Ablation](https://json2vec.github.io/json2vec/guides/field-ablation/)
|
|
193
|
+
- [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
|
|
194
|
+
- [Tensorfield Extensions](https://json2vec.github.io/json2vec/guides/tensorfields/)
|
|
195
|
+
- [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
|
|
196
|
+
- [API Reference](https://json2vec.github.io/json2vec/reference/api/)
|
|
197
|
+
- [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
|
|
198
|
+
|
|
199
|
+
## Core Concepts
|
|
200
|
+
|
|
201
|
+
- `Model.from_schema(...)` builds the model tree plus masking, targeting, and embedding controls.
|
|
202
|
+
- `Array` nodes describe hierarchical grouping and aggregation.
|
|
203
|
+
- Field `Request` nodes declare a `type`, a `query`, and type-specific options.
|
|
204
|
+
- `Address` values are stable paths such as `record/account/transaction/amount`.
|
|
205
|
+
- `jmespath` queries extract values from each observation.
|
|
206
|
+
- `TensorField` instances preserve typed content plus state tokens such as
|
|
207
|
+
`valued`, `null`, `padded`, and `masked`.
|
|
208
|
+
- `Parcel` objects carry embeddings from leaves to parent arrays and then up
|
|
209
|
+
the tree.
|
|
210
|
+
- `heritage` is the path from a leaf to the root; decoders use that path when
|
|
211
|
+
reconstructing masked, targeted, or supervised targets.
|
|
212
|
+
|
|
213
|
+
For large local or cloud-hosted datasets, `StreamingDataModule` supports these
|
|
214
|
+
dataset suffixes:
|
|
215
|
+
|
|
216
|
+
- `ndjson`
|
|
217
|
+
- `parquet`
|
|
218
|
+
- `feather`
|
|
219
|
+
- `avro`
|
|
220
|
+
- `csv`
|
|
221
|
+
- `orc`
|
|
222
|
+
- `json`
|
|
223
|
+
|
|
224
|
+
Supported dataset roots are local paths and `s3://...` URIs.
|
|
225
|
+
|
|
226
|
+
## How The Graph Runs
|
|
227
|
+
|
|
228
|
+
For each batch:
|
|
229
|
+
|
|
230
|
+
1. Each field request extracts values with its `jmespath` query.
|
|
231
|
+
2. The matching tensorfield plugin tensorizes values, updates online state when
|
|
232
|
+
allowed for the current split, and records trainable targets when masking or
|
|
233
|
+
targeting occurs.
|
|
234
|
+
3. Leaf embedders emit parcels to their parent arrays.
|
|
235
|
+
4. Array nodes run bottom-up, aggregate child parcels, and emit parent context.
|
|
236
|
+
5. Leaf decoders consume their context path to reconstruct trainable targets.
|
|
237
|
+
|
|
238
|
+
Random `p_mask` corrupts individual values. Random `p_prune` removes whole
|
|
239
|
+
field instances across an observation. `target=True` is shorthand for
|
|
240
|
+
`p_prune=1.0`; `embed=True` exposes embeddings during prediction.
|
|
241
|
+
|
|
242
|
+
## Preprocessor Model
|
|
243
|
+
|
|
244
|
+
Preprocessors are optional registered Python callables. See the
|
|
245
|
+
[preprocessor guide](https://json2vec.github.io/json2vec/guides/preprocessors/) for examples. If no
|
|
246
|
+
preprocessor is configured, each observation is used as-is without calling a
|
|
247
|
+
default function.
|
|
248
|
+
|
|
249
|
+
Custom preprocessors are registered with `@preprocess(yields=False)` for single-object transformations or `@preprocess(yields=True)` for generators.
|
|
250
|
+
|
|
251
|
+
- transformation preprocessors must return a single `dict`
|
|
252
|
+
- generator preprocessors may yield `dict` objects or return a `list[dict]`
|
|
253
|
+
- every emitted object is wrapped as a single-item root array before tensorization
|
|
254
|
+
|
|
255
|
+
Configured `dataset.kwargs` are passed into the preprocessor, with unsupported keyword arguments automatically ignored.
|
|
256
|
+
|
|
257
|
+
## Tensorfield Plugins
|
|
258
|
+
|
|
259
|
+
Each tensorfield plugin provides a request schema plus the model components
|
|
260
|
+
needed to encode values, decode predictions, compute losses, and optionally
|
|
261
|
+
serialize outputs. See [Tensorfield Extensions](https://json2vec.github.io/json2vec/guides/tensorfields/)
|
|
262
|
+
for a custom plugin walkthrough. Built-in tensorfields share the base leaf
|
|
263
|
+
options `name`, `query`, `pooling`, `weight`, `n_heads`, `n_linear`, `dropout`,
|
|
264
|
+
`p_mask`, and `p_prune`.
|
|
265
|
+
|
|
266
|
+
| Type | Use It For | Key Options |
|
|
267
|
+
| --- | --- | --- |
|
|
268
|
+
| `number` | Scalar numeric values. Values are padded with explicit state tokens, normalized online during training, embedded with learned Fourier features, and decoded as regression targets. | `jitter`, `n_bands`, `offset`, `alpha`, `objective` (`mae`, `mse`, `huber`) |
|
|
269
|
+
| `category` | Single-label categorical values with an online vocabulary stored in the checkpoint. Unknown or overflow labels route to a reserved unavailable bucket instead of becoming `null`. Prediction output includes label probabilities and optional top-k candidates. | `max_vocab_size`, `n_bands`, `p_unavailable`, `topk` |
|
|
270
|
+
| `set` | Unordered collections of categorical labels, encoded as a multi-hot vector over an online vocabulary. Strings are treated as one-item sets, iterables as many-item sets, and unknown labels use the reserved unavailable bucket. | `max_vocab_size`, `p_unavailable` |
|
|
271
|
+
| `dateparts` | Datetime values represented through selected calendar/time components. Inputs may be native datetimes or strings parsed with a configured pattern. | `dateparts` (`day_of_year`, `week_of_year`, `month_of_year`, `day_of_month`, `week_of_month`, `day_of_week`, `hour_of_day`, `minute_of_hour`), `pattern` |
|
|
272
|
+
| `entity` | Hashable identifiers where the useful signal is equality or co-occurrence within the current observation rather than a global vocabulary. Values are re-indexed locally per observation and require at least two slots per observation. | `topk` |
|
|
273
|
+
| `vector` | Fixed-width numeric embeddings or dense feature vectors supplied by another model or system. Inputs may be lists, tuples, 1D NumPy arrays, or 1D Torch tensors and are projected into `d_model`. | `n_dim`, `objective` (`l1`, `l2`) |
|
|
274
|
+
| `text` | String values encoded by a frozen Hugging Face `AutoModel`, pooled, and projected into `d_model`. Masked or targeted text is trained by reconstructing the encoder representation rather than generating text. | `model_name`, `max_length`, `encoder_batch_size`, `encoder_pooling` (`cls`, `mean`, `pooler`), `objective` (`l1`, `l2`), `revision`, `local_files_only` |
|
|
275
|
+
|
|
276
|
+
The `text` tensorfield requires the optional `transformers` dependency and is
|
|
277
|
+
not installed by default:
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
uv sync --extra text
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## Community
|
|
284
|
+
|
|
285
|
+
Join the Discord channel for questions, design discussion, and release notes:
|
|
286
|
+
<https://discord.gg/DVyZUkvTFA>
|
|
287
|
+
|
|
288
|
+
## Repository Layout
|
|
289
|
+
|
|
290
|
+
- `src/json2vec/architecture`: model assembly, attention, pooling, and parcel routing
|
|
291
|
+
- `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline
|
|
292
|
+
- `src/json2vec/inference`: serving and prediction callbacks
|
|
293
|
+
- `src/json2vec/logging`: runtime logging callbacks
|
|
294
|
+
- `src/json2vec/preprocessors`: preprocessor registry
|
|
295
|
+
- `src/json2vec/structs`: pydantic config models, enums, and tree nodes
|
|
296
|
+
- `src/json2vec/tensorfields`: tensorfield plugin system and built-in field types
|
|
297
|
+
- `tests/`: package test suite
|
|
298
|
+
- [`docs/whitepaper.typ`](https://json2vec.github.io/json2vec/whitepaper.pdf): longer written documentation
|
|
299
|
+
|
|
300
|
+
## Development
|
|
301
|
+
|
|
302
|
+
Run the test suite with:
|
|
303
|
+
|
|
304
|
+
```bash
|
|
305
|
+
uv run pytest
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
Run lint checks with:
|
|
309
|
+
|
|
310
|
+
```bash
|
|
311
|
+
uv run ruff check
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
## License
|
|
315
|
+
|
|
316
|
+
Licensed under the Apache License, Version 2.0. See `LICENSE` and `NOTICE`.
|
|
317
|
+
|
|
318
|
+
## References
|
|
319
|
+
|
|
320
|
+
- `BIBLIOGRAPHY.md`
|
|
321
|
+
- `CITATION.bib`
|