json2vec 0.2.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. json2vec-0.4.0/PKG-INFO +355 -0
  2. json2vec-0.4.0/README.md +321 -0
  3. {json2vec-0.2.0 → json2vec-0.4.0}/pyproject.toml +16 -12
  4. json2vec-0.4.0/src/json2vec/__init__.py +84 -0
  5. json2vec-0.4.0/src/json2vec/architecture/attention.py +75 -0
  6. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/architecture/encoder.py +36 -20
  7. json2vec-0.4.0/src/json2vec/architecture/node.py +39 -0
  8. json2vec-0.4.0/src/json2vec/architecture/plot.py +560 -0
  9. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/architecture/pool.py +17 -1
  10. json2vec-0.4.0/src/json2vec/architecture/root.py +812 -0
  11. json2vec-0.4.0/src/json2vec/data/datasets/__init__.py +36 -0
  12. json2vec-0.4.0/src/json2vec/data/datasets/base.py +94 -0
  13. json2vec-0.4.0/src/json2vec/data/datasets/polars.py +353 -0
  14. json2vec-0.4.0/src/json2vec/data/datasets/streaming.py +492 -0
  15. json2vec-0.4.0/src/json2vec/data/iterables.py +284 -0
  16. json2vec-0.4.0/src/json2vec/distributed.py +53 -0
  17. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/inference/callback.py +48 -25
  18. json2vec-0.4.0/src/json2vec/inference/deployment.py +396 -0
  19. json2vec-0.4.0/src/json2vec/logging/__init__.py +4 -0
  20. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/logging/config.py +2 -7
  21. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/logging/epoch.py +2 -2
  22. json2vec-0.4.0/src/json2vec/logging/throughput.py +60 -0
  23. json2vec-0.4.0/src/json2vec/preprocessors/__init__.py +14 -0
  24. json2vec-0.4.0/src/json2vec/preprocessors/base.py +160 -0
  25. json2vec-0.4.0/src/json2vec/preprocessors/extensions/__init__.py +1 -0
  26. {json2vec-0.2.0/src/json2vec/processors → json2vec-0.4.0/src/json2vec/preprocessors}/spec.py +1 -1
  27. json2vec-0.4.0/src/json2vec/structs/enums.py +128 -0
  28. json2vec-0.4.0/src/json2vec/structs/experiment.py +746 -0
  29. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/structs/packages.py +35 -6
  30. json2vec-0.4.0/src/json2vec/structs/structure.py +56 -0
  31. json2vec-0.4.0/src/json2vec/structs/tree.py +210 -0
  32. json2vec-0.4.0/src/json2vec/tensorfields/__init__.py +25 -0
  33. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/base.py +136 -43
  34. json2vec-0.4.0/src/json2vec/tensorfields/extensions/__init__.py +17 -0
  35. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/category.py +102 -178
  36. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/dateparts.py +33 -32
  37. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/entity.py +36 -39
  38. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/number.py +72 -47
  39. json2vec-0.4.0/src/json2vec/tensorfields/extensions/set.py +418 -0
  40. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/text.py +79 -89
  41. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/extensions/vector.py +38 -38
  42. json2vec-0.4.0/src/json2vec/tensorfields/shared/__init__.py +12 -0
  43. json2vec-0.4.0/src/json2vec/tensorfields/shared/counter.py +175 -0
  44. json2vec-0.4.0/src/json2vec/tensorfields/shared/vocabulary.py +286 -0
  45. json2vec-0.4.0/src/json2vec.egg-info/PKG-INFO +355 -0
  46. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec.egg-info/SOURCES.txt +18 -14
  47. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec.egg-info/requires.txt +10 -8
  48. json2vec-0.4.0/tests/test_callbacks.py +56 -0
  49. json2vec-0.4.0/tests/test_public_api.py +26 -0
  50. json2vec-0.2.0/PKG-INFO +0 -264
  51. json2vec-0.2.0/README.md +0 -230
  52. json2vec-0.2.0/src/json2vec/__main__.py +0 -32
  53. json2vec-0.2.0/src/json2vec/architecture/attention.py +0 -64
  54. json2vec-0.2.0/src/json2vec/architecture/counter.py +0 -37
  55. json2vec-0.2.0/src/json2vec/architecture/node.py +0 -34
  56. json2vec-0.2.0/src/json2vec/architecture/root.py +0 -338
  57. json2vec-0.2.0/src/json2vec/data/datasets.py +0 -518
  58. json2vec-0.2.0/src/json2vec/entrypoints/__init__.py +0 -3
  59. json2vec-0.2.0/src/json2vec/entrypoints/pipeline.py +0 -174
  60. json2vec-0.2.0/src/json2vec/inference/deployment.py +0 -175
  61. json2vec-0.2.0/src/json2vec/logging/__init__.py +0 -0
  62. json2vec-0.2.0/src/json2vec/logging/throughput.py +0 -39
  63. json2vec-0.2.0/src/json2vec/logging/tracking.py +0 -152
  64. json2vec-0.2.0/src/json2vec/processors/__init__.py +0 -8
  65. json2vec-0.2.0/src/json2vec/processors/base.py +0 -109
  66. json2vec-0.2.0/src/json2vec/processors/extensions/__init__.py +0 -0
  67. json2vec-0.2.0/src/json2vec/processors/extensions/example.py +0 -6
  68. json2vec-0.2.0/src/json2vec/structs/__init__.py +0 -0
  69. json2vec-0.2.0/src/json2vec/structs/enums.py +0 -84
  70. json2vec-0.2.0/src/json2vec/structs/environment.py +0 -138
  71. json2vec-0.2.0/src/json2vec/structs/experiment.py +0 -330
  72. json2vec-0.2.0/src/json2vec/structs/structure.py +0 -70
  73. json2vec-0.2.0/src/json2vec/structs/tree.py +0 -92
  74. json2vec-0.2.0/src/json2vec/tensorfields/__init__.py +0 -8
  75. json2vec-0.2.0/src/json2vec/tensorfields/extensions/__init__.py +0 -0
  76. json2vec-0.2.0/src/json2vec.egg-info/PKG-INFO +0 -264
  77. json2vec-0.2.0/src/json2vec.egg-info/entry_points.txt +0 -2
  78. {json2vec-0.2.0 → json2vec-0.4.0}/LICENSE +0 -0
  79. {json2vec-0.2.0 → json2vec-0.4.0}/NOTICE +0 -0
  80. {json2vec-0.2.0 → json2vec-0.4.0}/setup.cfg +0 -0
  81. {json2vec-0.2.0/src/json2vec → json2vec-0.4.0/src/json2vec/architecture}/__init__.py +0 -0
  82. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/architecture/rotary.py +0 -0
  83. {json2vec-0.2.0/src/json2vec/architecture → json2vec-0.4.0/src/json2vec/data}/__init__.py +0 -0
  84. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/data/processing.py +0 -0
  85. {json2vec-0.2.0/src/json2vec/data → json2vec-0.4.0/src/json2vec/inference}/__init__.py +0 -0
  86. {json2vec-0.2.0/src/json2vec/inference → json2vec-0.4.0/src/json2vec/structs}/__init__.py +0 -0
  87. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec/tensorfields/spec.py +0 -0
  88. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec.egg-info/dependency_links.txt +0 -0
  89. {json2vec-0.2.0 → json2vec-0.4.0}/src/json2vec.egg-info/top_level.txt +0 -0
@@ -0,0 +1,355 @@
1
+ Metadata-Version: 2.4
2
+ Name: json2vec
3
+ Version: 0.4.0
4
+ Summary: JSON -> [*]
5
+ License-Expression: Apache-2.0
6
+ Requires-Python: >=3.12
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ License-File: NOTICE
10
+ Requires-Dist: beartype>=0.21.0
11
+ Requires-Dist: pluggy>=1.6.0
12
+ Requires-Dist: rich>=14.0.0
13
+ Requires-Dist: pydantic>=2.11.7
14
+ Requires-Dist: jmespath>=1.0.1
15
+ Requires-Dist: loguru>=0.7.3
16
+ Requires-Dist: anytree>=2.13.0
17
+ Requires-Dist: ordered-set>=4.1.0
18
+ Requires-Dist: pyarrow>=21.0.0
19
+ Requires-Dist: polars>=1.35.2
20
+ Requires-Dist: numpy>=2.2.6
21
+ Requires-Dist: lightning>=2.6.4
22
+ Requires-Dist: tensordict>=0.10.0
23
+ Requires-Dist: torch>=2.7.1
24
+ Provides-Extra: serving
25
+ Requires-Dist: litserve>=0.2.13; extra == "serving"
26
+ Requires-Dist: pydantic-settings>=2.10.1; extra == "serving"
27
+ Provides-Extra: text
28
+ Requires-Dist: transformers>=4.55.0; extra == "text"
29
+ Provides-Extra: docs
30
+ Requires-Dist: mkdocs-material>=9.6; extra == "docs"
31
+ Requires-Dist: mkdocs-jupyter>=0.26.3; extra == "docs"
32
+ Requires-Dist: mkdocstrings[python]>=0.27; extra == "docs"
33
+ Dynamic: license-file
34
+
35
+ <p align="center">
36
+ <img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="JSON2Vec logo" width="180">
37
+ </p>
38
+
39
+ <h1 align="center">JSON2Vec</h1>
40
+
41
+ <p align="center">
42
+ <img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&amp;logoColor=white" />
43
+ <a href="LICENSE"><img alt="Apache-2.0 license" src="https://img.shields.io/badge/license-Apache--2.0-2E8B57" /></a>
44
+ <a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-MkDocs-526CFE?logo=materialformkdocs&amp;logoColor=white" /></a>
45
+ <!-- discord-invite:start -->
46
+ <a href="https://discord.gg/DVyZUkvTFA"><img alt="Discord channel invite" src="https://img.shields.io/badge/discord-join%20the%20channel-5865F2?logo=discord&amp;logoColor=white" /></a>
47
+ <!-- discord-invite:end -->
48
+ </p>
49
+
50
+ `json2vec` is a schema-driven framework for predictive modeling over nested,
51
+ structured records without flattening them into a fixed feature table first.
52
+
53
+ The schema becomes the encoder: leaf tensorfield plugins encode raw values,
54
+ array nodes aggregate child embeddings with transformer layers, and
55
+ datatype-specific decoders reconstruct masked, targeted, or supervised fields
56
+ from the surrounding hierarchy.
57
+
58
+ This supports self-supervised pretraining, supervised targets, embeddings, and
59
+ schema evolution in one model surface. Customer/account/transaction data,
60
+ flight itineraries, order fulfillment events, clickstream sessions, and other
61
+ nested records can use the same machinery while keeping proprietary data,
62
+ schemas, and checkpoints private.
63
+
64
+ ## What Makes This Different
65
+
66
+ - **Attributed embeddings.** The model can emit embeddings from any configured
67
+ field or array, not only from the root. That makes branch-level similarity and
68
+ retrieval workflows possible without flattening the record.
69
+ - **Extensible data types for predictive modeling.** Masked values,
70
+ targeted fields, and explicit supervised targets all flow through the same
71
+ datatype-specific heads. A new
72
+ [tensorfield type](https://json2vec.github.io/json2vec/guides/tensorfields/) brings its own embedding,
73
+ decoding, loss, and writing logic, so the framework stays reusable as schemas
74
+ grow.
75
+ - **Schema evolution is a first-class workflow.** Between training loops
76
+ (pretraining, finetuning, refitting, and task adaptation), the model can be
77
+ mutated. Fields can be added (`model.extend`), removed (`model.delete`),
78
+ updated (`model.update` / `with model.override`), and reset (`model.reset`).
79
+ See the [model update guide](https://json2vec.github.io/json2vec/guides/model-update/).
80
+ - **Production semantics for missingness.** `null`, `padded`, `masked`, and
81
+ `valued` are distinct states in the tensorfield type system.
82
+ They are not collapsed into one generic missing-value bucket.
83
+ - **Online state lives with the model.** Stateful components such as category
84
+ vocabularies, counters, and numeric normalization state are learned during
85
+ streaming training and serialized with checkpoints, so deployment does not
86
+ depend on a parallel tokenizer or normalizer artifact.
87
+ - **Training-serving parity.** The same configured graph is used for fitting,
88
+ validation, testing, batch prediction, and LitServe-backed online inference.
89
+ - **Target-trained counterfactuals.** Training can periodically remove whole
90
+ field instances with `target=True` or `p_prune`, not just mask individual
91
+ values. At inference time, schema overrides support ablation questions such
92
+ as "what changes if device data is unavailable?" without retraining a separate
93
+ model for every feature-removal scenario.
94
+
95
+ ## Where It Fits
96
+
97
+ Use `json2vec` when the hierarchy is part of the signal:
98
+
99
+ - customer, account, transaction, statement, device, and session records
100
+ - flight itineraries, legs, segments, and events
101
+ - orders, shipments, fulfillment events, and support histories
102
+ - entities with repeated sub-objects, evolving schemas, and mixed datatypes
103
+ - embedding retrieval, anomaly detection, counterfactual ablation, and
104
+ multi-target prediction over nested records
105
+
106
+ For more context on the modeling problem, read
107
+ [Why JSON2Vec](https://json2vec.github.io/json2vec/motivation/).
108
+
109
+ ## What It Does Not Do
110
+
111
+ `json2vec` stops at the representation and typed prediction layer. It does not
112
+ try to be a feature store, governance system, rule engine, authorization layer,
113
+ decision-capture system, or audit platform. Those systems can consume
114
+ `json2vec` embeddings and predictions, but their policies and operational
115
+ controls remain separate concerns.
116
+
117
+ It also does not require users to publish data, schemas, checkpoints, or model
118
+ parameters. The open-source layer is the reusable encoder and runtime
119
+ infrastructure. Your data stays yours, and so do your parameters.
120
+ The framework works under the assumption that model parameters will not be shared.
121
+
122
+ ## What Is In This Repository
123
+
124
+ This repository currently contains:
125
+
126
+ - the core library under `src/json2vec/`
127
+ - tensorfield plugins for `number`, `category`, `set`, `dateparts`, `entity`, `vector`, and `text`
128
+ - a preprocessor registry for dataset-specific preprocessing
129
+ - a LitServe deployment entrypoint for serving from checkpoints
130
+ - tests covering structure loading, data processing, tensorfields, training helpers, logging, and inference
131
+ - rendered tutorial and guide notebooks under [`docs/`](https://json2vec.github.io/json2vec/)
132
+ - diagrams plus whitepaper in [`docs/`](https://json2vec.github.io/json2vec/)
133
+
134
+ ## Install
135
+
136
+ For local development:
137
+
138
+ ```bash
139
+ uv sync
140
+ ```
141
+
142
+ The package requires Python `>=3.12`.
143
+
144
+ ## Hello World Notebook
145
+
146
+ The [hello world notebook](https://json2vec.github.io/json2vec/tutorials/hello-world/) trains a tiny model
147
+ from the bundled Iris JSONL buffer. It demonstrates the full loop: create a
148
+ Polars DataFrame, declare a schema, train a supervised category target, then
149
+ call `predict` and `embed`.
150
+
151
+ ```python
152
+ import lightning.pytorch as lit
153
+ import polars as pl
154
+ import torch
155
+ from rich.pretty import pprint
156
+
157
+ import json2vec as j2v
158
+
159
+
160
+ records = pl.read_ndjson("docs/data/iris.jsonl").head(36)
161
+
162
+ model = j2v.Model.from_schema(
163
+ j2v.Number("sepal_length"),
164
+ j2v.Number("petal_length"),
165
+ j2v.Category("species", target=True, max_vocab_size=4, topk=[2]),
166
+ d_model=16,
167
+ n_layers=1,
168
+ n_heads=4,
169
+ batch_size=8,
170
+ embed=True,
171
+ optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
172
+ )
173
+
174
+ datamodule = j2v.PolarsDataModule.from_model(
175
+ model,
176
+ train=records,
177
+ validate=records,
178
+ num_workers=0,
179
+ persistent_workers=False,
180
+ pin_memory=False,
181
+ observation_buffer_size=32,
182
+ sample_rate=1.0,
183
+ )
184
+
185
+ trainer = lit.Trainer(
186
+ accelerator="cpu",
187
+ max_epochs=1,
188
+ logger=False,
189
+ enable_progress_bar=False,
190
+ enable_model_summary=False,
191
+ enable_checkpointing=False,
192
+ limit_train_batches=1,
193
+ limit_val_batches=1,
194
+ )
195
+
196
+ trainer.fit(model=model, datamodule=datamodule)
197
+
198
+ batch = [[record] for record in records.to_dicts()[:3]]
199
+
200
+ pprint(model.predict(batch))
201
+ pprint(model.embed(batch))
202
+ ```
203
+
204
+ The prediction call returns a typed result for `record/species`. The embedding
205
+ call returns the configured `record` embedding for each input observation.
206
+
207
+ ## Documentation
208
+
209
+ The tutorial examples live as self-contained notebooks under `docs/` and are
210
+ rendered in the documentation site. Build the site locally with:
211
+
212
+ ```bash
213
+ uv run --extra docs mkdocs build --strict
214
+ ```
215
+
216
+ Useful entry points:
217
+
218
+ - [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
219
+ - [Why JSON2Vec](https://json2vec.github.io/json2vec/motivation/)
220
+ - [Schemas & Queries](https://json2vec.github.io/json2vec/guides/model-schemas/)
221
+ - [Model Updates](https://json2vec.github.io/json2vec/guides/model-update/)
222
+ - [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
223
+ - [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
224
+ - [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
225
+ - [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
226
+ - [Field Ablation](https://json2vec.github.io/json2vec/guides/field-ablation/)
227
+ - [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
228
+ - [Tensorfield Extensions](https://json2vec.github.io/json2vec/guides/tensorfields/)
229
+ - [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
230
+ - [API Reference](https://json2vec.github.io/json2vec/reference/api/)
231
+ - [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
232
+
233
+ ## Core Concepts
234
+
235
+ - `Model.from_schema(...)` builds the model tree plus masking, targeting, and embedding controls.
236
+ - `Array` nodes describe hierarchical grouping and aggregation.
237
+ - Field `Request` nodes declare a `type`, a `query`, and type-specific options.
238
+ - `Address` values are stable paths such as `record/account/transaction/amount`.
239
+ - `jmespath` queries extract values from each observation.
240
+ - `TensorField` instances preserve typed content plus state tokens such as
241
+ `valued`, `null`, `padded`, and `masked`.
242
+ - `Parcel` objects carry embeddings from leaves to parent arrays and then up
243
+ the tree.
244
+ - `heritage` is the path from a leaf to the root; decoders use that path when
245
+ reconstructing masked, targeted, or supervised targets.
246
+
247
+ For large local or cloud-hosted datasets, `StreamingDataModule` supports these
248
+ dataset suffixes:
249
+
250
+ - `ndjson`
251
+ - `parquet`
252
+ - `feather`
253
+ - `avro`
254
+ - `csv`
255
+ - `orc`
256
+ - `json`
257
+
258
+ Supported dataset roots are local paths and `s3://...` URIs.
259
+
260
+ ## How The Graph Runs
261
+
262
+ For each batch:
263
+
264
+ 1. Each field request extracts values with its `jmespath` query.
265
+ 2. The matching tensorfield plugin tensorizes values, updates online state when
266
+ allowed for the current split, and records trainable targets when masking or
267
+ targeting occurs.
268
+ 3. Leaf embedders emit parcels to their parent arrays.
269
+ 4. Array nodes run bottom-up, aggregate child parcels, and emit parent context.
270
+ 5. Leaf decoders consume their context path to reconstruct trainable targets.
271
+
272
+ Random `p_mask` corrupts individual values. Random `p_prune` removes whole
273
+ field instances across an observation. `target=True` is shorthand for
274
+ `p_prune=1.0`; `embed=True` exposes embeddings during prediction.
275
+
276
+ ## Preprocessor Model
277
+
278
+ Preprocessors are optional registered Python callables. See the
279
+ [preprocessor guide](https://json2vec.github.io/json2vec/guides/preprocessors/) for examples. If no
280
+ preprocessor is configured, each observation is used as-is without calling a
281
+ default function.
282
+
283
+ Custom preprocessors are registered with `@preprocess(yields=False)` for single-object transformations or `@preprocess(yields=True)` for generators.
284
+
285
+ - transformation preprocessors must return a single `dict`
286
+ - generator preprocessors may yield `dict` objects or return a `list[dict]`
287
+ - every emitted object is wrapped as a single-item root array before tensorization
288
+
289
+ Configured `dataset.kwargs` are passed into the preprocessor, with unsupported keyword arguments automatically ignored.
290
+
291
+ ## Tensorfield Plugins
292
+
293
+ Each tensorfield plugin provides a request schema plus the model components
294
+ needed to encode values, decode predictions, compute losses, and optionally
295
+ serialize outputs. See [Tensorfield Extensions](https://json2vec.github.io/json2vec/guides/tensorfields/)
296
+ for a custom plugin walkthrough. Built-in tensorfields share the base leaf
297
+ options `name`, `query`, `pooling`, `weight`, `n_heads`, `n_linear`, `dropout`,
298
+ `p_mask`, and `p_prune`.
299
+
300
+ | Type | Use It For | Key Options |
301
+ | --- | --- | --- |
302
+ | `number` | Scalar numeric values. Values are padded with explicit state tokens, normalized online during training, embedded with learned Fourier features, and decoded as regression targets. | `jitter`, `n_bands`, `offset`, `alpha`, `objective` (`mae`, `mse`, `huber`) |
303
+ | `category` | Single-label categorical values with an online vocabulary stored in the checkpoint. Unknown or overflow labels route to a reserved unavailable bucket instead of becoming `null`. Prediction output includes label probabilities and optional top-k candidates. | `max_vocab_size`, `n_bands`, `p_unavailable`, `topk` |
304
+ | `set` | Unordered collections of categorical labels, encoded as a multi-hot vector over an online vocabulary. Strings are treated as one-item sets, iterables as many-item sets, and unknown labels use the reserved unavailable bucket. | `max_vocab_size`, `p_unavailable` |
305
+ | `dateparts` | Datetime values represented through selected calendar/time components. Inputs may be native datetimes or strings parsed with a configured pattern. | `dateparts` (`day_of_year`, `week_of_year`, `month_of_year`, `day_of_month`, `week_of_month`, `day_of_week`, `hour_of_day`, `minute_of_hour`), `pattern` |
306
+ | `entity` | Hashable identifiers where the useful signal is equality or co-occurrence within the current observation rather than a global vocabulary. Values are re-indexed locally per observation and require at least two slots per observation. | `topk` |
307
+ | `vector` | Fixed-width numeric embeddings or dense feature vectors supplied by another model or system. Inputs may be lists, tuples, 1D NumPy arrays, or 1D Torch tensors and are projected into `d_model`. | `n_dim`, `objective` (`l1`, `l2`) |
308
+ | `text` | String values encoded by a frozen Hugging Face `AutoModel`, pooled, and projected into `d_model`. Masked or targeted text is trained by reconstructing the encoder representation rather than generating text. | `model_name`, `max_length`, `encoder_batch_size`, `encoder_pooling` (`cls`, `mean`, `pooler`), `objective` (`l1`, `l2`), `revision`, `local_files_only` |
309
+
310
+ The `text` tensorfield requires the optional `transformers` dependency and is
311
+ not installed by default:
312
+
313
+ ```bash
314
+ uv sync --extra text
315
+ ```
316
+
317
+ ## Community
318
+
319
+ Join the Discord channel for questions, design discussion, and release notes:
320
+ <https://discord.gg/DVyZUkvTFA>
321
+
322
+ ## Repository Layout
323
+
324
+ - `src/json2vec/architecture`: model assembly, attention, pooling, and parcel routing
325
+ - `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline
326
+ - `src/json2vec/inference`: serving and prediction callbacks
327
+ - `src/json2vec/logging`: runtime logging callbacks
328
+ - `src/json2vec/preprocessors`: preprocessor registry
329
+ - `src/json2vec/structs`: pydantic config models, enums, and tree nodes
330
+ - `src/json2vec/tensorfields`: tensorfield plugin system and built-in field types
331
+ - `tests/`: package test suite
332
+ - [`docs/whitepaper.typ`](https://json2vec.github.io/json2vec/whitepaper.pdf): longer written documentation
333
+
334
+ ## Development
335
+
336
+ Run the test suite with:
337
+
338
+ ```bash
339
+ uv run pytest
340
+ ```
341
+
342
+ Run lint checks with:
343
+
344
+ ```bash
345
+ uv run ruff check
346
+ ```
347
+
348
+ ## License
349
+
350
+ Licensed under the Apache License, Version 2.0. See `LICENSE` and `NOTICE`.
351
+
352
+ ## References
353
+
354
+ - `BIBLIOGRAPHY.md`
355
+ - `CITATION.bib`
@@ -0,0 +1,321 @@
1
+ <p align="center">
2
+ <img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="JSON2Vec logo" width="180">
3
+ </p>
4
+
5
+ <h1 align="center">JSON2Vec</h1>
6
+
7
+ <p align="center">
8
+ <img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&amp;logoColor=white" />
9
+ <a href="LICENSE"><img alt="Apache-2.0 license" src="https://img.shields.io/badge/license-Apache--2.0-2E8B57" /></a>
10
+ <a href="https://json2vec.github.io/json2vec/"><img alt="Documentation" src="https://img.shields.io/badge/docs-MkDocs-526CFE?logo=materialformkdocs&amp;logoColor=white" /></a>
11
+ <!-- discord-invite:start -->
12
+ <a href="https://discord.gg/DVyZUkvTFA"><img alt="Discord channel invite" src="https://img.shields.io/badge/discord-join%20the%20channel-5865F2?logo=discord&amp;logoColor=white" /></a>
13
+ <!-- discord-invite:end -->
14
+ </p>
15
+
16
+ `json2vec` is a schema-driven framework for predictive modeling over nested,
17
+ structured records without flattening them into a fixed feature table first.
18
+
19
+ The schema becomes the encoder: leaf tensorfield plugins encode raw values,
20
+ array nodes aggregate child embeddings with transformer layers, and
21
+ datatype-specific decoders reconstruct masked, targeted, or supervised fields
22
+ from the surrounding hierarchy.
23
+
24
+ This supports self-supervised pretraining, supervised targets, embeddings, and
25
+ schema evolution in one model surface. Customer/account/transaction data,
26
+ flight itineraries, order fulfillment events, clickstream sessions, and other
27
+ nested records can use the same machinery while keeping proprietary data,
28
+ schemas, and checkpoints private.
29
+
30
+ ## What Makes This Different
31
+
32
+ - **Attributed embeddings.** The model can emit embeddings from any configured
33
+ field or array, not only from the root. That makes branch-level similarity and
34
+ retrieval workflows possible without flattening the record.
35
+ - **Extensible data types for predictive modeling.** Masked values,
36
+ targeted fields, and explicit supervised targets all flow through the same
37
+ datatype-specific heads. A new
38
+ [tensorfield type](https://json2vec.github.io/json2vec/guides/tensorfields/) brings its own embedding,
39
+ decoding, loss, and writing logic, so the framework stays reusable as schemas
40
+ grow.
41
+ - **Schema evolution is a first-class workflow.** Between training loops
42
+ (pretraining, finetuning, refitting, and task adaptation), the model can be
43
+ mutated. Fields can be added (`model.extend`), removed (`model.delete`),
44
+ updated (`model.update` / `with model.override`), and reset (`model.reset`).
45
+ See the [model update guide](https://json2vec.github.io/json2vec/guides/model-update/).
46
+ - **Production semantics for missingness.** `null`, `padded`, `masked`, and
47
+ `valued` are distinct states in the tensorfield type system.
48
+ They are not collapsed into one generic missing-value bucket.
49
+ - **Online state lives with the model.** Stateful components such as category
50
+ vocabularies, counters, and numeric normalization state are learned during
51
+ streaming training and serialized with checkpoints, so deployment does not
52
+ depend on a parallel tokenizer or normalizer artifact.
53
+ - **Training-serving parity.** The same configured graph is used for fitting,
54
+ validation, testing, batch prediction, and LitServe-backed online inference.
55
+ - **Target-trained counterfactuals.** Training can periodically remove whole
56
+ field instances with `target=True` or `p_prune`, not just mask individual
57
+ values. At inference time, schema overrides support ablation questions such
58
+ as "what changes if device data is unavailable?" without retraining a separate
59
+ model for every feature-removal scenario.
60
+
61
+ ## Where It Fits
62
+
63
+ Use `json2vec` when the hierarchy is part of the signal:
64
+
65
+ - customer, account, transaction, statement, device, and session records
66
+ - flight itineraries, legs, segments, and events
67
+ - orders, shipments, fulfillment events, and support histories
68
+ - entities with repeated sub-objects, evolving schemas, and mixed datatypes
69
+ - embedding retrieval, anomaly detection, counterfactual ablation, and
70
+ multi-target prediction over nested records
71
+
72
+ For more context on the modeling problem, read
73
+ [Why JSON2Vec](https://json2vec.github.io/json2vec/motivation/).
74
+
75
+ ## What It Does Not Do
76
+
77
+ `json2vec` stops at the representation and typed prediction layer. It does not
78
+ try to be a feature store, governance system, rule engine, authorization layer,
79
+ decision-capture system, or audit platform. Those systems can consume
80
+ `json2vec` embeddings and predictions, but their policies and operational
81
+ controls remain separate concerns.
82
+
83
+ It also does not require users to publish data, schemas, checkpoints, or model
84
+ parameters. The open-source layer is the reusable encoder and runtime
85
+ infrastructure. Your data stays yours, and so do your parameters.
86
+ The framework works under the assumption that model parameters will not be shared.
87
+
88
+ ## What Is In This Repository
89
+
90
+ This repository currently contains:
91
+
92
+ - the core library under `src/json2vec/`
93
+ - tensorfield plugins for `number`, `category`, `set`, `dateparts`, `entity`, `vector`, and `text`
94
+ - a preprocessor registry for dataset-specific preprocessing
95
+ - a LitServe deployment entrypoint for serving from checkpoints
96
+ - tests covering structure loading, data processing, tensorfields, training helpers, logging, and inference
97
+ - rendered tutorial and guide notebooks under [`docs/`](https://json2vec.github.io/json2vec/)
98
+ - diagrams plus whitepaper in [`docs/`](https://json2vec.github.io/json2vec/)
99
+
100
+ ## Install
101
+
102
+ For local development:
103
+
104
+ ```bash
105
+ uv sync
106
+ ```
107
+
108
+ The package requires Python `>=3.12`.
109
+
110
+ ## Hello World Notebook
111
+
112
+ The [hello world notebook](https://json2vec.github.io/json2vec/tutorials/hello-world/) trains a tiny model
113
+ from the bundled Iris JSONL buffer. It demonstrates the full loop: create a
114
+ Polars DataFrame, declare a schema, train a supervised category target, then
115
+ call `predict` and `embed`.
116
+
117
+ ```python
118
+ import lightning.pytorch as lit
119
+ import polars as pl
120
+ import torch
121
+ from rich.pretty import pprint
122
+
123
+ import json2vec as j2v
124
+
125
+
126
+ records = pl.read_ndjson("docs/data/iris.jsonl").head(36)
127
+
128
+ model = j2v.Model.from_schema(
129
+ j2v.Number("sepal_length"),
130
+ j2v.Number("petal_length"),
131
+ j2v.Category("species", target=True, max_vocab_size=4, topk=[2]),
132
+ d_model=16,
133
+ n_layers=1,
134
+ n_heads=4,
135
+ batch_size=8,
136
+ embed=True,
137
+ optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
138
+ )
139
+
140
+ datamodule = j2v.PolarsDataModule.from_model(
141
+ model,
142
+ train=records,
143
+ validate=records,
144
+ num_workers=0,
145
+ persistent_workers=False,
146
+ pin_memory=False,
147
+ observation_buffer_size=32,
148
+ sample_rate=1.0,
149
+ )
150
+
151
+ trainer = lit.Trainer(
152
+ accelerator="cpu",
153
+ max_epochs=1,
154
+ logger=False,
155
+ enable_progress_bar=False,
156
+ enable_model_summary=False,
157
+ enable_checkpointing=False,
158
+ limit_train_batches=1,
159
+ limit_val_batches=1,
160
+ )
161
+
162
+ trainer.fit(model=model, datamodule=datamodule)
163
+
164
+ batch = [[record] for record in records.to_dicts()[:3]]
165
+
166
+ pprint(model.predict(batch))
167
+ pprint(model.embed(batch))
168
+ ```
169
+
170
+ The prediction call returns a typed result for `record/species`. The embedding
171
+ call returns the configured `record` embedding for each input observation.
172
+
173
+ ## Documentation
174
+
175
+ The tutorial examples live as self-contained notebooks under `docs/` and are
176
+ rendered in the documentation site. Build the site locally with:
177
+
178
+ ```bash
179
+ uv run --extra docs mkdocs build --strict
180
+ ```
181
+
182
+ Useful entry points:
183
+
184
+ - [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
185
+ - [Why JSON2Vec](https://json2vec.github.io/json2vec/motivation/)
186
+ - [Schemas & Queries](https://json2vec.github.io/json2vec/guides/model-schemas/)
187
+ - [Model Updates](https://json2vec.github.io/json2vec/guides/model-update/)
188
+ - [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
189
+ - [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
190
+ - [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
191
+ - [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
192
+ - [Field Ablation](https://json2vec.github.io/json2vec/guides/field-ablation/)
193
+ - [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
194
+ - [Tensorfield Extensions](https://json2vec.github.io/json2vec/guides/tensorfields/)
195
+ - [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
196
+ - [API Reference](https://json2vec.github.io/json2vec/reference/api/)
197
+ - [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
198
+
199
+ ## Core Concepts
200
+
201
+ - `Model.from_schema(...)` builds the model tree plus masking, targeting, and embedding controls.
202
+ - `Array` nodes describe hierarchical grouping and aggregation.
203
+ - Field `Request` nodes declare a `type`, a `query`, and type-specific options.
204
+ - `Address` values are stable paths such as `record/account/transaction/amount`.
205
+ - `jmespath` queries extract values from each observation.
206
+ - `TensorField` instances preserve typed content plus state tokens such as
207
+ `valued`, `null`, `padded`, and `masked`.
208
+ - `Parcel` objects carry embeddings from leaves to parent arrays and then up
209
+ the tree.
210
+ - `heritage` is the path from a leaf to the root; decoders use that path when
211
+ reconstructing masked, targeted, or supervised targets.
212
+
213
+ For large local or cloud-hosted datasets, `StreamingDataModule` supports these
214
+ dataset suffixes:
215
+
216
+ - `ndjson`
217
+ - `parquet`
218
+ - `feather`
219
+ - `avro`
220
+ - `csv`
221
+ - `orc`
222
+ - `json`
223
+
224
+ Supported dataset roots are local paths and `s3://...` URIs.
225
+
226
+ ## How The Graph Runs
227
+
228
+ For each batch:
229
+
230
+ 1. Each field request extracts values with its `jmespath` query.
231
+ 2. The matching tensorfield plugin tensorizes values, updates online state when
232
+ allowed for the current split, and records trainable targets when masking or
233
+ targeting occurs.
234
+ 3. Leaf embedders emit parcels to their parent arrays.
235
+ 4. Array nodes run bottom-up, aggregate child parcels, and emit parent context.
236
+ 5. Leaf decoders consume their context path to reconstruct trainable targets.
237
+
238
+ Random `p_mask` corrupts individual values. Random `p_prune` removes whole
239
+ field instances across an observation. `target=True` is shorthand for
240
+ `p_prune=1.0`; `embed=True` exposes embeddings during prediction.
241
+
242
+ ## Preprocessor Model
243
+
244
+ Preprocessors are optional registered Python callables. See the
245
+ [preprocessor guide](https://json2vec.github.io/json2vec/guides/preprocessors/) for examples. If no
246
+ preprocessor is configured, each observation is used as-is without calling a
247
+ default function.
248
+
249
+ Custom preprocessors are registered with `@preprocess(yields=False)` for single-object transformations or `@preprocess(yields=True)` for generators.
250
+
251
+ - transformation preprocessors must return a single `dict`
252
+ - generator preprocessors may yield `dict` objects or return a `list[dict]`
253
+ - every emitted object is wrapped as a single-item root array before tensorization
254
+
255
+ Configured `dataset.kwargs` are passed into the preprocessor, with unsupported keyword arguments automatically ignored.
256
+
257
+ ## Tensorfield Plugins
258
+
259
+ Each tensorfield plugin provides a request schema plus the model components
260
+ needed to encode values, decode predictions, compute losses, and optionally
261
+ serialize outputs. See [Tensorfield Extensions](https://json2vec.github.io/json2vec/guides/tensorfields/)
262
+ for a custom plugin walkthrough. Built-in tensorfields share the base leaf
263
+ options `name`, `query`, `pooling`, `weight`, `n_heads`, `n_linear`, `dropout`,
264
+ `p_mask`, and `p_prune`.
265
+
266
+ | Type | Use It For | Key Options |
267
+ | --- | --- | --- |
268
+ | `number` | Scalar numeric values. Values are padded with explicit state tokens, normalized online during training, embedded with learned Fourier features, and decoded as regression targets. | `jitter`, `n_bands`, `offset`, `alpha`, `objective` (`mae`, `mse`, `huber`) |
269
+ | `category` | Single-label categorical values with an online vocabulary stored in the checkpoint. Unknown or overflow labels route to a reserved unavailable bucket instead of becoming `null`. Prediction output includes label probabilities and optional top-k candidates. | `max_vocab_size`, `n_bands`, `p_unavailable`, `topk` |
270
+ | `set` | Unordered collections of categorical labels, encoded as a multi-hot vector over an online vocabulary. Strings are treated as one-item sets, iterables as many-item sets, and unknown labels use the reserved unavailable bucket. | `max_vocab_size`, `p_unavailable` |
271
+ | `dateparts` | Datetime values represented through selected calendar/time components. Inputs may be native datetimes or strings parsed with a configured pattern. | `dateparts` (`day_of_year`, `week_of_year`, `month_of_year`, `day_of_month`, `week_of_month`, `day_of_week`, `hour_of_day`, `minute_of_hour`), `pattern` |
272
+ | `entity` | Hashable identifiers where the useful signal is equality or co-occurrence within the current observation rather than a global vocabulary. Values are re-indexed locally per observation and require at least two slots per observation. | `topk` |
273
+ | `vector` | Fixed-width numeric embeddings or dense feature vectors supplied by another model or system. Inputs may be lists, tuples, 1D NumPy arrays, or 1D Torch tensors and are projected into `d_model`. | `n_dim`, `objective` (`l1`, `l2`) |
274
+ | `text` | String values encoded by a frozen Hugging Face `AutoModel`, pooled, and projected into `d_model`. Masked or targeted text is trained by reconstructing the encoder representation rather than generating text. | `model_name`, `max_length`, `encoder_batch_size`, `encoder_pooling` (`cls`, `mean`, `pooler`), `objective` (`l1`, `l2`), `revision`, `local_files_only` |
275
+
276
+ The `text` tensorfield requires the optional `transformers` dependency and is
277
+ not installed by default:
278
+
279
+ ```bash
280
+ uv sync --extra text
281
+ ```
282
+
283
+ ## Community
284
+
285
+ Join the Discord channel for questions, design discussion, and release notes:
286
+ <https://discord.gg/DVyZUkvTFA>
287
+
288
+ ## Repository Layout
289
+
290
+ - `src/json2vec/architecture`: model assembly, attention, pooling, and parcel routing
291
+ - `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline
292
+ - `src/json2vec/inference`: serving and prediction callbacks
293
+ - `src/json2vec/logging`: runtime logging callbacks
294
+ - `src/json2vec/preprocessors`: preprocessor registry
295
+ - `src/json2vec/structs`: pydantic config models, enums, and tree nodes
296
+ - `src/json2vec/tensorfields`: tensorfield plugin system and built-in field types
297
+ - `tests/`: package test suite
298
+ - [`docs/whitepaper.typ`](https://json2vec.github.io/json2vec/whitepaper.pdf): longer written documentation
299
+
300
+ ## Development
301
+
302
+ Run the test suite with:
303
+
304
+ ```bash
305
+ uv run pytest
306
+ ```
307
+
308
+ Run lint checks with:
309
+
310
+ ```bash
311
+ uv run ruff check
312
+ ```
313
+
314
+ ## License
315
+
316
+ Licensed under the Apache License, Version 2.0. See `LICENSE` and `NOTICE`.
317
+
318
+ ## References
319
+
320
+ - `BIBLIOGRAPHY.md`
321
+ - `CITATION.bib`