json2vec 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {json2vec-0.2.0/src/json2vec.egg-info → json2vec-0.2.1}/PKG-INFO +129 -5
  2. json2vec-0.2.1/README.md +354 -0
  3. {json2vec-0.2.0 → json2vec-0.2.1}/pyproject.toml +1 -1
  4. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/architecture/counter.py +10 -0
  5. json2vec-0.2.1/src/json2vec/architecture/plot.py +252 -0
  6. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/architecture/root.py +11 -0
  7. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/inference/deployment.py +4 -5
  8. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/structs/enums.py +1 -0
  9. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/base.py +32 -5
  10. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/extensions/category.py +35 -1
  11. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/extensions/number.py +23 -0
  12. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/extensions/text.py +2 -4
  13. {json2vec-0.2.0 → json2vec-0.2.1/src/json2vec.egg-info}/PKG-INFO +129 -5
  14. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec.egg-info/SOURCES.txt +1 -0
  15. json2vec-0.2.0/README.md +0 -230
  16. {json2vec-0.2.0 → json2vec-0.2.1}/LICENSE +0 -0
  17. {json2vec-0.2.0 → json2vec-0.2.1}/NOTICE +0 -0
  18. {json2vec-0.2.0 → json2vec-0.2.1}/setup.cfg +0 -0
  19. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/__init__.py +0 -0
  20. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/__main__.py +0 -0
  21. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/architecture/__init__.py +0 -0
  22. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/architecture/attention.py +0 -0
  23. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/architecture/encoder.py +0 -0
  24. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/architecture/node.py +0 -0
  25. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/architecture/pool.py +0 -0
  26. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/architecture/rotary.py +0 -0
  27. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/data/__init__.py +0 -0
  28. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/data/datasets.py +0 -0
  29. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/data/processing.py +0 -0
  30. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/entrypoints/__init__.py +0 -0
  31. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/entrypoints/pipeline.py +0 -0
  32. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/inference/__init__.py +0 -0
  33. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/inference/callback.py +0 -0
  34. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/logging/__init__.py +0 -0
  35. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/logging/config.py +0 -0
  36. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/logging/epoch.py +0 -0
  37. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/logging/throughput.py +0 -0
  38. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/logging/tracking.py +0 -0
  39. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/processors/__init__.py +0 -0
  40. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/processors/base.py +0 -0
  41. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/processors/extensions/__init__.py +0 -0
  42. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/processors/extensions/example.py +0 -0
  43. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/processors/spec.py +0 -0
  44. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/structs/__init__.py +0 -0
  45. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/structs/environment.py +0 -0
  46. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/structs/experiment.py +0 -0
  47. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/structs/packages.py +0 -0
  48. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/structs/structure.py +0 -0
  49. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/structs/tree.py +0 -0
  50. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/__init__.py +0 -0
  51. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/extensions/__init__.py +0 -0
  52. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/extensions/dateparts.py +0 -0
  53. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/extensions/entity.py +0 -0
  54. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/extensions/vector.py +0 -0
  55. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec/tensorfields/spec.py +0 -0
  56. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec.egg-info/dependency_links.txt +0 -0
  57. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec.egg-info/entry_points.txt +0 -0
  58. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec.egg-info/requires.txt +0 -0
  59. {json2vec-0.2.0 → json2vec-0.2.1}/src/json2vec.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: json2vec
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: JSON -> [*]
5
5
  License-Expression: Apache-2.0
6
6
  Requires-Python: >=3.12
@@ -34,9 +34,83 @@ Dynamic: license-file
34
34
 
35
35
  # JSON2Vec
36
36
 
37
- `json2vec` is a Python library for learning embeddings directly from nested, semi-structured records without flattening them into a fixed feature table first.
38
-
39
- The model is defined as a tree of contexts and typed fields. Leaf tensorfield plugins encode raw values, context nodes aggregate them with attention, and the same configured pipeline is used for training, batch prediction, and online inference.
37
+ `json2vec` is a schema-driven framework for learning embeddings and task
38
+ heads directly from nested, semi-structured records without flattening them
39
+ into a fixed feature table first.
40
+
41
+ The central idea is that the schema is the encoder. A declared tree of
42
+ contexts and typed fields becomes an addressable neural graph: leaf tensorfield
43
+ plugins encode raw values, context nodes aggregate child embeddings with
44
+ rotary self-attention and learned-query cross-attention pooling, and
45
+ datatype-specific decoders reconstruct masked, pruned, or supervised targets
46
+ from the surrounding hierarchy.
47
+
48
+ This makes `json2vec` a factory for structure-aware encoders rather than a
49
+ single domain model. Customer/account/transaction data, flight itineraries,
50
+ order fulfillment events, clickstream sessions, and other nested records can
51
+ all use the same machinery while keeping their proprietary data, schemas, and
52
+ trained checkpoints private.
53
+
54
+ ## What Makes This Different
55
+
56
+ - **Attributed-distance embeddings.** The model can emit embeddings at any
57
+ configured field or context, not only at the root. That means two observations
58
+ can be similar overall while still exposing which branch of the hierarchy
59
+ accounts for the difference: customer profile, monthly statement, login
60
+ session, transaction history, or any other declared context.
61
+ - **Prune-trained counterfactuals.** Training can periodically remove whole
62
+ fields, not just mask individual values. At inference time, the
63
+ same mechanism supports zero-shot ablation questions such as "what changes if
64
+ device data is unavailable?" without retraining a separate model for every
65
+ feature-removal scenario.
66
+ - **One path for self-supervised and supervised learning.** Masked values,
67
+ pruned fields, and explicit supervised targets all flow through the same
68
+ datatype-specific heads. A new tensorfield type brings its own embedding,
69
+ decoding, loss, and writing logic, so the framework stays reusable as schemas
70
+ grow.
71
+ - **Schema evolution is a first-class workflow.** Because modules are addressed
72
+ by the schema tree, structures can be patched, fields can be added or
73
+ removed, and selected fields can be pruned across sessions without rebuilding
74
+ a separate feature pipeline.
75
+ - **Production semantics for missingness.** `null`, `padded`, `masked`,
76
+ `pruned`, and `valued` are distinct states in the tensorfield type system.
77
+ They are not collapsed into one generic missing-value bucket.
78
+ - **Online state lives with the model.** Stateful components such as category
79
+ vocabularies, counters, and numeric normalization state are learned during
80
+ streaming training and serialized with checkpoints, so deployment does not
81
+ depend on a parallel tokenizer or normalizer artifact.
82
+ - **Training-serving parity.** The same configured graph is used for fitting,
83
+ validation, testing, batch prediction, and LitServe-backed online inference.
84
+
85
+ The attributed embeddings and prune-trained ablations are model-level
86
+ explanation primitives. They help answer where two records differ and how a
87
+ prediction changes when an information source is withheld. They are not a
88
+ complete compliance story by themselves, but they make governance and audit
89
+ layers easier to build on top of the representation layer.
90
+
91
+ ## Where It Fits
92
+
93
+ Use `json2vec` when the hierarchy is part of the signal:
94
+
95
+ - customer, account, transaction, statement, device, and session records
96
+ - flight itineraries, legs, segments, and events
97
+ - orders, shipments, fulfillment events, and support histories
98
+ - entities with repeated sub-objects, evolving schemas, and mixed datatypes
99
+ - embedding retrieval, anomaly detection, counterfactual ablation, and
100
+ multi-target prediction over nested records
101
+
102
+ ## What It Does Not Do
103
+
104
+ `json2vec` stops at the representation and typed prediction layer. It does not
105
+ try to be a feature store, governance system, rule engine, authorization layer,
106
+ decision-capture system, or audit platform. Those systems can consume
107
+ `json2vec` embeddings and predictions, but their policies and operational
108
+ controls remain separate concerns.
109
+
110
+ It also does not require users to publish data, schemas, checkpoints, or model
111
+ parameters. The open-source layer is the reusable encoder and runtime
112
+ infrastructure. Your data stays yours, as does your parameters.
113
+ The framework works under the assumption that model parameters will not be shared.
40
114
 
41
115
  ## What Is In This Repository
42
116
 
@@ -51,6 +125,8 @@ This repository currently contains:
51
125
 
52
126
  It does not currently ship maintained example experiments or `make` shortcuts. Older references to `experiments/`, `examples/`, and `make train` were removed because they no longer reflect the checked-in code.
53
127
 
128
+ More examples based on publicly available will soon be included to showcase implementation and expected behavior.
129
+
54
130
  ## Install
55
131
 
56
132
  For local development:
@@ -72,8 +148,16 @@ The package requires Python `>=3.12`.
72
148
  - `Structure` defines the model tree.
73
149
  - `Context` nodes describe hierarchical grouping and aggregation.
74
150
  - Field `Request` nodes declare a `type`, a `query`, and type-specific options.
151
+ - `Address` values are stable paths such as `root/account/transaction/amount`.
75
152
  - `jmespath` queries extract values from each observation.
76
- - `Session` combines a dataset, structure, task, and runtime controls.
153
+ - `TensorField` instances preserve typed content plus state tokens such as
154
+ `valued`, `null`, `padded`, `masked`, and `pruned`.
155
+ - `Parcel` objects carry embeddings from leaves to parent contexts and then up
156
+ the tree.
157
+ - `heritage` is the path from a leaf to the root; decoders use that path as
158
+ context when reconstructing masked, pruned, or supervised targets.
159
+ - `Session` combines a dataset, structure, task, masking/pruning controls, and
160
+ selected embedding outputs.
77
161
  - `Experiment` is an ordered list of sessions loaded from config files.
78
162
 
79
163
  Supported session tasks are:
@@ -94,6 +178,27 @@ Supported dataset suffixes are:
94
178
  - `json`
95
179
 
96
180
  Supported dataset roots are local paths and `s3://...` URIs. If `dataset.root` is `null`, the pipeline runs in processor-driven mode and expects the configured processor to generate observations.
181
+ This will likely expand to support `@register` based UDFs for arbitrary data sourcing and file format support ...
182
+
183
+ ## How The Graph Runs
184
+
185
+ For each batch:
186
+
187
+ 1. Each field request extracts values with its `jmespath` query.
188
+ 2. The matching tensorfield plugin tensorizes those values, updates any online
189
+ state allowed for the current split, and records trainable targets when
190
+ masking or pruning occurs.
191
+ 3. Leaf embedders emit parcels to their parent contexts.
192
+ 4. Context nodes run bottom-up. Each context concatenates available child
193
+ parcels, applies rotary transformer layers, compresses with learned-query
194
+ cross-attention, and emits a new parcel to its parent.
195
+ 5. Leaf decoders consume the parcel sequence along their heritage path to
196
+ reconstruct trainable targets.
197
+
198
+ Random `p_mask` corrupts individual values. Random `p_prune` removes whole
199
+ field instances across an observation. Session-level `pruned` fields are always
200
+ withheld and become supervised targets; session-level `output` addresses are
201
+ serialized as embeddings during prediction.
97
202
 
98
203
  ## Minimal Training Workflow
99
204
 
@@ -115,6 +220,10 @@ sessions:
115
220
  - name: train
116
221
  task: fit
117
222
  learning_rate: 0.001
223
+ p_mask: 0.15
224
+ p_prune: 0.05
225
+ output:
226
+ - root
118
227
  dataset:
119
228
  root: /path/to/data
120
229
  sample_rate: 1.0
@@ -139,6 +248,9 @@ sessions:
139
248
  type: context
140
249
  context_size: 1
141
250
  n_outputs: 1
251
+ n_layers: 1
252
+ n_heads: 4
253
+ n_linear: 1
142
254
  fields:
143
255
  - name: identifier
144
256
  type: category
@@ -148,10 +260,22 @@ sessions:
148
260
 
149
261
  `fit` sessions write checkpoints to `models/`. In multi-session experiments, the output checkpoint from a `fit` session is automatically passed to later `validate`, `test`, or `predict` sessions.
150
262
 
263
+ To turn a field into a supervised target, include its address in
264
+ `session.pruned` for a fit, validate, test, or predict session. The model will
265
+ withhold that field from the encoder and use the same datatype-specific decoder
266
+ that is used for masked/pruned reconstruction. To export embeddings, include
267
+ field or context addresses in `session.output`.
268
+
151
269
  ## Inference And Serving
152
270
 
153
271
  Batch prediction uses the same experiment/session machinery as training. Prediction outputs are written to `tmp/predictions/`.
154
272
 
273
+ Checkpoints carry the Lightning weights, serialized session configuration, and
274
+ stateful tensorfield state such as online category vocabularies, numeric
275
+ normalization buffers, and class-frequency counters. This tight coupling is
276
+ intentional: the deployed model should not depend on a separate, manually
277
+ synchronized tokenizer or normalizer artifact.
278
+
155
279
  For online serving, the repository exposes `json2vec.inference.deployment.Deployment`, which wraps a checkpoint-backed model in LitServe. Runtime configuration is environment-driven:
156
280
 
157
281
  - `JSON2VEC_CHECKPOINT` or `CHECKPOINT`
@@ -0,0 +1,354 @@
1
+ # JSON2Vec
2
+
3
+ `json2vec` is a schema-driven framework for learning embeddings and task
4
+ heads directly from nested, semi-structured records without flattening them
5
+ into a fixed feature table first.
6
+
7
+ The central idea is that the schema is the encoder. A declared tree of
8
+ contexts and typed fields becomes an addressable neural graph: leaf tensorfield
9
+ plugins encode raw values, context nodes aggregate child embeddings with
10
+ rotary self-attention and learned-query cross-attention pooling, and
11
+ datatype-specific decoders reconstruct masked, pruned, or supervised targets
12
+ from the surrounding hierarchy.
13
+
14
+ This makes `json2vec` a factory for structure-aware encoders rather than a
15
+ single domain model. Customer/account/transaction data, flight itineraries,
16
+ order fulfillment events, clickstream sessions, and other nested records can
17
+ all use the same machinery while keeping their proprietary data, schemas, and
18
+ trained checkpoints private.
19
+
20
+ ## What Makes This Different
21
+
22
+ - **Attributed-distance embeddings.** The model can emit embeddings at any
23
+ configured field or context, not only at the root. That means two observations
24
+ can be similar overall while still exposing which branch of the hierarchy
25
+ accounts for the difference: customer profile, monthly statement, login
26
+ session, transaction history, or any other declared context.
27
+ - **Prune-trained counterfactuals.** Training can periodically remove whole
28
+ fields, not just mask individual values. At inference time, the
29
+ same mechanism supports zero-shot ablation questions such as "what changes if
30
+ device data is unavailable?" without retraining a separate model for every
31
+ feature-removal scenario.
32
+ - **One path for self-supervised and supervised learning.** Masked values,
33
+ pruned fields, and explicit supervised targets all flow through the same
34
+ datatype-specific heads. A new tensorfield type brings its own embedding,
35
+ decoding, loss, and writing logic, so the framework stays reusable as schemas
36
+ grow.
37
+ - **Schema evolution is a first-class workflow.** Because modules are addressed
38
+ by the schema tree, structures can be patched, fields can be added or
39
+ removed, and selected fields can be pruned across sessions without rebuilding
40
+ a separate feature pipeline.
41
+ - **Production semantics for missingness.** `null`, `padded`, `masked`,
42
+ `pruned`, and `valued` are distinct states in the tensorfield type system.
43
+ They are not collapsed into one generic missing-value bucket.
44
+ - **Online state lives with the model.** Stateful components such as category
45
+ vocabularies, counters, and numeric normalization state are learned during
46
+ streaming training and serialized with checkpoints, so deployment does not
47
+ depend on a parallel tokenizer or normalizer artifact.
48
+ - **Training-serving parity.** The same configured graph is used for fitting,
49
+ validation, testing, batch prediction, and LitServe-backed online inference.
50
+
51
+ The attributed embeddings and prune-trained ablations are model-level
52
+ explanation primitives. They help answer where two records differ and how a
53
+ prediction changes when an information source is withheld. They are not a
54
+ complete compliance story by themselves, but they make governance and audit
55
+ layers easier to build on top of the representation layer.
56
+
57
+ ## Where It Fits
58
+
59
+ Use `json2vec` when the hierarchy is part of the signal:
60
+
61
+ - customer, account, transaction, statement, device, and session records
62
+ - flight itineraries, legs, segments, and events
63
+ - orders, shipments, fulfillment events, and support histories
64
+ - entities with repeated sub-objects, evolving schemas, and mixed datatypes
65
+ - embedding retrieval, anomaly detection, counterfactual ablation, and
66
+ multi-target prediction over nested records
67
+
68
+ ## What It Does Not Do
69
+
70
+ `json2vec` stops at the representation and typed prediction layer. It does not
71
+ try to be a feature store, governance system, rule engine, authorization layer,
72
+ decision-capture system, or audit platform. Those systems can consume
73
+ `json2vec` embeddings and predictions, but their policies and operational
74
+ controls remain separate concerns.
75
+
76
+ It also does not require users to publish data, schemas, checkpoints, or model
77
+ parameters. The open-source layer is the reusable encoder and runtime
78
+ infrastructure. Your data stays yours, as does your parameters.
79
+ The framework works under the assumption that model parameters will not be shared.
80
+
81
+ ## What Is In This Repository
82
+
83
+ This repository currently contains:
84
+
85
+ - the core library under `src/json2vec/`
86
+ - tensorfield plugins for `number`, `category`, `dateparts`, `entity`, `vector`, and `text`
87
+ - a processor registry for dataset-specific preprocessing
88
+ - a LitServe deployment entrypoint for serving from checkpoints
89
+ - tests covering structure loading, data processing, tensorfields, training helpers, logging, and inference
90
+ - diagrams plus longer design docs in `docs/`
91
+
92
+ It does not currently ship maintained example experiments or `make` shortcuts. Older references to `experiments/`, `examples/`, and `make train` were removed because they no longer reflect the checked-in code.
93
+
94
+ More examples based on publicly available will soon be included to showcase implementation and expected behavior.
95
+
96
+ ## Install
97
+
98
+ For local development:
99
+
100
+ ```bash
101
+ uv sync
102
+ ```
103
+
104
+ If you want an editable install:
105
+
106
+ ```bash
107
+ pip install -e .
108
+ ```
109
+
110
+ The package requires Python `>=3.12`.
111
+
112
+ ## Core Concepts
113
+
114
+ - `Structure` defines the model tree.
115
+ - `Context` nodes describe hierarchical grouping and aggregation.
116
+ - Field `Request` nodes declare a `type`, a `query`, and type-specific options.
117
+ - `Address` values are stable paths such as `root/account/transaction/amount`.
118
+ - `jmespath` queries extract values from each observation.
119
+ - `TensorField` instances preserve typed content plus state tokens such as
120
+ `valued`, `null`, `padded`, `masked`, and `pruned`.
121
+ - `Parcel` objects carry embeddings from leaves to parent contexts and then up
122
+ the tree.
123
+ - `heritage` is the path from a leaf to the root; decoders use that path as
124
+ context when reconstructing masked, pruned, or supervised targets.
125
+ - `Session` combines a dataset, structure, task, masking/pruning controls, and
126
+ selected embedding outputs.
127
+ - `Experiment` is an ordered list of sessions loaded from config files.
128
+
129
+ Supported session tasks are:
130
+
131
+ - `fit`
132
+ - `validate`
133
+ - `test`
134
+ - `predict`
135
+
136
+ Supported dataset suffixes are:
137
+
138
+ - `ndjson`
139
+ - `parquet`
140
+ - `feather`
141
+ - `avro`
142
+ - `csv`
143
+ - `orc`
144
+ - `json`
145
+
146
+ Supported dataset roots are local paths and `s3://...` URIs. If `dataset.root` is `null`, the pipeline runs in processor-driven mode and expects the configured processor to generate observations.
147
+ This will likely expand to support `@register` based UDFs for arbitrary data sourcing and file format support ...
148
+
149
+ ## How The Graph Runs
150
+
151
+ For each batch:
152
+
153
+ 1. Each field request extracts values with its `jmespath` query.
154
+ 2. The matching tensorfield plugin tensorizes those values, updates any online
155
+ state allowed for the current split, and records trainable targets when
156
+ masking or pruning occurs.
157
+ 3. Leaf embedders emit parcels to their parent contexts.
158
+ 4. Context nodes run bottom-up. Each context concatenates available child
159
+ parcels, applies rotary transformer layers, compresses with learned-query
160
+ cross-attention, and emits a new parcel to its parent.
161
+ 5. Leaf decoders consume the parcel sequence along their heritage path to
162
+ reconstruct trainable targets.
163
+
164
+ Random `p_mask` corrupts individual values. Random `p_prune` removes whole
165
+ field instances across an observation. Session-level `pruned` fields are always
166
+ withheld and become supervised targets; session-level `output` addresses are
167
+ serialized as embeddings during prediction.
168
+
169
+ ## Minimal Training Workflow
170
+
171
+ The CLI entrypoint is:
172
+
173
+ ```bash
174
+ uv run python -m json2vec --experiments /path/to/configs --experiment demo --name local-dev --notes "first run"
175
+ ```
176
+
177
+ The same function is also exposed as the `train` console script after installation.
178
+
179
+ Config discovery is directory-based. `json2vec` can load `.json`, `.yaml`, `.yml`, `.toml`, and `.jsonnet` experiment files. If a config directory contains exactly one experiment file, `--experiment` can be omitted.
180
+
181
+ A minimal YAML experiment looks like this:
182
+
183
+ ```yaml
184
+ project: demo
185
+ sessions:
186
+ - name: train
187
+ task: fit
188
+ learning_rate: 0.001
189
+ p_mask: 0.15
190
+ p_prune: 0.05
191
+ output:
192
+ - root
193
+ dataset:
194
+ root: /path/to/data
195
+ sample_rate: 1.0
196
+ file_buffer_size: 16
197
+ observation_buffer_size: 16
198
+ processor: default
199
+ kwargs: {}
200
+ suffix: ndjson
201
+ patterns:
202
+ train: .*
203
+ validate: .*
204
+ test: .*
205
+ predict: .*
206
+ structure:
207
+ name: demo-structure
208
+ type: structure
209
+ batch_size: 2
210
+ dropout: 0.1
211
+ d_model: 16
212
+ fields:
213
+ name: root
214
+ type: context
215
+ context_size: 1
216
+ n_outputs: 1
217
+ n_layers: 1
218
+ n_heads: 4
219
+ n_linear: 1
220
+ fields:
221
+ - name: identifier
222
+ type: category
223
+ query: "[*].id"
224
+ max_vocab_size: 1024
225
+ ```
226
+
227
+ `fit` sessions write checkpoints to `models/`. In multi-session experiments, the output checkpoint from a `fit` session is automatically passed to later `validate`, `test`, or `predict` sessions.
228
+
229
+ To turn a field into a supervised target, include its address in
230
+ `session.pruned` for a fit, validate, test, or predict session. The model will
231
+ withhold that field from the encoder and use the same datatype-specific decoder
232
+ that is used for masked/pruned reconstruction. To export embeddings, include
233
+ field or context addresses in `session.output`.
234
+
235
+ ## Inference And Serving
236
+
237
+ Batch prediction uses the same experiment/session machinery as training. Prediction outputs are written to `tmp/predictions/`.
238
+
239
+ Checkpoints carry the Lightning weights, serialized session configuration, and
240
+ stateful tensorfield state such as online category vocabularies, numeric
241
+ normalization buffers, and class-frequency counters. This tight coupling is
242
+ intentional: the deployed model should not depend on a separate, manually
243
+ synchronized tokenizer or normalizer artifact.
244
+
245
+ For online serving, the repository exposes `json2vec.inference.deployment.Deployment`, which wraps a checkpoint-backed model in LitServe. Runtime configuration is environment-driven:
246
+
247
+ - `JSON2VEC_CHECKPOINT` or `CHECKPOINT`
248
+ - `JSON2VEC_MAX_BATCH_SIZE`
249
+ - `JSON2VEC_BATCH_TIMEOUT`
250
+ - `JSON2VEC_WORKERS_PER_DEVICE`
251
+ - `JSON2VEC_ACCELERATOR`
252
+ - `JSON2VEC_TRACK_REQUESTS`
253
+
254
+ A minimal serve entrypoint is:
255
+
256
+ ```python
257
+ from json2vec.inference.deployment import Deployment
258
+
259
+ Deployment.serve()
260
+ ```
261
+
262
+ ## Processor Model
263
+
264
+ Processors are registered Python callables. The built-in `default` processor returns each observation unchanged.
265
+
266
+ Custom processors live under `src/json2vec/processors/extensions/` and are registered with either `@register.transformation` or `@register.generator`.
267
+
268
+ - transformation processors must return a single `dict`
269
+ - generator processors may yield `dict` objects or return a `list[dict]`
270
+ - every emitted object is wrapped as a single-item root context before tensorization
271
+
272
+ Configured `dataset.kwargs` are passed into the processor, with unsupported keyword arguments automatically ignored.
273
+
274
+ ## Tensorfield Plugins
275
+
276
+ The current built-in tensorfield types are:
277
+
278
+ - `number`
279
+ - `category`
280
+ - `dateparts`
281
+ - `entity`
282
+ - `vector`
283
+ - `text`
284
+
285
+ Each tensorfield plugin provides a request schema plus the model components needed to encode values, decode predictions, compute losses, and optionally serialize outputs.
286
+
287
+ The `text` tensorfield requires the optional `transformers` dependency and is not installed by default.
288
+
289
+ ## Runtime Environment
290
+
291
+ Training and dataloading behavior is controlled with environment variables such as:
292
+
293
+ - `JSON2VEC_LOGGER`
294
+ - `WANDB_API_KEY`
295
+ - `NEPTUNE_API_TOKEN`
296
+ - `COMET_API_KEY`
297
+ - `MLFLOW_TRACKING_URI`
298
+ - `JSON2VEC_TENSORBOARD_LOG_DIR`
299
+ - `JSON2VEC_CSV_LOG_DIR`
300
+ - `JSON2VEC_NUM_WORKERS`
301
+ - `JSON2VEC_PERSISTENT_WORKERS`
302
+ - `JSON2VEC_PIN_MEMORY`
303
+ - `JSON2VEC_SHARDING`
304
+ - `JSON2VEC_CHUNK_BATCH_SIZE`
305
+
306
+ Supported sharding strategies are `file`, `chunk`, and `record`.
307
+
308
+ ## Repository Layout
309
+
310
+ - `src/json2vec/architecture`: model assembly, attention, pooling, and parcel routing
311
+ - `src/json2vec/data`: dataset fetch/read/process/batch/encode pipeline
312
+ - `src/json2vec/entrypoints`: training and evaluation orchestration
313
+ - `src/json2vec/inference`: serving and prediction callbacks
314
+ - `src/json2vec/logging`: tracking and runtime logging helpers
315
+ - `src/json2vec/processors`: processor registry and built-in extensions
316
+ - `src/json2vec/structs`: pydantic config models, enums, tree structures, and environment settings
317
+ - `src/json2vec/tensorfields`: tensorfield plugin system and built-in field types
318
+ - `tests/`: package test suite
319
+ - `docs/summary.typ` and `docs/whitepaper.typ`: longer written documentation
320
+
321
+ ## Diagrams
322
+
323
+ The repository includes architecture and pipeline diagrams:
324
+
325
+ ![Tree of encoding modules](docs/diagrams/tree.drawio.svg)
326
+
327
+ ![Single context node](docs/diagrams/node.drawio.svg)
328
+
329
+ ![Pipeline stages](docs/diagrams/pipeline.drawio.svg)
330
+
331
+ ![Example configured module tree](docs/diagrams/modules.drawio.svg)
332
+
333
+ ## Development
334
+
335
+ Run the test suite with:
336
+
337
+ ```bash
338
+ uv run pytest
339
+ ```
340
+
341
+ Run lint checks with:
342
+
343
+ ```bash
344
+ uv run ruff check
345
+ ```
346
+
347
+ ## License
348
+
349
+ Licensed under the Apache License, Version 2.0. See `LICENSE` and `NOTICE`.
350
+
351
+ ## References
352
+
353
+ - `BIBLIOGRAPHY.md`
354
+ - `CITATION.bib`
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "json2vec"
3
- version = "0.2.0"
3
+ version = "0.2.1"
4
4
  description = "JSON -> [*]"
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -14,6 +14,16 @@ class Counter(torch.nn.Module):
14
14
  self.register_buffer("counts", torch.ones(size, dtype=torch.int64))
15
15
  self.is_full: bool = False
16
16
 
17
+ def __str__(self) -> str:
18
+ counts = self.counts.detach().cpu().tolist()
19
+ return "\n".join(
20
+ (
21
+ f"size: {self.size}",
22
+ f"is_full: {self.is_full}",
23
+ f"counts: {counts}",
24
+ )
25
+ )
26
+
17
27
  @torch.no_grad()
18
28
  def forward(self, values: torch.Tensor):
19
29
  if self.training and not self.is_full: