json2vec 0.4.5__tar.gz → 0.4.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {json2vec-0.4.5/src/json2vec.egg-info → json2vec-0.4.6}/PKG-INFO +24 -21
- {json2vec-0.4.5 → json2vec-0.4.6}/README.md +21 -20
- {json2vec-0.4.5 → json2vec-0.4.6}/pyproject.toml +3 -1
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/__init__.py +60 -1
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/checkpoint.py +1 -1
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/encoder.py +1 -1
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/plot.py +2 -3
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/root.py +17 -61
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/runtime.py +52 -44
- json2vec-0.4.6/src/json2vec/inference/__init__.py +64 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/inference/callback.py +17 -25
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/inference/deployment.py +6 -11
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/preprocessors/base.py +1 -1
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/structs/experiment.py +6 -14
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/structs/packages.py +0 -25
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/structs/structure.py +1 -2
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/structs/tree.py +43 -43
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/base.py +4 -1
- {json2vec-0.4.5 → json2vec-0.4.6/src/json2vec.egg-info}/PKG-INFO +24 -21
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec.egg-info/requires.txt +2 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/tests/test_public_api.py +10 -0
- json2vec-0.4.5/src/json2vec/structs/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/LICENSE +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/NOTICE +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/setup.cfg +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/attention.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/contracts.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/graph.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/mutations.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/node.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/pool.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/architecture/rotary.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/data/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/data/datasets/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/data/datasets/base.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/data/datasets/polars.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/data/datasets/streaming.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/data/iterables.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/data/processing.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/distributed.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/logging/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/logging/config.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/logging/epoch.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/logging/throughput.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/preprocessors/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/preprocessors/extensions/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/preprocessors/spec.py +0 -0
- {json2vec-0.4.5/src/json2vec/inference → json2vec-0.4.6/src/json2vec/structs}/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/structs/enums.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/structs/selectors.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/category.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/dateparts.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/entity.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/number.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/set.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/text.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/vector.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/shared/__init__.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/shared/counter.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/shared/vocabulary.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec/tensorfields/spec.py +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec.egg-info/SOURCES.txt +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec.egg-info/dependency_links.txt +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/src/json2vec.egg-info/top_level.txt +0 -0
- {json2vec-0.4.5 → json2vec-0.4.6}/tests/test_callbacks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: json2vec
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.6
|
|
4
4
|
Summary: {...} -> [*]
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
Requires-Python: >=3.12
|
|
@@ -27,16 +27,18 @@ Requires-Dist: pydantic-settings>=2.10.1; extra == "serving"
|
|
|
27
27
|
Provides-Extra: text
|
|
28
28
|
Requires-Dist: transformers>=4.55.0; extra == "text"
|
|
29
29
|
Provides-Extra: docs
|
|
30
|
+
Requires-Dist: litserve>=0.2.13; extra == "docs"
|
|
30
31
|
Requires-Dist: mkdocs-material>=9.6; extra == "docs"
|
|
31
32
|
Requires-Dist: mkdocs-jupyter>=0.26.3; extra == "docs"
|
|
32
33
|
Requires-Dist: mkdocstrings[python]>=0.27; extra == "docs"
|
|
34
|
+
Requires-Dist: pydantic-settings>=2.10.1; extra == "docs"
|
|
33
35
|
Dynamic: license-file
|
|
34
36
|
|
|
35
37
|
<p align="center">
|
|
36
|
-
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="
|
|
38
|
+
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="json2vec logo" width="180">
|
|
37
39
|
</p>
|
|
38
40
|
|
|
39
|
-
<h1 align="center">
|
|
41
|
+
<h1 align="center"><code>json2vec</code></h1>
|
|
40
42
|
|
|
41
43
|
<p align="center">
|
|
42
44
|
<img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&logoColor=white" />
|
|
@@ -69,14 +71,14 @@ schemas, and checkpoints private.
|
|
|
69
71
|
- **Extensible data types for predictive modeling.** Masked values,
|
|
70
72
|
targeted fields, and explicit supervised targets all flow through the same
|
|
71
73
|
datatype-specific heads. A new
|
|
72
|
-
[tensorfield type](https://json2vec.github.io/json2vec/
|
|
74
|
+
[tensorfield type](https://json2vec.github.io/json2vec/data-types/tensorfields/) brings its own embedding,
|
|
73
75
|
decoding, loss, and writing logic, so the framework stays reusable as schemas
|
|
74
76
|
grow.
|
|
75
77
|
- **Schema evolution is a first-class workflow.** Between training loops
|
|
76
78
|
(pretraining, finetuning, refitting, and task adaptation), the model can be
|
|
77
79
|
mutated. Fields can be added (`model.extend`), removed (`model.delete`),
|
|
78
80
|
updated (`model.update` / `with model.override`), and reset (`model.reset`).
|
|
79
|
-
See the [
|
|
81
|
+
See the [mutations guide](https://json2vec.github.io/json2vec/core-concepts/mutations/).
|
|
80
82
|
- **Production semantics for missingness.** `null`, `padded`, `masked`, and
|
|
81
83
|
`valued` are distinct states in the tensorfield type system.
|
|
82
84
|
They are not collapsed into one generic missing-value bucket.
|
|
@@ -104,7 +106,7 @@ Use `json2vec` when the hierarchy is part of the signal:
|
|
|
104
106
|
multi-target prediction over nested records
|
|
105
107
|
|
|
106
108
|
For more context on the modeling problem, read
|
|
107
|
-
[Why
|
|
109
|
+
[Why `json2vec`](https://json2vec.github.io/json2vec/motivation/).
|
|
108
110
|
|
|
109
111
|
## What It Does Not Do
|
|
110
112
|
|
|
@@ -171,8 +173,8 @@ model = j2v.Model.from_schema(
|
|
|
171
173
|
optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
|
|
172
174
|
)
|
|
173
175
|
|
|
174
|
-
datamodule = j2v.PolarsDataModule
|
|
175
|
-
model,
|
|
176
|
+
datamodule = j2v.PolarsDataModule(
|
|
177
|
+
model=model,
|
|
176
178
|
train=records,
|
|
177
179
|
validate=records,
|
|
178
180
|
num_workers=0,
|
|
@@ -195,14 +197,11 @@ trainer = lit.Trainer(
|
|
|
195
197
|
|
|
196
198
|
trainer.fit(model=model, datamodule=datamodule)
|
|
197
199
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
pprint(model.predict(batch))
|
|
201
|
-
pprint(model.embed(batch))
|
|
200
|
+
pprint(model.predict(records.to_dicts()[:3]))
|
|
202
201
|
```
|
|
203
202
|
|
|
204
|
-
The prediction call returns a typed result for `record/species
|
|
205
|
-
|
|
203
|
+
The prediction call returns a typed result for `record/species` and the
|
|
204
|
+
configured `record` embedding for each input observation.
|
|
206
205
|
|
|
207
206
|
## Documentation
|
|
208
207
|
|
|
@@ -216,16 +215,20 @@ uv run --extra docs mkdocs build --strict
|
|
|
216
215
|
Useful entry points:
|
|
217
216
|
|
|
218
217
|
- [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
|
|
219
|
-
- [
|
|
220
|
-
- [
|
|
221
|
-
- [
|
|
218
|
+
- [AI Quickstart](https://json2vec.github.io/json2vec/ai-quickstart/)
|
|
219
|
+
- [Why `json2vec`](https://json2vec.github.io/json2vec/motivation/)
|
|
220
|
+
- [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths/)
|
|
221
|
+
- [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types/)
|
|
222
|
+
- [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings/)
|
|
223
|
+
- [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree/)
|
|
224
|
+
- [Mutations](https://json2vec.github.io/json2vec/core-concepts/mutations/)
|
|
222
225
|
- [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
|
|
223
|
-
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
224
226
|
- [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
|
|
227
|
+
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
225
228
|
- [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
|
|
226
|
-
- [Field
|
|
229
|
+
- [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance/)
|
|
227
230
|
- [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
|
|
228
|
-
- [
|
|
231
|
+
- [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
|
|
229
232
|
- [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
|
|
230
233
|
- [API Reference](https://json2vec.github.io/json2vec/reference/api/)
|
|
231
234
|
- [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
|
|
@@ -292,7 +295,7 @@ Configured `dataset.kwargs` are passed into the preprocessor, with unsupported k
|
|
|
292
295
|
|
|
293
296
|
Each tensorfield plugin provides a request schema plus the model components
|
|
294
297
|
needed to encode values, decode predictions, compute losses, and optionally
|
|
295
|
-
serialize outputs. See [
|
|
298
|
+
serialize outputs. See [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
|
|
296
299
|
for a custom plugin walkthrough. Built-in tensorfields share the base leaf
|
|
297
300
|
options `name`, `query`, `pooling`, `weight`, `n_heads`, `n_linear`, `dropout`,
|
|
298
301
|
`p_mask`, and `p_prune`.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="
|
|
2
|
+
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="json2vec logo" width="180">
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
|
-
<h1 align="center">
|
|
5
|
+
<h1 align="center"><code>json2vec</code></h1>
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
8
8
|
<img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&logoColor=white" />
|
|
@@ -35,14 +35,14 @@ schemas, and checkpoints private.
|
|
|
35
35
|
- **Extensible data types for predictive modeling.** Masked values,
|
|
36
36
|
targeted fields, and explicit supervised targets all flow through the same
|
|
37
37
|
datatype-specific heads. A new
|
|
38
|
-
[tensorfield type](https://json2vec.github.io/json2vec/
|
|
38
|
+
[tensorfield type](https://json2vec.github.io/json2vec/data-types/tensorfields/) brings its own embedding,
|
|
39
39
|
decoding, loss, and writing logic, so the framework stays reusable as schemas
|
|
40
40
|
grow.
|
|
41
41
|
- **Schema evolution is a first-class workflow.** Between training loops
|
|
42
42
|
(pretraining, finetuning, refitting, and task adaptation), the model can be
|
|
43
43
|
mutated. Fields can be added (`model.extend`), removed (`model.delete`),
|
|
44
44
|
updated (`model.update` / `with model.override`), and reset (`model.reset`).
|
|
45
|
-
See the [
|
|
45
|
+
See the [mutations guide](https://json2vec.github.io/json2vec/core-concepts/mutations/).
|
|
46
46
|
- **Production semantics for missingness.** `null`, `padded`, `masked`, and
|
|
47
47
|
`valued` are distinct states in the tensorfield type system.
|
|
48
48
|
They are not collapsed into one generic missing-value bucket.
|
|
@@ -70,7 +70,7 @@ Use `json2vec` when the hierarchy is part of the signal:
|
|
|
70
70
|
multi-target prediction over nested records
|
|
71
71
|
|
|
72
72
|
For more context on the modeling problem, read
|
|
73
|
-
[Why
|
|
73
|
+
[Why `json2vec`](https://json2vec.github.io/json2vec/motivation/).
|
|
74
74
|
|
|
75
75
|
## What It Does Not Do
|
|
76
76
|
|
|
@@ -137,8 +137,8 @@ model = j2v.Model.from_schema(
|
|
|
137
137
|
optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
|
|
138
138
|
)
|
|
139
139
|
|
|
140
|
-
datamodule = j2v.PolarsDataModule
|
|
141
|
-
model,
|
|
140
|
+
datamodule = j2v.PolarsDataModule(
|
|
141
|
+
model=model,
|
|
142
142
|
train=records,
|
|
143
143
|
validate=records,
|
|
144
144
|
num_workers=0,
|
|
@@ -161,14 +161,11 @@ trainer = lit.Trainer(
|
|
|
161
161
|
|
|
162
162
|
trainer.fit(model=model, datamodule=datamodule)
|
|
163
163
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
pprint(model.predict(batch))
|
|
167
|
-
pprint(model.embed(batch))
|
|
164
|
+
pprint(model.predict(records.to_dicts()[:3]))
|
|
168
165
|
```
|
|
169
166
|
|
|
170
|
-
The prediction call returns a typed result for `record/species
|
|
171
|
-
|
|
167
|
+
The prediction call returns a typed result for `record/species` and the
|
|
168
|
+
configured `record` embedding for each input observation.
|
|
172
169
|
|
|
173
170
|
## Documentation
|
|
174
171
|
|
|
@@ -182,16 +179,20 @@ uv run --extra docs mkdocs build --strict
|
|
|
182
179
|
Useful entry points:
|
|
183
180
|
|
|
184
181
|
- [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
|
|
185
|
-
- [
|
|
186
|
-
- [
|
|
187
|
-
- [
|
|
182
|
+
- [AI Quickstart](https://json2vec.github.io/json2vec/ai-quickstart/)
|
|
183
|
+
- [Why `json2vec`](https://json2vec.github.io/json2vec/motivation/)
|
|
184
|
+
- [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths/)
|
|
185
|
+
- [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types/)
|
|
186
|
+
- [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings/)
|
|
187
|
+
- [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree/)
|
|
188
|
+
- [Mutations](https://json2vec.github.io/json2vec/core-concepts/mutations/)
|
|
188
189
|
- [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
|
|
189
|
-
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
190
190
|
- [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
|
|
191
|
+
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
191
192
|
- [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
|
|
192
|
-
- [Field
|
|
193
|
+
- [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance/)
|
|
193
194
|
- [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
|
|
194
|
-
- [
|
|
195
|
+
- [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
|
|
195
196
|
- [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
|
|
196
197
|
- [API Reference](https://json2vec.github.io/json2vec/reference/api/)
|
|
197
198
|
- [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
|
|
@@ -258,7 +259,7 @@ Configured `dataset.kwargs` are passed into the preprocessor, with unsupported k
|
|
|
258
259
|
|
|
259
260
|
Each tensorfield plugin provides a request schema plus the model components
|
|
260
261
|
needed to encode values, decode predictions, compute losses, and optionally
|
|
261
|
-
serialize outputs. See [
|
|
262
|
+
serialize outputs. See [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
|
|
262
263
|
for a custom plugin walkthrough. Built-in tensorfields share the base leaf
|
|
263
264
|
options `name`, `query`, `pooling`, `weight`, `n_heads`, `n_linear`, `dropout`,
|
|
264
265
|
`p_mask`, and `p_prune`.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "json2vec"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.6"
|
|
4
4
|
description = "{...} -> [*]"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -31,9 +31,11 @@ text = [
|
|
|
31
31
|
"transformers>=4.55.0",
|
|
32
32
|
]
|
|
33
33
|
docs = [
|
|
34
|
+
"litserve>=0.2.13",
|
|
34
35
|
"mkdocs-material>=9.6",
|
|
35
36
|
"mkdocs-jupyter>=0.26.3",
|
|
36
37
|
"mkdocstrings[python]>=0.27",
|
|
38
|
+
"pydantic-settings>=2.10.1",
|
|
37
39
|
]
|
|
38
40
|
|
|
39
41
|
[dependency-groups]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Public
|
|
1
|
+
"""Public `json2vec` SDK surface.
|
|
2
2
|
|
|
3
3
|
The top-level package exports the constructors and helpers used by most
|
|
4
4
|
applications: `Model.from_schema(...)` for model construction, tensorfield
|
|
@@ -6,6 +6,8 @@ request constructors such as `Category` and `Number`, data modules, schema
|
|
|
6
6
|
mutation predicates, and the `@preprocess` decorator.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
9
11
|
from json2vec.architecture.root import (
|
|
10
12
|
Model,
|
|
11
13
|
MutationLockCallback,
|
|
@@ -15,6 +17,7 @@ from json2vec.architecture.root import (
|
|
|
15
17
|
SchedulerConfig,
|
|
16
18
|
)
|
|
17
19
|
from json2vec.data.datasets import PolarsDataModule, StreamingDataModule
|
|
20
|
+
from json2vec.inference.callback import Postprocessor, Writer
|
|
18
21
|
from json2vec.preprocessors import PREPROCESSORS, Preprocessor, PreprocessorMode, preprocess
|
|
19
22
|
from json2vec.structs.enums import AttentionMode, Component, Metric, ShardingStrategy, Strata, Suffix, TensorKey, Tokens
|
|
20
23
|
from json2vec.structs.experiment import (
|
|
@@ -37,20 +40,73 @@ from json2vec.tensorfields.extensions.text import Request as Text
|
|
|
37
40
|
from json2vec.tensorfields.extensions.vector import Request as Vector
|
|
38
41
|
from json2vec.tensorfields.shared.vocabulary import VocabularySyncCallback
|
|
39
42
|
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from json2vec.inference.deployment import (
|
|
45
|
+
API,
|
|
46
|
+
Accelerator,
|
|
47
|
+
BatchItem,
|
|
48
|
+
Deployment,
|
|
49
|
+
ErrorItem,
|
|
50
|
+
Input,
|
|
51
|
+
ModelSource,
|
|
52
|
+
UpdateOperation,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
_SERVING_EXPORTS = {
|
|
56
|
+
"API",
|
|
57
|
+
"Accelerator",
|
|
58
|
+
"BatchItem",
|
|
59
|
+
"Deployment",
|
|
60
|
+
"ErrorItem",
|
|
61
|
+
"Input",
|
|
62
|
+
"ModelSource",
|
|
63
|
+
"UpdateOperation",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def __getattr__(name: str) -> Any:
|
|
68
|
+
if name not in _SERVING_EXPORTS:
|
|
69
|
+
raise AttributeError(f"module 'json2vec' has no attribute {name!r}")
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
from json2vec.inference import deployment
|
|
73
|
+
except ModuleNotFoundError as error:
|
|
74
|
+
if error.name in {"litserve", "pydantic_settings"}:
|
|
75
|
+
raise ModuleNotFoundError(
|
|
76
|
+
f"json2vec.{name} requires the serving extra; install with `pip install json2vec[serving]`."
|
|
77
|
+
) from error
|
|
78
|
+
raise
|
|
79
|
+
|
|
80
|
+
value = getattr(deployment, name)
|
|
81
|
+
globals()[name] = value
|
|
82
|
+
return value
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def __dir__() -> list[str]:
|
|
86
|
+
return sorted([*globals(), *_SERVING_EXPORTS])
|
|
87
|
+
|
|
88
|
+
|
|
40
89
|
__all__ = [
|
|
41
90
|
"Address",
|
|
91
|
+
"API",
|
|
92
|
+
"Accelerator",
|
|
42
93
|
"Array",
|
|
43
94
|
"AttentionMode",
|
|
95
|
+
"BatchItem",
|
|
44
96
|
"Category",
|
|
45
97
|
"Component",
|
|
46
98
|
"DateParts",
|
|
47
99
|
"DecoderBase",
|
|
100
|
+
"Deployment",
|
|
48
101
|
"EmbedderBase",
|
|
49
102
|
"Entity",
|
|
103
|
+
"ErrorItem",
|
|
50
104
|
"Hyperparameters",
|
|
105
|
+
"Input",
|
|
51
106
|
"Leaf",
|
|
52
107
|
"Metric",
|
|
53
108
|
"Model",
|
|
109
|
+
"ModelSource",
|
|
54
110
|
"MutationLockCallback",
|
|
55
111
|
"NodeAttribute",
|
|
56
112
|
"NodePredicate",
|
|
@@ -59,6 +115,7 @@ __all__ = [
|
|
|
59
115
|
"PREPROCESSORS",
|
|
60
116
|
"Plugin",
|
|
61
117
|
"PolarsDataModule",
|
|
118
|
+
"Postprocessor",
|
|
62
119
|
"Preprocessor",
|
|
63
120
|
"PreprocessorMode",
|
|
64
121
|
"RequestBase",
|
|
@@ -76,8 +133,10 @@ __all__ = [
|
|
|
76
133
|
"TensorKey",
|
|
77
134
|
"Text",
|
|
78
135
|
"Tokens",
|
|
136
|
+
"UpdateOperation",
|
|
79
137
|
"Vector",
|
|
80
138
|
"VocabularySyncCallback",
|
|
139
|
+
"Writer",
|
|
81
140
|
"predicate",
|
|
82
141
|
"preprocess",
|
|
83
142
|
"where",
|
|
@@ -88,7 +88,7 @@ def render_schema_plot(
|
|
|
88
88
|
) -> RenderableType:
|
|
89
89
|
hyperparameters = module.hyperparameters
|
|
90
90
|
root = hyperparameters.fields if address is None else resolve_node(hyperparameters=hyperparameters, address=address)
|
|
91
|
-
title = "
|
|
91
|
+
title = "json2vec State" if state_focus else "json2vec Schema"
|
|
92
92
|
|
|
93
93
|
tree = Tree(render_node_label(module=module, node=root, state_focus=state_focus), guide_style="dim")
|
|
94
94
|
append_schema_children(tree=tree, module=module, node=root, detail=detail or state_focus, state_focus=state_focus)
|
|
@@ -252,7 +252,6 @@ def node_metadata_keys(node: Node, values: dict[str, Any], state_focus: bool) ->
|
|
|
252
252
|
"d_model",
|
|
253
253
|
"attention",
|
|
254
254
|
"max_length",
|
|
255
|
-
"n_outputs",
|
|
256
255
|
"n_layers",
|
|
257
256
|
"n_heads",
|
|
258
257
|
"batch_size",
|
|
@@ -267,7 +266,7 @@ def node_metadata_keys(node: Node, values: dict[str, Any], state_focus: bool) ->
|
|
|
267
266
|
elif isinstance(node, Leaf):
|
|
268
267
|
preferred = ["query", "pooling", "max_vocab_size", "topk", "objective", "weight"]
|
|
269
268
|
else:
|
|
270
|
-
preferred = ["attention", "max_length", "
|
|
269
|
+
preferred = ["attention", "max_length", "n_layers", "n_heads"]
|
|
271
270
|
|
|
272
271
|
remaining = [key for key in values if key not in preferred]
|
|
273
272
|
return preferred + remaining
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Public Lightning model facade for
|
|
1
|
+
"""Public Lightning model facade for `json2vec` schemas."""
|
|
2
2
|
|
|
3
3
|
from collections import Counter
|
|
4
4
|
from collections.abc import Callable, Iterator, Sequence
|
|
@@ -20,7 +20,7 @@ from json2vec.architecture.contracts import ContractScheduler
|
|
|
20
20
|
from json2vec.architecture.graph import ModelGraph
|
|
21
21
|
from json2vec.architecture.mutations import SchemaEditor
|
|
22
22
|
from json2vec.architecture.plot import PlotMode
|
|
23
|
-
from json2vec.architecture.runtime import
|
|
23
|
+
from json2vec.architecture.runtime import ModelRuntime, Postprocessor, Preprocessor, step
|
|
24
24
|
from json2vec.data.datasets.base import EncodedBatch, EncodedInput
|
|
25
25
|
from json2vec.structs.enums import AttentionMode, Strata
|
|
26
26
|
from json2vec.structs.experiment import (
|
|
@@ -30,7 +30,7 @@ from json2vec.structs.experiment import (
|
|
|
30
30
|
SchemaField,
|
|
31
31
|
)
|
|
32
32
|
from json2vec.structs.packages import Prediction
|
|
33
|
-
from json2vec.structs.tree import Address, Node,
|
|
33
|
+
from json2vec.structs.tree import Address, Node, Rate
|
|
34
34
|
from json2vec.tensorfields.base import TENSORFIELDS, Plugin, TensorFieldBase
|
|
35
35
|
|
|
36
36
|
OptimizerConfig = torch.optim.Optimizer | Callable[["Model"], torch.optim.Optimizer]
|
|
@@ -138,11 +138,11 @@ class RollbackCheckpoint(ModelCheckpoint):
|
|
|
138
138
|
|
|
139
139
|
|
|
140
140
|
class Model(lit.LightningModule):
|
|
141
|
-
"""Neural model generated from a
|
|
141
|
+
"""Neural model generated from a `json2vec` schema tree.
|
|
142
142
|
|
|
143
143
|
`Model` owns the schema hyperparameters, tensorfield embedders, array
|
|
144
|
-
encoders, decoders, and convenience methods for prediction,
|
|
145
|
-
|
|
144
|
+
encoders, decoders, and convenience methods for prediction, checkpointing,
|
|
145
|
+
plotting, and schema mutation.
|
|
146
146
|
|
|
147
147
|
Example:
|
|
148
148
|
```python
|
|
@@ -169,16 +169,13 @@ class Model(lit.LightningModule):
|
|
|
169
169
|
n_heads: int,
|
|
170
170
|
batch_size: int = 1,
|
|
171
171
|
fields: Sequence[SchemaField] | None = None,
|
|
172
|
-
|
|
172
|
+
name: str = "record",
|
|
173
173
|
description: str | None = None,
|
|
174
174
|
embed: bool = False,
|
|
175
175
|
attention: AttentionMode | str = AttentionMode.mha,
|
|
176
176
|
max_length: int = 1,
|
|
177
|
-
n_outputs: int = 1,
|
|
178
177
|
n_linear: int = 1,
|
|
179
178
|
dropout: Rate | None = None,
|
|
180
|
-
p_mask: Rate | None = None,
|
|
181
|
-
p_prune: PruneRate | None = None,
|
|
182
179
|
optimizer: OptimizerConfig | None = None,
|
|
183
180
|
scheduler: SchedulerConfig | None = None,
|
|
184
181
|
) -> Self:
|
|
@@ -193,16 +190,13 @@ class Model(lit.LightningModule):
|
|
|
193
190
|
batch_size: Batch size used by data modules, examples, and mocked
|
|
194
191
|
Lightning input arrays.
|
|
195
192
|
fields: Optional sequence form of `field_args`.
|
|
196
|
-
|
|
193
|
+
name: Root array name. Defaults to `record`.
|
|
197
194
|
description: Optional description on the generated root array.
|
|
198
195
|
embed: Configure the generated root array as an embedding output.
|
|
199
196
|
attention: Attention mode for the generated root array.
|
|
200
197
|
max_length: Maximum number of records per observation at the root.
|
|
201
|
-
n_outputs: Number of pooled outputs emitted by the generated root array.
|
|
202
198
|
n_linear: Feed-forward block count on the generated root array.
|
|
203
199
|
dropout: Optional dropout rate on the generated root array.
|
|
204
|
-
p_mask: Optional mask rate on the generated root array.
|
|
205
|
-
p_prune: Optional prune rate on the generated root array.
|
|
206
200
|
optimizer: Optimizer instance or factory used by Lightning training.
|
|
207
201
|
scheduler: Optional scheduler config or factory.
|
|
208
202
|
|
|
@@ -215,16 +209,13 @@ class Model(lit.LightningModule):
|
|
|
215
209
|
n_layers=n_layers,
|
|
216
210
|
n_heads=n_heads,
|
|
217
211
|
fields=fields,
|
|
218
|
-
|
|
212
|
+
name=name,
|
|
219
213
|
description=description,
|
|
220
214
|
embed=embed,
|
|
221
215
|
attention=attention,
|
|
222
216
|
max_length=max_length,
|
|
223
|
-
n_outputs=n_outputs,
|
|
224
217
|
n_linear=n_linear,
|
|
225
218
|
dropout=dropout,
|
|
226
|
-
p_mask=p_mask,
|
|
227
|
-
p_prune=p_prune,
|
|
228
219
|
)
|
|
229
220
|
return cls(
|
|
230
221
|
hyperparameters=hyperparameters,
|
|
@@ -299,7 +290,7 @@ class Model(lit.LightningModule):
|
|
|
299
290
|
"""Mutate selected schema nodes and rebuild compatible modules.
|
|
300
291
|
|
|
301
292
|
`target=True` is shorthand for `p_prune=1.0`; `target=False` clears
|
|
302
|
-
target behavior by setting `p_prune=
|
|
293
|
+
target behavior by setting `p_prune=0.0`.
|
|
303
294
|
|
|
304
295
|
Args:
|
|
305
296
|
*predicates: Predicates used to select nodes.
|
|
@@ -507,7 +498,7 @@ class Model(lit.LightningModule):
|
|
|
507
498
|
CheckpointState.dump(self, checkpoint)
|
|
508
499
|
|
|
509
500
|
def restore_checkpoint_state(self, checkpoint: dict[str, Any]) -> None:
|
|
510
|
-
"""Restore this model in place from a
|
|
501
|
+
"""Restore this model in place from a `json2vec` checkpoint dictionary."""
|
|
511
502
|
CheckpointState.restore(self, checkpoint)
|
|
512
503
|
|
|
513
504
|
@classmethod
|
|
@@ -515,16 +506,14 @@ class Model(lit.LightningModule):
|
|
|
515
506
|
"""Load a `Model` checkpoint written by `Model.save(...)`."""
|
|
516
507
|
return cast(Self, CheckpointState.load(cls, checkpoint))
|
|
517
508
|
|
|
518
|
-
def write(
|
|
519
|
-
self, predictions: list[Prediction]
|
|
520
|
-
) -> tuple[dict[Address, dict[str, Any]], dict[Address, dict[str, Any]]]:
|
|
509
|
+
def write(self, predictions: list[Prediction]) -> dict[Address, dict[str, Any]]:
|
|
521
510
|
return ModelRuntime.write(self, predictions)
|
|
522
511
|
|
|
523
512
|
@immutable("inference")
|
|
524
513
|
def encode(
|
|
525
514
|
self,
|
|
526
515
|
batch: EncodedBatch | list[dict[str, Any]],
|
|
527
|
-
preprocess:
|
|
516
|
+
preprocess: Preprocessor | None = None,
|
|
528
517
|
strata: Strata | str = Strata.predict,
|
|
529
518
|
) -> EncodedInput:
|
|
530
519
|
"""Return encoded tensorfield inputs for raw or processed observations."""
|
|
@@ -536,52 +525,19 @@ class Model(lit.LightningModule):
|
|
|
536
525
|
)
|
|
537
526
|
|
|
538
527
|
@immutable("inference")
|
|
539
|
-
def evaluate(
|
|
540
|
-
self,
|
|
541
|
-
batch: EncodedBatch | list[dict[str, Any]],
|
|
542
|
-
preprocess: PreprocessFn | None = None,
|
|
543
|
-
postprocess: Postprocessor | None = None,
|
|
544
|
-
) -> EvaluationResult:
|
|
545
|
-
"""Run prediction and embedding for encoded or raw observations.
|
|
546
|
-
|
|
547
|
-
If `preprocess` is omitted, raw records are encoded unchanged.
|
|
548
|
-
"""
|
|
549
|
-
return ModelRuntime.evaluate(
|
|
550
|
-
self,
|
|
551
|
-
batch=batch,
|
|
552
|
-
preprocess=preprocess,
|
|
553
|
-
postprocess=postprocess,
|
|
554
|
-
)
|
|
555
|
-
|
|
556
528
|
def predict(
|
|
557
529
|
self,
|
|
558
530
|
batch: EncodedBatch | list[dict[str, Any]],
|
|
559
|
-
preprocess:
|
|
531
|
+
preprocess: Preprocessor | None = None,
|
|
560
532
|
postprocess: Postprocessor | None = None,
|
|
561
533
|
) -> dict[Address, dict[str, Any]]:
|
|
562
|
-
"""Return typed predictions for a raw or encoded batch."""
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
batch=batch,
|
|
566
|
-
preprocess=preprocess,
|
|
567
|
-
postprocess=postprocess,
|
|
568
|
-
)
|
|
569
|
-
|
|
570
|
-
return result.predictions
|
|
571
|
-
|
|
572
|
-
def embed(
|
|
573
|
-
self,
|
|
574
|
-
batch: EncodedBatch | list[dict[str, Any]],
|
|
575
|
-
preprocess: PreprocessFn | None = None,
|
|
576
|
-
postprocess: Postprocessor | None = None,
|
|
577
|
-
) -> dict[Address, dict[str, Any]]:
|
|
578
|
-
"""Return configured embeddings for a raw or encoded batch."""
|
|
579
|
-
result = self.evaluate(
|
|
534
|
+
"""Return typed predictions and configured embeddings for a raw or encoded batch."""
|
|
535
|
+
return ModelRuntime.predict(
|
|
536
|
+
self,
|
|
580
537
|
batch=batch,
|
|
581
538
|
preprocess=preprocess,
|
|
582
539
|
postprocess=postprocess,
|
|
583
540
|
)
|
|
584
|
-
return result.embeddings
|
|
585
541
|
|
|
586
542
|
training_step = partialmethod(step, strata=Strata.train)
|
|
587
543
|
validation_step = partialmethod(step, strata=Strata.validate)
|