json2vec 0.4.4__tar.gz → 0.4.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {json2vec-0.4.4/src/json2vec.egg-info → json2vec-0.4.6}/PKG-INFO +24 -21
- {json2vec-0.4.4 → json2vec-0.4.6}/README.md +21 -20
- {json2vec-0.4.4 → json2vec-0.4.6}/pyproject.toml +3 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/__init__.py +60 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/checkpoint.py +1 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/contracts.py +3 -3
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/encoder.py +1 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/graph.py +18 -5
- json2vec-0.4.6/src/json2vec/architecture/mutations.py +323 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/plot.py +2 -3
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/root.py +26 -54
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/runtime.py +69 -55
- json2vec-0.4.6/src/json2vec/inference/__init__.py +64 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/inference/callback.py +17 -25
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/inference/deployment.py +6 -11
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/preprocessors/base.py +1 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/structs/experiment.py +16 -19
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/structs/packages.py +0 -25
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/structs/selectors.py +1 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/structs/structure.py +1 -2
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/structs/tree.py +43 -28
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/base.py +4 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/category.py +10 -2
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/dateparts.py +46 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/number.py +1 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/set.py +18 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/vector.py +50 -8
- {json2vec-0.4.4 → json2vec-0.4.6/src/json2vec.egg-info}/PKG-INFO +24 -21
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec.egg-info/SOURCES.txt +1 -1
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec.egg-info/requires.txt +2 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/tests/test_public_api.py +10 -0
- json2vec-0.4.4/src/json2vec/architecture/schema_editor.py +0 -126
- json2vec-0.4.4/src/json2vec/structs/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/LICENSE +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/NOTICE +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/setup.cfg +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/attention.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/node.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/pool.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/architecture/rotary.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/data/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/data/datasets/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/data/datasets/base.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/data/datasets/polars.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/data/datasets/streaming.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/data/iterables.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/data/processing.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/distributed.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/logging/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/logging/config.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/logging/epoch.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/logging/throughput.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/preprocessors/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/preprocessors/extensions/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/preprocessors/spec.py +0 -0
- {json2vec-0.4.4/src/json2vec/inference → json2vec-0.4.6/src/json2vec/structs}/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/structs/enums.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/entity.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/extensions/text.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/shared/__init__.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/shared/counter.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/shared/vocabulary.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec/tensorfields/spec.py +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec.egg-info/dependency_links.txt +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/src/json2vec.egg-info/top_level.txt +0 -0
- {json2vec-0.4.4 → json2vec-0.4.6}/tests/test_callbacks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: json2vec
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.6
|
|
4
4
|
Summary: {...} -> [*]
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
Requires-Python: >=3.12
|
|
@@ -27,16 +27,18 @@ Requires-Dist: pydantic-settings>=2.10.1; extra == "serving"
|
|
|
27
27
|
Provides-Extra: text
|
|
28
28
|
Requires-Dist: transformers>=4.55.0; extra == "text"
|
|
29
29
|
Provides-Extra: docs
|
|
30
|
+
Requires-Dist: litserve>=0.2.13; extra == "docs"
|
|
30
31
|
Requires-Dist: mkdocs-material>=9.6; extra == "docs"
|
|
31
32
|
Requires-Dist: mkdocs-jupyter>=0.26.3; extra == "docs"
|
|
32
33
|
Requires-Dist: mkdocstrings[python]>=0.27; extra == "docs"
|
|
34
|
+
Requires-Dist: pydantic-settings>=2.10.1; extra == "docs"
|
|
33
35
|
Dynamic: license-file
|
|
34
36
|
|
|
35
37
|
<p align="center">
|
|
36
|
-
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="
|
|
38
|
+
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="json2vec logo" width="180">
|
|
37
39
|
</p>
|
|
38
40
|
|
|
39
|
-
<h1 align="center">
|
|
41
|
+
<h1 align="center"><code>json2vec</code></h1>
|
|
40
42
|
|
|
41
43
|
<p align="center">
|
|
42
44
|
<img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&logoColor=white" />
|
|
@@ -69,14 +71,14 @@ schemas, and checkpoints private.
|
|
|
69
71
|
- **Extensible data types for predictive modeling.** Masked values,
|
|
70
72
|
targeted fields, and explicit supervised targets all flow through the same
|
|
71
73
|
datatype-specific heads. A new
|
|
72
|
-
[tensorfield type](https://json2vec.github.io/json2vec/
|
|
74
|
+
[tensorfield type](https://json2vec.github.io/json2vec/data-types/tensorfields/) brings its own embedding,
|
|
73
75
|
decoding, loss, and writing logic, so the framework stays reusable as schemas
|
|
74
76
|
grow.
|
|
75
77
|
- **Schema evolution is a first-class workflow.** Between training loops
|
|
76
78
|
(pretraining, finetuning, refitting, and task adaptation), the model can be
|
|
77
79
|
mutated. Fields can be added (`model.extend`), removed (`model.delete`),
|
|
78
80
|
updated (`model.update` / `with model.override`), and reset (`model.reset`).
|
|
79
|
-
See the [
|
|
81
|
+
See the [mutations guide](https://json2vec.github.io/json2vec/core-concepts/mutations/).
|
|
80
82
|
- **Production semantics for missingness.** `null`, `padded`, `masked`, and
|
|
81
83
|
`valued` are distinct states in the tensorfield type system.
|
|
82
84
|
They are not collapsed into one generic missing-value bucket.
|
|
@@ -104,7 +106,7 @@ Use `json2vec` when the hierarchy is part of the signal:
|
|
|
104
106
|
multi-target prediction over nested records
|
|
105
107
|
|
|
106
108
|
For more context on the modeling problem, read
|
|
107
|
-
[Why
|
|
109
|
+
[Why `json2vec`](https://json2vec.github.io/json2vec/motivation/).
|
|
108
110
|
|
|
109
111
|
## What It Does Not Do
|
|
110
112
|
|
|
@@ -171,8 +173,8 @@ model = j2v.Model.from_schema(
|
|
|
171
173
|
optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
|
|
172
174
|
)
|
|
173
175
|
|
|
174
|
-
datamodule = j2v.PolarsDataModule
|
|
175
|
-
model,
|
|
176
|
+
datamodule = j2v.PolarsDataModule(
|
|
177
|
+
model=model,
|
|
176
178
|
train=records,
|
|
177
179
|
validate=records,
|
|
178
180
|
num_workers=0,
|
|
@@ -195,14 +197,11 @@ trainer = lit.Trainer(
|
|
|
195
197
|
|
|
196
198
|
trainer.fit(model=model, datamodule=datamodule)
|
|
197
199
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
pprint(model.predict(batch))
|
|
201
|
-
pprint(model.embed(batch))
|
|
200
|
+
pprint(model.predict(records.to_dicts()[:3]))
|
|
202
201
|
```
|
|
203
202
|
|
|
204
|
-
The prediction call returns a typed result for `record/species
|
|
205
|
-
|
|
203
|
+
The prediction call returns a typed result for `record/species` and the
|
|
204
|
+
configured `record` embedding for each input observation.
|
|
206
205
|
|
|
207
206
|
## Documentation
|
|
208
207
|
|
|
@@ -216,16 +215,20 @@ uv run --extra docs mkdocs build --strict
|
|
|
216
215
|
Useful entry points:
|
|
217
216
|
|
|
218
217
|
- [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
|
|
219
|
-
- [
|
|
220
|
-
- [
|
|
221
|
-
- [
|
|
218
|
+
- [AI Quickstart](https://json2vec.github.io/json2vec/ai-quickstart/)
|
|
219
|
+
- [Why `json2vec`](https://json2vec.github.io/json2vec/motivation/)
|
|
220
|
+
- [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths/)
|
|
221
|
+
- [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types/)
|
|
222
|
+
- [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings/)
|
|
223
|
+
- [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree/)
|
|
224
|
+
- [Mutations](https://json2vec.github.io/json2vec/core-concepts/mutations/)
|
|
222
225
|
- [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
|
|
223
|
-
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
224
226
|
- [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
|
|
227
|
+
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
225
228
|
- [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
|
|
226
|
-
- [Field
|
|
229
|
+
- [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance/)
|
|
227
230
|
- [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
|
|
228
|
-
- [
|
|
231
|
+
- [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
|
|
229
232
|
- [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
|
|
230
233
|
- [API Reference](https://json2vec.github.io/json2vec/reference/api/)
|
|
231
234
|
- [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
|
|
@@ -292,7 +295,7 @@ Configured `dataset.kwargs` are passed into the preprocessor, with unsupported k
|
|
|
292
295
|
|
|
293
296
|
Each tensorfield plugin provides a request schema plus the model components
|
|
294
297
|
needed to encode values, decode predictions, compute losses, and optionally
|
|
295
|
-
serialize outputs. See [
|
|
298
|
+
serialize outputs. See [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
|
|
296
299
|
for a custom plugin walkthrough. Built-in tensorfields share the base leaf
|
|
297
300
|
options `name`, `query`, `pooling`, `weight`, `n_heads`, `n_linear`, `dropout`,
|
|
298
301
|
`p_mask`, and `p_prune`.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="
|
|
2
|
+
<img src="https://json2vec.github.io/json2vec/diagrams/json2vec.png" alt="json2vec logo" width="180">
|
|
3
3
|
</p>
|
|
4
4
|
|
|
5
|
-
<h1 align="center">
|
|
5
|
+
<h1 align="center"><code>json2vec</code></h1>
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
8
8
|
<img alt="Python 3.12+" src="https://img.shields.io/badge/python-3.12%2B-3776AB?logo=python&logoColor=white" />
|
|
@@ -35,14 +35,14 @@ schemas, and checkpoints private.
|
|
|
35
35
|
- **Extensible data types for predictive modeling.** Masked values,
|
|
36
36
|
targeted fields, and explicit supervised targets all flow through the same
|
|
37
37
|
datatype-specific heads. A new
|
|
38
|
-
[tensorfield type](https://json2vec.github.io/json2vec/
|
|
38
|
+
[tensorfield type](https://json2vec.github.io/json2vec/data-types/tensorfields/) brings its own embedding,
|
|
39
39
|
decoding, loss, and writing logic, so the framework stays reusable as schemas
|
|
40
40
|
grow.
|
|
41
41
|
- **Schema evolution is a first-class workflow.** Between training loops
|
|
42
42
|
(pretraining, finetuning, refitting, and task adaptation), the model can be
|
|
43
43
|
mutated. Fields can be added (`model.extend`), removed (`model.delete`),
|
|
44
44
|
updated (`model.update` / `with model.override`), and reset (`model.reset`).
|
|
45
|
-
See the [
|
|
45
|
+
See the [mutations guide](https://json2vec.github.io/json2vec/core-concepts/mutations/).
|
|
46
46
|
- **Production semantics for missingness.** `null`, `padded`, `masked`, and
|
|
47
47
|
`valued` are distinct states in the tensorfield type system.
|
|
48
48
|
They are not collapsed into one generic missing-value bucket.
|
|
@@ -70,7 +70,7 @@ Use `json2vec` when the hierarchy is part of the signal:
|
|
|
70
70
|
multi-target prediction over nested records
|
|
71
71
|
|
|
72
72
|
For more context on the modeling problem, read
|
|
73
|
-
[Why
|
|
73
|
+
[Why `json2vec`](https://json2vec.github.io/json2vec/motivation/).
|
|
74
74
|
|
|
75
75
|
## What It Does Not Do
|
|
76
76
|
|
|
@@ -137,8 +137,8 @@ model = j2v.Model.from_schema(
|
|
|
137
137
|
optimizer=lambda module: torch.optim.AdamW(module.parameters(), lr=1e-2),
|
|
138
138
|
)
|
|
139
139
|
|
|
140
|
-
datamodule = j2v.PolarsDataModule
|
|
141
|
-
model,
|
|
140
|
+
datamodule = j2v.PolarsDataModule(
|
|
141
|
+
model=model,
|
|
142
142
|
train=records,
|
|
143
143
|
validate=records,
|
|
144
144
|
num_workers=0,
|
|
@@ -161,14 +161,11 @@ trainer = lit.Trainer(
|
|
|
161
161
|
|
|
162
162
|
trainer.fit(model=model, datamodule=datamodule)
|
|
163
163
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
pprint(model.predict(batch))
|
|
167
|
-
pprint(model.embed(batch))
|
|
164
|
+
pprint(model.predict(records.to_dicts()[:3]))
|
|
168
165
|
```
|
|
169
166
|
|
|
170
|
-
The prediction call returns a typed result for `record/species
|
|
171
|
-
|
|
167
|
+
The prediction call returns a typed result for `record/species` and the
|
|
168
|
+
configured `record` embedding for each input observation.
|
|
172
169
|
|
|
173
170
|
## Documentation
|
|
174
171
|
|
|
@@ -182,16 +179,20 @@ uv run --extra docs mkdocs build --strict
|
|
|
182
179
|
Useful entry points:
|
|
183
180
|
|
|
184
181
|
- [Getting Started](https://json2vec.github.io/json2vec/getting-started/)
|
|
185
|
-
- [
|
|
186
|
-
- [
|
|
187
|
-
- [
|
|
182
|
+
- [AI Quickstart](https://json2vec.github.io/json2vec/ai-quickstart/)
|
|
183
|
+
- [Why `json2vec`](https://json2vec.github.io/json2vec/motivation/)
|
|
184
|
+
- [Query Paths](https://json2vec.github.io/json2vec/core-concepts/querypaths/)
|
|
185
|
+
- [Built-In Data Types](https://json2vec.github.io/json2vec/core-concepts/data-types/)
|
|
186
|
+
- [Learning Modes & Embeddings](https://json2vec.github.io/json2vec/core-concepts/embeddings/)
|
|
187
|
+
- [Model Tree](https://json2vec.github.io/json2vec/core-concepts/model-tree/)
|
|
188
|
+
- [Mutations](https://json2vec.github.io/json2vec/core-concepts/mutations/)
|
|
188
189
|
- [Hello World](https://json2vec.github.io/json2vec/tutorials/hello-world/)
|
|
189
|
-
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
190
190
|
- [Nested Supervised Training](https://json2vec.github.io/json2vec/tutorials/nested-supervised-training/)
|
|
191
|
+
- [Masked Pretraining](https://json2vec.github.io/json2vec/tutorials/pretraining/)
|
|
191
192
|
- [Supervised Tabular Training](https://json2vec.github.io/json2vec/tutorials/supervised-tabular-training/)
|
|
192
|
-
- [Field
|
|
193
|
+
- [Field Importance](https://json2vec.github.io/json2vec/guides/field-importance/)
|
|
193
194
|
- [Preprocessors](https://json2vec.github.io/json2vec/guides/preprocessors/)
|
|
194
|
-
- [
|
|
195
|
+
- [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
|
|
195
196
|
- [Serving](https://json2vec.github.io/json2vec/tutorials/serving/)
|
|
196
197
|
- [API Reference](https://json2vec.github.io/json2vec/reference/api/)
|
|
197
198
|
- [Whitepaper](https://json2vec.github.io/json2vec/whitepaper.pdf)
|
|
@@ -258,7 +259,7 @@ Configured `dataset.kwargs` are passed into the preprocessor, with unsupported k
|
|
|
258
259
|
|
|
259
260
|
Each tensorfield plugin provides a request schema plus the model components
|
|
260
261
|
needed to encode values, decode predictions, compute losses, and optionally
|
|
261
|
-
serialize outputs. See [
|
|
262
|
+
serialize outputs. See [Custom Data Types](https://json2vec.github.io/json2vec/data-types/tensorfields/)
|
|
262
263
|
for a custom plugin walkthrough. Built-in tensorfields share the base leaf
|
|
263
264
|
options `name`, `query`, `pooling`, `weight`, `n_heads`, `n_linear`, `dropout`,
|
|
264
265
|
`p_mask`, and `p_prune`.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "json2vec"
|
|
3
|
-
version = "0.4.
|
|
3
|
+
version = "0.4.6"
|
|
4
4
|
description = "{...} -> [*]"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -31,9 +31,11 @@ text = [
|
|
|
31
31
|
"transformers>=4.55.0",
|
|
32
32
|
]
|
|
33
33
|
docs = [
|
|
34
|
+
"litserve>=0.2.13",
|
|
34
35
|
"mkdocs-material>=9.6",
|
|
35
36
|
"mkdocs-jupyter>=0.26.3",
|
|
36
37
|
"mkdocstrings[python]>=0.27",
|
|
38
|
+
"pydantic-settings>=2.10.1",
|
|
37
39
|
]
|
|
38
40
|
|
|
39
41
|
[dependency-groups]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Public
|
|
1
|
+
"""Public `json2vec` SDK surface.
|
|
2
2
|
|
|
3
3
|
The top-level package exports the constructors and helpers used by most
|
|
4
4
|
applications: `Model.from_schema(...)` for model construction, tensorfield
|
|
@@ -6,6 +6,8 @@ request constructors such as `Category` and `Number`, data modules, schema
|
|
|
6
6
|
mutation predicates, and the `@preprocess` decorator.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
+
from typing import TYPE_CHECKING, Any
|
|
10
|
+
|
|
9
11
|
from json2vec.architecture.root import (
|
|
10
12
|
Model,
|
|
11
13
|
MutationLockCallback,
|
|
@@ -15,6 +17,7 @@ from json2vec.architecture.root import (
|
|
|
15
17
|
SchedulerConfig,
|
|
16
18
|
)
|
|
17
19
|
from json2vec.data.datasets import PolarsDataModule, StreamingDataModule
|
|
20
|
+
from json2vec.inference.callback import Postprocessor, Writer
|
|
18
21
|
from json2vec.preprocessors import PREPROCESSORS, Preprocessor, PreprocessorMode, preprocess
|
|
19
22
|
from json2vec.structs.enums import AttentionMode, Component, Metric, ShardingStrategy, Strata, Suffix, TensorKey, Tokens
|
|
20
23
|
from json2vec.structs.experiment import (
|
|
@@ -37,20 +40,73 @@ from json2vec.tensorfields.extensions.text import Request as Text
|
|
|
37
40
|
from json2vec.tensorfields.extensions.vector import Request as Vector
|
|
38
41
|
from json2vec.tensorfields.shared.vocabulary import VocabularySyncCallback
|
|
39
42
|
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from json2vec.inference.deployment import (
|
|
45
|
+
API,
|
|
46
|
+
Accelerator,
|
|
47
|
+
BatchItem,
|
|
48
|
+
Deployment,
|
|
49
|
+
ErrorItem,
|
|
50
|
+
Input,
|
|
51
|
+
ModelSource,
|
|
52
|
+
UpdateOperation,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
_SERVING_EXPORTS = {
|
|
56
|
+
"API",
|
|
57
|
+
"Accelerator",
|
|
58
|
+
"BatchItem",
|
|
59
|
+
"Deployment",
|
|
60
|
+
"ErrorItem",
|
|
61
|
+
"Input",
|
|
62
|
+
"ModelSource",
|
|
63
|
+
"UpdateOperation",
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def __getattr__(name: str) -> Any:
|
|
68
|
+
if name not in _SERVING_EXPORTS:
|
|
69
|
+
raise AttributeError(f"module 'json2vec' has no attribute {name!r}")
|
|
70
|
+
|
|
71
|
+
try:
|
|
72
|
+
from json2vec.inference import deployment
|
|
73
|
+
except ModuleNotFoundError as error:
|
|
74
|
+
if error.name in {"litserve", "pydantic_settings"}:
|
|
75
|
+
raise ModuleNotFoundError(
|
|
76
|
+
f"json2vec.{name} requires the serving extra; install with `pip install json2vec[serving]`."
|
|
77
|
+
) from error
|
|
78
|
+
raise
|
|
79
|
+
|
|
80
|
+
value = getattr(deployment, name)
|
|
81
|
+
globals()[name] = value
|
|
82
|
+
return value
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def __dir__() -> list[str]:
|
|
86
|
+
return sorted([*globals(), *_SERVING_EXPORTS])
|
|
87
|
+
|
|
88
|
+
|
|
40
89
|
__all__ = [
|
|
41
90
|
"Address",
|
|
91
|
+
"API",
|
|
92
|
+
"Accelerator",
|
|
42
93
|
"Array",
|
|
43
94
|
"AttentionMode",
|
|
95
|
+
"BatchItem",
|
|
44
96
|
"Category",
|
|
45
97
|
"Component",
|
|
46
98
|
"DateParts",
|
|
47
99
|
"DecoderBase",
|
|
100
|
+
"Deployment",
|
|
48
101
|
"EmbedderBase",
|
|
49
102
|
"Entity",
|
|
103
|
+
"ErrorItem",
|
|
50
104
|
"Hyperparameters",
|
|
105
|
+
"Input",
|
|
51
106
|
"Leaf",
|
|
52
107
|
"Metric",
|
|
53
108
|
"Model",
|
|
109
|
+
"ModelSource",
|
|
54
110
|
"MutationLockCallback",
|
|
55
111
|
"NodeAttribute",
|
|
56
112
|
"NodePredicate",
|
|
@@ -59,6 +115,7 @@ __all__ = [
|
|
|
59
115
|
"PREPROCESSORS",
|
|
60
116
|
"Plugin",
|
|
61
117
|
"PolarsDataModule",
|
|
118
|
+
"Postprocessor",
|
|
62
119
|
"Preprocessor",
|
|
63
120
|
"PreprocessorMode",
|
|
64
121
|
"RequestBase",
|
|
@@ -76,8 +133,10 @@ __all__ = [
|
|
|
76
133
|
"TensorKey",
|
|
77
134
|
"Text",
|
|
78
135
|
"Tokens",
|
|
136
|
+
"UpdateOperation",
|
|
79
137
|
"Vector",
|
|
80
138
|
"VocabularySyncCallback",
|
|
139
|
+
"Writer",
|
|
81
140
|
"predicate",
|
|
82
141
|
"preprocess",
|
|
83
142
|
"where",
|
|
@@ -93,7 +93,7 @@ def sanitize(
|
|
|
93
93
|
require_core_tensors(module, address, tensorfield)
|
|
94
94
|
require_tensor_devices(module, address, tensorfield)
|
|
95
95
|
require_target_contract(module, address, tensorfield, strata=normalized)
|
|
96
|
-
require_mask_contract(module, address, tensorfield)
|
|
96
|
+
require_mask_contract(module, address, tensorfield, strata=normalized)
|
|
97
97
|
|
|
98
98
|
|
|
99
99
|
def is_backoff_index(index: int, *, periodic_interval: int) -> bool:
|
|
@@ -292,7 +292,7 @@ def require_tensor_devices(module: "Model", address: Address, tensorfield: Tenso
|
|
|
292
292
|
)
|
|
293
293
|
|
|
294
294
|
|
|
295
|
-
def require_mask_contract(module: "Model", address: Address, tensorfield: TensorFieldBase) -> None:
|
|
295
|
+
def require_mask_contract(module: "Model", address: Address, tensorfield: TensorFieldBase, *, strata: Strata) -> None:
|
|
296
296
|
state = tensorfield.state
|
|
297
297
|
trainable = tensorfield.trainable
|
|
298
298
|
is_masked = state.eq(Tokens.masked.value)
|
|
@@ -301,7 +301,7 @@ def require_mask_contract(module: "Model", address: Address, tensorfield: Tensor
|
|
|
301
301
|
if trainable.any() and not state.masked_select(trainable).eq(Tokens.masked.value).all():
|
|
302
302
|
raise ForwardContractError(f"forward input '{address}' trainable positions must have masked state")
|
|
303
303
|
|
|
304
|
-
if not is_target and (is_masked & ~trainable).any():
|
|
304
|
+
if strata != Strata.predict and not is_target and (is_masked & ~trainable).any():
|
|
305
305
|
raise ForwardContractError(f"forward input '{address}' has masked state where trainable is false")
|
|
306
306
|
|
|
307
307
|
if not trainable.any():
|
|
@@ -8,6 +8,8 @@ from typing import TYPE_CHECKING
|
|
|
8
8
|
import torch
|
|
9
9
|
|
|
10
10
|
from json2vec.architecture.node import NodeModule
|
|
11
|
+
from json2vec.data.datasets.base import EncodedInput
|
|
12
|
+
from json2vec.structs.enums import Strata
|
|
11
13
|
from json2vec.structs.experiment import Hyperparameters
|
|
12
14
|
from json2vec.structs.tree import Address, Node
|
|
13
15
|
|
|
@@ -19,9 +21,19 @@ class ModelGraph:
|
|
|
19
21
|
"""Build and rebuild runtime modules from schema hyperparameters."""
|
|
20
22
|
|
|
21
23
|
@staticmethod
|
|
22
|
-
def
|
|
24
|
+
def example_forward_kwargs(hyperparameters: Hyperparameters, batch_size: int) -> dict[str, EncodedInput | Strata]:
|
|
23
25
|
from json2vec.data.iterables import mock
|
|
24
26
|
|
|
27
|
+
return {
|
|
28
|
+
"inputs": mock(hyperparameters=hyperparameters, batch_size=batch_size),
|
|
29
|
+
"strata": Strata.predict,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def build(
|
|
34
|
+
hyperparameters: Hyperparameters,
|
|
35
|
+
batch_size: int,
|
|
36
|
+
) -> tuple[torch.nn.ModuleDict, dict[str, EncodedInput | Strata]]:
|
|
25
37
|
nodes: torch.nn.ModuleDict[str, NodeModule] = torch.nn.ModuleDict()
|
|
26
38
|
|
|
27
39
|
for address in hyperparameters.requests | hyperparameters.arrays:
|
|
@@ -31,7 +43,7 @@ class ModelGraph:
|
|
|
31
43
|
batch_size=batch_size,
|
|
32
44
|
)
|
|
33
45
|
|
|
34
|
-
return nodes,
|
|
46
|
+
return nodes, ModelGraph.example_forward_kwargs(hyperparameters=hyperparameters, batch_size=batch_size)
|
|
35
47
|
|
|
36
48
|
@staticmethod
|
|
37
49
|
def install(module: "Model") -> None:
|
|
@@ -72,8 +84,6 @@ class ModelGraph:
|
|
|
72
84
|
|
|
73
85
|
@staticmethod
|
|
74
86
|
def reset_selected(module: "Model", selected: list[Node], *, descendants: bool = False) -> None:
|
|
75
|
-
from json2vec.data.iterables import mock
|
|
76
|
-
|
|
77
87
|
selected_by_address: dict[Address, Node] = {}
|
|
78
88
|
for node in selected:
|
|
79
89
|
if node.address in module.nodes:
|
|
@@ -94,7 +104,10 @@ class ModelGraph:
|
|
|
94
104
|
batch_size=module.batch_size,
|
|
95
105
|
)
|
|
96
106
|
|
|
97
|
-
module.example_input_array =
|
|
107
|
+
module.example_input_array = ModelGraph.example_forward_kwargs(
|
|
108
|
+
hyperparameters=module.hyperparameters,
|
|
109
|
+
batch_size=module.batch_size,
|
|
110
|
+
)
|
|
98
111
|
device = module.device
|
|
99
112
|
if isinstance(device, torch.device):
|
|
100
113
|
module.to(device=device)
|