spacy-accelerate 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spacy_accelerate-0.3.0/.claude/settings.local.json +11 -0
- spacy_accelerate-0.3.0/.dockerignore +20 -0
- spacy_accelerate-0.3.0/.gitignore +88 -0
- spacy_accelerate-0.3.0/LICENSE +21 -0
- spacy_accelerate-0.3.0/Makefile +52 -0
- spacy_accelerate-0.3.0/PKG-INFO +341 -0
- spacy_accelerate-0.3.0/README.md +296 -0
- spacy_accelerate-0.3.0/benchmarks/README.md +109 -0
- spacy_accelerate-0.3.0/benchmarks/benchmark.py +806 -0
- spacy_accelerate-0.3.0/benchmarks/check_tensorrt.py +128 -0
- spacy_accelerate-0.3.0/benchmarks/dataset_loader.py +394 -0
- spacy_accelerate-0.3.0/benchmarks/docker/Dockerfile +31 -0
- spacy_accelerate-0.3.0/benchmarks/docker/run.sh +21 -0
- spacy_accelerate-0.3.0/benchmarks/requirements.txt +28 -0
- spacy_accelerate-0.3.0/docs/article.html +1385 -0
- spacy_accelerate-0.3.0/docs/article.md +479 -0
- spacy_accelerate-0.3.0/docs/article_en.html +1481 -0
- spacy_accelerate-0.3.0/docs/article_en.md +442 -0
- spacy_accelerate-0.3.0/pyproject.toml +101 -0
- spacy_accelerate-0.3.0/requirements.txt +36 -0
- spacy_accelerate-0.3.0/spacy_accelerate/__init__.py +35 -0
- spacy_accelerate-0.3.0/spacy_accelerate/__main__.py +95 -0
- spacy_accelerate-0.3.0/spacy_accelerate/_logging.py +44 -0
- spacy_accelerate-0.3.0/spacy_accelerate/_version.py +2 -0
- spacy_accelerate-0.3.0/spacy_accelerate/api.py +292 -0
- spacy_accelerate-0.3.0/spacy_accelerate/cache/__init__.py +5 -0
- spacy_accelerate-0.3.0/spacy_accelerate/cache/manager.py +262 -0
- spacy_accelerate-0.3.0/spacy_accelerate/config.py +150 -0
- spacy_accelerate-0.3.0/spacy_accelerate/conversion/__init__.py +5 -0
- spacy_accelerate-0.3.0/spacy_accelerate/conversion/exporter.py +254 -0
- spacy_accelerate-0.3.0/spacy_accelerate/conversion/fp16_converter.py +80 -0
- spacy_accelerate-0.3.0/spacy_accelerate/conversion/weight_mapper.py +268 -0
- spacy_accelerate-0.3.0/spacy_accelerate/core/__init__.py +12 -0
- spacy_accelerate-0.3.0/spacy_accelerate/core/discovery.py +143 -0
- spacy_accelerate-0.3.0/spacy_accelerate/core/patcher.py +55 -0
- spacy_accelerate-0.3.0/spacy_accelerate/core/validation.py +40 -0
- spacy_accelerate-0.3.0/spacy_accelerate/exceptions.py +37 -0
- spacy_accelerate-0.3.0/spacy_accelerate/runtime/__init__.py +9 -0
- spacy_accelerate-0.3.0/spacy_accelerate/runtime/cpu_proxy.py +133 -0
- spacy_accelerate-0.3.0/spacy_accelerate/runtime/io_binding_proxy.py +369 -0
- spacy_accelerate-0.3.0/spacy_accelerate/runtime/ort_proxy.py +142 -0
- spacy_accelerate-0.3.0/spacy_accelerate/runtime/providers.py +264 -0
- spacy_accelerate-0.3.0/spacy_accelerate/runtime/proxy_base.py +115 -0
- spacy_accelerate-0.3.0/tests/__init__.py +1 -0
- spacy_accelerate-0.3.0/tests/conftest.py +66 -0
- spacy_accelerate-0.3.0/tests/test_api.py +207 -0
- spacy_accelerate-0.3.0/tests/test_cache.py +134 -0
- spacy_accelerate-0.3.0/tests/test_discovery.py +36 -0
- spacy_accelerate-0.3.0/tests/test_patcher.py +45 -0
- spacy_accelerate-0.3.0/tests/test_providers.py +33 -0
- spacy_accelerate-0.3.0/tests/test_validation.py +49 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
|
|
27
|
+
# PyInstaller
|
|
28
|
+
*.manifest
|
|
29
|
+
*.spec
|
|
30
|
+
|
|
31
|
+
# Installer logs
|
|
32
|
+
pip-log.txt
|
|
33
|
+
pip-delete-this-directory.txt
|
|
34
|
+
|
|
35
|
+
# Unit test / coverage reports
|
|
36
|
+
htmlcov/
|
|
37
|
+
.tox/
|
|
38
|
+
.nox/
|
|
39
|
+
.coverage
|
|
40
|
+
.coverage.*
|
|
41
|
+
.cache
|
|
42
|
+
nosetests.xml
|
|
43
|
+
coverage.xml
|
|
44
|
+
*.cover
|
|
45
|
+
*.py,cover
|
|
46
|
+
.hypothesis/
|
|
47
|
+
.pytest_cache/
|
|
48
|
+
|
|
49
|
+
# Translations
|
|
50
|
+
*.mo
|
|
51
|
+
*.pot
|
|
52
|
+
|
|
53
|
+
# Environments
|
|
54
|
+
.env
|
|
55
|
+
.venv
|
|
56
|
+
env/
|
|
57
|
+
venv/
|
|
58
|
+
ENV/
|
|
59
|
+
env.bak/
|
|
60
|
+
venv.bak/
|
|
61
|
+
|
|
62
|
+
# IDE
|
|
63
|
+
.idea/
|
|
64
|
+
.vscode/
|
|
65
|
+
*.swp
|
|
66
|
+
*.swo
|
|
67
|
+
*~
|
|
68
|
+
|
|
69
|
+
# OS
|
|
70
|
+
.DS_Store
|
|
71
|
+
Thumbs.db
|
|
72
|
+
|
|
73
|
+
# ONNX models
|
|
74
|
+
*.onnx
|
|
75
|
+
|
|
76
|
+
# TensorRT engines
|
|
77
|
+
*.engine
|
|
78
|
+
*.plan
|
|
79
|
+
trt_engines/
|
|
80
|
+
|
|
81
|
+
# Cache
|
|
82
|
+
.cache/
|
|
83
|
+
|
|
84
|
+
# PyPI
|
|
85
|
+
.pypirc
|
|
86
|
+
|
|
87
|
+
# Benchmark artifacts
|
|
88
|
+
artifacts/
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Siarhei Niaverau
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
IMAGE_NAME ?= spacy-accelerate-bench
|
|
2
|
+
ARTIFACTS_DIR ?= $(CURDIR)/artifacts/benchmarks/docker
|
|
3
|
+
DOCKERFILE ?= benchmarks/docker/Dockerfile
|
|
4
|
+
MODELS ?=
|
|
5
|
+
BENCHMARK_ARGS ?=
|
|
6
|
+
PYTHON ?= .venv/bin/python
|
|
7
|
+
TEST_PYPI_REPOSITORY_URL ?= https://test.pypi.org/legacy/
|
|
8
|
+
|
|
9
|
+
.PHONY: build benchmark benchmark-full benchmark-ner-only run docker-build docker-benchmark docker-benchmark-full docker-benchmark-ner-only docker-benchmark-shell package package-check publish-testpypi
|
|
10
|
+
|
|
11
|
+
docker-build:
|
|
12
|
+
docker build -f $(DOCKERFILE) -t $(IMAGE_NAME) .
|
|
13
|
+
|
|
14
|
+
docker-benchmark:
|
|
15
|
+
mkdir -p $(ARTIFACTS_DIR)
|
|
16
|
+
docker run --rm --gpus all \
|
|
17
|
+
-v "$(ARTIFACTS_DIR):/artifacts" \
|
|
18
|
+
$(IMAGE_NAME) \
|
|
19
|
+
$(if $(strip $(MODELS)),--models $(MODELS),) \
|
|
20
|
+
$(BENCHMARK_ARGS)
|
|
21
|
+
|
|
22
|
+
docker-benchmark-full:
|
|
23
|
+
$(MAKE) docker-benchmark BENCHMARK_ARGS="$(BENCHMARK_ARGS)"
|
|
24
|
+
|
|
25
|
+
docker-benchmark-ner-only:
|
|
26
|
+
$(MAKE) docker-benchmark BENCHMARK_ARGS="--ner-only $(BENCHMARK_ARGS)"
|
|
27
|
+
|
|
28
|
+
docker-benchmark-shell:
|
|
29
|
+
mkdir -p $(ARTIFACTS_DIR)
|
|
30
|
+
docker run --rm -it --gpus all --entrypoint bash \
|
|
31
|
+
-v "$(ARTIFACTS_DIR):/artifacts" \
|
|
32
|
+
$(IMAGE_NAME)
|
|
33
|
+
|
|
34
|
+
build: docker-build
|
|
35
|
+
|
|
36
|
+
benchmark: docker-benchmark-full
|
|
37
|
+
|
|
38
|
+
benchmark-full: docker-benchmark-full
|
|
39
|
+
|
|
40
|
+
benchmark-ner-only: docker-benchmark-ner-only
|
|
41
|
+
|
|
42
|
+
run: docker-benchmark-full
|
|
43
|
+
|
|
44
|
+
package:
|
|
45
|
+
rm -rf dist/
|
|
46
|
+
$(PYTHON) -m build
|
|
47
|
+
|
|
48
|
+
package-check: package
|
|
49
|
+
$(PYTHON) -m twine check dist/*
|
|
50
|
+
|
|
51
|
+
publish-testpypi: package-check
|
|
52
|
+
$(PYTHON) -m twine upload --repository-url $(TEST_PYPI_REPOSITORY_URL) dist/*
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spacy-accelerate
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Accelerate spaCy transformers with TensorRT/ONNX Runtime
|
|
5
|
+
Project-URL: Homepage, https://github.com/nesergey/spacy-accelerate
|
|
6
|
+
Project-URL: Documentation, https://github.com/nesergey/spacy-accelerate#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/nesergey/spacy-accelerate
|
|
8
|
+
Project-URL: Issues, https://github.com/nesergey/spacy-accelerate/issues
|
|
9
|
+
Author-email: Siarhei Niaverau <nesergey@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: acceleration,nlp,onnx,spacy,tensorrt,transformer
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Requires-Dist: cupy-cuda12x==13.6.0
|
|
24
|
+
Requires-Dist: numpy==2.4.1
|
|
25
|
+
Requires-Dist: onnx==1.20.1
|
|
26
|
+
Requires-Dist: onnxruntime-gpu==1.23.2
|
|
27
|
+
Requires-Dist: onnxscript<0.2.0,>=0.1.0
|
|
28
|
+
Requires-Dist: spacy-transformers==1.3.9
|
|
29
|
+
Requires-Dist: spacy==3.8.2
|
|
30
|
+
Requires-Dist: tensorrt-cu12-bindings==10.15.1.29
|
|
31
|
+
Requires-Dist: tensorrt-cu12-libs==10.15.1.29
|
|
32
|
+
Requires-Dist: tensorrt-cu12==10.15.1.29
|
|
33
|
+
Requires-Dist: tensorrt==10.15.1.29
|
|
34
|
+
Requires-Dist: thinc==8.3.10
|
|
35
|
+
Requires-Dist: torch==2.5.1
|
|
36
|
+
Requires-Dist: transformers==4.41.2
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: black>=24.0.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: datasets>=2.0.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
42
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
43
|
+
Requires-Dist: ruff>=0.3.0; extra == 'dev'
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
|
|
46
|
+
# spacy-accelerate
|
|
47
|
+
|
|
48
|
+
Accelerate spaCy transformers with TensorRT/ONNX Runtime. Drop-in replacement for transformer-based spaCy pipelines with Docker-verified GPU benchmark workflows.
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
`spacy-accelerate` depends on a CUDA/TensorRT stack that must stay version-aligned.
|
|
53
|
+
The two failure modes we hit in practice were:
|
|
54
|
+
|
|
55
|
+
- a second dependency resolution pass upgrading parts of the stack to different CUDA majors;
|
|
56
|
+
- CUDA/TensorRT shared libraries from pip wheels not being visible to CuPy / ONNX Runtime.
|
|
57
|
+
|
|
58
|
+
The package now pins the runtime versions in `pyproject.toml`, and it configures
|
|
59
|
+
the pip-installed native libraries automatically on import.
|
|
60
|
+
|
|
61
|
+
Benchmark Docker files live under `benchmarks/docker/`, and canonical benchmark
|
|
62
|
+
artifacts are saved under `artifacts/benchmarks/docker/`. The root
|
|
63
|
+
`.dockerignore` is kept at repository level because Docker build context
|
|
64
|
+
filtering applies to the whole repo root.
|
|
65
|
+
|
|
66
|
+
### PyPI install
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install spacy-accelerate
|
|
70
|
+
pip install --force-reinstall \
|
|
71
|
+
--extra-index-url https://pypi.nvidia.com \
|
|
72
|
+
onnxruntime-gpu==1.23.2
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
The second command is still required to guarantee the TensorRT-enabled
|
|
76
|
+
`onnxruntime-gpu` build from NVIDIA.
|
|
77
|
+
|
|
78
|
+
### Source / editable install
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install -r requirements.txt
|
|
82
|
+
pip install -e . --no-deps
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Do not run plain `pip install -e .` after that. It can trigger a second resolver
|
|
86
|
+
pass and replace the pinned CUDA 12 stack with newer incompatible packages.
|
|
87
|
+
|
|
88
|
+
**Verify the installation:**
|
|
89
|
+
```bash
|
|
90
|
+
python -m spacy_accelerate
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
You should see `TensorRT EP : OK` and `CUDA EP : OK` in the output.
|
|
94
|
+
|
|
95
|
+
**Requirements:**
|
|
96
|
+
- Python 3.11+
|
|
97
|
+
- CUDA 12.x
|
|
98
|
+
- NVIDIA GPU with TensorRT support (Ampere / Ada Lovelace recommended)
|
|
99
|
+
- spaCy 3.8+ with spacy-transformers
|
|
100
|
+
|
|
101
|
+
## Quick Start
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
import spacy
|
|
105
|
+
import spacy_accelerate
|
|
106
|
+
|
|
107
|
+
# Load your spaCy transformer model
|
|
108
|
+
nlp = spacy.load("en_core_web_trf")
|
|
109
|
+
|
|
110
|
+
# Optimize with one line!
|
|
111
|
+
nlp = spacy_accelerate.optimize(nlp, precision="fp16")
|
|
112
|
+
|
|
113
|
+
# Use as normal - same API, faster inference
|
|
114
|
+
doc = nlp("Apple Inc. was founded by Steve Jobs in Cupertino.")
|
|
115
|
+
print([(ent.text, ent.label_) for ent in doc.ents])
|
|
116
|
+
# [('Apple Inc.', 'ORG'), ('Steve Jobs', 'PERSON'), ('Cupertino', 'GPE')]
|
|
117
|
+
|
|
118
|
+
# Batch processing works too
|
|
119
|
+
texts = ["Text one.", "Text two.", "Text three."]
|
|
120
|
+
docs = list(nlp.pipe(texts, batch_size=32))
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## API Reference
|
|
124
|
+
|
|
125
|
+
### `optimize(nlp, **kwargs)`
|
|
126
|
+
|
|
127
|
+
Optimize a spaCy transformer pipeline with ONNX Runtime / TensorRT.
|
|
128
|
+
|
|
129
|
+
**Parameters:**
|
|
130
|
+
|
|
131
|
+
| Parameter | Type | Default | Description |
|
|
132
|
+
|-----------|------|---------|-------------|
|
|
133
|
+
| `nlp` | `spacy.Language` | required | spaCy pipeline with transformer |
|
|
134
|
+
| `precision` | `"fp32"` \| `"fp16"` | `"fp16"` | Model precision |
|
|
135
|
+
| `provider` | `"tensorrt"` \| `"cuda"` \| `"cpu"` | `"cuda"` | Execution provider |
|
|
136
|
+
| `cache_dir` | `Path` \| `str` | `~/.cache/spacy-accelerate` | ONNX model cache directory |
|
|
137
|
+
| `warmup` | `bool` | `True` | Run warmup inference |
|
|
138
|
+
| `device_id` | `int` | `0` | CUDA device ID |
|
|
139
|
+
| `max_batch_size` | `int` | `128` | Max batch size for IO Binding |
|
|
140
|
+
| `max_seq_length` | `int` | `512` | Max sequence length for IO Binding |
|
|
141
|
+
| `use_io_binding` | `bool` | `True` | Use zero-copy IO Binding |
|
|
142
|
+
| `verbose` | `bool` | `False` | Enable verbose logging |
|
|
143
|
+
|
|
144
|
+
**TensorRT-specific parameters:**
|
|
145
|
+
|
|
146
|
+
| Parameter | Type | Default | Description |
|
|
147
|
+
|-----------|------|---------|-------------|
|
|
148
|
+
| `trt_max_workspace_size` | `int` | `4GB` | TensorRT workspace size |
|
|
149
|
+
| `trt_builder_optimization_level` | `int` | `3` | Optimization level (0-5) |
|
|
150
|
+
| `trt_timing_cache` | `bool` | `True` | Enable timing cache |
|
|
151
|
+
|
|
152
|
+
**Returns:** The optimized `spacy.Language` object (modified in-place).
|
|
153
|
+
|
|
154
|
+
### Cache Management
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
import spacy_accelerate
|
|
158
|
+
|
|
159
|
+
# List cached models
|
|
160
|
+
cached = spacy_accelerate.list_cached()
|
|
161
|
+
print(f"Cached models: {cached}")
|
|
162
|
+
|
|
163
|
+
# Get cache size
|
|
164
|
+
size_bytes = spacy_accelerate.get_cache_size()
|
|
165
|
+
print(f"Cache size: {size_bytes / 1024**2:.1f} MB")
|
|
166
|
+
|
|
167
|
+
# Clear cache
|
|
168
|
+
cleared = spacy_accelerate.clear_cache()
|
|
169
|
+
print(f"Cleared {cleared} cache entries")
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Performance
|
|
173
|
+
|
|
174
|
+
Canonical benchmark results are the Docker runs under [artifacts/benchmarks/docker](/Users/nesergeyv/Projects/spacy-accelerate/artifacts/benchmarks/docker).
|
|
175
|
+
|
|
176
|
+
Benchmark commands and runner details are maintained in [benchmarks/README.md](/Users/nesergeyv/Projects/spacy-accelerate/benchmarks/README.md).
|
|
177
|
+
|
|
178
|
+
Latest full-pipeline Docker measurement for `en_core_web_trf` on **NVIDIA RTX 4000 SFF Ada Generation**, **CoNLL-2003** test set, `batch_size=128`, `1` discarded prime pass and `3` measured passes averaged:
|
|
179
|
+
|
|
180
|
+
| Execution Provider | Speed (WPS) | Speedup vs PyTorch | Accuracy |
|
|
181
|
+
|--------------------|-------------|--------------------|----------|
|
|
182
|
+
| PyTorch Baseline (FP32) | 6,241 | 1.00x | 100.00% |
|
|
183
|
+
| PyTorch Baseline (FP16) | 6,166 | 0.99x | 100.00% |
|
|
184
|
+
| CUDA FP32 | 9,910 | 1.59x | 99.90% |
|
|
185
|
+
| CUDA FP16 | 15,763 | 2.53x | 99.75% |
|
|
186
|
+
| TensorRT FP32 | 10,552 | 1.69x | 99.95% |
|
|
187
|
+
| **TensorRT FP16** | **16,935** | **2.71x** | **99.50%** |
|
|
188
|
+
|
|
189
|
+
Latest Docker NER-only measurement for `en_core_web_trf` with `tagger`, `parser`, `attribute_ruler`, and `lemmatizer` disabled:
|
|
190
|
+
|
|
191
|
+
| Execution Provider | Speed (WPS) | Speedup vs PyTorch | Accuracy |
|
|
192
|
+
|--------------------|-------------|--------------------|----------|
|
|
193
|
+
| PyTorch Baseline (FP32) | 7,066 | 1.00x | 100.00% |
|
|
194
|
+
| PyTorch Baseline (FP16) | 6,859 | 0.97x | 100.00% |
|
|
195
|
+
| CUDA FP32 | 11,972 | 1.69x | 99.90% |
|
|
196
|
+
| CUDA FP16 | 22,394 | 3.17x | 99.75% |
|
|
197
|
+
| TensorRT FP32 | 13,138 | 1.86x | 99.95% |
|
|
198
|
+
| **TensorRT FP16** | **24,823** | **3.51x** | **99.65%** |
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
## Examples
|
|
202
|
+
|
|
203
|
+
### Using TensorRT for Maximum Performance
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
import spacy
|
|
207
|
+
import spacy_accelerate
|
|
208
|
+
|
|
209
|
+
nlp = spacy.load("en_core_web_trf")
|
|
210
|
+
|
|
211
|
+
nlp = spacy_accelerate.optimize(
|
|
212
|
+
nlp,
|
|
213
|
+
provider="tensorrt",
|
|
214
|
+
precision="fp16",
|
|
215
|
+
trt_max_workspace_size=8 * 1024**3, # 8GB
|
|
216
|
+
trt_builder_optimization_level=5, # Maximum optimization
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# First inference builds TensorRT engine (cached for subsequent runs)
|
|
220
|
+
doc = nlp("TensorRT provides maximum inference speed.")
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
### Custom Cache Directory
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
import spacy
|
|
229
|
+
import spacy_accelerate
|
|
230
|
+
|
|
231
|
+
nlp = spacy.load("en_core_web_trf")
|
|
232
|
+
|
|
233
|
+
nlp = spacy_accelerate.optimize(
|
|
234
|
+
nlp,
|
|
235
|
+
cache_dir="/path/to/custom/cache",
|
|
236
|
+
precision="fp16",
|
|
237
|
+
)
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Verbose Mode for Debugging
|
|
241
|
+
|
|
242
|
+
```python
|
|
243
|
+
import spacy
|
|
244
|
+
import spacy_accelerate
|
|
245
|
+
|
|
246
|
+
nlp = spacy.load("en_core_web_trf")
|
|
247
|
+
|
|
248
|
+
nlp = spacy_accelerate.optimize(
|
|
249
|
+
nlp,
|
|
250
|
+
verbose=True, # Print detailed logs
|
|
251
|
+
)
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
## Supported Models
|
|
255
|
+
|
|
256
|
+
Right now the confirmed spaCy model support is:
|
|
257
|
+
|
|
258
|
+
- `en_core_web_trf`
|
|
259
|
+
|
|
260
|
+
The earlier wording here listed transformer architecture families, not actual
|
|
261
|
+
published spaCy package names. Internally, the exporter and architecture
|
|
262
|
+
detection logic currently target curated-transformer / RoBERTa-style backbones,
|
|
263
|
+
with partial code paths for BERT and XLM-RoBERTa families, but those are not yet
|
|
264
|
+
claimed here as generally supported spaCy packages.
|
|
265
|
+
|
|
266
|
+
## How It Works
|
|
267
|
+
|
|
268
|
+
1. **Weight Mapping**: Extracts transformer weights from spaCy's internal format and maps them to HuggingFace format.
|
|
269
|
+
|
|
270
|
+
2. **ONNX Export**: Exports the mapped model to ONNX format with dynamic batch and sequence dimensions.
|
|
271
|
+
|
|
272
|
+
3. **FP16 Optimization** (optional): Applies BERT-style optimizations and converts to FP16 for faster inference.
|
|
273
|
+
|
|
274
|
+
4. **Runtime Patching**: Replaces the PyTorch transformer with an ONNX Runtime proxy that provides the same interface.
|
|
275
|
+
|
|
276
|
+
5. **Caching**: Converted models are cached to avoid re-conversion on subsequent loads.
|
|
277
|
+
|
|
278
|
+
## Troubleshooting
|
|
279
|
+
|
|
280
|
+
### TensorRT provider not available
|
|
281
|
+
|
|
282
|
+
Run the diagnostic tool first:
|
|
283
|
+
```bash
|
|
284
|
+
python -m spacy_accelerate
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
If you see `TensorRT EP : MISSING`, the NVIDIA build of onnxruntime-gpu is not installed.
|
|
288
|
+
Fix with step 2 from the installation instructions:
|
|
289
|
+
```bash
|
|
290
|
+
pip install --force-reinstall \
|
|
291
|
+
--extra-index-url https://pypi.nvidia.com \
|
|
292
|
+
onnxruntime-gpu==1.23.2
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### libnvinfer.so / libcublas.so / libcublasLt.so not found
|
|
296
|
+
|
|
297
|
+
If you see errors like `libnvinfer.so.10`, `libcublas.so.12`, or
|
|
298
|
+
`libcublasLt.so.12: cannot open shared object file`:
|
|
299
|
+
|
|
300
|
+
**Automatic fix:** `spacy-accelerate` automatically configures both TensorRT
|
|
301
|
+
libraries and the CUDA libraries installed under `site-packages/nvidia/*/lib`.
|
|
302
|
+
Import `spacy_accelerate` before creating ONNX Runtime sessions or calling
|
|
303
|
+
`spacy.require_gpu()`.
|
|
304
|
+
|
|
305
|
+
**Manual fix:** If the automatic configuration doesn't work (e.g., running scripts directly):
|
|
306
|
+
```bash
|
|
307
|
+
SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
|
|
308
|
+
export LD_LIBRARY_PATH="$SITE_PACKAGES/tensorrt_libs:$SITE_PACKAGES/nvidia/cublas/lib:$SITE_PACKAGES/nvidia/cuda_runtime/lib:$SITE_PACKAGES/nvidia/cudnn/lib:$LD_LIBRARY_PATH"
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
### CUDA out of memory
|
|
312
|
+
|
|
313
|
+
Reduce workspace size or batch size:
|
|
314
|
+
|
|
315
|
+
```python
|
|
316
|
+
nlp = spacy_accelerate.optimize(
|
|
317
|
+
nlp,
|
|
318
|
+
trt_max_workspace_size=2 * 1024**3, # 2GB instead of 4GB
|
|
319
|
+
max_batch_size=16, # Smaller batches
|
|
320
|
+
)
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### First inference is slow
|
|
324
|
+
|
|
325
|
+
TensorRT builds optimized engines on first run. Enable caching:
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
nlp = spacy_accelerate.optimize(
|
|
329
|
+
nlp,
|
|
330
|
+
provider="tensorrt",
|
|
331
|
+
trt_timing_cache=True, # Cache timing data
|
|
332
|
+
)
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
## License
|
|
336
|
+
|
|
337
|
+
MIT License
|
|
338
|
+
|
|
339
|
+
## Contributing
|
|
340
|
+
|
|
341
|
+
Contributions are welcome! Please open an issue or submit a pull request.
|