llguidance 0.7.23__tar.gz → 0.7.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llguidance-0.7.23 → llguidance-0.7.24}/CHANGELOG.md +4 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/Cargo.lock +5 -5
- {llguidance-0.7.23 → llguidance-0.7.24}/PKG-INFO +1 -1
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/Cargo.toml +1 -1
- {llguidance-0.7.23 → llguidance-0.7.24}/pyproject.toml +1 -1
- {llguidance-0.7.23 → llguidance-0.7.24}/python_ext/Cargo.toml +1 -1
- {llguidance-0.7.23 → llguidance-0.7.24}/python_ext/src/lib.rs +1 -0
- llguidance-0.7.24/python_ext/src/llamatokenizer.rs +169 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python_ext/src/py.rs +4 -87
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/Cargo.toml +1 -1
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie_hf_downloader/Cargo.toml +1 -1
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie_hf_tokenizers/Cargo.toml +1 -1
- {llguidance-0.7.23 → llguidance-0.7.24}/.github/workflows/rust.yml +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/.github/workflows/wheels.yml +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/.gitignore +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/CODE_OF_CONDUCT.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/Cargo.toml +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/LICENSE +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/README.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/SECURITY.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/SUPPORT.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/c_sample/Makefile +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/c_sample/README.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/c_sample/c_sample.cpp +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/docs/fast_forward.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/docs/json_schema.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/docs/mask_plot.png +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/docs/optimizations.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/docs/special_tokens.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/docs/syntax.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/docs/toktrie.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/json_stats/Cargo.toml +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/json_stats/expected_maskbench.json +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/json_stats/jstats.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/json_stats/scripts/split-stats.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/json_stats/scripts/split_plot.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/json_stats/src/json_stats.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/json_stats/src/lib.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/json_stats/src/stats.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/LICENSE +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/README.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/build.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/cbindgen.toml +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/grammars/character.json +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/grammars/json.json +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/llguidance.h +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/api.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/constraint.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/earley/from_guidance.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/earley/grammar.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/earley/lexer.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/earley/lexerspec.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/earley/mod.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/earley/parser.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/earley/perf.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/earley/regexvec.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/earley/slicer.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/factory.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/ffi.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/ffi_par.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/grammar_builder.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/README.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/compiler.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/context_ref.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/context_simple/context.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/context_simple/draft.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/context_simple/mod.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/formats.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/mod.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/numeric.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/schema.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json/shared_context.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/json_validation.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/lark/README.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/lark/ast.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/lark/common.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/lark/compiler.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/lark/lexer.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/lark/mod.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/lark/parser.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/lib.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/logging.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/matcher.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/output.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/panic_utils.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/regex_rewrite.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/stop_controller.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/substring.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/tokenizer_json.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/parser/src/tokenparser.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/plan.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/__init__.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/_grammar_from.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/_lib.pyi +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/_struct_tag.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/_tokenizer.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/_util.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/cli.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/gbnf_to_lark.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/hf.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/llamacpp.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/mlx.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/numpy.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/py.typed +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/llguidance/torch.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/mypy.ini +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/torch_tests/__init__.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/torch_tests/test_bitmask.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/torch_tests/test_hf.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/torch_tests/test_llamacpp.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python/torch_tests/test_matcher.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python_ext/src/llinterpreter.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python_ext/src/llmatcher.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python_ext/src/parserlimits.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/python_ext/src/pyjson.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/Cargo.toml +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/README.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/cli.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/blog.sample.json +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/blog.schema.json +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/blog.schema.ll.json +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/README.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/arithmetic.gbnf +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/c.gbnf +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/chess.gbnf +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/english.gbnf +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/japanese.gbnf +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/json.gbnf +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/json_arr.gbnf +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/list.gbnf +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/from-llama.cpp/vllm-sql.gbnf +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/lark.lark +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/rfc.lark +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/rfc.xml +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/data/ulysses.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/gtest.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/lark.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/run.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/src/lib.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/src/minimal.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/src/sample_parser.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/tests/test_lark.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/tests/test_ll.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/tests/test_raw_parser.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/sample_parser/tests/test_stop.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/annotate_asm.js +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/bump.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/cbindgen.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/checklinks.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/checklinks.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/ci-publish.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/disasm.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/gbnf_to_lark.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/gen-testcase.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/git-version.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/install-deps.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/jsonschema-stats.js +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/remote-guidance-test.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/rust-size.js +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/rust_size.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/test-guidance.sh +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/tokenizer_test.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/scripts/update-git.py +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/LICENSE +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/README.md +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/src/bytes.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/src/lib.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/src/recognizer.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/src/rng.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/src/svob.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/src/tokenv.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/src/toktree.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie/tests/test_svob.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie_hf_downloader/LICENSE +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie_hf_downloader/src/lib.rs +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie_hf_tokenizers/LICENSE +0 -0
- {llguidance-0.7.23 → llguidance-0.7.24}/toktrie_hf_tokenizers/src/lib.rs +0 -0
|
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. Dates are d
|
|
|
4
4
|
|
|
5
5
|
If a release doesn't introduce any interesting changes (build fixes etc.), it's skipped.
|
|
6
6
|
|
|
7
|
+
#### [0.7.24](https://github.com/guidance-ai/llguidance/compare/v0.7.23...0.7.24) 2025-05-23
|
|
8
|
+
|
|
9
|
+
- add the sentinel token hack, fixes #180 [`#180`](https://github.com/guidance-ai/llguidance/issues/180)
|
|
10
|
+
|
|
7
11
|
#### [0.7.23](https://github.com/guidance-ai/llguidance/compare/v0.7.22...0.7.23) 2025-05-22
|
|
8
12
|
|
|
9
13
|
- native llama.cpp tokenizer support [`#179`](https://github.com/guidance-ai/llguidance/pull/179)
|
|
@@ -1174,7 +1174,7 @@ checksum = "23fb14cb19457329c82206317a5663005a4d404783dc74f4252769b0d5f42856"
|
|
|
1174
1174
|
|
|
1175
1175
|
[[package]]
|
|
1176
1176
|
name = "llguidance"
|
|
1177
|
-
version = "0.7.
|
|
1177
|
+
version = "0.7.24"
|
|
1178
1178
|
dependencies = [
|
|
1179
1179
|
"anyhow",
|
|
1180
1180
|
"derivre",
|
|
@@ -1193,7 +1193,7 @@ dependencies = [
|
|
|
1193
1193
|
|
|
1194
1194
|
[[package]]
|
|
1195
1195
|
name = "llguidance_py"
|
|
1196
|
-
version = "0.7.
|
|
1196
|
+
version = "0.7.24"
|
|
1197
1197
|
dependencies = [
|
|
1198
1198
|
"anyhow",
|
|
1199
1199
|
"bytemuck",
|
|
@@ -2336,7 +2336,7 @@ dependencies = [
|
|
|
2336
2336
|
|
|
2337
2337
|
[[package]]
|
|
2338
2338
|
name = "toktrie"
|
|
2339
|
-
version = "0.7.
|
|
2339
|
+
version = "0.7.24"
|
|
2340
2340
|
dependencies = [
|
|
2341
2341
|
"anyhow",
|
|
2342
2342
|
"bytemuck",
|
|
@@ -2347,7 +2347,7 @@ dependencies = [
|
|
|
2347
2347
|
|
|
2348
2348
|
[[package]]
|
|
2349
2349
|
name = "toktrie_hf_downloader"
|
|
2350
|
-
version = "0.7.
|
|
2350
|
+
version = "0.7.24"
|
|
2351
2351
|
dependencies = [
|
|
2352
2352
|
"anyhow",
|
|
2353
2353
|
"hf-hub",
|
|
@@ -2358,7 +2358,7 @@ dependencies = [
|
|
|
2358
2358
|
|
|
2359
2359
|
[[package]]
|
|
2360
2360
|
name = "toktrie_hf_tokenizers"
|
|
2361
|
-
version = "0.7.
|
|
2361
|
+
version = "0.7.24"
|
|
2362
2362
|
dependencies = [
|
|
2363
2363
|
"anyhow",
|
|
2364
2364
|
"log",
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
use std::sync::Arc;
|
|
2
|
+
|
|
3
|
+
use anyhow::{ensure, Result};
|
|
4
|
+
use llguidance::toktrie::{self, TokEnv, TokRxInfo, TokTrie, TokenId, TokenizerEnv};
|
|
5
|
+
|
|
6
|
+
type LlamaTokenizeFn = unsafe extern "C" fn(
|
|
7
|
+
vocab: *const std::os::raw::c_void,
|
|
8
|
+
text: *const std::os::raw::c_char,
|
|
9
|
+
text_len: i32,
|
|
10
|
+
tokens: *mut i32,
|
|
11
|
+
n_tokens_max: i32,
|
|
12
|
+
add_special: bool,
|
|
13
|
+
parse_special: bool,
|
|
14
|
+
) -> i32;
|
|
15
|
+
|
|
16
|
+
struct LlamaTokenizer {
|
|
17
|
+
trie: TokTrie,
|
|
18
|
+
tokenize_fn: LlamaTokenizeFn,
|
|
19
|
+
vocab: *const std::os::raw::c_void,
|
|
20
|
+
sentinel: Option<u8>,
|
|
21
|
+
sentinel_tokens: Vec<TokenId>,
|
|
22
|
+
}
|
|
23
|
+
// SAFETY: tokenize_fn is required to be thread-safe
|
|
24
|
+
unsafe impl Send for LlamaTokenizer {}
|
|
25
|
+
unsafe impl Sync for LlamaTokenizer {}
|
|
26
|
+
|
|
27
|
+
impl LlamaTokenizer {
|
|
28
|
+
fn tokenize_with_sentinel(&self, s: &[u8]) -> Result<Vec<toktrie::TokenId>> {
|
|
29
|
+
if s.is_empty() {
|
|
30
|
+
return Ok(vec![]);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if let Some(sentinel) = self.sentinel {
|
|
34
|
+
let mut b = Vec::with_capacity(s.len() + 1);
|
|
35
|
+
b.push(sentinel);
|
|
36
|
+
b.extend_from_slice(s);
|
|
37
|
+
let mut res = self.raw_tokenize(&b);
|
|
38
|
+
ensure!(
|
|
39
|
+
res.len() > self.sentinel_tokens.len(),
|
|
40
|
+
"tokenize_with_sentinel: res.len() <= sentinel_tokens.len()"
|
|
41
|
+
);
|
|
42
|
+
ensure!(
|
|
43
|
+
res[0..self.sentinel_tokens.len()] == self.sentinel_tokens,
|
|
44
|
+
"tokenize_with_sentinel: res[0..sentinel_tokens.len()] != sentinel_tokens"
|
|
45
|
+
);
|
|
46
|
+
res.splice(0..self.sentinel_tokens.len(), []);
|
|
47
|
+
Ok(res)
|
|
48
|
+
} else {
|
|
49
|
+
Ok(self.raw_tokenize(s))
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
fn raw_tokenize(&self, s: &[u8]) -> Vec<toktrie::TokenId> {
|
|
54
|
+
let mut res_toks = vec![0u32; s.len() / 4 + 5];
|
|
55
|
+
let res = unsafe {
|
|
56
|
+
(self.tokenize_fn)(
|
|
57
|
+
self.vocab,
|
|
58
|
+
s.as_ptr() as *const std::os::raw::c_char,
|
|
59
|
+
s.len().try_into().unwrap(),
|
|
60
|
+
res_toks.as_mut_ptr() as *mut i32,
|
|
61
|
+
res_toks.len().try_into().unwrap(),
|
|
62
|
+
false,
|
|
63
|
+
false,
|
|
64
|
+
)
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
let res = if res < 0 {
|
|
68
|
+
let n_toks = (-res) as usize;
|
|
69
|
+
res_toks.resize(n_toks, 0);
|
|
70
|
+
let res2 = unsafe {
|
|
71
|
+
(self.tokenize_fn)(
|
|
72
|
+
self.vocab,
|
|
73
|
+
s.as_ptr() as *const std::os::raw::c_char,
|
|
74
|
+
s.len().try_into().unwrap(),
|
|
75
|
+
res_toks.as_mut_ptr() as *mut i32,
|
|
76
|
+
res_toks.len().try_into().unwrap(),
|
|
77
|
+
false,
|
|
78
|
+
false,
|
|
79
|
+
)
|
|
80
|
+
};
|
|
81
|
+
assert!(res2 == n_toks as i32);
|
|
82
|
+
res2
|
|
83
|
+
} else {
|
|
84
|
+
res
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
res_toks.truncate(res as usize);
|
|
88
|
+
res_toks
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
impl TokenizerEnv for LlamaTokenizer {
|
|
93
|
+
fn tok_trie(&self) -> &TokTrie {
|
|
94
|
+
&self.trie
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
fn tokenize_bytes(&self, s: &[u8]) -> Vec<toktrie::TokenId> {
|
|
98
|
+
// llama.cpp tokenizer encodes invalid UTF8 as Unicode replacement character U+FFFD,
|
|
99
|
+
// so we need the greedy fallback
|
|
100
|
+
self.trie.tokenize_with_greedy_fallback(s, |s| {
|
|
101
|
+
self.tokenize_with_sentinel(s.as_bytes())
|
|
102
|
+
.expect("tokenize_with_sentinel failed")
|
|
103
|
+
})
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
pub fn tokenv_from_llamacpp(
|
|
108
|
+
tokens: Vec<Vec<u8>>,
|
|
109
|
+
vocab_ptr: usize,
|
|
110
|
+
tokenize_fptr: usize,
|
|
111
|
+
eos_token: u32,
|
|
112
|
+
) -> Result<TokEnv> {
|
|
113
|
+
ensure!(vocab_ptr != 0, "vocab_ptr must be non-null");
|
|
114
|
+
ensure!(tokenize_fptr != 0, "tokenize_fptr must be non-null");
|
|
115
|
+
|
|
116
|
+
let info = TokRxInfo::new(tokens.len() as u32, eos_token);
|
|
117
|
+
let trie = TokTrie::from(&info, &tokens);
|
|
118
|
+
|
|
119
|
+
let mut llama_tok = LlamaTokenizer {
|
|
120
|
+
trie,
|
|
121
|
+
tokenize_fn: unsafe { std::mem::transmute::<usize, LlamaTokenizeFn>(tokenize_fptr) },
|
|
122
|
+
vocab: vocab_ptr as *const std::os::raw::c_void,
|
|
123
|
+
sentinel: None,
|
|
124
|
+
sentinel_tokens: vec![],
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
let trie = &llama_tok.trie;
|
|
128
|
+
let t0 = llama_tok.raw_tokenize(b"a");
|
|
129
|
+
if trie.decode(&t0) != b"a" {
|
|
130
|
+
// Now, this likely means that the tokenizer is adding a space in front of the token
|
|
131
|
+
// (or possibly <BOS> token)
|
|
132
|
+
// We fill "fix" this by tokenizing [sentinel] + s instead of just s
|
|
133
|
+
// and then removing tokens corresponding to the sentinel
|
|
134
|
+
|
|
135
|
+
// find a good sentinel token - one that doesn't start any other token
|
|
136
|
+
let sentinel = (1u8..32)
|
|
137
|
+
.find(|&b| {
|
|
138
|
+
trie.token_id(&[b]).is_some()
|
|
139
|
+
&& !trie.has_extensions(&[b])
|
|
140
|
+
&& !trie.has_extensions(&[b' ', b])
|
|
141
|
+
})
|
|
142
|
+
.ok_or_else(|| {
|
|
143
|
+
anyhow::anyhow!("could not find a good sentinel token in the range 1..32")
|
|
144
|
+
})?;
|
|
145
|
+
|
|
146
|
+
llama_tok.sentinel_tokens = llama_tok.raw_tokenize(&[sentinel]);
|
|
147
|
+
llama_tok.sentinel = Some(sentinel);
|
|
148
|
+
|
|
149
|
+
// now, check if it works
|
|
150
|
+
let t1 = llama_tok.tokenize_with_sentinel(b"a")?;
|
|
151
|
+
ensure!(
|
|
152
|
+
trie.decode(&t1) == b"a",
|
|
153
|
+
"tokenizer is not working with the sentinel {} {:?}",
|
|
154
|
+
sentinel,
|
|
155
|
+
trie.decode(&t1)
|
|
156
|
+
);
|
|
157
|
+
|
|
158
|
+
// make sure we can tokenize double-sentinel
|
|
159
|
+
let t3 = llama_tok.tokenize_with_sentinel(&[sentinel])?;
|
|
160
|
+
ensure!(
|
|
161
|
+
trie.decode(&t3) == [sentinel],
|
|
162
|
+
"tokenizer is not working with the sentinel (rec) {} {:?}",
|
|
163
|
+
sentinel,
|
|
164
|
+
trie.decode(&t3)
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
Ok(Arc::new(llama_tok))
|
|
169
|
+
}
|
|
@@ -14,6 +14,8 @@ use serde::{Deserialize, Serialize};
|
|
|
14
14
|
use serde_json::Value;
|
|
15
15
|
use toktrie_hf_tokenizers::ByteTokenizer;
|
|
16
16
|
|
|
17
|
+
use crate::llamatokenizer::tokenv_from_llamacpp;
|
|
18
|
+
|
|
17
19
|
struct PyTokenizer {
|
|
18
20
|
tok_trie: Arc<toktrie::TokTrie>,
|
|
19
21
|
tokenizer_fun: Py<PyAny>,
|
|
@@ -34,78 +36,6 @@ struct PyMidProcessResult {
|
|
|
34
36
|
temperature: f32,
|
|
35
37
|
}
|
|
36
38
|
|
|
37
|
-
type LlamaTokenizeFn = unsafe extern "C" fn(
|
|
38
|
-
vocab: *const std::os::raw::c_void,
|
|
39
|
-
text: *const std::os::raw::c_char,
|
|
40
|
-
text_len: i32,
|
|
41
|
-
tokens: *mut i32,
|
|
42
|
-
n_tokens_max: i32,
|
|
43
|
-
add_special: bool,
|
|
44
|
-
parse_special: bool,
|
|
45
|
-
) -> i32;
|
|
46
|
-
|
|
47
|
-
struct LlamaTokenizerInner {
|
|
48
|
-
trie: TokTrie,
|
|
49
|
-
tokenize_fn: LlamaTokenizeFn,
|
|
50
|
-
vocab: *const std::os::raw::c_void,
|
|
51
|
-
}
|
|
52
|
-
// SAFETY: tokenize_fn is required to be thread-safe
|
|
53
|
-
unsafe impl Send for LlamaTokenizerInner {}
|
|
54
|
-
unsafe impl Sync for LlamaTokenizerInner {}
|
|
55
|
-
|
|
56
|
-
impl LlamaTokenizerInner {
|
|
57
|
-
fn raw_tokenize(&self, s: &[u8]) -> Vec<toktrie::TokenId> {
|
|
58
|
-
let mut res_toks = vec![0u32; s.len() / 4 + 5];
|
|
59
|
-
let res = unsafe {
|
|
60
|
-
(self.tokenize_fn)(
|
|
61
|
-
self.vocab,
|
|
62
|
-
s.as_ptr() as *const std::os::raw::c_char,
|
|
63
|
-
s.len().try_into().unwrap(),
|
|
64
|
-
res_toks.as_mut_ptr() as *mut i32,
|
|
65
|
-
res_toks.len().try_into().unwrap(),
|
|
66
|
-
false,
|
|
67
|
-
false,
|
|
68
|
-
)
|
|
69
|
-
};
|
|
70
|
-
|
|
71
|
-
let res = if res < 0 {
|
|
72
|
-
let n_toks = (-res) as usize;
|
|
73
|
-
res_toks.resize(n_toks, 0);
|
|
74
|
-
let res2 = unsafe {
|
|
75
|
-
(self.tokenize_fn)(
|
|
76
|
-
self.vocab,
|
|
77
|
-
s.as_ptr() as *const std::os::raw::c_char,
|
|
78
|
-
s.len().try_into().unwrap(),
|
|
79
|
-
res_toks.as_mut_ptr() as *mut i32,
|
|
80
|
-
res_toks.len().try_into().unwrap(),
|
|
81
|
-
false,
|
|
82
|
-
false,
|
|
83
|
-
)
|
|
84
|
-
};
|
|
85
|
-
assert!(res2 == n_toks as i32);
|
|
86
|
-
res2
|
|
87
|
-
} else {
|
|
88
|
-
res
|
|
89
|
-
};
|
|
90
|
-
|
|
91
|
-
res_toks.truncate(res as usize);
|
|
92
|
-
res_toks
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
impl TokenizerEnv for LlamaTokenizerInner {
|
|
97
|
-
fn tok_trie(&self) -> &TokTrie {
|
|
98
|
-
&self.trie
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
fn tokenize_bytes(&self, s: &[u8]) -> Vec<toktrie::TokenId> {
|
|
102
|
-
// llama.cpp tokenizer encodes invalid UTF8 as Unicode replacement character U+FFFD,
|
|
103
|
-
// so we need the greedy fallback
|
|
104
|
-
self.trie
|
|
105
|
-
.tokenize_with_greedy_fallback(s, |s| self.raw_tokenize(s.as_bytes()))
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
|
|
109
39
|
#[pymethods]
|
|
110
40
|
impl LLTokenizer {
|
|
111
41
|
#[new]
|
|
@@ -154,22 +84,9 @@ impl LLTokenizer {
|
|
|
154
84
|
eos_token: u32,
|
|
155
85
|
slices: Option<Vec<String>>,
|
|
156
86
|
) -> PyResult<Self> {
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
}
|
|
160
|
-
if tokenize_fptr == 0 {
|
|
161
|
-
return Err(PyValueError::new_err("tokenize_fptr must be non-null"));
|
|
162
|
-
}
|
|
87
|
+
let tok_env =
|
|
88
|
+
tokenv_from_llamacpp(tokens, vocab_ptr, tokenize_fptr, eos_token).map_err(val_error)?;
|
|
163
89
|
|
|
164
|
-
let info = TokRxInfo::new(tokens.len() as u32, eos_token);
|
|
165
|
-
let trie = TokTrie::from(&info, &tokens);
|
|
166
|
-
|
|
167
|
-
let llama_tok = LlamaTokenizerInner {
|
|
168
|
-
trie,
|
|
169
|
-
tokenize_fn: unsafe { std::mem::transmute::<usize, LlamaTokenizeFn>(tokenize_fptr) },
|
|
170
|
-
vocab: vocab_ptr as *const std::os::raw::c_void,
|
|
171
|
-
};
|
|
172
|
-
let tok_env: TokEnv = Arc::new(llama_tok);
|
|
173
90
|
let factory = ParserFactory::new(
|
|
174
91
|
&tok_env,
|
|
175
92
|
InferenceCapabilities::default(),
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|