llguidance 1.0.1__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llguidance-1.0.1 → llguidance-1.1.0}/CHANGELOG.md +7 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/Cargo.lock +6 -6
- {llguidance-1.0.1 → llguidance-1.1.0}/PKG-INFO +1 -1
- llguidance-1.1.0/docs/de_recursing.md +51 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/docs/syntax.md +5 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/Cargo.toml +1 -1
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/grammar_builder.rs +75 -2
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/lark/compiler.rs +17 -1
- {llguidance-1.0.1 → llguidance-1.1.0}/pyproject.toml +1 -1
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/hf.py +12 -7
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/llamacpp.py +8 -2
- {llguidance-1.0.1 → llguidance-1.1.0}/python_ext/Cargo.toml +1 -1
- llguidance-1.1.0/sample_parser/tests/common_lark_utils/mod.rs +144 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/tests/test_lark.rs +15 -139
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/tests/test_ll.rs +24 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/Cargo.toml +1 -1
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie_hf_downloader/Cargo.toml +1 -1
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie_hf_tokenizers/Cargo.toml +1 -1
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie_tiktoken/Cargo.toml +1 -1
- {llguidance-1.0.1 → llguidance-1.1.0}/.github/workflows/rust.yml +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/.github/workflows/wheels.yml +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/.gitignore +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/CODE_OF_CONDUCT.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/Cargo.toml +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/LICENSE +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/README.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/SECURITY.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/SUPPORT.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/c_sample/Makefile +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/c_sample/README.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/c_sample/c_sample.cpp +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/docs/fast_forward.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/docs/json_schema.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/docs/mask_plot.png +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/docs/optimizations.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/docs/parametric.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/docs/special_tokens.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/docs/toktrie.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/json_stats/Cargo.toml +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/json_stats/expected_maskbench.json +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/json_stats/jstats.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/json_stats/scripts/split-stats.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/json_stats/scripts/split_plot.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/json_stats/src/json_stats.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/json_stats/src/lib.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/json_stats/src/stats.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/LICENSE +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/README.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/build.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/cbindgen.toml +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/grammars/character.json +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/grammars/json.json +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/llguidance.h +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/api.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/constraint.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/earley/from_guidance.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/earley/grammar.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/earley/lexer.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/earley/lexerspec.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/earley/mod.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/earley/parser.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/earley/perf.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/earley/regexvec.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/earley/slicer.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/factory.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/ffi.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/ffi_par.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/hashcons.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/README.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/compiler.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/context_ref.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/context_simple/context.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/context_simple/draft.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/context_simple/mod.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/formats.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/mod.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/numeric.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/schema.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json/shared_context.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/json_validation.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/lark/README.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/lark/ast.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/lark/common.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/lark/lexer.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/lark/mod.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/lark/parser.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/lib.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/logging.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/matcher.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/output.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/panic_utils.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/regex_rewrite.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/stop_controller.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/substring.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/tokenizer_json.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/parser/src/tokenparser.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/plan.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/__init__.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/_grammar_from.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/_lib.pyi +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/_struct_tag.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/_tokenizer.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/_util.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/cli.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/gbnf_to_lark.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/mlx.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/numpy.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/py.typed +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/tiktoken.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/llguidance/torch.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/mypy.ini +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/torch_tests/__init__.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/torch_tests/test_bitmask.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/torch_tests/test_hf.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/torch_tests/test_llamacpp.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/torch_tests/test_matcher.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python/torch_tests/test_tiktoken.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python_ext/src/lib.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python_ext/src/llamatokenizer.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python_ext/src/llinterpreter.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python_ext/src/llmatcher.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python_ext/src/parserlimits.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python_ext/src/py.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/python_ext/src/pyjson.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/Cargo.toml +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/README.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/cli.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/blog.sample.json +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/blog.schema.json +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/blog.schema.ll.json +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/README.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/arithmetic.gbnf +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/c.gbnf +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/chess.gbnf +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/english.gbnf +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/japanese.gbnf +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/json.gbnf +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/json_arr.gbnf +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/list.gbnf +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/from-llama.cpp/vllm-sql.gbnf +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/lark.lark +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/rfc.lark +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/rfc.xml +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/data/ulysses.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/gtest.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/lark.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/run.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/src/lib.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/src/minimal.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/src/sample_parser.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/tests/test_raw_parser.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/sample_parser/tests/test_stop.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/annotate_asm.js +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/bump.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/cbindgen.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/checklinks.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/checklinks.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/ci-publish.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/disasm.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/gbnf_to_lark.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/gen-testcase.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/git-version.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/install-deps.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/jsonschema-stats.js +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/remote-guidance-test.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/rust-size.js +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/rust_size.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/test-guidance.sh +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/tokenizer_test.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/scripts/update-git.py +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/LICENSE +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/README.md +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/src/bytes.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/src/lib.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/src/recognizer.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/src/rng.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/src/svob.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/src/tokenv.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/src/toktree.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie/tests/test_svob.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie_hf_downloader/LICENSE +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie_hf_downloader/src/lib.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie_hf_tokenizers/LICENSE +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie_hf_tokenizers/src/lib.rs +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie_tiktoken/LICENSE +0 -0
- {llguidance-1.0.1 → llguidance-1.1.0}/toktrie_tiktoken/src/lib.rs +0 -0
|
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. Dates are d
|
|
|
4
4
|
|
|
5
5
|
If a release doesn't introduce any interesting changes (build fixes etc.), it's skipped.
|
|
6
6
|
|
|
7
|
+
#### [1.1.0](https://github.com/guidance-ai/llguidance/compare/v1.0.1...1.1.0) 2025-07-18
|
|
8
|
+
|
|
9
|
+
- disable hf tokenizer truncation and padding [`#205`](https://github.com//hudson-ai/llguidance.git/pull/205); fixes [`#1322`](https://github.com/guidance-ai/guidance/issues/1322)
|
|
10
|
+
- llama_cpp tokenizers: infer added tokens starting/ending with < and > to be special tokens [`#202`](https://github.com//hudson-ai/llguidance.git/pull/202)
|
|
11
|
+
- add lark syntax for "any token" and negation of token ranges [`#201`](https://github.com//hudson-ai/llguidance.git/pull/201)
|
|
12
|
+
- add de-recursion cook book to docs [`#199`](https://github.com//hudson-ai/llguidance.git/pull/199)
|
|
13
|
+
|
|
7
14
|
#### [1.0.1](https://github.com/guidance-ai/llguidance/compare/v1.0.0...1.0.1) 2025-07-03
|
|
8
15
|
|
|
9
16
|
- fix: tokenizers normalizers sequence api changed [`#195`](https://github.com/guidance-ai/llguidance/pull/195)
|
|
@@ -1229,7 +1229,7 @@ checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
|
|
|
1229
1229
|
|
|
1230
1230
|
[[package]]
|
|
1231
1231
|
name = "llguidance"
|
|
1232
|
-
version = "1.0
|
|
1232
|
+
version = "1.1.0"
|
|
1233
1233
|
dependencies = [
|
|
1234
1234
|
"anyhow",
|
|
1235
1235
|
"derivre",
|
|
@@ -1248,7 +1248,7 @@ dependencies = [
|
|
|
1248
1248
|
|
|
1249
1249
|
[[package]]
|
|
1250
1250
|
name = "llguidance_py"
|
|
1251
|
-
version = "1.0
|
|
1251
|
+
version = "1.1.0"
|
|
1252
1252
|
dependencies = [
|
|
1253
1253
|
"anyhow",
|
|
1254
1254
|
"bytemuck",
|
|
@@ -2405,7 +2405,7 @@ dependencies = [
|
|
|
2405
2405
|
|
|
2406
2406
|
[[package]]
|
|
2407
2407
|
name = "toktrie"
|
|
2408
|
-
version = "1.0
|
|
2408
|
+
version = "1.1.0"
|
|
2409
2409
|
dependencies = [
|
|
2410
2410
|
"anyhow",
|
|
2411
2411
|
"bytemuck",
|
|
@@ -2416,7 +2416,7 @@ dependencies = [
|
|
|
2416
2416
|
|
|
2417
2417
|
[[package]]
|
|
2418
2418
|
name = "toktrie_hf_downloader"
|
|
2419
|
-
version = "1.0
|
|
2419
|
+
version = "1.1.0"
|
|
2420
2420
|
dependencies = [
|
|
2421
2421
|
"anyhow",
|
|
2422
2422
|
"hf-hub",
|
|
@@ -2427,7 +2427,7 @@ dependencies = [
|
|
|
2427
2427
|
|
|
2428
2428
|
[[package]]
|
|
2429
2429
|
name = "toktrie_hf_tokenizers"
|
|
2430
|
-
version = "1.0
|
|
2430
|
+
version = "1.1.0"
|
|
2431
2431
|
dependencies = [
|
|
2432
2432
|
"anyhow",
|
|
2433
2433
|
"log",
|
|
@@ -2439,7 +2439,7 @@ dependencies = [
|
|
|
2439
2439
|
|
|
2440
2440
|
[[package]]
|
|
2441
2441
|
name = "toktrie_tiktoken"
|
|
2442
|
-
version = "1.0
|
|
2442
|
+
version = "1.1.0"
|
|
2443
2443
|
dependencies = [
|
|
2444
2444
|
"anyhow",
|
|
2445
2445
|
"log",
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# De-Recursing Grammars
|
|
2
|
+
|
|
3
|
+
This is a cookbook of examples to help in removing recursion where possible from grammars (see [Syntax](./syntax.md) for more details).
|
|
4
|
+
The examples below will generally already be left-recursive.
|
|
5
|
+
|
|
6
|
+
## Simple lists
|
|
7
|
+
|
|
8
|
+
```lark
|
|
9
|
+
item_list : item
|
|
10
|
+
| item_list item
|
|
11
|
+
```
|
|
12
|
+
can become
|
|
13
|
+
```lark
|
|
14
|
+
item_list : item+
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Lists with Delimiters
|
|
18
|
+
|
|
19
|
+
```lark
|
|
20
|
+
sep_list : item
|
|
21
|
+
| item_list SEP item
|
|
22
|
+
```
|
|
23
|
+
becomes
|
|
24
|
+
```lark
|
|
25
|
+
sep_list : item (SEP item)*
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## List with alternatives
|
|
29
|
+
|
|
30
|
+
```lark
|
|
31
|
+
postfix_expression: primary_expression
|
|
32
|
+
| postfix_expression "[" expression "]"
|
|
33
|
+
| postfix_expression "(" ")"
|
|
34
|
+
| postfix_expression "(" argument_expression_list ")"
|
|
35
|
+
| postfix_expression "." IDENTIFIER
|
|
36
|
+
| postfix_expression PTR_OP IDENTIFIER
|
|
37
|
+
| "(" type_name ")" "{" initializer_list "}"
|
|
38
|
+
| "(" type_name ")" "{" initializer_list "," "}"
|
|
39
|
+
```
|
|
40
|
+
becomes (note the additional rule):
|
|
41
|
+
```lark
|
|
42
|
+
postfix_expression: primary_expression postfix_suffix*
|
|
43
|
+
| "(" type_name ")" "{" initializer_list "}"
|
|
44
|
+
| "(" type_name ")" "{" initializer_list "," "}"
|
|
45
|
+
|
|
46
|
+
postfix_suffix: "[" expression "]"
|
|
47
|
+
| "(" ")"
|
|
48
|
+
| "(" argument_expression_list ")"
|
|
49
|
+
| "." IDENTIFIER
|
|
50
|
+
| PTR_OP IDENTIFIER
|
|
51
|
+
```
|
|
@@ -70,6 +70,11 @@ You can also use numeric token ids, as in `<[128010]>` (this is `<|python_tag|>`
|
|
|
70
70
|
Tou can also use ranges like `<[128000-128255]>` for all Llama special tokens, or
|
|
71
71
|
even lists of ranges like `<[128000-128100,128130-128170]>`; ranges are inclusive.
|
|
72
72
|
|
|
73
|
+
Individual numeric token ids and ranges can be negated with the caret operator, like `<[^128000,128130-128170]>`.
|
|
74
|
+
This is equivalent to `<[0-12799,128001-128129,128171-MAX]>`.
|
|
75
|
+
|
|
76
|
+
You can also use a *wildcard* token range, `<[*]>`, denoting `<[0-MAX]>`.
|
|
77
|
+
|
|
73
78
|
For example, this is how to constrain JSON function calling for Meta Llama 3.1,
|
|
74
79
|
according to their [source repo](https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/prompt_format.md#model-response-format-5) (and yes, it's [different](https://github.com/meta-llama/llama-models/issues/266) than the website).
|
|
75
80
|
|
|
@@ -282,8 +282,6 @@ impl GrammarBuilder {
|
|
|
282
282
|
pub fn token_ranges(&mut self, token_ranges: Vec<RangeInclusive<u32>>) -> Result<NodeRef> {
|
|
283
283
|
self.check_limits()?;
|
|
284
284
|
|
|
285
|
-
let name = token_ranges_to_string(&token_ranges);
|
|
286
|
-
|
|
287
285
|
let trie = self.tok_env.as_ref().map(|t| t.tok_trie());
|
|
288
286
|
for r in &token_ranges {
|
|
289
287
|
ensure!(r.start() <= r.end(), "Invalid token range: {:?}", r);
|
|
@@ -300,10 +298,69 @@ impl GrammarBuilder {
|
|
|
300
298
|
self.add_warning("no tokenizer - can't validate <[...]>".to_string());
|
|
301
299
|
}
|
|
302
300
|
|
|
301
|
+
let name = token_ranges_to_string(&token_ranges);
|
|
303
302
|
let id = self.regex.spec.add_special_token(name, token_ranges)?;
|
|
304
303
|
Ok(self.lexeme_to_node(id))
|
|
305
304
|
}
|
|
306
305
|
|
|
306
|
+
pub fn negated_token_ranges(
|
|
307
|
+
&mut self,
|
|
308
|
+
token_ranges: Vec<RangeInclusive<u32>>,
|
|
309
|
+
) -> Result<NodeRef> {
|
|
310
|
+
let negated_ranges = if let Some(te) = &self.tok_env {
|
|
311
|
+
let trie = te.tok_trie();
|
|
312
|
+
|
|
313
|
+
let (min, max) = (0u32, trie.vocab_size() as u32 - 1);
|
|
314
|
+
ensure!(
|
|
315
|
+
!token_ranges.is_empty(),
|
|
316
|
+
"negation of empty token ranges is not supported"
|
|
317
|
+
);
|
|
318
|
+
|
|
319
|
+
let mut sorted = token_ranges.clone();
|
|
320
|
+
sorted.sort_by_key(|r| *r.start());
|
|
321
|
+
|
|
322
|
+
let mut negated = vec![];
|
|
323
|
+
let mut current = min;
|
|
324
|
+
for range in sorted {
|
|
325
|
+
ensure!(
|
|
326
|
+
*range.end() < trie.vocab_size() as u32,
|
|
327
|
+
"Token range end too large: {:?}",
|
|
328
|
+
range.end()
|
|
329
|
+
);
|
|
330
|
+
ensure!(
|
|
331
|
+
range.start() <= range.end(),
|
|
332
|
+
"Invalid token range: {:?}",
|
|
333
|
+
range
|
|
334
|
+
);
|
|
335
|
+
|
|
336
|
+
let (&start, &end) = (range.start(), range.end());
|
|
337
|
+
ensure!(start <= end, "Invalid token range: {:?}", range);
|
|
338
|
+
if end < current {
|
|
339
|
+
// skip this range, it is already covered by the previous one
|
|
340
|
+
continue;
|
|
341
|
+
}
|
|
342
|
+
if start > current {
|
|
343
|
+
// add a range from the current to the start of this one
|
|
344
|
+
negated.push(current..=start - 1);
|
|
345
|
+
}
|
|
346
|
+
// update the current to the end of this range
|
|
347
|
+
current = current.max(end + 1);
|
|
348
|
+
}
|
|
349
|
+
if current <= max {
|
|
350
|
+
// add the last range from the current to the max
|
|
351
|
+
negated.push(current..=max);
|
|
352
|
+
}
|
|
353
|
+
negated
|
|
354
|
+
} else {
|
|
355
|
+
self.add_warning("no tokenizer - can't validate <[^...]>".to_string());
|
|
356
|
+
vec![INVALID_TOKEN..=INVALID_TOKEN]
|
|
357
|
+
};
|
|
358
|
+
|
|
359
|
+
let name = token_ranges_to_string(&negated_ranges);
|
|
360
|
+
let id = self.regex.spec.add_special_token(name, negated_ranges)?;
|
|
361
|
+
Ok(self.lexeme_to_node(id))
|
|
362
|
+
}
|
|
363
|
+
|
|
307
364
|
pub fn special_token(&mut self, token: &str) -> Result<NodeRef> {
|
|
308
365
|
self.check_limits()?;
|
|
309
366
|
|
|
@@ -331,6 +388,22 @@ impl GrammarBuilder {
|
|
|
331
388
|
Ok(self.lexeme_to_node(idx))
|
|
332
389
|
}
|
|
333
390
|
|
|
391
|
+
pub fn any_token(&mut self) -> Result<NodeRef> {
|
|
392
|
+
self.check_limits()?;
|
|
393
|
+
let range = if let Some(te) = &self.tok_env {
|
|
394
|
+
let trie = te.tok_trie();
|
|
395
|
+
0..=trie.vocab_size() as u32 - 1
|
|
396
|
+
} else {
|
|
397
|
+
self.add_warning("no tokenizer - can't validate <any_token>".to_string());
|
|
398
|
+
INVALID_TOKEN..=INVALID_TOKEN
|
|
399
|
+
};
|
|
400
|
+
let idx = self
|
|
401
|
+
.regex
|
|
402
|
+
.spec
|
|
403
|
+
.add_special_token("<[*]>".to_string(), vec![range])?;
|
|
404
|
+
Ok(self.lexeme_to_node(idx))
|
|
405
|
+
}
|
|
406
|
+
|
|
334
407
|
pub fn gen_grammar(&mut self, data: GenGrammarOptions, props: NodeProps) -> NodeRef {
|
|
335
408
|
if props.max_tokens.is_some() {
|
|
336
409
|
self.regex.spec.has_max_tokens = true;
|
|
@@ -313,6 +313,18 @@ impl Compiler {
|
|
|
313
313
|
Value::SpecialToken(s) => {
|
|
314
314
|
if s.starts_with("<[") && s.ends_with("]>") {
|
|
315
315
|
let s = &s[2..s.len() - 2];
|
|
316
|
+
let negate = s.starts_with("^");
|
|
317
|
+
let s = if negate { &s[1..] } else { s };
|
|
318
|
+
if s == "*" {
|
|
319
|
+
if negate {
|
|
320
|
+
bail!("negated wildcard token <[^*]> is not supported");
|
|
321
|
+
}
|
|
322
|
+
return self.builder.any_token();
|
|
323
|
+
} else if s.contains('*') {
|
|
324
|
+
bail!(
|
|
325
|
+
"wildcard token range '*' must not contain additional tokens"
|
|
326
|
+
);
|
|
327
|
+
}
|
|
316
328
|
let mut ranges = vec![];
|
|
317
329
|
for range in s.split(",") {
|
|
318
330
|
let ends: Vec<&str> = range.split('-').map(|s| s.trim()).collect();
|
|
@@ -334,7 +346,11 @@ impl Compiler {
|
|
|
334
346
|
ranges.push(start..=end);
|
|
335
347
|
}
|
|
336
348
|
ensure!(!ranges.is_empty(), "empty token range");
|
|
337
|
-
return
|
|
349
|
+
return if negate {
|
|
350
|
+
self.builder.negated_token_ranges(ranges)
|
|
351
|
+
} else {
|
|
352
|
+
self.builder.token_ranges(ranges)
|
|
353
|
+
};
|
|
338
354
|
}
|
|
339
355
|
return self.builder.special_token(s);
|
|
340
356
|
}
|
|
@@ -1,8 +1,10 @@
|
|
|
1
|
+
from copy import copy
|
|
1
2
|
from typing import List, Optional
|
|
2
|
-
from ._lib import LLTokenizer
|
|
3
3
|
|
|
4
4
|
import transformers
|
|
5
5
|
|
|
6
|
+
from ._lib import LLTokenizer
|
|
7
|
+
|
|
6
8
|
|
|
7
9
|
def from_tokenizer(
|
|
8
10
|
hf_tokenizer: transformers.PreTrainedTokenizerFast,
|
|
@@ -28,15 +30,18 @@ def from_tokenizer(
|
|
|
28
30
|
# this will JSON-serialize the Rust impl of the tokenizer,
|
|
29
31
|
# including added tokens from tokenizer_config.json
|
|
30
32
|
# (which may be missing from tokenizer.json)
|
|
31
|
-
|
|
33
|
+
backend_tokenizer = copy(
|
|
34
|
+
hf_tokenizer.backend_tokenizer # type: ignore[attr-defined]
|
|
35
|
+
)
|
|
36
|
+
# disable padding and truncation on copy before converting to string
|
|
37
|
+
backend_tokenizer.no_padding()
|
|
38
|
+
backend_tokenizer.no_truncation()
|
|
39
|
+
s = backend_tokenizer.to_str()
|
|
32
40
|
# This is probably not needed - it should figure it out by itself
|
|
33
41
|
# if n_vocab is None:
|
|
34
42
|
# n_vocab = hf_tokenizer.backend_tokenizer.get_vocab_size(with_added_tokens=True)
|
|
35
43
|
if eos_token is None:
|
|
36
|
-
eos_token = hf_tokenizer.eos_token_id
|
|
37
|
-
return LLTokenizer(s,
|
|
38
|
-
n_vocab=n_vocab,
|
|
39
|
-
eos_token=eos_token,
|
|
40
|
-
slices=slices)
|
|
44
|
+
eos_token = hf_tokenizer.eos_token_id # type: ignore
|
|
45
|
+
return LLTokenizer(s, n_vocab=n_vocab, eos_token=eos_token, slices=slices)
|
|
41
46
|
else:
|
|
42
47
|
raise ValueError("Only fast tokenizers are supported")
|
|
@@ -44,8 +44,14 @@ def lltokenizer_from_vocab(
|
|
|
44
44
|
assert n <= buffer_len
|
|
45
45
|
tok = bytes(buffer[:n]) # type: ignore
|
|
46
46
|
attr = llama_cpp.llama_token_get_attr(vocab, token)
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
# If the token is a control token or a user-defined token that looks like a control token,
|
|
48
|
+
# we prefix it with 0xff to indicate that it should be treated as a special token.
|
|
49
|
+
if attr & llama_cpp.LLAMA_TOKEN_ATTR_CONTROL or (
|
|
50
|
+
attr & llama_cpp.LLAMA_TOKEN_ATTR_USER_DEFINED
|
|
51
|
+
and tok.startswith(b"<")
|
|
52
|
+
and tok.endswith(b">")
|
|
53
|
+
):
|
|
54
|
+
tok = b"\xff" + tok
|
|
49
55
|
tokens.append(tok)
|
|
50
56
|
|
|
51
57
|
if n_vocab is not None:
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
/*
|
|
2
|
+
Per
|
|
3
|
+
https://doc.rust-lang.org/book/ch11-03-test-organization.html#submodules-in-integration-tests
|
|
4
|
+
we do have an 'old style' mod.rs, so that the test runner doesn't look inside
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
use anyhow::Result;
|
|
8
|
+
use llguidance::{
|
|
9
|
+
api::{GrammarInit, TopLevelGrammar},
|
|
10
|
+
toktrie::bytes::limit_str,
|
|
11
|
+
TokenParser,
|
|
12
|
+
};
|
|
13
|
+
use sample_parser::*;
|
|
14
|
+
use serde_json::Value;
|
|
15
|
+
|
|
16
|
+
pub fn make_parser(lark: &str, quiet: bool) -> Result<TokenParser> {
|
|
17
|
+
let grm = TopLevelGrammar::from_lark(lark.to_string());
|
|
18
|
+
let mut parser = get_parser_factory().create_parser_from_init(
|
|
19
|
+
GrammarInit::Serialized(grm),
|
|
20
|
+
if quiet { 0 } else { 2 },
|
|
21
|
+
if quiet { 1 } else { 2 },
|
|
22
|
+
)?;
|
|
23
|
+
parser.start_without_prompt();
|
|
24
|
+
Ok(parser)
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
pub fn consume(parser: &mut TokenParser, tok: u32) {
|
|
28
|
+
let n = parser.consume_token(tok).unwrap();
|
|
29
|
+
assert!(n == 0);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
pub fn lark_ok(lark: &str) {
|
|
33
|
+
if let Err(e) = make_parser(lark, false) {
|
|
34
|
+
panic!("unexpected error: {e}, grm:\n{lark}")
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
pub fn lark_err_test(lark: &str, err: &str) {
|
|
39
|
+
match make_parser(lark, false) {
|
|
40
|
+
Err(e) => {
|
|
41
|
+
let e = format!("{e}");
|
|
42
|
+
if !e.contains(err) {
|
|
43
|
+
panic!("unexpected error: {e}, expecting {err:?}; grm:\n{lark}");
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
Ok(_) => panic!("expected error: {err}; grm:\n{lark}"),
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
pub fn json_err_test(schema: &Value, err: &str) {
|
|
51
|
+
lark_err_test(
|
|
52
|
+
&format!(r#"start: %json {}"#, serde_json::to_string(schema).unwrap()),
|
|
53
|
+
err,
|
|
54
|
+
);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
pub fn lark_str_test(lark: &str, should_accept: bool, input: &str, quiet: bool) {
|
|
58
|
+
let trie = get_tok_env().tok_trie();
|
|
59
|
+
let (final_reject, input) = if let Some(input) = input.strip_prefix("FINAL_REJECT:") {
|
|
60
|
+
(true, input)
|
|
61
|
+
} else {
|
|
62
|
+
(false, input)
|
|
63
|
+
};
|
|
64
|
+
let tokens = get_tok_env().tokenize(input);
|
|
65
|
+
let info = format!(
|
|
66
|
+
"\ninput: {:?}, grm: {:?}",
|
|
67
|
+
limit_str(input, 500),
|
|
68
|
+
limit_str(lark, 100)
|
|
69
|
+
);
|
|
70
|
+
if !quiet {
|
|
71
|
+
println!(
|
|
72
|
+
"\n\ntokens: {}, accpt={}\ngrm:\n{}\n",
|
|
73
|
+
trie.tokens_dbg(&tokens),
|
|
74
|
+
should_accept,
|
|
75
|
+
lark
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// let t0 = std::time::Instant::now();
|
|
80
|
+
let mut p = make_parser(lark, quiet).unwrap();
|
|
81
|
+
// println!("make_parser: {:?}", t0.elapsed());
|
|
82
|
+
|
|
83
|
+
for tok in tokens.iter() {
|
|
84
|
+
let m = p.compute_mask().unwrap();
|
|
85
|
+
if m.is_allowed(*tok) {
|
|
86
|
+
consume(&mut p, *tok);
|
|
87
|
+
} else {
|
|
88
|
+
if should_accept {
|
|
89
|
+
panic!("unexpected token: {}{info}", trie.token_dbg(*tok));
|
|
90
|
+
}
|
|
91
|
+
if final_reject {
|
|
92
|
+
panic!(
|
|
93
|
+
"unexpected token: {}; expecting reject only at the end{info}",
|
|
94
|
+
trie.token_dbg(*tok)
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if !final_reject && !should_accept {
|
|
102
|
+
panic!(
|
|
103
|
+
"expected rejection (in the middle; final accept={})",
|
|
104
|
+
p.is_accepting()
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if p.is_accepting() == final_reject {
|
|
109
|
+
if p.is_accepting() {
|
|
110
|
+
panic!("unexpected accept{info}");
|
|
111
|
+
} else {
|
|
112
|
+
panic!("unexpected reject{info}");
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
pub fn lark_str_test_many_ext(quiet: bool, lark: &str, passing: &[&str], failing: &[&str]) {
|
|
118
|
+
for s in passing {
|
|
119
|
+
lark_str_test(lark, true, s, quiet);
|
|
120
|
+
}
|
|
121
|
+
for s in failing {
|
|
122
|
+
lark_str_test(lark, false, s, quiet);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
pub fn json_test_many(schema: &Value, passing: &[Value], failing: &[Value]) {
|
|
127
|
+
let lark = format!(r#"start: %json {}"#, serde_json::to_string(schema).unwrap());
|
|
128
|
+
for s in passing {
|
|
129
|
+
let s = serde_json::to_string(s).unwrap();
|
|
130
|
+
lark_str_test(&lark, true, &s, false);
|
|
131
|
+
}
|
|
132
|
+
for s in failing {
|
|
133
|
+
let s = serde_json::to_string(s).unwrap();
|
|
134
|
+
lark_str_test(&lark, false, &s, false);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
pub fn lark_str_test_many(lark: &str, passing: &[&str], failing: &[&str]) {
|
|
139
|
+
lark_str_test_many_ext(false, lark, passing, failing);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
pub fn lark_str_test_many_quiet(lark: &str, passing: &[&str], failing: &[&str]) {
|
|
143
|
+
lark_str_test_many_ext(true, lark, passing, failing);
|
|
144
|
+
}
|
|
@@ -1,143 +1,8 @@
|
|
|
1
|
-
use
|
|
2
|
-
use
|
|
3
|
-
api::{GrammarInit, TopLevelGrammar},
|
|
4
|
-
earley::XorShift,
|
|
5
|
-
substring::chunk_into_words,
|
|
6
|
-
toktrie::bytes::limit_str,
|
|
7
|
-
TokenParser,
|
|
8
|
-
};
|
|
9
|
-
use sample_parser::*;
|
|
10
|
-
use serde_json::{json, Value};
|
|
11
|
-
|
|
12
|
-
fn make_parser(lark: &str, quiet: bool) -> Result<TokenParser> {
|
|
13
|
-
let grm = TopLevelGrammar::from_lark(lark.to_string());
|
|
14
|
-
let mut parser = get_parser_factory().create_parser_from_init(
|
|
15
|
-
GrammarInit::Serialized(grm),
|
|
16
|
-
if quiet { 0 } else { 2 },
|
|
17
|
-
if quiet { 1 } else { 2 },
|
|
18
|
-
)?;
|
|
19
|
-
parser.start_without_prompt();
|
|
20
|
-
Ok(parser)
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
fn consume(parser: &mut TokenParser, tok: u32) {
|
|
24
|
-
let n = parser.consume_token(tok).unwrap();
|
|
25
|
-
assert!(n == 0);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
fn lark_ok(lark: &str) {
|
|
29
|
-
if let Err(e) = make_parser(lark, false) {
|
|
30
|
-
panic!("unexpected error: {e}, grm:\n{lark}")
|
|
31
|
-
}
|
|
32
|
-
}
|
|
1
|
+
use llguidance::{earley::XorShift, substring::chunk_into_words};
|
|
2
|
+
use serde_json::json;
|
|
33
3
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
Err(e) => {
|
|
37
|
-
let e = format!("{e}");
|
|
38
|
-
if !e.contains(err) {
|
|
39
|
-
panic!("unexpected error: {e}, expecting {err:?}; grm:\n{lark}");
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
Ok(_) => panic!("expected error: {err}; grm:\n{lark}"),
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
fn json_err_test(schema: &Value, err: &str) {
|
|
47
|
-
lark_err_test(
|
|
48
|
-
&format!(r#"start: %json {}"#, serde_json::to_string(schema).unwrap()),
|
|
49
|
-
err,
|
|
50
|
-
);
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
fn lark_str_test(lark: &str, should_accept: bool, input: &str, quiet: bool) {
|
|
54
|
-
let trie = get_tok_env().tok_trie();
|
|
55
|
-
let (final_reject, input) = if let Some(input) = input.strip_prefix("FINAL_REJECT:") {
|
|
56
|
-
(true, input)
|
|
57
|
-
} else {
|
|
58
|
-
(false, input)
|
|
59
|
-
};
|
|
60
|
-
let tokens = get_tok_env().tokenize(input);
|
|
61
|
-
let info = format!(
|
|
62
|
-
"\ninput: {:?}, grm: {:?}",
|
|
63
|
-
limit_str(input, 500),
|
|
64
|
-
limit_str(lark, 100)
|
|
65
|
-
);
|
|
66
|
-
if !quiet {
|
|
67
|
-
println!(
|
|
68
|
-
"\n\ntokens: {}, accpt={}\ngrm:\n{}\n",
|
|
69
|
-
trie.tokens_dbg(&tokens),
|
|
70
|
-
should_accept,
|
|
71
|
-
lark
|
|
72
|
-
);
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
// let t0 = std::time::Instant::now();
|
|
76
|
-
let mut p = make_parser(lark, quiet).unwrap();
|
|
77
|
-
// println!("make_parser: {:?}", t0.elapsed());
|
|
78
|
-
|
|
79
|
-
for tok in tokens.iter() {
|
|
80
|
-
let m = p.compute_mask().unwrap();
|
|
81
|
-
if m.is_allowed(*tok) {
|
|
82
|
-
consume(&mut p, *tok);
|
|
83
|
-
} else {
|
|
84
|
-
if should_accept {
|
|
85
|
-
panic!("unexpected token: {}{info}", trie.token_dbg(*tok));
|
|
86
|
-
}
|
|
87
|
-
if final_reject {
|
|
88
|
-
panic!(
|
|
89
|
-
"unexpected token: {}; expecting reject only at the end{info}",
|
|
90
|
-
trie.token_dbg(*tok)
|
|
91
|
-
);
|
|
92
|
-
}
|
|
93
|
-
return;
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
if !final_reject && !should_accept {
|
|
98
|
-
panic!(
|
|
99
|
-
"expected rejection (in the middle; final accept={})",
|
|
100
|
-
p.is_accepting()
|
|
101
|
-
);
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
if p.is_accepting() == final_reject {
|
|
105
|
-
if p.is_accepting() {
|
|
106
|
-
panic!("unexpected accept{info}");
|
|
107
|
-
} else {
|
|
108
|
-
panic!("unexpected reject{info}");
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
fn lark_str_test_many_ext(quiet: bool, lark: &str, passing: &[&str], failing: &[&str]) {
|
|
114
|
-
for s in passing {
|
|
115
|
-
lark_str_test(lark, true, s, quiet);
|
|
116
|
-
}
|
|
117
|
-
for s in failing {
|
|
118
|
-
lark_str_test(lark, false, s, quiet);
|
|
119
|
-
}
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
fn json_test_many(schema: &Value, passing: &[Value], failing: &[Value]) {
|
|
123
|
-
let lark = format!(r#"start: %json {}"#, serde_json::to_string(schema).unwrap());
|
|
124
|
-
for s in passing {
|
|
125
|
-
let s = serde_json::to_string(s).unwrap();
|
|
126
|
-
lark_str_test(&lark, true, &s, false);
|
|
127
|
-
}
|
|
128
|
-
for s in failing {
|
|
129
|
-
let s = serde_json::to_string(s).unwrap();
|
|
130
|
-
lark_str_test(&lark, false, &s, false);
|
|
131
|
-
}
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
fn lark_str_test_many(lark: &str, passing: &[&str], failing: &[&str]) {
|
|
135
|
-
lark_str_test_many_ext(false, lark, passing, failing);
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
fn lark_str_test_many_quiet(lark: &str, passing: &[&str], failing: &[&str]) {
|
|
139
|
-
lark_str_test_many_ext(true, lark, passing, failing);
|
|
140
|
-
}
|
|
4
|
+
mod common_lark_utils;
|
|
5
|
+
use common_lark_utils::*;
|
|
141
6
|
|
|
142
7
|
#[test]
|
|
143
8
|
fn test_dot_unicode() {
|
|
@@ -318,6 +183,17 @@ fn test_lark_syntax_general() {
|
|
|
318
183
|
lark_err_test(r#"start: <[,]>"#, "empty token range");
|
|
319
184
|
lark_err_test(r#"start: <[200-100]>"#, "invalid token range");
|
|
320
185
|
lark_err_test(r#"start: <[200 - 100]>"#, "lexer error");
|
|
186
|
+
lark_ok(r#"start: <[*]>"#);
|
|
187
|
+
lark_err_test(
|
|
188
|
+
r#"start: <[^*]>"#,
|
|
189
|
+
"negated wildcard token <[^*]> is not supported",
|
|
190
|
+
);
|
|
191
|
+
lark_err_test(
|
|
192
|
+
r#"start: <[*,100]>"#,
|
|
193
|
+
"wildcard token range '*' must not contain additional tokens",
|
|
194
|
+
);
|
|
195
|
+
lark_ok(r#"start: <[^100,200-300]>"#);
|
|
196
|
+
lark_ok(r#"start: <[^100-200,100-300]>"#);
|
|
321
197
|
|
|
322
198
|
lark_err_test(
|
|
323
199
|
r#"
|