@mlc-ai/web-xgrammar 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,295 @@
1
+ /**
2
+ * Various testing methods that are not optimized for performance.
3
+ */
4
+ export declare class Testings {
5
+ /**
6
+ * Convert JSON schema string to EBNF grammar string. For test purposes.
7
+ *
8
+ * @param {string} schema The schema string.
9
+ * @param {number} [indent=2] The number of spaces for indentation. If -1, the grammar will
10
+ * enforce the output to be in one line.
11
+ * @param {[string, string]} [separators] Two separators that will be enforced by the grammar:
12
+ * comma and colon. Examples: (",", ":"), (", ", ": "). If undefined, the default separators will
13
+ * be used: (",", ": ") when the indent is not undefined, and (", ", ": ") otherwise. This follows
14
+ * the convention in Python's json.dumps(). Currently unsupported and will use the default value.
15
+ * @param {boolean} [strictMode=true] Whether to use strict mode. In strict mode, the generated
16
+ * grammar will not allow properties and items that is not specified in the schema. This is
17
+ * equivalent to setting unevaluatedProperties and unevaluatedItems to false.
18
+ * @returns {string} The EBNF grammar string.
19
+ */
20
+ static _jsonSchemaToEBNF(schema: string, indent?: number, separators?: [string, string], strictMode?: boolean): Promise<string>;
21
+ /**
22
+ *
23
+ * @param {Int32Array} bitmask Bitmask returned by getNextTokenBitmask().
24
+ * @param {number} vocabSize Vocab size returned by getVocabSize().
25
+ * @param {number} index The batch index of the bitmask. For batch inference, bitmask[index] will
26
+ * be used. Defaults to 0.
27
+ * @returns An array of vocab ID that will be rejected as a result of the bitmask.
28
+ */
29
+ static debugGetMaskedTokensFromBitmask(bitmask: Int32Array, vocabSize: number, index?: number): Promise<Int32Array>;
30
+ }
31
+ /**
32
+ * This class stores the abstract syntax tree (AST) of the Backus-Naur Form (BNF) grammar and
33
+ * provides utilities to parse and print the AST. User should provide a BNF/EBNF (Extended
34
+ * Backus-Naur Form) grammar, and use from_ebnf_string to parse and simplify the grammar into an
35
+ * AST of BNF grammar.
36
+ */
37
+ export declare class Grammar {
38
+ handle: any;
39
+ /**
40
+ * @internal
41
+ * Private constructor. Factory methods are used since binding initialization is asynchronous.
42
+ * @param {any} handle handle of Grammar created by binding.
43
+ */
44
+ constructor(handle: any);
45
+ /**
46
+ * Dispose this Grammar.
47
+ */
48
+ dispose(): void;
49
+ /**
50
+ * Construct a BNF grammar with a EBNF-formatted string. The grammar will be normalized
51
+ * (simplified) by default.
52
+ * EBNF grammar: see https://www.w3.org/TR/xml/#sec-notation. Note:
53
+ * 1. Use # as the comment mark
54
+ * 2. Use C-style unicode escape sequence \u01AB, \U000001AB, \xAB
55
+ * 3. A-B (match A and not match B) is not supported yet
56
+ * 4. Lookahead assertion can be added at the end of a rule to speed up matching. E.g.
57
+ * ```
58
+ * root ::= "ab" a [a-z]
59
+ * a ::= "cd" (=[a-z])
60
+ * ```
61
+ * The assertion (=[a-z]) means a must be followed by [a-z].
62
+ * @param {string} ebnfString The grammar string
63
+ * @param {string} [rootRule="root"] The name of the root rule. Default: "root".
64
+ * @returns {Grammar} The parsed BNF grammar.
65
+ */
66
+ static fromEBNF(ebnfString: string, rootRule?: string): Promise<Grammar>;
67
+ /**
68
+ * Get the grammar of standard JSON.
69
+ * @returns {Grammar} The JSON grammar.
70
+ */
71
+ static builtinJSONGrammar(): Promise<Grammar>;
72
+ /**
73
+ * Construct a BNF grammar from the json schema string. The schema string should be in the
74
+ * format of the schema of a JSON file. We will parse the schema and generate a BNF grammar.
75
+ *
76
+ * @param {string} schema The schema string.
77
+ * @param {number} [indent=2] The number of spaces for indentation. If -1, the grammar will
78
+ * enforce the output to be in one line.
79
+ * @param {[string, string]} [separators] Two separators that will be enforced by the grammar:
80
+ * comma and colon. Examples: (",", ":"), (", ", ": "). If undefined, the default separators will
81
+ * be used: (",", ": ") when the indent is not undefined, and (", ", ": ") otherwise. This follows
82
+ * the convention in Python's json.dumps(). Currently unsupported and will use the default value.
83
+ * @param {boolean} [strictMode=true] Whether to use strict mode. In strict mode, the generated
84
+ * grammar will not allow properties and items that is not specified in the schema. This is
85
+ * equivalent to setting unevaluatedProperties and unevaluatedItems to false.
86
+ * @returns {Grammar} The generated BNF grammar.
87
+ */
88
+ static fromJSONSchema(schema: string, indent?: number, separators?: [string, string], strictMode?: boolean): Promise<Grammar>;
89
+ /**
90
+ * Print the BNF grammar to a string, in standard BNF format.
91
+ * @returns The BNF grammar string.
92
+ */
93
+ toString(): string;
94
+ }
95
+ /**
96
+ * A class that wraps a preprocessed vocab, needed to instantiate GrammarCompiler.
97
+ */
98
+ export declare class TokenizerInfo {
99
+ handle: any;
100
+ /**
101
+ * @internal
102
+ * Private constructor. Factory methods are used since binding initialization is asynchronous.
103
+ * @param {any} handle handle of TokenizerInfo created by binding.
104
+ */
105
+ constructor(handle: any);
106
+ /**
107
+ * Dispose this tokenizer info object.
108
+ */
109
+ dispose(): void;
110
+ /**
111
+ * Get the vocab size.
112
+ */
113
+ getVocabSize(): number;
114
+ /**
115
+ * Get the post-processed vocab. Returned as a handle of type binding.VectorString
116
+ */
117
+ getDecodedVocabHandle(): any;
118
+ /**
119
+ * Instantiate with raw vocab and the vocab type by internally post-processing
120
+ * the raw vocab by decoding each token with the provided vocab type.
121
+ * @param {string[]} encodedVocab: the vocab in the form of a string list of tokens,
122
+ * ordered by their token id. It should include all the special tokens.
123
+ * @param {string} vocabType: either "byte_fallback", "byte_level", or `raw`. See `tokenizer.cc`
124
+ * for its semantic.
125
+ * @param {boolean} prependSpaceInTokenization: whether the tokenizer will prepend a space before
126
+ * the text in the tokenization process.
127
+ * @param {number} vocabSize: the full vocab size read from `config.json`. If not provided, will
128
+ * use length of `encodedVocab`. Note some model has a vocab size larger in `config.json` due
129
+ * to padding. Essentially the size of the logits.
130
+ * @param {number[] | number} [stopTokenIds=undefined] Stop tokens to override the default ones.
131
+ */
132
+ static createTokenizerInfo(encodedVocab: string[], vocabType: string, prependSpaceInTokenization: boolean, vocabSize?: number, stopTokenIds?: number[] | number): Promise<TokenizerInfo>;
133
+ }
134
+ export declare class CompiledGrammar {
135
+ handle: any;
136
+ /**
137
+ * @internal
138
+ * Private constructor. Factory methods are used since binding initialization is asynchronous.
139
+ * @param {any} handle handle of CompiledGrammar created by binding.
140
+ */
141
+ constructor(handle: any);
142
+ /**
143
+ * Dispose this compiled grammar object.
144
+ */
145
+ dispose(): void;
146
+ /**
147
+ * @returns {Grammar} The grammar used to compile this CompiledGrammar.
148
+ */
149
+ grammar(): Grammar;
150
+ /**
151
+ * @returns {TokenizerInfo} The tokenizer info used to compile this CompiledGrammar.
152
+ */
153
+ tokenizerInfo(): TokenizerInfo;
154
+ }
155
+ export declare class GrammarCompiler {
156
+ handle: any;
157
+ /**
158
+ * @internal
159
+ * Private constructor. Factory methods are used since binding initialization is asynchronous.
160
+ * @param {any} handle handle of GrammarCompiler created by binding.
161
+ */
162
+ private constructor();
163
+ /**
164
+ * Dispose this grammar compiler object.
165
+ */
166
+ dispose(): void;
167
+ /**
168
+ *
169
+ * @param tokenizerInfo {TokenizerInfo} The tokenizer info that contains preprocessed vocab.
170
+ * @param cacheEnabled {boolean} Whether to enable caching. Default is true.
171
+ */
172
+ static createGrammarCompiler(tokenizerInfo: TokenizerInfo, cacheEnabled?: boolean): Promise<GrammarCompiler>;
173
+ /**
174
+ * Get CompiledGrammar from the json schema string. The schema string should be in the
175
+ * format of the schema of a JSON file. We will parse the schema and generate a BNF grammar.
176
+ *
177
+ * @param {string} schema The schema string.
178
+ * @param {number} [indent=2] The number of spaces for indentation. If -1, the grammar will
179
+ * enforce the output to be in one line.
180
+ * @param {[string, string]} [separators] Two separators that will be enforced by the grammar:
181
+ * comma and colon. Examples: (",", ":"), (", ", ": "). If undefined, the default separators will
182
+ * be used: (",", ": ") when the indent is not undefined, and (", ", ": ") otherwise. This follows
183
+ * the convention in Python's json.dumps(). Currently unsupported and will use the default value.
184
+ * @param {boolean} [strictMode=true] Whether to use strict mode. In strict mode, the generated
185
+ * grammar will not allow properties and items that is not specified in the schema. This is
186
+ * equivalent to setting unevaluatedProperties and unevaluatedItems to false.
187
+ * @returns {CompiledGrammar} The compiled grammar for the specified JSON schema.
188
+ */
189
+ compileJSONSchema(schema: string, indent?: number, separators?: [string, string], strictMode?: boolean): Promise<CompiledGrammar>;
190
+ /**
191
+ * @returns {CompiledGrammar} The compiled grammar for JSON.
192
+ */
193
+ compileBuiltinJSONGrammar(): Promise<CompiledGrammar>;
194
+ /**
195
+ * Get CompiledGrammar from the EBNF-formatted string. The grammar will be normalized
196
+ * (simplified) by default.
197
+ * EBNF grammar: see https://www.w3.org/TR/xml/#sec-notation. Note:
198
+ * 1. Use # as the comment mark
199
+ * 2. Use C-style unicode escape sequence \u01AB, \U000001AB, \xAB
200
+ * 3. A-B (match A and not match B) is not supported yet
201
+ * 4. Lookahead assertion can be added at the end of a rule to speed up matching. E.g.
202
+ * ```
203
+ * root ::= "ab" a [a-z]
204
+ * a ::= "cd" (=[a-z])
205
+ * ```
206
+ * The assertion (=[a-z]) means a must be followed by [a-z].
207
+ * @param {string} ebnfString The grammar string
208
+ * @param {string} [rootRule="root"] The name of the root rule. Default: "root".
209
+ * @returns {CompiledGrammar} The compiled grammar for the specified EBNF string.
210
+ */
211
+ compileGrammar(grammar: Grammar): Promise<CompiledGrammar>;
212
+ compileGrammar(grammar: string, rootRule?: string): Promise<CompiledGrammar>;
213
+ }
214
+ /**
215
+ * A stateful matcher to match tokens to the specified BNF grammar. This class is the core logic
216
+ * of the grammar-guided generation.
217
+ *
218
+ * This class implements the non-deterministic pushdown automaton (NPDA) matching algorithm to
219
+ * match characters to a BNF grammar. It keep track of the current state of the matching process by
220
+ * maintaining several stacks internally as possible paths in the NPDA. It also supports
221
+ * backtracking.
222
+ *
223
+ * It is particularly capable of finding the set of tokens that are acceptable for the next step
224
+ * and storing them in a bitmask. This aids in grammar-guided generation.
225
+ */
226
+ export declare class GrammarMatcher {
227
+ private handle;
228
+ private vocab_size;
229
+ /**
230
+ * @internal
231
+ * Private constructor. Factory methods are used since binding initialization is asynchronous.
232
+ * @param {any} handle handle of GrammarMatcher created by binding.
233
+ */
234
+ private constructor();
235
+ /**
236
+ * Dispose this grammar state matcher.
237
+ */
238
+ dispose(): void;
239
+ /**
240
+ * Construct a GrammarMatcher.
241
+ * @param {CompiledGrammar} compiledGrammar A compiled grammar from GrammarCompiler.
242
+ * @param {number[] | number} [overrideStopTokens=undefined] Stop tokens to override the default ones.
243
+ * @param {boolean} [terminateWithoutStopToken=false] Whether to terminate without stop token.
244
+ * @param {number} [maxRollbackTokens=0] Max rollback tokens.
245
+ * @returns {GrammarMatcher} The constructed GrammarMatcher.
246
+ */
247
+ static createGrammarMatcher(compiledGrammar: CompiledGrammar, overrideStopTokens?: number[] | number, terminateWithoutStopToken?: boolean, maxRollbackTokens?: number): Promise<GrammarMatcher>;
248
+ /**
249
+ * Get the maximum number of rollback tokens allowed.
250
+ */
251
+ getMaxRollbackTokens(): number;
252
+ /**
253
+ * Accept one token and update the state of the matcher.
254
+ * @param {number} tokenID The id of the token to accept.
255
+ * @param {boolean} [verbose=false] To print debugging info
256
+ * @returns {boolean} Whether the token is accepted.
257
+ */
258
+ acceptToken(tokenID: number, verbose?: boolean): boolean;
259
+ /**
260
+ * Accept one unicode codepoint to the current state. For test purposes.
261
+ * @param {string} inputStr The unicode codepoint of the character to be accepted.
262
+ * @param {boolean} [verbose=false] To print debugging info
263
+ * @returns {boolean} Whether the input string is accepted.
264
+ */
265
+ _debugAcceptString(inputStr: string, verbose?: boolean): boolean;
266
+ /**
267
+ * Returns a bitmask in the form of an Int32Array of length ceildiv(vocab_size, 32)
268
+ * based on what tokens can/cannot be accepted by the current state of the grammar state matcher.
269
+ *
270
+ * @returns {Int32Array} An array representing the bitmask that masks the rejected token IDs
271
+ */
272
+ getNextTokenBitmask(): Promise<Int32Array>;
273
+ /**
274
+ * Check if the matcher has accepted the stop token and terminated. See also
275
+ * GrammarMatcher.acceptToken.
276
+ */
277
+ isTerminated(): boolean;
278
+ /**
279
+ * Reset the matcher to the initial state.
280
+ */
281
+ reset(): void;
282
+ /**
283
+ * Find the jump-forward string for jump-forward decoding. This is the longest string that
284
+ * will be valid according to the current syntax.
285
+ * @returns {string} The jump-forward string.
286
+ */
287
+ findJumpForwardString(): string;
288
+ /**
289
+ * Rollback the matcher to a previous state.
290
+ * @param {number} numTokens The number of tokens to rollback. It cannot exceed the current
291
+ * number of steps, nor can it exceed the specified maximum number of rollback tokens.
292
+ */
293
+ rollBack(numTokens: number): void;
294
+ }
295
+ //# sourceMappingURL=xgrammar.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"xgrammar.d.ts","sourceRoot":"","sources":["../src/xgrammar.ts"],"names":[],"mappings":"AAUA;;GAEG;AACH,qBAAa,QAAQ;IACnB;;;;;;;;;;;;;;OAcG;WACU,iBAAiB,CAC5B,MAAM,EAAE,MAAM,EACd,MAAM,SAAI,EACV,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAC7B,UAAU,UAAO,GAChB,OAAO,CAAC,MAAM,CAAC;IAiBlB;;;;;;;OAOG;WACU,+BAA+B,CAC1C,OAAO,EAAE,UAAU,EACnB,SAAS,EAAE,MAAM,EACjB,KAAK,GAAE,MAAU,GAChB,OAAO,CAAC,UAAU,CAAC;CAavB;AAED;;;;;GAKG;AACH,qBAAa,OAAO;IAClB,MAAM,EAAE,GAAG,CAAC;IAEZ;;;;OAIG;gBACS,MAAM,EAAE,GAAG;IAIvB;;OAEG;IACH,OAAO;IAIP;;;;;;;;;;;;;;;;OAgBG;WACU,QAAQ,CAAC,UAAU,EAAE,MAAM,EAAE,QAAQ,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC;IAK9E;;;OAGG;WACU,kBAAkB,IAAI,OAAO,CAAC,OAAO,CAAC;IAKnD;;;;;;;;;;;;;;;OAeG;WACU,cAAc,CACzB,MAAM,EAAE,MAAM,EACd,MAAM,SAAI,EACV,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAC7B,UAAU,UAAO,GAChB,OAAO,CAAC,OAAO,CAAC;IAkBnB;;;OAGG;IACH,QAAQ,IAAI,MAAM;CAGnB;AAED;;GAEG;AACH,qBAAa,aAAa;IACxB,MAAM,EAAE,GAAG,CAAC;IAEZ;;;;OAIG;gBACS,MAAM,EAAE,GAAG;IAIvB;;OAEG;IACH,OAAO;IAIP;;OAEG;IACH,YAAY,IAAI,MAAM;IAItB;;OAEG;IACH,qBAAqB,IAAI,GAAG;IAI5B;;;;;;;;;;;;;OAaG;WACU,mBAAmB,CAC9B,YAAY,EAAE,MAAM,EAAE,EACtB,SAAS,EAAE,MAAM,EACjB,0BAA0B,EAAE,OAAO,EACnC,SAAS,CAAC,EAAE,MAAM,EAClB,YAAY,CAAC,EAAE,MAAM,EAAE,GAAG,MAAM,GAC/B,OAAO,CAAC,aAAa,CAAC;CAoB1B;AAED,qBAAa,eAAe;IAC1B,MAAM,EAAE,GAAG,CAAC;IAEZ;;;;OAIG;gBACS,MAAM,EAAE,GAAG;IAIvB;;OAEG;IACH,OAAO;IAIP;;OAEG;IACH,OAAO,IAAI,OAAO;IAIlB;;OAEG;IACH,aAAa,IAAI,aAAa;CAG/B;AAED,qBAAa,eAAe;IAC1B,MAAM,EAAE,GAAG,CAAC;IAEZ;;;;OAIG;IACH,OAAO;IAIP;;OAEG;IACH,OAAO;IAIP;;;;OAIG;WACU,qBAAqB,CAChC,aAAa,EAAE,aAAa,EAC5B,YAAY,GAAE,OAAc,GAC3B,OAAO,CAAC,eAAe,CAAC;IAU3B;;;;;;;;;;;;;;;OAeG;IACG,iBAAiB,CACrB,MAAM,EAAE,MAAM,EACd,MAAM,SAAI,EACV,UAAU,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAC7B,UAAU,UAAO,GAChB,OAAO,CAAC,eAAe,CAAC;IAkB3B;;OAEG;IACG,yBAAyB,IAAI,OAAO,CAAC,eAAe,CAAC;IAK3D;;;;;;;;;;;;;;;;OAgBG;IACG,cAAc,CAAC,OAAO,EAAE,OAAO,GAAG,OAAO,CAAC,eAAe,CAAC;IAC1D,cAAc,CAAC,OAAO,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,eAAe,CAAC;CAUnF;AAED;;;;;;;;;;;GAWG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,MAAM,CAAM;IACpB,OAAO,CAAC,UAAU,CAAS;IAE3B;;;;OAIG;IACH,OAAO;IAKP;;OAEG;IACH,OAAO;IAIP;;;;;;;OAOG;WACU,oBAAoB,CAC/B,eAAe,EAAE,eAAe,EAChC,kBAAkB,CAAC,EAAE,MAAM,EAAE,GAAG,MAAM,EACtC,yBAAyB,GAAE,OAAe,EAC1C,iBAAiB,GAAE,MAAU,GAC5B,OAAO,CAAC,cAAc,CAAC;IAiB1B;;OAEG;IACH,oBAAoB,IAAI,MAAM;IAI9B;;;;;OAKG;IACH,WAAW,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,GAAE,OAAe,GAAG,OAAO;IAI/D;;;;;OAKG;IACH,kBAAkB,CAAC,QAAQ,EAAE,MAAM,EAAE,OAAO,GAAE,OAAe,GAAG,OAAO;IAIvE;;;;;OAKG;IACG,mBAAmB,IAAI,OAAO,CAAC,UAAU,CAAC;IAShD;;;OAGG;IACH,YAAY,IAAI,OAAO;IAIvB;;OAEG;IACH,KAAK,IAAI,IAAI;IAIb;;;;OAIG;IACH,qBAAqB,IAAI,MAAM;IAI/B;;;;OAIG;IACH,QAAQ,CAAC,SAAS,EAAE,MAAM,GAAG,IAAI;CAGlC"}
package/package.json ADDED
@@ -0,0 +1,44 @@
1
+ {
2
+ "name": "@mlc-ai/web-xgrammar",
3
+ "version": "0.1.0",
4
+ "description": "",
5
+ "main": "lib/index.js",
6
+ "types": "lib/index.d.ts",
7
+ "type": "module",
8
+ "scripts": {
9
+ "build": "./build.sh; rollup -c",
10
+ "lint": "npx eslint .",
11
+ "test": "./run_test.sh"
12
+ },
13
+ "files": [
14
+ "lib"
15
+ ],
16
+ "repository": {
17
+ "type": "git",
18
+ "url": "git+https://github.com/mlc-ai/xgrammar"
19
+ },
20
+ "keywords": [
21
+ "machine_learning",
22
+ "llm",
23
+ "nlp"
24
+ ],
25
+ "license": "Apache-2.0",
26
+ "homepage": "https://github.com/mlc-ai/xgrammar/tree/main/web",
27
+ "devDependencies": {
28
+ "@jest/globals": "^29.7.0",
29
+ "@rollup/plugin-commonjs": "^20.0.0",
30
+ "@rollup/plugin-node-resolve": "^13.0.4",
31
+ "@rollup/plugin-wasm": "^5.1.2",
32
+ "@types/jest": "^29.5.12",
33
+ "@typescript-eslint/eslint-plugin": "^5.59.6",
34
+ "@typescript-eslint/parser": "^5.59.6",
35
+ "eslint": "^8.41.0",
36
+ "jest": "^29.7.0",
37
+ "rollup": "^2.56.2",
38
+ "rollup-plugin-typescript2": "^0.34.1",
39
+ "ts-jest": "^29.2.5",
40
+ "tslib": "^2.3.1",
41
+ "typescript": "^4.9.5",
42
+ "@mlc-ai/web-tokenizers": "^0.1.5"
43
+ }
44
+ }