@fuzdev/fuz_code 0.37.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +25 -0
- package/README.md +185 -0
- package/dist/Code.svelte +146 -0
- package/dist/Code.svelte.d.ts +79 -0
- package/dist/Code.svelte.d.ts.map +1 -0
- package/dist/CodeHighlight.svelte +205 -0
- package/dist/CodeHighlight.svelte.d.ts +101 -0
- package/dist/CodeHighlight.svelte.d.ts.map +1 -0
- package/dist/code_sample.d.ts +8 -0
- package/dist/code_sample.d.ts.map +1 -0
- package/dist/code_sample.js +2 -0
- package/dist/grammar_clike.d.ts +12 -0
- package/dist/grammar_clike.d.ts.map +1 -0
- package/dist/grammar_clike.js +43 -0
- package/dist/grammar_css.d.ts +11 -0
- package/dist/grammar_css.d.ts.map +1 -0
- package/dist/grammar_css.js +70 -0
- package/dist/grammar_js.d.ts +11 -0
- package/dist/grammar_js.d.ts.map +1 -0
- package/dist/grammar_js.js +180 -0
- package/dist/grammar_json.d.ts +11 -0
- package/dist/grammar_json.d.ts.map +1 -0
- package/dist/grammar_json.js +35 -0
- package/dist/grammar_markdown.d.ts +8 -0
- package/dist/grammar_markdown.d.ts.map +1 -0
- package/dist/grammar_markdown.js +228 -0
- package/dist/grammar_markup.d.ts +31 -0
- package/dist/grammar_markup.d.ts.map +1 -0
- package/dist/grammar_markup.js +192 -0
- package/dist/grammar_svelte.d.ts +12 -0
- package/dist/grammar_svelte.d.ts.map +1 -0
- package/dist/grammar_svelte.js +150 -0
- package/dist/grammar_ts.d.ts +11 -0
- package/dist/grammar_ts.d.ts.map +1 -0
- package/dist/grammar_ts.js +95 -0
- package/dist/highlight_manager.d.ts +25 -0
- package/dist/highlight_manager.d.ts.map +1 -0
- package/dist/highlight_manager.js +139 -0
- package/dist/highlight_priorities.d.ts +3 -0
- package/dist/highlight_priorities.d.ts.map +1 -0
- package/dist/highlight_priorities.gen.d.ts +4 -0
- package/dist/highlight_priorities.gen.d.ts.map +1 -0
- package/dist/highlight_priorities.gen.js +58 -0
- package/dist/highlight_priorities.js +55 -0
- package/dist/syntax_styler.d.ts +277 -0
- package/dist/syntax_styler.d.ts.map +1 -0
- package/dist/syntax_styler.js +426 -0
- package/dist/syntax_styler_global.d.ts +3 -0
- package/dist/syntax_styler_global.d.ts.map +1 -0
- package/dist/syntax_styler_global.js +18 -0
- package/dist/syntax_token.d.ts +34 -0
- package/dist/syntax_token.d.ts.map +1 -0
- package/dist/syntax_token.js +27 -0
- package/dist/theme.css +98 -0
- package/dist/theme_highlight.css +160 -0
- package/dist/theme_variables.css +20 -0
- package/dist/tokenize_syntax.d.ts +28 -0
- package/dist/tokenize_syntax.d.ts.map +1 -0
- package/dist/tokenize_syntax.js +194 -0
- package/package.json +117 -0
- package/src/lib/code_sample.ts +10 -0
- package/src/lib/grammar_clike.ts +48 -0
- package/src/lib/grammar_css.ts +84 -0
- package/src/lib/grammar_js.ts +215 -0
- package/src/lib/grammar_json.ts +38 -0
- package/src/lib/grammar_markdown.ts +289 -0
- package/src/lib/grammar_markup.ts +225 -0
- package/src/lib/grammar_svelte.ts +165 -0
- package/src/lib/grammar_ts.ts +114 -0
- package/src/lib/highlight_manager.ts +182 -0
- package/src/lib/highlight_priorities.gen.ts +71 -0
- package/src/lib/highlight_priorities.ts +110 -0
- package/src/lib/syntax_styler.ts +583 -0
- package/src/lib/syntax_styler_global.ts +20 -0
- package/src/lib/syntax_token.ts +49 -0
- package/src/lib/tokenize_syntax.ts +270 -0
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
import {SyntaxToken, type SyntaxTokenStream} from './syntax_token.js';
|
|
2
|
+
import {tokenize_syntax} from './tokenize_syntax.js';
|
|
3
|
+
|
|
4
|
+
export type AddSyntaxGrammar = (syntax_styler: SyntaxStyler) => void;
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Based on Prism (https://github.com/PrismJS/prism)
|
|
8
|
+
* by Lea Verou (https://lea.verou.me/)
|
|
9
|
+
*
|
|
10
|
+
* MIT license
|
|
11
|
+
*
|
|
12
|
+
* @see LICENSE
|
|
13
|
+
*/
|
|
14
|
+
export class SyntaxStyler {
|
|
15
|
+
langs: Record<string, SyntaxGrammar | undefined> = {
|
|
16
|
+
plaintext: {},
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
// constructor() {
|
|
20
|
+
// TODO this API? problem is the grammars rely on mutating existing grammars in the `syntax_styler`,
|
|
21
|
+
// so for now adding grammars will remain inherently stateful
|
|
22
|
+
// export interface SyntaxStylerOptions {
|
|
23
|
+
// grammars?: AddGrammar[];
|
|
24
|
+
// }
|
|
25
|
+
// options: SyntaxStylerOptions = {}
|
|
26
|
+
// const {grammars} = options;
|
|
27
|
+
// if (grammars) {
|
|
28
|
+
// for (const add_grammar of grammars) {
|
|
29
|
+
// this.langs[id] =
|
|
30
|
+
// add_grammar(this);
|
|
31
|
+
// }
|
|
32
|
+
// }
|
|
33
|
+
// }
|
|
34
|
+
|
|
35
|
+
add_lang(id: string, grammar: SyntaxGrammarRaw, aliases?: Array<string>): void {
|
|
36
|
+
// Normalize grammar once at registration for optimal runtime performance
|
|
37
|
+
// Use a visited set to handle circular references
|
|
38
|
+
this.normalize_grammar(grammar, new Set());
|
|
39
|
+
// After normalization, grammar has the shape of SyntaxGrammar
|
|
40
|
+
const normalized = grammar as unknown as SyntaxGrammar;
|
|
41
|
+
this.langs[id] = normalized;
|
|
42
|
+
if (aliases !== undefined) {
|
|
43
|
+
for (var alias of aliases) {
|
|
44
|
+
this.langs[alias] = normalized;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
add_extended_lang(
|
|
50
|
+
base_id: string,
|
|
51
|
+
extension_id: string,
|
|
52
|
+
extension: SyntaxGrammarRaw,
|
|
53
|
+
aliases?: Array<string>,
|
|
54
|
+
): SyntaxGrammar {
|
|
55
|
+
// extend_grammar returns already normalized grammar
|
|
56
|
+
var grammar = this.extend_grammar(base_id, extension);
|
|
57
|
+
// Store the normalized grammar directly
|
|
58
|
+
this.langs[extension_id] = grammar;
|
|
59
|
+
if (aliases !== undefined) {
|
|
60
|
+
for (var alias of aliases) {
|
|
61
|
+
this.langs[alias] = grammar;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return grammar;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
get_lang(id: string): SyntaxGrammar {
|
|
68
|
+
var lang = this.langs[id];
|
|
69
|
+
if (lang === undefined) {
|
|
70
|
+
throw Error(`The language "${id}" has no grammar.`);
|
|
71
|
+
}
|
|
72
|
+
return lang;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Generates HTML with syntax highlighting from source code.
|
|
77
|
+
*
|
|
78
|
+
* **Process:**
|
|
79
|
+
* 1. Runs `before_tokenize` hook
|
|
80
|
+
* 2. Tokenizes code using the provided or looked-up grammar
|
|
81
|
+
* 3. Runs `after_tokenize` hook
|
|
82
|
+
* 4. Runs `wrap` hook on each token
|
|
83
|
+
* 5. Converts tokens to HTML with CSS classes
|
|
84
|
+
*
|
|
85
|
+
* **Parameter Relationship:**
|
|
86
|
+
* - `lang` is ALWAYS required for hook context and identification
|
|
87
|
+
* - `grammar` is optional; when undefined, automatically looks up via `this.get_lang(lang)`
|
|
88
|
+
* - When both are provided, `grammar` is used for tokenization, `lang` for metadata
|
|
89
|
+
*
|
|
90
|
+
* **Use cases:**
|
|
91
|
+
* - Standard usage: `stylize(code, 'ts')` - uses registered TypeScript grammar
|
|
92
|
+
* - Custom grammar: `stylize(code, 'ts', customGrammar)` - uses custom grammar but keeps 'ts' label
|
|
93
|
+
* - Extended grammar: `stylize(code, 'custom', this.extend_grammar('ts', extension))` - new language variant
|
|
94
|
+
*
|
|
95
|
+
* @param text - The source code to syntax highlight.
|
|
96
|
+
* @param lang - Language identifier (e.g., 'ts', 'css', 'html'). Used for:
|
|
97
|
+
* - Grammar lookup when `grammar` is undefined
|
|
98
|
+
* - Hook context (`lang` field passed to hooks)
|
|
99
|
+
* - Language identification in output
|
|
100
|
+
* @param grammar - Optional custom grammar object. When undefined, automatically
|
|
101
|
+
* looks up the grammar via `this.get_lang(lang)`. Provide this to use a custom
|
|
102
|
+
* or modified grammar instead of the registered one.
|
|
103
|
+
*
|
|
104
|
+
* @returns HTML string with syntax highlighting using CSS classes (`.token_*`)
|
|
105
|
+
*
|
|
106
|
+
* @example
|
|
107
|
+
* // Standard usage - uses registered grammar
|
|
108
|
+
* stylize('var foo = true;', 'ts');
|
|
109
|
+
*
|
|
110
|
+
* @example
|
|
111
|
+
* // Custom grammar - overrides registered grammar
|
|
112
|
+
* const customGrammar = { keyword: [...], string: [...] };
|
|
113
|
+
* stylize('var foo = false;', 'ts', customGrammar);
|
|
114
|
+
*
|
|
115
|
+
* @example
|
|
116
|
+
* // Extended grammar - builds on existing grammar
|
|
117
|
+
* const extended = this.extend_grammar('ts', { customToken: [...] });
|
|
118
|
+
* stylize('var foo = 42;', 'ts-extended', extended);
|
|
119
|
+
*/
|
|
120
|
+
stylize(
|
|
121
|
+
text: string,
|
|
122
|
+
lang: string,
|
|
123
|
+
grammar: SyntaxGrammar | undefined = this.get_lang(lang),
|
|
124
|
+
): string {
|
|
125
|
+
var ctx: HookBeforeTokenizeCallbackContext = {
|
|
126
|
+
code: text,
|
|
127
|
+
grammar,
|
|
128
|
+
lang,
|
|
129
|
+
tokens: undefined,
|
|
130
|
+
};
|
|
131
|
+
this.run_hook_before_tokenize(ctx);
|
|
132
|
+
const c = ctx as any as HookAfterTokenizeCallbackContext;
|
|
133
|
+
c.tokens = tokenize_syntax(c.code, c.grammar);
|
|
134
|
+
this.run_hook_after_tokenize(c);
|
|
135
|
+
return this.stringify_token(c.tokens, c.lang);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Inserts tokens _before_ another token in a language definition or any other grammar.
|
|
140
|
+
*
|
|
141
|
+
* ## Usage
|
|
142
|
+
*
|
|
143
|
+
* This helper method makes it easy to modify existing languages. For example, the CSS language definition
|
|
144
|
+
* not only defines CSS styling for CSS documents, but also needs to define styling for CSS embedded
|
|
145
|
+
* in HTML through `<style>` elements. To do this, it needs to modify `syntax_styler.get_lang('markup')` and add the
|
|
146
|
+
* appropriate tokens. However, `syntax_styler.get_lang('markup')` is a regular JS object literal, so if you do
|
|
147
|
+
* this:
|
|
148
|
+
*
|
|
149
|
+
* ```js
|
|
150
|
+
* syntax_styler.get_lang('markup').style = {
|
|
151
|
+
* // token
|
|
152
|
+
* };
|
|
153
|
+
* ```
|
|
154
|
+
*
|
|
155
|
+
* then the `style` token will be added (and processed) at the end. `insert_before` allows you to insert tokens
|
|
156
|
+
* before existing tokens. For the CSS example above, you would use it like this:
|
|
157
|
+
*
|
|
158
|
+
* ```js
|
|
159
|
+
* grammar_insert_before('markup', 'cdata', {
|
|
160
|
+
* 'style': {
|
|
161
|
+
* // token
|
|
162
|
+
* }
|
|
163
|
+
* });
|
|
164
|
+
* ```
|
|
165
|
+
*
|
|
166
|
+
* ## Special cases
|
|
167
|
+
*
|
|
168
|
+
* If the grammars of `inside` and `insert` have tokens with the same name, the tokens in `inside`'s grammar
|
|
169
|
+
* will be ignored.
|
|
170
|
+
*
|
|
171
|
+
* This behavior can be used to insert tokens after `before`:
|
|
172
|
+
*
|
|
173
|
+
* ```js
|
|
174
|
+
* grammar_insert_before('markup', 'comment', {
|
|
175
|
+
* 'comment': syntax_styler.get_lang('markup').comment,
|
|
176
|
+
* // tokens after 'comment'
|
|
177
|
+
* });
|
|
178
|
+
* ```
|
|
179
|
+
*
|
|
180
|
+
* ## Limitations
|
|
181
|
+
*
|
|
182
|
+
* The main problem `insert_before` has to solve is iteration order. Since ES2015, the iteration order for object
|
|
183
|
+
* properties is guaranteed to be the insertion order (except for integer keys) but some browsers behave
|
|
184
|
+
* differently when keys are deleted and re-inserted. So `insert_before` can't be implemented by temporarily
|
|
185
|
+
* deleting properties which is necessary to insert at arbitrary positions.
|
|
186
|
+
*
|
|
187
|
+
* To solve this problem, `insert_before` doesn't actually insert the given tokens into the target object.
|
|
188
|
+
* Instead, it will create a new object and replace all references to the target object with the new one. This
|
|
189
|
+
* can be done without temporarily deleting properties, so the iteration order is well-defined.
|
|
190
|
+
*
|
|
191
|
+
* However, only references that can be reached from `syntax_styler.langs` or `insert` will be replaced. I.e. if
|
|
192
|
+
* you hold the target object in a variable, then the value of the variable will not change.
|
|
193
|
+
*
|
|
194
|
+
* ```js
|
|
195
|
+
* var oldMarkup = syntax_styler.get_lang('markup');
|
|
196
|
+
* var newMarkup = grammar_insert_before('markup', 'comment', { ... });
|
|
197
|
+
*
|
|
198
|
+
* assert(oldMarkup !== syntax_styler.get_lang('markup'));
|
|
199
|
+
* assert(newMarkup === syntax_styler.get_lang('markup'));
|
|
200
|
+
* ```
|
|
201
|
+
*
|
|
202
|
+
* @param inside - The property of `root` (e.g. a language id in `syntax_styler.langs`) that contains the
|
|
203
|
+
* object to be modified.
|
|
204
|
+
* @param before - The key to insert before.
|
|
205
|
+
* @param insert - An object containing the key-value pairs to be inserted.
|
|
206
|
+
* @param root - The object containing `inside`, i.e. the object that contains the
|
|
207
|
+
* object to be modified.
|
|
208
|
+
*
|
|
209
|
+
* Defaults to `syntax_styler.langs`.
|
|
210
|
+
*
|
|
211
|
+
* @returns the new grammar object
|
|
212
|
+
*/
|
|
213
|
+
grammar_insert_before(
|
|
214
|
+
inside: string,
|
|
215
|
+
before: string,
|
|
216
|
+
insert: SyntaxGrammarRaw,
|
|
217
|
+
root: Record<string, any> = this.langs,
|
|
218
|
+
): SyntaxGrammar {
|
|
219
|
+
var grammar = root[inside];
|
|
220
|
+
var updated: SyntaxGrammarRaw = {};
|
|
221
|
+
|
|
222
|
+
for (var token in grammar) {
|
|
223
|
+
if (token === before) {
|
|
224
|
+
for (var new_token in insert) {
|
|
225
|
+
updated[new_token] = insert[new_token];
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Do not insert tokens which also occur in insert.
|
|
230
|
+
if (!Object.hasOwn(insert, token)) {
|
|
231
|
+
updated[token] = grammar[token];
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Normalize the updated grammar to ensure inserted patterns have consistent shape
|
|
236
|
+
this.normalize_grammar(updated, new Set());
|
|
237
|
+
|
|
238
|
+
// After normalization, cast to SyntaxGrammar
|
|
239
|
+
const normalized = updated as unknown as SyntaxGrammar;
|
|
240
|
+
var old = root[inside];
|
|
241
|
+
root[inside] = normalized;
|
|
242
|
+
|
|
243
|
+
// Update references in other language definitions
|
|
244
|
+
depth_first_search(this.langs, (o, key, value) => {
|
|
245
|
+
if (value === old && key !== inside) {
|
|
246
|
+
o[key] = normalized;
|
|
247
|
+
}
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
return normalized;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Converts the given token or token stream to an HTML representation.
|
|
255
|
+
*
|
|
256
|
+
* Runs the `wrap` hook on each `SyntaxToken`.
|
|
257
|
+
*
|
|
258
|
+
* @param o - The token or token stream to be converted.
|
|
259
|
+
* @param lang - The name of current language.
|
|
260
|
+
* @returns The HTML representation of the token or token stream.
|
|
261
|
+
*/
|
|
262
|
+
stringify_token(o: string | SyntaxToken | SyntaxTokenStream, lang: string): string {
|
|
263
|
+
if (typeof o === 'string') {
|
|
264
|
+
return o
|
|
265
|
+
.replace(/&/g, '&')
|
|
266
|
+
.replace(/</g, '<')
|
|
267
|
+
.replace(/\u00a0/g, ' ');
|
|
268
|
+
}
|
|
269
|
+
if (Array.isArray(o)) {
|
|
270
|
+
var s = '';
|
|
271
|
+
for (var e of o) {
|
|
272
|
+
s += this.stringify_token(e, lang);
|
|
273
|
+
}
|
|
274
|
+
return s;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
var ctx: HookWrapCallbackContext = {
|
|
278
|
+
type: o.type,
|
|
279
|
+
content: this.stringify_token(o.content, lang),
|
|
280
|
+
tag: 'span',
|
|
281
|
+
classes: [`token_${o.type}`],
|
|
282
|
+
attributes: {},
|
|
283
|
+
lang,
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
var aliases = o.alias;
|
|
287
|
+
// alias is always an array after normalization
|
|
288
|
+
for (const a of aliases) {
|
|
289
|
+
ctx.classes.push(`token_${a}`);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
this.run_hook_wrap(ctx);
|
|
293
|
+
|
|
294
|
+
var attributes = '';
|
|
295
|
+
for (var name in ctx.attributes) {
|
|
296
|
+
attributes += ' ' + name + '="' + (ctx.attributes[name] || '').replace(/"/g, '"') + '"';
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
return (
|
|
300
|
+
'<' +
|
|
301
|
+
ctx.tag +
|
|
302
|
+
' class="' +
|
|
303
|
+
ctx.classes.join(' ') +
|
|
304
|
+
'"' +
|
|
305
|
+
attributes +
|
|
306
|
+
'>' +
|
|
307
|
+
ctx.content +
|
|
308
|
+
'</' +
|
|
309
|
+
ctx.tag +
|
|
310
|
+
'>'
|
|
311
|
+
);
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
/**
|
|
315
|
+
* Creates a deep copy of the language with the given id and appends the given tokens.
|
|
316
|
+
*
|
|
317
|
+
* If a token in `extension` also appears in the copied language, then the existing token in the copied language
|
|
318
|
+
* will be overwritten at its original position.
|
|
319
|
+
*
|
|
320
|
+
* ## Best practices
|
|
321
|
+
*
|
|
322
|
+
* Since the position of overwriting tokens (token in `extension` that overwrite tokens in the copied language)
|
|
323
|
+
* doesn't matter, they can technically be in any order. However, this can be confusing to others that trying to
|
|
324
|
+
* understand the language definition because, normally, the order of tokens matters in the grammars.
|
|
325
|
+
*
|
|
326
|
+
* Therefore, it is encouraged to order overwriting tokens according to the positions of the overwritten tokens.
|
|
327
|
+
* Furthermore, all non-overwriting tokens should be placed after the overwriting ones.
|
|
328
|
+
*
|
|
329
|
+
* @param base_id - The id of the language to extend. This has to be a key in `syntax_styler.langs`.
|
|
330
|
+
* @param extension - The new tokens to append.
|
|
331
|
+
* @returns the new grammar
|
|
332
|
+
*/
|
|
333
|
+
extend_grammar(base_id: string, extension: SyntaxGrammarRaw): SyntaxGrammar {
|
|
334
|
+
// Merge normalized base with un-normalized extension
|
|
335
|
+
const extended = {...structuredClone(this.get_lang(base_id)), ...extension};
|
|
336
|
+
// Normalize the extension parts
|
|
337
|
+
this.normalize_grammar(extended as SyntaxGrammarRaw, new Set());
|
|
338
|
+
// Return as SyntaxGrammar
|
|
339
|
+
return extended as unknown as SyntaxGrammar;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Normalize a single pattern to have consistent shape.
|
|
344
|
+
* This ensures all patterns have the same object shape for V8 optimization.
|
|
345
|
+
*/
|
|
346
|
+
private normalize_pattern(
|
|
347
|
+
pattern: RegExp | SyntaxGrammarTokenRaw,
|
|
348
|
+
visited: Set<number>,
|
|
349
|
+
): SyntaxGrammarToken {
|
|
350
|
+
const p = pattern instanceof RegExp ? {pattern} : pattern;
|
|
351
|
+
|
|
352
|
+
let regex = p.pattern;
|
|
353
|
+
|
|
354
|
+
// Add global flag if greedy and not already present
|
|
355
|
+
if ((p.greedy ?? false) && !regex.global) {
|
|
356
|
+
const flags = regex.flags;
|
|
357
|
+
regex = new RegExp(regex.source, flags.includes('g') ? flags : flags + 'g');
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// Normalize alias to always be an array
|
|
361
|
+
let normalized_alias: Array<string> = [];
|
|
362
|
+
if (p.alias) {
|
|
363
|
+
normalized_alias = Array.isArray(p.alias) ? p.alias : [p.alias];
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// Recursively normalize the inside grammar if present
|
|
367
|
+
let normalized_inside: SyntaxGrammar | null = null;
|
|
368
|
+
if (p.inside) {
|
|
369
|
+
this.normalize_grammar(p.inside, visited);
|
|
370
|
+
// After normalization, cast to SyntaxGrammar
|
|
371
|
+
normalized_inside = p.inside as unknown as SyntaxGrammar;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
return {
|
|
375
|
+
pattern: regex,
|
|
376
|
+
lookbehind: p.lookbehind ?? false,
|
|
377
|
+
greedy: p.greedy ?? false,
|
|
378
|
+
alias: normalized_alias,
|
|
379
|
+
inside: normalized_inside,
|
|
380
|
+
};
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Normalize a grammar to have consistent object shapes.
|
|
385
|
+
* This performs several optimizations:
|
|
386
|
+
* 1. Merges `rest` property into main grammar
|
|
387
|
+
* 2. Ensures all pattern values are arrays
|
|
388
|
+
* 3. Normalizes all pattern objects to have consistent shapes
|
|
389
|
+
* 4. Adds global flag to greedy patterns
|
|
390
|
+
*
|
|
391
|
+
* This is called once at registration time to avoid runtime overhead.
|
|
392
|
+
* @param visited - Set of grammar object IDs already normalized (for circular references)
|
|
393
|
+
*/
|
|
394
|
+
private normalize_grammar(grammar: SyntaxGrammarRaw, visited: Set<number>): void {
|
|
395
|
+
// Check if we've already normalized this grammar (circular reference)
|
|
396
|
+
const grammar_id = id_of(grammar);
|
|
397
|
+
if (visited.has(grammar_id)) {
|
|
398
|
+
return;
|
|
399
|
+
}
|
|
400
|
+
visited.add(grammar_id);
|
|
401
|
+
|
|
402
|
+
// Step 1: Merge rest into grammar first
|
|
403
|
+
if (grammar.rest) {
|
|
404
|
+
for (const token in grammar.rest) {
|
|
405
|
+
if (!grammar[token]) {
|
|
406
|
+
// Don't overwrite existing tokens
|
|
407
|
+
grammar[token] = grammar.rest[token];
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
delete grammar.rest;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// Step 2: Normalize all patterns
|
|
414
|
+
for (const key in grammar) {
|
|
415
|
+
if (key === 'rest') continue;
|
|
416
|
+
|
|
417
|
+
const value = grammar[key];
|
|
418
|
+
if (!value) {
|
|
419
|
+
grammar[key] = [];
|
|
420
|
+
continue;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// Always store as array of normalized patterns
|
|
424
|
+
const patterns = Array.isArray(value) ? value : [value];
|
|
425
|
+
grammar[key] = patterns.map((p) => this.normalize_pattern(p, visited));
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// TODO add some builtins
|
|
430
|
+
plugins: Record<string, any> = {};
|
|
431
|
+
|
|
432
|
+
// TODO maybe extend/compose an event listener?
|
|
433
|
+
hooks_before_tokenize: Array<HookBeforeTokenizeCallback> = [];
|
|
434
|
+
hooks_after_tokenize: Array<HookAfterTokenizeCallback> = [];
|
|
435
|
+
hooks_wrap: Array<HookWrapCallback> = [];
|
|
436
|
+
|
|
437
|
+
add_hook_before_tokenize(cb: HookBeforeTokenizeCallback): void {
|
|
438
|
+
this.hooks_before_tokenize.push(cb);
|
|
439
|
+
}
|
|
440
|
+
add_hook_after_tokenize(cb: HookAfterTokenizeCallback): void {
|
|
441
|
+
this.hooks_after_tokenize.push(cb);
|
|
442
|
+
}
|
|
443
|
+
add_hook_wrap(cb: HookWrapCallback): void {
|
|
444
|
+
this.hooks_wrap.push(cb);
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
run_hook_before_tokenize(ctx: HookBeforeTokenizeCallbackContext): void {
|
|
448
|
+
for (var cb of this.hooks_before_tokenize) {
|
|
449
|
+
cb(ctx);
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
run_hook_after_tokenize(ctx: HookAfterTokenizeCallbackContext): void {
|
|
453
|
+
for (var cb of this.hooks_after_tokenize) {
|
|
454
|
+
cb(ctx);
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
run_hook_wrap(ctx: HookWrapCallbackContext): void {
|
|
458
|
+
for (var cb of this.hooks_wrap) {
|
|
459
|
+
cb(ctx);
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
export type SyntaxGrammarValueRaw =
|
|
465
|
+
| RegExp
|
|
466
|
+
| SyntaxGrammarTokenRaw
|
|
467
|
+
| Array<RegExp | SyntaxGrammarTokenRaw>;
|
|
468
|
+
|
|
469
|
+
export type SyntaxGrammarRaw = Record<string, SyntaxGrammarValueRaw | undefined> & {
|
|
470
|
+
rest?: SyntaxGrammarRaw | undefined;
|
|
471
|
+
};
|
|
472
|
+
|
|
473
|
+
/**
|
|
474
|
+
* The expansion of a simple `RegExp` literal to support additional properties.
|
|
475
|
+
*
|
|
476
|
+
* The `inside` grammar will be used to tokenize the text value of each token of this kind.
|
|
477
|
+
*
|
|
478
|
+
* This can be used to make nested and even recursive language definitions.
|
|
479
|
+
*
|
|
480
|
+
* Note: This can cause infinite recursion. Be careful when you embed different languages or even the same language into
|
|
481
|
+
* each another.
|
|
482
|
+
*
|
|
483
|
+
* Note: Grammar authors can use optional properties, but they will be normalized
|
|
484
|
+
* to required properties at registration time for optimal performance.
|
|
485
|
+
*/
|
|
486
|
+
export interface SyntaxGrammarTokenRaw {
|
|
487
|
+
/**
|
|
488
|
+
* The regular expression of the token.
|
|
489
|
+
*/
|
|
490
|
+
pattern: RegExp;
|
|
491
|
+
/**
|
|
492
|
+
* If `true`, then the first capturing group of `pattern` will (effectively)
|
|
493
|
+
* behave as a lookbehind group meaning that the captured text will not be part of the matched text of the new token.
|
|
494
|
+
* @default false
|
|
495
|
+
*/
|
|
496
|
+
lookbehind?: boolean;
|
|
497
|
+
/**
|
|
498
|
+
* Whether the token is greedy.
|
|
499
|
+
* @default false
|
|
500
|
+
*/
|
|
501
|
+
greedy?: boolean;
|
|
502
|
+
/**
|
|
503
|
+
* An optional alias or list of aliases.
|
|
504
|
+
*/
|
|
505
|
+
alias?: string | Array<string>;
|
|
506
|
+
/**
|
|
507
|
+
* The nested grammar of this token.
|
|
508
|
+
*/
|
|
509
|
+
inside?: SyntaxGrammarRaw | null;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
/**
|
|
513
|
+
* Grammar token with all properties required.
|
|
514
|
+
* This is the normalized representation used at runtime.
|
|
515
|
+
*/
|
|
516
|
+
export interface SyntaxGrammarToken {
|
|
517
|
+
pattern: RegExp;
|
|
518
|
+
lookbehind: boolean;
|
|
519
|
+
greedy: boolean;
|
|
520
|
+
alias: Array<string>;
|
|
521
|
+
inside: SyntaxGrammar | null;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
/**
|
|
525
|
+
* A grammar after normalization.
|
|
526
|
+
* All values are arrays of normalized tokens with consistent shapes.
|
|
527
|
+
*/
|
|
528
|
+
export type SyntaxGrammar = Record<string, Array<SyntaxGrammarToken>>;
|
|
529
|
+
|
|
530
|
+
const depth_first_search = (
|
|
531
|
+
o: any,
|
|
532
|
+
cb: (obj: any, key: string, value: any) => void,
|
|
533
|
+
visited: Set<number> = new Set(),
|
|
534
|
+
): void => {
|
|
535
|
+
for (var key in o) {
|
|
536
|
+
cb(o, key, o[key]);
|
|
537
|
+
|
|
538
|
+
var property = o[key];
|
|
539
|
+
|
|
540
|
+
if (
|
|
541
|
+
property &&
|
|
542
|
+
typeof property === 'object' &&
|
|
543
|
+
!(property instanceof RegExp) &&
|
|
544
|
+
!visited.has(id_of(property))
|
|
545
|
+
) {
|
|
546
|
+
visited.add(id_of(property));
|
|
547
|
+
depth_first_search(property, cb, visited);
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
};
|
|
551
|
+
|
|
552
|
+
export type HookBeforeTokenizeCallback = (ctx: HookBeforeTokenizeCallbackContext) => void;
|
|
553
|
+
export type HookAfterTokenizeCallback = (ctx: HookAfterTokenizeCallbackContext) => void;
|
|
554
|
+
export type HookWrapCallback = (ctx: HookWrapCallbackContext) => void;
|
|
555
|
+
|
|
556
|
+
export interface HookBeforeTokenizeCallbackContext {
|
|
557
|
+
code: string;
|
|
558
|
+
grammar: SyntaxGrammar;
|
|
559
|
+
lang: string;
|
|
560
|
+
tokens: undefined;
|
|
561
|
+
}
|
|
562
|
+
export interface HookAfterTokenizeCallbackContext {
|
|
563
|
+
code: string;
|
|
564
|
+
grammar: SyntaxGrammar;
|
|
565
|
+
lang: string;
|
|
566
|
+
tokens: SyntaxTokenStream;
|
|
567
|
+
}
|
|
568
|
+
export interface HookWrapCallbackContext {
|
|
569
|
+
type: string;
|
|
570
|
+
content: string;
|
|
571
|
+
tag: string;
|
|
572
|
+
classes: Array<string>;
|
|
573
|
+
attributes: Record<string, string>;
|
|
574
|
+
lang: string;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
var unique_id = 0;
|
|
578
|
+
|
|
579
|
+
/**
|
|
580
|
+
* Returns a unique number for the given object. Later calls will still return the same number.
|
|
581
|
+
*/
|
|
582
|
+
const ID = Symbol('id');
|
|
583
|
+
const id_of = (obj: any): number => (obj[ID] ??= ++unique_id);
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import {SyntaxStyler} from './syntax_styler.js';
|
|
2
|
+
import {add_grammar_markup} from './grammar_markup.js';
|
|
3
|
+
import {add_grammar_css} from './grammar_css.js';
|
|
4
|
+
import {add_grammar_clike} from './grammar_clike.js';
|
|
5
|
+
import {add_grammar_js} from './grammar_js.js';
|
|
6
|
+
import {add_grammar_ts} from './grammar_ts.js';
|
|
7
|
+
import {add_grammar_svelte} from './grammar_svelte.js';
|
|
8
|
+
import {add_grammar_json} from './grammar_json.js';
|
|
9
|
+
import {add_grammar_markdown} from './grammar_markdown.js';
|
|
10
|
+
|
|
11
|
+
export const syntax_styler_global = new SyntaxStyler();
|
|
12
|
+
|
|
13
|
+
add_grammar_markup(syntax_styler_global);
|
|
14
|
+
add_grammar_css(syntax_styler_global);
|
|
15
|
+
add_grammar_clike(syntax_styler_global);
|
|
16
|
+
add_grammar_js(syntax_styler_global);
|
|
17
|
+
add_grammar_ts(syntax_styler_global);
|
|
18
|
+
add_grammar_svelte(syntax_styler_global);
|
|
19
|
+
add_grammar_json(syntax_styler_global);
|
|
20
|
+
add_grammar_markdown(syntax_styler_global);
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
export class SyntaxToken {
|
|
2
|
+
/**
|
|
3
|
+
* The type of the token.
|
|
4
|
+
*
|
|
5
|
+
* This is usually the key of a pattern in a `Grammar`.
|
|
6
|
+
*/
|
|
7
|
+
type: string;
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* The strings or tokens contained by this token.
|
|
11
|
+
*
|
|
12
|
+
* This will be a token stream if the pattern matched also defined an `inside` grammar.
|
|
13
|
+
*/
|
|
14
|
+
content: string | SyntaxTokenStream;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* The alias(es) of the token.
|
|
18
|
+
* Always an array, even if empty or single value.
|
|
19
|
+
*/
|
|
20
|
+
alias: Array<string>;
|
|
21
|
+
|
|
22
|
+
length: number;
|
|
23
|
+
|
|
24
|
+
constructor(
|
|
25
|
+
type: string,
|
|
26
|
+
content: string | SyntaxTokenStream,
|
|
27
|
+
alias: string | Array<string> | undefined,
|
|
28
|
+
matched_str: string = '',
|
|
29
|
+
) {
|
|
30
|
+
this.type = type;
|
|
31
|
+
this.content = content;
|
|
32
|
+
// Normalize alias to always be an array
|
|
33
|
+
this.alias = alias ? (Array.isArray(alias) ? alias : [alias]) : [];
|
|
34
|
+
this.length = matched_str.length;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* A token stream is an array of strings and `SyntaxToken` objects.
|
|
40
|
+
*
|
|
41
|
+
* Syntax token streams have to fulfill a few properties that are assumed by most functions (mostly internal ones) that process
|
|
42
|
+
* them.
|
|
43
|
+
*
|
|
44
|
+
* 1. No adjacent strings.
|
|
45
|
+
* 2. No empty strings.
|
|
46
|
+
*
|
|
47
|
+
* The only exception here is the token stream that only contains the empty string and nothing else.
|
|
48
|
+
*/
|
|
49
|
+
export type SyntaxTokenStream = Array<string | SyntaxToken>;
|