immunio 1.0.4 → 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,68 +0,0 @@
1
- -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
2
- -- JavaScript LPeg lexer.
3
-
4
- local l = require('lexer')
5
- local token, word_match = l.token, l.word_match
6
- local P, R, S = lpeg.P, lpeg.R, lpeg.S
7
-
8
- local M = {_NAME = 'javascript'}
9
-
10
- -- Whitespace.
11
- local ws = token(l.WHITESPACE, l.space^1)
12
-
13
- -- Comments.
14
- local line_comment = '//' * l.nonnewline_esc^0
15
- local block_comment = '/*' * (l.any - '*/')^0 * P('*/')^-1
16
- local comment = token(l.COMMENT, line_comment + block_comment)
17
-
18
- -- Strings.
19
- local sq_str = l.delimited_range("'")
20
- local dq_str = l.delimited_range('"')
21
- local regex = token( "regex", l.last_char_includes('+-*%^!=&|?:;,([{<>') *
22
- l.delimited_range('/', true) * S('igm')^0 )
23
- local string = token(l.STRING, sq_str + dq_str) --+ token(l.REGEX, regex_str)
24
-
25
- -- Numbers.
26
- local number = token(l.NUMBER, l.float + l.integer)
27
-
28
- -- Keywords.
29
- local keyword = token(l.KEYWORD, word_match{
30
- 'abstract', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class',
31
- 'const', 'continue', 'debugger', 'default', 'delete', 'do', 'double', 'else',
32
- 'enum', 'export', 'extends', 'false', 'final', 'finally', 'float', 'for',
33
- 'function', 'goto', 'if', 'implements', 'import', 'in', 'instanceof', 'int',
34
- 'interface', 'let', 'long', 'native', 'new', 'null', 'package', 'private',
35
- 'protected', 'public', 'return', 'short', 'static', 'super', 'switch',
36
- 'synchronized', 'this', 'throw', 'throws', 'transient', 'true', 'try',
37
- 'typeof', 'var', 'void', 'volatile', 'while', 'with', 'yield'
38
- })
39
-
40
- -- Identifiers.
41
- local identifier = token(l.IDENTIFIER, l.word)
42
-
43
- -- Operators.
44
- local operator = token(l.OPERATOR, S('+-/*%^!=&|?:;,.()[]{}<>'))
45
-
46
- -- Immunio marker
47
- local marker = l.token('marker', P('{immunio-var:') * l.integer * ':' * l.xdigit^1 * '}')
48
-
49
-
50
- M._rules = {
51
- {'whitespace', ws},
52
- {'marker', marker},
53
- {'keyword', keyword},
54
- {'identifier', identifier},
55
- {'comment', comment},
56
- {'number', number},
57
- {'string', string},
58
- {'regex', regex},
59
- {'operator', operator},
60
- }
61
-
62
- M._foldsymbols = {
63
- _patterns = {'[{}]', '/%*', '%*/', '//'},
64
- [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
65
- [l.COMMENT] = {['/*'] = 1, ['*/'] = -1, ['//'] = l.fold_line_comments('//')}
66
- }
67
-
68
- return M
@@ -1,1575 +0,0 @@
1
- -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
2
-
3
- local M = {}
4
-
5
- --[=[ This comment is for LuaDoc.
6
- ---
7
- -- Lexes Scintilla documents with Lua and LPeg.
8
- --
9
- -- ## Overview
10
- --
11
- -- Lexers highlight the syntax of source code. Scintilla (the editing component
12
- -- behind [Textadept][] and [SciTE][]) traditionally uses static, compiled C++
13
- -- lexers which are notoriously difficult to create and/or extend. On the other
14
- -- hand, Lua makes it easy to to rapidly create new lexers, extend existing
15
- -- ones, and embed lexers within one another. Lua lexers tend to be more
16
- -- readable than C++ lexers too.
17
- --
18
- -- Lexers are Parsing Expression Grammars, or PEGs, composed with the Lua
19
- -- [LPeg library][]. The following table comes from the LPeg documentation and
20
- -- summarizes all you need to know about constructing basic LPeg patterns. This
21
- -- module provides convenience functions for creating and working with other
22
- -- more advanced patterns and concepts.
23
- --
24
- -- Operator | Description
25
- -- ---------------------|------------
26
- -- `lpeg.P(string)` | Matches `string` literally.
27
- -- `lpeg.P(`_`n`_`)` | Matches exactly _`n`_ characters.
28
- -- `lpeg.S(string)` | Matches any character in set `string`.
29
- -- `lpeg.R("`_`xy`_`")` | Matches any character between range `x` and `y`.
30
- -- `patt^`_`n`_ | Matches at least _`n`_ repetitions of `patt`.
31
- -- `patt^-`_`n`_ | Matches at most _`n`_ repetitions of `patt`.
32
- -- `patt1 * patt2` | Matches `patt1` followed by `patt2`.
33
- -- `patt1 + patt2` | Matches `patt1` or `patt2` (ordered choice).
34
- -- `patt1 - patt2` | Matches `patt1` if `patt2` does not match.
35
- -- `-patt` | Equivalent to `("" - patt)`.
36
- -- `#patt` | Matches `patt` but consumes no input.
37
- --
38
- -- The first part of this document deals with rapidly constructing a simple
39
- -- lexer. The next part deals with more advanced techniques, such as custom
40
- -- coloring and embedding lexers within one another. Following that is a
41
- -- discussion about code folding, or being able to tell Scintilla which code
42
- -- blocks are "foldable" (temporarily hideable from view). After that are
43
- -- instructions on how to use LPeg lexers with the aforementioned Textadept and
44
- -- SciTE editors. Finally there are comments on lexer performance and
45
- -- limitations.
46
- --
47
- -- [LPeg library]: http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html
48
- -- [Textadept]: http://foicica.com/textadept
49
- -- [SciTE]: http://scintilla.org/SciTE.html
50
- --
51
- -- ## Lexer Basics
52
- --
53
- -- The *lexers/* directory contains all lexers, including your new one. Before
54
- -- attempting to write one from scratch though, first determine if your
55
- -- programming language is similar to any of the 80+ languages supported. If so,
56
- -- you may be able to copy and modify that lexer, saving some time and effort.
57
- -- The filename of your lexer should be the name of your programming language in
58
- -- lower case followed by a *.lua* extension. For example, a new Lua lexer has
59
- -- the name *lua.lua*.
60
- --
61
- -- Note: Try to refrain from using one-character language names like "b", "c",
62
- -- or "d". For example, Scintillua uses "b_lang", "cpp", and "dmd",
63
- -- respectively.
64
- --
65
- -- ### New Lexer Template
66
- --
67
- -- There is a *lexers/template.txt* file that contains a simple template for a
68
- -- new lexer. Feel free to use it, replacing the '?'s with the name of your
69
- -- lexer:
70
- --
71
- -- -- ? LPeg lexer.
72
- --
73
- -- local l = require('lexer')
74
- -- local token, word_match = l.token, l.word_match
75
- -- local P, R, S = lpeg.P, lpeg.R, lpeg.S
76
- --
77
- -- local M = {_NAME = '?'}
78
- --
79
- -- -- Whitespace.
80
- -- local ws = token(l.WHITESPACE, l.space^1)
81
- --
82
- -- M._rules = {
83
- -- {'whitespace', ws},
84
- -- }
85
- --
86
- -- M._tokenstyles = {
87
- --
88
- -- }
89
- --
90
- -- return M
91
- --
92
- -- The first 4 lines of code simply define often used convenience variables. The
93
- -- 5th and last lines define and return the lexer object Scintilla uses; they
94
- -- are very important and must be part of every lexer. The sixth line defines
95
- -- something called a "token", an essential building block of lexers. You will
96
- -- learn about tokens shortly. The rest of the code defines a set of grammar
97
- -- rules and token styles. You will learn about those later. Note, however, the
98
- -- `M.` prefix in front of `_rules` and `_tokenstyles`: not only do these tables
99
- -- belong to their respective lexers, but any non-local variables need the `M.`
100
- -- prefix too so-as not to affect Lua's global environment. All in all, this is
101
- -- a minimal, working lexer that you can build on.
102
- --
103
- -- ### Tokens
104
- --
105
- -- Take a moment to think about your programming language's structure. What kind
106
- -- of key elements does it have? In the template shown earlier, one predefined
107
- -- element all languages have is whitespace. Your language probably also has
108
- -- elements like comments, strings, and keywords. Lexers refer to these elements
109
- -- as "tokens". Tokens are the fundamental "building blocks" of lexers. Lexers
110
- -- break down source code into tokens for coloring, which results in the syntax
111
- -- highlighting familiar to you. It is up to you how specific your lexer is when
112
- -- it comes to tokens. Perhaps only distinguishing between keywords and
113
- -- identifiers is necessary, or maybe recognizing constants and built-in
114
- -- functions, methods, or libraries is desirable. The Lua lexer, for example,
115
- -- defines 11 tokens: whitespace, comments, strings, numbers, keywords, built-in
116
- -- functions, constants, built-in libraries, identifiers, labels, and operators.
117
- -- Even though constants, built-in functions, and built-in libraries are subsets
118
- -- of identifiers, Lua programmers find it helpful for the lexer to distinguish
119
- -- between them all. It is perfectly acceptable to just recognize keywords and
120
- -- identifiers.
121
- --
122
- -- In a lexer, tokens consist of a token name and an LPeg pattern that matches a
123
- -- sequence of characters recognized as an instance of that token. Create tokens
124
- -- using the [`lexer.token()`]() function. Let us examine the "whitespace" token
125
- -- defined in the template shown earlier:
126
- --
127
- -- local ws = token(l.WHITESPACE, l.space^1)
128
- --
129
- -- At first glance, the first argument does not appear to be a string name and
130
- -- the second argument does not appear to be an LPeg pattern. Perhaps you
131
- -- expected something like:
132
- --
133
- -- local ws = token('whitespace', S('\t\v\f\n\r ')^1)
134
- --
135
- -- The `lexer` (`l`) module actually provides a convenient list of common token
136
- -- names and common LPeg patterns for you to use. Token names include
137
- -- [`lexer.DEFAULT`](), [`lexer.WHITESPACE`](), [`lexer.COMMENT`](),
138
- -- [`lexer.STRING`](), [`lexer.NUMBER`](), [`lexer.KEYWORD`](),
139
- -- [`lexer.IDENTIFIER`](), [`lexer.OPERATOR`](), [`lexer.ERROR`](),
140
- -- [`lexer.PREPROCESSOR`](), [`lexer.CONSTANT`](), [`lexer.VARIABLE`](),
141
- -- [`lexer.FUNCTION`](), [`lexer.CLASS`](), [`lexer.TYPE`](), [`lexer.LABEL`](),
142
- -- [`lexer.REGEX`](), and [`lexer.EMBEDDED`](). Patterns include
143
- -- [`lexer.any`](), [`lexer.ascii`](), [`lexer.extend`](), [`lexer.alpha`](),
144
- -- [`lexer.digit`](), [`lexer.alnum`](), [`lexer.lower`](), [`lexer.upper`](),
145
- -- [`lexer.xdigit`](), [`lexer.cntrl`](), [`lexer.graph`](), [`lexer.print`](),
146
- -- [`lexer.punct`](), [`lexer.space`](), [`lexer.newline`](),
147
- -- [`lexer.nonnewline`](), [`lexer.nonnewline_esc`](), [`lexer.dec_num`](),
148
- -- [`lexer.hex_num`](), [`lexer.oct_num`](), [`lexer.integer`](),
149
- -- [`lexer.float`](), and [`lexer.word`](). You may use your own token names if
150
- -- none of the above fit your language, but an advantage to using predefined
151
- -- token names is that your lexer's tokens will inherit the universal syntax
152
- -- highlighting color theme used by your text editor.
153
- --
154
- -- #### Example Tokens
155
- --
156
- -- So, how might you define other tokens like comments, strings, and keywords?
157
- -- Here are some examples.
158
- --
159
- -- **Comments**
160
- --
161
- -- Line-style comments with a prefix character(s) are easy to express with LPeg:
162
- --
163
- -- local shell_comment = token(l.COMMENT, '#' * l.nonnewline^0)
164
- -- local c_line_comment = token(l.COMMENT, '//' * l.nonnewline_esc^0)
165
- --
166
- -- The comments above start with a '#' or "//" and go to the end of the line.
167
- -- The second comment recognizes the next line also as a comment if the current
168
- -- line ends with a '\' escape character.
169
- --
170
- -- C-style "block" comments with a start and end delimiter are also easy to
171
- -- express:
172
- --
173
- -- local c_comment = token(l.COMMENT, '/*' * (l.any - '*/')^0 * P('*/')^-1)
174
- --
175
- -- This comment starts with a "/\*" sequence and contains anything up to and
176
- -- including an ending "\*/" sequence. The ending "\*/" is optional so the lexer
177
- -- can recognize unfinished comments as comments and highlight them properly.
178
- --
179
- -- **Strings**
180
- --
181
- -- It is tempting to think that a string is not much different from the block
182
- -- comment shown above in that both have start and end delimiters:
183
- --
184
- -- local dq_str = '"' * (l.any - '"')^0 * P('"')^-1
185
- -- local sq_str = "'" * (l.any - "'")^0 * P("'")^-1
186
- -- local simple_string = token(l.STRING, dq_str + sq_str)
187
- --
188
- -- However, most programming languages allow escape sequences in strings such
189
- -- that a sequence like "\\&quot;" in a double-quoted string indicates that the
190
- -- '&quot;' is not the end of the string. The above token incorrectly matches
191
- -- such a string. Instead, use the [`lexer.delimited_range()`]() convenience
192
- -- function.
193
- --
194
- -- local dq_str = l.delimited_range('"')
195
- -- local sq_str = l.delimited_range("'")
196
- -- local string = token(l.STRING, dq_str + sq_str)
197
- --
198
- -- In this case, the lexer treats '\' as an escape character in a string
199
- -- sequence.
200
- --
201
- -- **Keywords**
202
- --
203
- -- Instead of matching _n_ keywords with _n_ `P('keyword_`_`n`_`')` ordered
204
- -- choices, use another convenience function: [`lexer.word_match()`](). It is
205
- -- much easier and more efficient to write word matches like:
206
- --
207
- -- local keyword = token(l.KEYWORD, l.word_match{
208
- -- 'keyword_1', 'keyword_2', ..., 'keyword_n'
209
- -- })
210
- --
211
- -- local case_insensitive_keyword = token(l.KEYWORD, l.word_match({
212
- -- 'KEYWORD_1', 'keyword_2', ..., 'KEYword_n'
213
- -- }, nil, true))
214
- --
215
- -- local hyphened_keyword = token(l.KEYWORD, l.word_match({
216
- -- 'keyword-1', 'keyword-2', ..., 'keyword-n'
217
- -- }, '-'))
218
- --
219
- -- By default, characters considered to be in keywords are in the set of
220
- -- alphanumeric characters and underscores. The last token demonstrates how to
221
- -- allow '-' (hyphen) characters to be in keywords as well.
222
- --
223
- -- **Numbers**
224
- --
225
- -- Most programming languages have the same format for integer and float tokens,
226
- -- so it might be as simple as using a couple of predefined LPeg patterns:
227
- --
228
- -- local number = token(l.NUMBER, l.float + l.integer)
229
- --
230
- -- However, some languages allow postfix characters on integers.
231
- --
232
- -- local integer = P('-')^-1 * (l.dec_num * S('lL')^-1)
233
- -- local number = token(l.NUMBER, l.float + l.hex_num + integer)
234
- --
235
- -- Your language may need other tweaks, but it is up to you how fine-grained you
236
- -- want your highlighting to be. After all, you are not writing a compiler or
237
- -- interpreter!
238
- --
239
- -- ### Rules
240
- --
241
- -- Programming languages have grammars, which specify valid token structure. For
242
- -- example, comments usually cannot appear within a string. Grammars consist of
243
- -- rules, which are simply combinations of tokens. Recall from the lexer
244
- -- template the `_rules` table, which defines all the rules used by the lexer
245
- -- grammar:
246
- --
247
- -- M._rules = {
248
- -- {'whitespace', ws},
249
- -- }
250
- --
251
- -- Each entry in a lexer's `_rules` table consists of a rule name and its
252
- -- associated pattern. Rule names are completely arbitrary and serve only to
253
- -- identify and distinguish between different rules. Rule order is important: if
254
- -- text does not match the first rule, the lexer tries the second rule, and so
255
- -- on. This simple grammar says to match whitespace tokens under a rule named
256
- -- "whitespace".
257
- --
258
- -- To illustrate the importance of rule order, here is an example of a
259
- -- simplified Lua grammar:
260
- --
261
- -- M._rules = {
262
- -- {'whitespace', ws},
263
- -- {'keyword', keyword},
264
- -- {'identifier', identifier},
265
- -- {'string', string},
266
- -- {'comment', comment},
267
- -- {'number', number},
268
- -- {'label', label},
269
- -- {'operator', operator},
270
- -- }
271
- --
272
- -- Note how identifiers come after keywords. In Lua, as with most programming
273
- -- languages, the characters allowed in keywords and identifiers are in the same
274
- -- set (alphanumerics plus underscores). If the lexer specified the "identifier"
275
- -- rule before the "keyword" rule, all keywords would match identifiers and thus
276
- -- incorrectly highlight as identifiers instead of keywords. The same idea
277
- -- applies to function, constant, etc. tokens that you may want to distinguish
278
- -- between: their rules should come before identifiers.
279
- --
280
- -- So what about text that does not match any rules? For example in Lua, the '!'
281
- -- character is meaningless outside a string or comment. Normally the lexer
282
- -- skips over such text. If instead you want to highlight these "syntax errors",
283
- -- add an additional end rule:
284
- --
285
- -- M._rules = {
286
- -- {'whitespace', ws},
287
- -- {'error', token(l.ERROR, l.any)},
288
- -- }
289
- --
290
- -- This identifies and highlights any character not matched by an existing
291
- -- rule as an `lexer.ERROR` token.
292
- --
293
- -- Even though the rules defined in the examples above contain a single token,
294
- -- rules may consist of multiple tokens. For example, a rule for an HTML tag
295
- -- could consist of a tag token followed by an arbitrary number of attribute
296
- -- tokens, allowing the lexer to highlight all tokens separately. The rule might
297
- -- look something like this:
298
- --
299
- -- {'tag', tag_start * (ws * attributes)^0 * tag_end^-1}
300
- --
301
- -- Note however that lexers with complex rules like these are more prone to lose
302
- -- track of their state.
303
- --
304
- -- ### Summary
305
- --
306
- -- Lexers primarily consist of tokens and grammar rules. At your disposal are a
307
- -- number of convenience patterns and functions for rapidly creating a lexer. If
308
- -- you choose to use predefined token names for your tokens, you do not have to
309
- -- define how the lexer highlights them. The tokens will inherit the default
310
- -- syntax highlighting color theme your editor uses.
311
- --
312
- -- ## Advanced Techniques
313
- --
314
- -- ### Styles and Styling
315
- --
316
- -- The most basic form of syntax highlighting is assigning different colors to
317
- -- different tokens. Instead of highlighting with just colors, Scintilla allows
318
- -- for more rich highlighting, or "styling", with different fonts, font sizes,
319
- -- font attributes, and foreground and background colors, just to name a few.
320
- -- The unit of this rich highlighting is called a "style". Styles are simply
321
- -- strings of comma-separated property settings. By default, lexers associate
322
- -- predefined token names like `lexer.WHITESPACE`, `lexer.COMMENT`,
323
- -- `lexer.STRING`, etc. with particular styles as part of a universal color
324
- -- theme. These predefined styles include [`lexer.STYLE_CLASS`](),
325
- -- [`lexer.STYLE_COMMENT`](), [`lexer.STYLE_CONSTANT`](),
326
- -- [`lexer.STYLE_ERROR`](), [`lexer.STYLE_EMBEDDED`](),
327
- -- [`lexer.STYLE_FUNCTION`](), [`lexer.STYLE_IDENTIFIER`](),
328
- -- [`lexer.STYLE_KEYWORD`](), [`lexer.STYLE_LABEL`](), [`lexer.STYLE_NUMBER`](),
329
- -- [`lexer.STYLE_OPERATOR`](), [`lexer.STYLE_PREPROCESSOR`](),
330
- -- [`lexer.STYLE_REGEX`](), [`lexer.STYLE_STRING`](), [`lexer.STYLE_TYPE`](),
331
- -- [`lexer.STYLE_VARIABLE`](), and [`lexer.STYLE_WHITESPACE`](). Like with
332
- -- predefined token names and LPeg patterns, you may define your own styles. At
333
- -- their core, styles are just strings, so you may create new ones and/or modify
334
- -- existing ones. Each style consists of the following comma-separated settings:
335
- --
336
- -- Setting | Description
337
- -- ---------------|------------
338
- -- font:_name_ | The name of the font the style uses.
339
- -- size:_int_ | The size of the font the style uses.
340
- -- [not]bold | Whether or not the font face is bold.
341
- -- [not]italics | Whether or not the font face is italic.
342
- -- [not]underlined| Whether or not the font face is underlined.
343
- -- fore:_color_ | The foreground color of the font face.
344
- -- back:_color_ | The background color of the font face.
345
- -- [not]eolfilled | Does the background color extend to the end of the line?
346
- -- case:_char_ | The case of the font ('u': upper, 'l': lower, 'm': normal).
347
- -- [not]visible | Whether or not the text is visible.
348
- -- [not]changeable| Whether the text is changeable or read-only.
349
- -- [not]hotspot | Whether or not the text is clickable.
350
- --
351
- -- Specify font colors in either "#RRGGBB" format, "0xBBGGRR" format, or the
352
- -- decimal equivalent of the latter. As with token names, LPeg patterns, and
353
- -- styles, there is a set of predefined color names, but they vary depending on
354
- -- the current color theme in use. Therefore, it is generally not a good idea to
355
- -- manually define colors within styles in your lexer since they might not fit
356
- -- into a user's chosen color theme. Try to refrain from even using predefined
357
- -- colors in a style because that color may be theme-specific. Instead, the best
358
- -- practice is to either use predefined styles or derive new color-agnostic
359
- -- styles from predefined ones. For example, Lua "longstring" tokens use the
360
- -- existing `lexer.STYLE_STRING` style instead of defining a new one.
361
- --
362
- -- #### Example Styles
363
- --
364
- -- Defining styles is pretty straightforward. An empty style that inherits the
365
- -- default theme settings is simply an empty string:
366
- --
367
- -- local style_nothing = ''
368
- --
369
- -- A similar style but with a bold font face looks like this:
370
- --
371
- -- local style_bold = 'bold'
372
- --
373
- -- If you want the same style, but also with an italic font face, define the new
374
- -- style in terms of the old one:
375
- --
376
- -- local style_bold_italic = style_bold..',italics'
377
- --
378
- -- This allows you to derive new styles from predefined ones without having to
379
- -- rewrite them. This operation leaves the old style unchanged. Thus if you
380
- -- had a "static variable" token whose style you wanted to base off of
381
- -- `lexer.STYLE_VARIABLE`, it would probably look like:
382
- --
383
- -- local style_static_var = l.STYLE_VARIABLE..',italics'
384
- --
385
- -- The color theme files in the *lexers/themes/* folder give more examples of
386
- -- style definitions.
387
- --
388
- -- ### Token Styles
389
- --
390
- -- Lexers use the `_tokenstyles` table to assign tokens to particular styles.
391
- -- Recall the token definition and `_tokenstyles` table from the lexer template:
392
- --
393
- -- local ws = token(l.WHITESPACE, l.space^1)
394
- --
395
- -- ...
396
- --
397
- -- M._tokenstyles = {
398
- --
399
- -- }
400
- --
401
- -- Why is a style not assigned to the `lexer.WHITESPACE` token? As mentioned
402
- -- earlier, lexers automatically associate tokens that use predefined token
403
- -- names with a particular style. Only tokens with custom token names need
404
- -- manual style associations. As an example, consider a custom whitespace token:
405
- --
406
- -- local ws = token('custom_whitespace', l.space^1)
407
- --
408
- -- Assigning a style to this token looks like:
409
- --
410
- -- M._tokenstyles = {
411
- -- custom_whitespace = l.STYLE_WHITESPACE
412
- -- }
413
- --
414
- -- Do not confuse token names with rule names. They are completely different
415
- -- entities. In the example above, the lexer assigns the "custom_whitespace"
416
- -- token the existing style for `WHITESPACE` tokens. If instead you want to
417
- -- color the background of whitespace a shade of grey, it might look like:
418
- --
419
- -- local custom_style = l.STYLE_WHITESPACE..',back:$(color.grey)'
420
- -- M._tokenstyles = {
421
- -- custom_whitespace = custom_style
422
- -- }
423
- --
424
- -- Notice that the lexer peforms Scintilla/SciTE-style "$()" property expansion.
425
- -- You may also use "%()". Remember to refrain from assigning specific colors in
426
- -- styles, but in this case, all user color themes probably define the
427
- -- "color.grey" property.
428
- --
429
- -- ### Line Lexers
430
- --
431
- -- By default, lexers match the arbitrary chunks of text passed to them by
432
- -- Scintilla. These chunks may be a full document, only the visible part of a
433
- -- document, or even just portions of lines. Some lexers need to match whole
434
- -- lines. For example, a lexer for the output of a file "diff" needs to know if
435
- -- the line started with a '+' or '-' and then style the entire line
436
- -- accordingly. To indicate that your lexer matches by line, use the
437
- -- `_LEXBYLINE` field:
438
- --
439
- -- M._LEXBYLINE = true
440
- --
441
- -- Now the input text for the lexer is a single line at a time. Keep in mind
442
- -- that line lexers do not have the ability to look ahead at subsequent lines.
443
- --
444
- -- ### Embedded Lexers
445
- --
446
- -- Lexers embed within one another very easily, requiring minimal effort. In the
447
- -- following sections, the lexer being embedded is called the "child" lexer and
448
- -- the lexer a child is being embedded in is called the "parent". For example,
449
- -- consider an HTML lexer and a CSS lexer. Either lexer stands alone for styling
450
- -- their respective HTML and CSS files. However, CSS can be embedded inside
451
- -- HTML. In this specific case, the CSS lexer is the "child" lexer with the HTML
452
- -- lexer being the "parent". Now consider an HTML lexer and a PHP lexer. This
453
- -- sounds a lot like the case with CSS, but there is a subtle difference: PHP
454
- -- _embeds itself_ into HTML while CSS is _embedded in_ HTML. This fundamental
455
- -- difference results in two types of embedded lexers: a parent lexer that
456
- -- embeds other child lexers in it (like HTML embedding CSS), and a child lexer
457
- -- that embeds itself within a parent lexer (like PHP embedding itself in HTML).
458
- --
459
- -- #### Parent Lexer
460
- --
461
- -- Before embedding a child lexer into a parent lexer, the parent lexer needs to
462
- -- load the child lexer. This is done with the [`lexer.load()`]() function. For
463
- -- example, loading the CSS lexer within the HTML lexer looks like:
464
- --
465
- -- local css = l.load('css')
466
- --
467
- -- The next part of the embedding process is telling the parent lexer when to
468
- -- switch over to the child lexer and when to switch back. The lexer refers to
469
- -- these indications as the "start rule" and "end rule", respectively, and are
470
- -- just LPeg patterns. Continuing with the HTML/CSS example, the transition from
471
- -- HTML to CSS is when the lexer encounters a "style" tag with a "type"
472
- -- attribute whose value is "text/css":
473
- --
474
- -- local css_tag = P('<style') * P(function(input, index)
475
- -- if input:find('^[^>]+type="text/css"', index) then
476
- -- return index
477
- -- end
478
- -- end)
479
- --
480
- -- This pattern looks for the beginning of a "style" tag and searches its
481
- -- attribute list for the text "`type="text/css"`". (In this simplified example,
482
- -- the Lua pattern does not consider whitespace between the '=' nor does it
483
- -- consider that using single quotes is valid.) If there is a match, the
484
- -- functional pattern returns a value instead of `nil`. In this case, the value
485
- -- returned does not matter because we ultimately want to style the "style" tag
486
- -- as an HTML tag, so the actual start rule looks like this:
487
- --
488
- -- local css_start_rule = #css_tag * tag
489
- --
490
- -- Now that the parent knows when to switch to the child, it needs to know when
491
- -- to switch back. In the case of HTML/CSS, the switch back occurs when the
492
- -- lexer encounters an ending "style" tag, though the lexer should still style
493
- -- the tag as an HTML tag:
494
- --
495
- -- local css_end_rule = #P('</style>') * tag
496
- --
497
- -- Once the parent loads the child lexer and defines the child's start and end
498
- -- rules, it embeds the child with the [`lexer.embed_lexer()`]() function:
499
- --
500
- -- l.embed_lexer(M, css, css_start_rule, css_end_rule)
501
- --
502
- -- The first parameter is the parent lexer object to embed the child in, which
503
- -- in this case is `M`. The other three parameters are the child lexer object
504
- -- loaded earlier followed by its start and end rules.
505
- --
506
- -- #### Child Lexer
507
- --
508
- -- The process for instructing a child lexer to embed itself into a parent is
509
- -- very similar to embedding a child into a parent: first, load the parent lexer
510
- -- into the child lexer with the [`lexer.load()`]() function and then create
511
- -- start and end rules for the child lexer. However, in this case, swap the
512
- -- lexer object arguments to [`lexer.embed_lexer()`](). For example, in the PHP
513
- -- lexer:
514
- --
515
- -- local html = l.load('html')
516
- -- local php_start_rule = token('php_tag', '<?php ')
517
- -- local php_end_rule = token('php_tag', '?>')
518
- -- l.embed_lexer(html, M, php_start_rule, php_end_rule)
519
- --
520
- -- ## Code Folding
521
- --
522
- -- When reading source code, it is occasionally helpful to temporarily hide
523
- -- blocks of code like functions, classes, comments, etc. This is the concept of
524
- -- "folding". In the Textadept and SciTE editors for example, little indicators
525
- -- in the editor margins appear next to code that can be folded at places called
526
- -- "fold points". When the user clicks an indicator, the editor hides the code
527
- -- associated with the indicator until the user clicks the indicator again. The
528
- -- lexer specifies these fold points and what code exactly to fold.
529
- --
530
- -- The fold points for most languages occur on keywords or character sequences.
531
- -- Examples of fold keywords are "if" and "end" in Lua and examples of fold
532
- -- character sequences are '{', '}', "/\*", and "\*/" in C for code block and
533
- -- comment delimiters, respectively. However, these fold points cannot occur
534
- -- just anywhere. For example, lexers should not recognize fold keywords that
535
- -- appear within strings or comments. The lexer's `_foldsymbols` table allows
536
- -- you to conveniently define fold points with such granularity. For example,
537
- -- consider C:
538
- --
539
- -- M._foldsymbols = {
540
- -- [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
541
- -- [l.COMMENT] = {['/*'] = 1, ['*/'] = -1},
542
- -- _patterns = {'[{}]', '/%*', '%*/'}
543
- -- }
544
- --
545
- -- The first assignment states that any '{' or '}' that the lexer recognized as
546
- -- an `lexer.OPERATOR` token is a fold point. The integer `1` indicates the
547
- -- match is a beginning fold point and `-1` indicates the match is an ending
548
- -- fold point. Likewise, the second assignment states that any "/\*" or "\*/"
549
- -- that the lexer recognizes as part of a `lexer.COMMENT` token is a fold point.
550
- -- The lexer does not consider any occurences of these characters outside their
551
- -- defined tokens (such as in a string) as fold points. Finally, every
552
- -- `_foldsymbols` table must have a `_patterns` field that contains a list of
553
- -- [Lua patterns][] that match fold points. If the lexer encounters text that
554
- -- matches one of those patterns, the lexer looks up the matched text in its
555
- -- token's table to determine whether or not the text is a fold point. In the
556
- -- example above, the first Lua pattern matches any '{' or '}' characters. When
557
- -- the lexer comes across one of those characters, it checks if the match is an
558
- -- `lexer.OPERATOR` token. If so, the lexer identifies the match as a fold
559
- -- point. The same idea applies for the other patterns. (The '%' is in the other
560
- -- patterns because '\*' is a special character in Lua patterns that needs
561
- -- escaping.) How do you specify fold keywords? Here is an example for Lua:
562
- --
563
- -- M._foldsymbols = {
564
- -- [l.KEYWORD] = {
565
- -- ['if'] = 1, ['do'] = 1, ['function'] = 1,
566
- -- ['end'] = -1, ['repeat'] = 1, ['until'] = -1
567
- -- },
568
- -- _patterns = {'%l+'}
569
- -- }
570
- --
571
- -- Any time the lexer encounters a lower case word, if that word is a
572
- -- `lexer.KEYWORD` token and in the associated list of fold points, the lexer
573
- -- identifies the word as a fold point.
574
- --
575
- -- If your lexer needs to do some additional processing to determine if a match
576
- -- is a fold point, assign a function that returns an integer. Returning `1` or
577
- -- `-1` indicates the match is a fold point. Returning `0` indicates it is not.
578
- -- For example:
579
- --
580
- -- local function fold_strange_token(text, pos, line, s, match)
581
- -- if ... then
582
- -- return 1 -- beginning fold point
583
- -- elseif ... then
584
- -- return -1 -- ending fold point
585
- -- end
586
- -- return 0
587
- -- end
588
- --
589
- -- M._foldsymbols = {
590
- -- ['strange_token'] = {['|'] = fold_strange_token},
591
- -- _patterns = {'|'}
592
- -- }
593
- --
594
- -- Any time the lexer encounters a '|' that is a "strange_token", it calls the
595
- -- `fold_strange_token` function to determine if '|' is a fold point. The lexer
596
- -- calls these functions with the following arguments: the text to identify fold
597
- -- points in, the beginning position of the current line in the text to fold,
598
- -- the current line's text, the position in the current line the matched text
599
- -- starts at, and the matched text itself.
600
- --
601
- -- [Lua patterns]: http://www.lua.org/manual/5.2/manual.html#6.4.1
602
- --
603
- -- ## Using Lexers
604
- --
605
- -- ### Textadept
606
- --
607
- -- Put your lexer in your *~/.textadept/lexers/* directory so you do not
608
- -- overwrite it when upgrading Textadept. Also, lexers in this directory
609
- -- override default lexers. Thus, Textadept loads a user *lua* lexer instead of
610
- -- the default *lua* lexer. This is convenient for tweaking a default lexer to
611
- -- your liking. Then add a [file type][] for your lexer if necessary.
612
- --
613
- -- [file type]: _M.textadept.file_types.html
614
- --
615
- -- ### SciTE
616
- --
617
- -- Create a *.properties* file for your lexer and `import` it in either your
618
- -- *SciTEUser.properties* or *SciTEGlobal.properties*. The contents of the
619
- -- *.properties* file should contain:
620
- --
621
- -- file.patterns.[lexer_name]=[file_patterns]
622
- -- lexer.$(file.patterns.[lexer_name])=[lexer_name]
623
- --
624
- -- where `[lexer_name]` is the name of your lexer (minus the *.lua* extension)
625
- -- and `[file_patterns]` is a set of file extensions to use your lexer for.
626
- --
627
- -- Please note that Lua lexers ignore any styling information in *.properties*
628
- -- files. Your theme file in the *lexers/themes/* directory contains styling
629
- -- information.
630
- --
631
- -- ## Considerations
632
- --
633
- -- ### Performance
634
- --
635
- -- There might be some slight overhead when initializing a lexer, but loading a
636
- -- file from disk into Scintilla is usually more expensive. On modern computer
637
- -- systems, I see no difference in speed between LPeg lexers and Scintilla's C++
638
- -- ones. Optimize lexers for speed by re-arranging rules in the `_rules` table
639
- -- so that the most common rules match first. Do keep in mind that order matters
640
- -- for similar rules.
641
- --
642
- -- ### Limitations
643
- --
644
- -- Embedded preprocessor languages like PHP cannot completely embed in their
645
- -- parent languages in that the parent's tokens do not support start and end
646
- -- rules. This mostly goes unnoticed, but code like
647
- --
648
- -- <div id="<?php echo $id; ?>">
649
- --
650
- -- or
651
- --
652
- -- <div <?php if ($odd) { echo 'class="odd"'; } ?>>
653
- --
654
- -- will not style correctly.
655
- --
656
- -- ### Troubleshooting
657
- --
658
- -- Errors in lexers can be tricky to debug. Lexers print Lua errors to
659
- -- `io.stderr` and `_G.print()` statements to `io.stdout`. Running your editor
660
- -- from a terminal is the easiest way to see errors as they occur.
661
- --
662
- -- ### Risks
663
- --
664
- -- Poorly written lexers have the ability to crash Scintilla (and thus its
665
- -- containing application), so unsaved data might be lost. However, I have only
666
- -- observed these crashes in early lexer development, when syntax errors or
667
- -- pattern errors are present. Once the lexer actually starts styling text
668
- -- (either correctly or incorrectly, it does not matter), I have not observed
669
- -- any crashes.
670
- --
671
- -- ### Acknowledgements
672
- --
673
- -- Thanks to Peter Odding for his [lexer post][] on the Lua mailing list
674
- -- that inspired me, and thanks to Roberto Ierusalimschy for LPeg.
675
- --
676
- -- [lexer post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html
677
- -- @field LEXERPATH (string)
678
- -- The path used to search for a lexer to load.
679
- -- Identical in format to Lua's `package.path` string.
680
- -- The default value is `package.path`.
681
- -- @field DEFAULT (string)
682
- -- The token name for default tokens.
683
- -- @field WHITESPACE (string)
684
- -- The token name for whitespace tokens.
685
- -- @field COMMENT (string)
686
- -- The token name for comment tokens.
687
- -- @field STRING (string)
688
- -- The token name for string tokens.
689
- -- @field NUMBER (string)
690
- -- The token name for number tokens.
691
- -- @field KEYWORD (string)
692
- -- The token name for keyword tokens.
693
- -- @field IDENTIFIER (string)
694
- -- The token name for identifier tokens.
695
- -- @field OPERATOR (string)
696
- -- The token name for operator tokens.
697
- -- @field ERROR (string)
698
- -- The token name for error tokens.
699
- -- @field PREPROCESSOR (string)
700
- -- The token name for preprocessor tokens.
701
- -- @field CONSTANT (string)
702
- -- The token name for constant tokens.
703
- -- @field VARIABLE (string)
704
- -- The token name for variable tokens.
705
- -- @field FUNCTION (string)
706
- -- The token name for function tokens.
707
- -- @field CLASS (string)
708
- -- The token name for class tokens.
709
- -- @field TYPE (string)
710
- -- The token name for type tokens.
711
- -- @field LABEL (string)
712
- -- The token name for label tokens.
713
- -- @field REGEX (string)
714
- -- The token name for regex tokens.
715
- -- @field STYLE_CLASS (string)
716
- -- The style typically used for class definitions.
717
- -- @field STYLE_COMMENT (string)
718
- -- The style typically used for code comments.
719
- -- @field STYLE_CONSTANT (string)
720
- -- The style typically used for constants.
721
- -- @field STYLE_ERROR (string)
722
- -- The style typically used for erroneous syntax.
723
- -- @field STYLE_FUNCTION (string)
724
- -- The style typically used for function definitions.
725
- -- @field STYLE_KEYWORD (string)
726
- -- The style typically used for language keywords.
727
- -- @field STYLE_LABEL (string)
728
- -- The style typically used for labels.
729
- -- @field STYLE_NUMBER (string)
730
- -- The style typically used for numbers.
731
- -- @field STYLE_OPERATOR (string)
732
- -- The style typically used for operators.
733
- -- @field STYLE_REGEX (string)
734
- -- The style typically used for regular expression strings.
735
- -- @field STYLE_STRING (string)
736
- -- The style typically used for strings.
737
- -- @field STYLE_PREPROCESSOR (string)
738
- -- The style typically used for preprocessor statements.
739
- -- @field STYLE_TYPE (string)
740
- -- The style typically used for static types.
741
- -- @field STYLE_VARIABLE (string)
742
- -- The style typically used for variables.
743
- -- @field STYLE_WHITESPACE (string)
744
- -- The style typically used for whitespace.
745
- -- @field STYLE_EMBEDDED (string)
746
- -- The style typically used for embedded code.
747
- -- @field STYLE_IDENTIFIER (string)
748
- -- The style typically used for identifier words.
749
- -- @field STYLE_DEFAULT (string)
750
- -- The style all styles are based off of.
751
- -- @field STYLE_LINENUMBER (string)
752
- -- The style used for all margins except fold margins.
753
- -- @field STYLE_BRACELIGHT (string)
754
- -- The style used for highlighted brace characters.
755
- -- @field STYLE_BRACEBAD (string)
756
- -- The style used for unmatched brace characters.
757
- -- @field STYLE_CONTROLCHAR (string)
758
- -- The style used for control characters.
759
- -- Color attributes are ignored.
760
- -- @field STYLE_INDENTGUIDE (string)
761
- -- The style used for indentation guides.
762
- -- @field STYLE_CALLTIP (string)
763
- -- The style used by call tips if [`buffer.call_tip_use_style`]() is set.
764
- -- Only the font name, size, and color attributes are used.
765
- -- @field any (pattern)
766
- -- A pattern that matches any single character.
767
- -- @field ascii (pattern)
768
- -- A pattern that matches any ASCII character (codes 0 to 127).
769
- -- @field extend (pattern)
770
- -- A pattern that matches any ASCII extended character (codes 0 to 255).
771
- -- @field alpha (pattern)
772
- -- A pattern that matches any alphabetic character ('A'-'Z', 'a'-'z').
773
- -- @field digit (pattern)
774
- -- A pattern that matches any digit ('0'-'9').
775
- -- @field alnum (pattern)
776
- -- A pattern that matches any alphanumeric character ('A'-'Z', 'a'-'z',
777
- -- '0'-'9').
778
- -- @field lower (pattern)
779
- -- A pattern that matches any lower case character ('a'-'z').
780
- -- @field upper (pattern)
781
- -- A pattern that matches any upper case character ('A'-'Z').
782
- -- @field xdigit (pattern)
783
- -- A pattern that matches any hexadecimal digit ('0'-'9', 'A'-'F', 'a'-'f').
784
- -- @field cntrl (pattern)
785
- -- A pattern that matches any control character (ASCII codes 0 to 31).
786
- -- @field graph (pattern)
787
- -- A pattern that matches any graphical character ('!' to '~').
788
- -- @field print (pattern)
789
- -- A pattern that matches any printable character (' ' to '~').
790
- -- @field punct (pattern)
791
- -- A pattern that matches any punctuation character ('!' to '/', ':' to '@',
792
- -- '[' to ''', '{' to '~').
793
- -- @field space (pattern)
794
- -- A pattern that matches any whitespace character ('\t', '\v', '\f', '\n',
795
- -- '\r', space).
796
- -- @field newline (pattern)
797
- -- A pattern that matches any set of end of line characters.
798
- -- @field nonnewline (pattern)
799
- -- A pattern that matches any single, non-newline character.
800
- -- @field nonnewline_esc (pattern)
801
- -- A pattern that matches any single, non-newline character or any set of end
802
- -- of line characters escaped with '\'.
803
- -- @field dec_num (pattern)
804
- -- A pattern that matches a decimal number.
805
- -- @field hex_num (pattern)
806
- -- A pattern that matches a hexadecimal number.
807
- -- @field oct_num (pattern)
808
- -- A pattern that matches an octal number.
809
- -- @field integer (pattern)
810
- -- A pattern that matches either a decimal, hexadecimal, or octal number.
811
- -- @field float (pattern)
812
- -- A pattern that matches a floating point number.
813
- -- @field word (pattern)
814
- -- A pattern that matches a typical word. Words begin with a letter or
815
- -- underscore and consist of alphanumeric and underscore characters.
816
- -- @field FOLD_BASE (number)
817
- -- The initial (root) fold level.
818
- -- @field FOLD_BLANK (number)
819
- -- Flag indicating that the line is blank.
820
- -- @field FOLD_HEADER (number)
821
- -- Flag indicating the line is fold point.
822
- -- @field fold_level (table, Read-only)
823
- -- Table of fold level bit-masks for line numbers starting from zero.
824
- -- Fold level masks are composed of an integer level combined with any of the
825
- -- following bits:
826
- --
827
- -- * `lexer.FOLD_BASE`
828
- -- The initial fold level.
829
- -- * `lexer.FOLD_BLANK`
830
- -- The line is blank.
831
- -- * `lexer.FOLD_HEADER`
832
- -- The line is a header, or fold point.
833
- -- @field indent_amount (table, Read-only)
834
- -- Table of indentation amounts in character columns, for line numbers
835
- -- starting from zero.
836
- -- @field property (table)
837
- -- Map of key-value string pairs.
838
- -- @field property_expanded (table, Read-only)
839
- -- Map of key-value string pairs with `$()` and `%()` variable replacement
840
- -- performed in values.
841
- -- @field property_int (table, Read-only)
842
- -- Map of key-value pairs with values interpreted as numbers, or `0` if not
843
- -- found.
844
- -- @field style_at (table, Read-only)
845
- -- Table of style names at positions in the buffer starting from zero.
846
- module('lexer')]=]
847
-
848
- local lpeg = require('lpeg')
849
- local lpeg_P, lpeg_R, lpeg_S, lpeg_V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
850
- local lpeg_Ct, lpeg_Cc, lpeg_Cp = lpeg.Ct, lpeg.Cc, lpeg.Cp
851
- local lpeg_Cmt, lpeg_C, lpeg_Cg = lpeg.Cmt, lpeg.C, lpeg.Cg
852
- local lpeg_match = lpeg.match
853
-
854
- M.LEXERPATH = package.path
855
-
856
- -- Table of loaded lexers.
857
- local lexers = {}
858
-
859
- -- Keep track of the last parent lexer loaded. This lexer's rules are used for
860
- -- proxy lexers (those that load parent and child lexers to embed) that do not
861
- -- declare a parent lexer.
862
- local parent_lexer
863
-
864
- if not package.searchpath then
865
- -- Searches for the given *name* in the given *path*.
866
- -- This is an implementation of Lua 5.2's `package.searchpath()` function for
867
- -- Lua 5.1.
868
- function package.searchpath(name, path)
869
- local tried = {}
870
- for part in path:gmatch('[^;]+') do
871
- local filename = part:gsub('%?', name)
872
- local f = io.open(filename, 'r')
873
- if f then f:close() return filename end
874
- tried[#tried + 1] = ("no file '%s'"):format(filename)
875
- end
876
- return nil, table.concat(tried, '\n')
877
- end
878
- end
879
-
880
- -- Adds a rule to a lexer's current ordered list of rules.
881
- -- @param lexer The lexer to add the given rule to.
882
- -- @param name The name associated with this rule. It is used for other lexers
883
- -- to access this particular rule from the lexer's `_RULES` table. It does not
884
- -- have to be the same as the name passed to `token`.
885
- -- @param rule The LPeg pattern of the rule.
886
- local function add_rule(lexer, id, rule)
887
- if not lexer._RULES then
888
- lexer._RULES = {}
889
- -- Contains an ordered list (by numerical index) of rule names. This is used
890
- -- in conjunction with lexer._RULES for building _TOKENRULE.
891
- lexer._RULEORDER = {}
892
- end
893
- lexer._RULES[id] = rule
894
- lexer._RULEORDER[#lexer._RULEORDER + 1] = id
895
- end
896
-
897
- -- Adds a new Scintilla style to Scintilla.
898
- -- @param lexer The lexer to add the given style to.
899
- -- @param token_name The name of the token associated with this style.
900
- -- @param style A Scintilla style created from `style()`.
901
- -- @see style
902
- local function add_style(lexer, token_name, style)
903
- local num_styles = lexer._numstyles
904
- if num_styles == 32 then num_styles = num_styles + 8 end -- skip predefined
905
- if num_styles >= 255 then print('Too many styles defined (255 MAX)') end
906
- lexer._TOKENSTYLES[token_name], lexer._numstyles = num_styles, num_styles + 1
907
- lexer._EXTRASTYLES[token_name] = style
908
- end
909
-
910
- -- (Re)constructs `lexer._TOKENRULE`.
911
- -- @param parent The parent lexer.
912
- local function join_tokens(lexer)
913
- local patterns, order = lexer._RULES, lexer._RULEORDER
914
- local token_rule = patterns[order[1]]
915
- for i = 2, #order do token_rule = token_rule + patterns[order[i]] end
916
- lexer._TOKENRULE = token_rule + M.token(M.DEFAULT, M.any)
917
- return lexer._TOKENRULE
918
- end
919
-
920
- -- Adds a given lexer and any of its embedded lexers to a given grammar.
921
- -- @param grammar The grammar to add the lexer to.
922
- -- @param lexer The lexer to add.
923
- local function add_lexer(grammar, lexer, token_rule)
924
- local token_rule = join_tokens(lexer)
925
- local lexer_name = lexer._NAME
926
- for _, child in ipairs(lexer._CHILDREN) do
927
- if child._CHILDREN then add_lexer(grammar, child) end
928
- local child_name = child._NAME
929
- local rules = child._EMBEDDEDRULES[lexer_name]
930
- local rules_token_rule = grammar['__'..child_name] or rules.token_rule
931
- grammar[child_name] = (-rules.end_rule * rules_token_rule)^0 *
932
- rules.end_rule^-1 * lpeg_V(lexer_name)
933
- local embedded_child = '_'..child_name
934
- grammar[embedded_child] = rules.start_rule * (-rules.end_rule *
935
- rules_token_rule)^0 * rules.end_rule^-1
936
- token_rule = lpeg_V(embedded_child) + token_rule
937
- end
938
- grammar['__'..lexer_name] = token_rule -- can contain embedded lexer rules
939
- grammar[lexer_name] = token_rule^0
940
- end
941
-
942
- -- (Re)constructs `lexer._GRAMMAR`.
943
- -- @param lexer The parent lexer.
944
- -- @param initial_rule The name of the rule to start lexing with. The default
945
- -- value is `lexer._NAME`. Multilang lexers use this to start with a child
946
- -- rule if necessary.
947
- local function build_grammar(lexer, initial_rule)
948
- -- local children = lexer._CHILDREN
949
- -- if children then
950
- local lexer_name = lexer._NAME
951
- if not initial_rule then initial_rule = lexer_name end
952
- local grammar = {initial_rule}
953
- if not lexer._CHILDREN then lexer._CHILDREN={} end
954
- add_lexer(grammar, lexer)
955
- lexer._INITIALRULE = initial_rule
956
- lexer._GRAMMAR = lpeg_Ct(lpeg_P(grammar))
957
- -- else
958
- -- lexer._GRAMMAR = lpeg_Ct(join_tokens(lexer)^0)
959
- -- end
960
- end
961
-
962
- local string_upper = string.upper
963
- -- Default styles.
964
- local default = {
965
- 'nothing', 'whitespace', 'comment', 'string', 'number', 'keyword',
966
- 'identifier', 'operator', 'error', 'preprocessor', 'constant', 'variable',
967
- 'function', 'class', 'type', 'label', 'regex', 'embedded'
968
- }
969
- for _, v in ipairs(default) do
970
- M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
971
- end
972
- -- Predefined styles.
973
- local predefined = {
974
- 'default', 'linenumber', 'bracelight', 'bracebad', 'controlchar',
975
- 'indentguide', 'calltip'
976
- }
977
- for _, v in ipairs(predefined) do
978
- M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
979
- end
980
-
981
- ---
982
- -- Initializes or loads and returns the lexer of string name *name*.
983
- -- Scintilla calls this function to load a lexer. Parent lexers also call this
984
- -- function to load child lexers and vice-versa. The user calls this function
985
- -- to load a lexer when using Scintillua as a Lua library.
986
- -- @param name The name of the lexing language.
987
- -- @param alt_name The alternate name of the lexing language. This is useful for
988
- -- embedding the same child lexer with multiple sets of start and end tokens.
989
- -- @return lexer object
990
- -- @name load
991
- function M.load(name, alt_name)
992
- if lexers[alt_name or name] then return lexers[alt_name or name] end
993
- parent_lexer = nil -- reset
994
-
995
- -- When using Scintillua as a stand-alone module, the `property` and
996
- -- `property_int` tables do not exist (they are not useful). Create them to
997
- -- prevent errors from occurring.
998
- if not M.property then
999
- M.property, M.property_int = {}, setmetatable({}, {
1000
- __index = function(t, k)
1001
- return tostring(tonumber(M.property[k]) or 0)
1002
- end,
1003
- __newindex = function() error('read-only property') end
1004
- })
1005
- end
1006
-
1007
- -- Load the language lexer with its rules, styles, etc.
1008
- M.WHITESPACE = (alt_name or name)..'_whitespace'
1009
- local lexer_file, error = package.searchpath(name, M.LEXERPATH)
1010
- local ok, lexer = pcall(dofile, lexer_file or '')
1011
- if not ok then
1012
- _G.print(error or lexer) -- error message
1013
- lexer = {_NAME = alt_name or name}
1014
- end
1015
- if alt_name then lexer._NAME = alt_name end
1016
-
1017
- -- Create the initial maps for token names to style numbers and styles.
1018
- local token_styles = {}
1019
- for i = 1, #default do token_styles[default[i]] = i - 1 end
1020
- for i = 1, #predefined do token_styles[predefined[i]] = i + 31 end
1021
- lexer._TOKENSTYLES, lexer._numstyles = token_styles, #default
1022
- lexer._EXTRASTYLES = {}
1023
-
1024
- -- If the lexer is a proxy (loads parent and child lexers to embed) and does
1025
- -- not declare a parent, try and find one and use its rules.
1026
- if not lexer._rules and not lexer._lexer then lexer._lexer = parent_lexer end
1027
-
1028
- -- If the lexer is a proxy or a child that embedded itself, add its rules and
1029
- -- styles to the parent lexer. Then set the parent to be the main lexer.
1030
- if lexer._lexer then
1031
- local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles
1032
- if not l._tokenstyles then l._tokenstyles = {} end
1033
- for _, r in ipairs(_r or {}) do
1034
- -- Prevent rule id clashes.
1035
- l._rules[#l._rules + 1] = {lexer._NAME..'_'..r[1], r[2]}
1036
- end
1037
- for token, style in pairs(_s or {}) do l._tokenstyles[token] = style end
1038
- lexer = l
1039
- end
1040
-
1041
- -- Add the lexer's styles and build its grammar.
1042
- if lexer._rules then
1043
- for token, style in pairs(lexer._tokenstyles or {}) do
1044
- add_style(lexer, token, style)
1045
- end
1046
- for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end
1047
- build_grammar(lexer)
1048
- end
1049
- -- Add the lexer's unique whitespace style.
1050
- add_style(lexer, lexer._NAME..'_whitespace', M.STYLE_WHITESPACE)
1051
-
1052
- -- Process the lexer's fold symbols.
1053
- if lexer._foldsymbols and lexer._foldsymbols._patterns then
1054
- local patterns = lexer._foldsymbols._patterns
1055
- for i = 1, #patterns do patterns[i] = '()('..patterns[i]..')' end
1056
- end
1057
-
1058
- lexer.lex, lexer.fold = M.lex, M.fold
1059
- -- Immun.io copy over some of our helpful functions
1060
- if M.lex_recursive then lexer.lex_recursive = M.lex_recursive end
1061
- if M.unlex_rules then lexer.unlex_rules = M.unlex_rules end
1062
- lexers[alt_name or name] = lexer
1063
- return lexer
1064
- end
1065
-
1066
- ---
1067
- -- Lexes a chunk of text *text* (that has an initial style number of
1068
- -- *init_style*) with lexer *lexer*.
1069
- -- If *lexer* has a `_LEXBYLINE` flag set, the text is lexed one line at a time.
1070
- -- Otherwise the text is lexed as a whole.
1071
- -- @param lexer The lexer object to lex with.
1072
- -- @param text The text in the buffer to lex.
1073
- -- @param init_style The current style. Multiple-language lexers use this to
1074
- -- determine which language to start lexing in.
1075
- -- @return table of token names and positions.
1076
- -- @name lex
1077
- function M.lex(lexer, text, init_style)
1078
- if not lexer._LEXBYLINE then
1079
- -- For multilang lexers, build a new grammar whose initial_rule is the
1080
- -- current language.
1081
- if lexer._CHILDREN then
1082
- for style, style_num in pairs(lexer._TOKENSTYLES) do
1083
- if style_num == init_style then
1084
- local lexer_name = style:match('^(.+)_whitespace') or lexer._NAME
1085
- if lexer._INITIALRULE ~= lexer_name then
1086
- build_grammar(lexer, lexer_name)
1087
- end
1088
- break
1089
- end
1090
- end
1091
- end
1092
- return lpeg_match(lexer._GRAMMAR, text)
1093
- else
1094
- local tokens = {}
1095
- local function append(tokens, line_tokens, offset)
1096
- for i = 1, #line_tokens, 2 do
1097
- tokens[#tokens + 1] = line_tokens[i]
1098
- tokens[#tokens + 1] = line_tokens[i + 1] + offset
1099
- end
1100
- end
1101
- local offset = 0
1102
- local grammar = lexer._GRAMMAR
1103
- for line in text:gmatch('[^\r\n]*\r?\n?') do
1104
- local line_tokens = lpeg_match(grammar, line)
1105
- if line_tokens then append(tokens, line_tokens, offset) end
1106
- offset = offset + #line
1107
- -- Use the default style to the end of the line if none was specified.
1108
- if tokens[#tokens] ~= offset then
1109
- tokens[#tokens + 1], tokens[#tokens + 2] = 'default', offset + 1
1110
- end
1111
- end
1112
- return tokens
1113
- end
1114
- end
1115
-
1116
- ---
1117
- -- Folds a chunk of text *text* with lexer *lexer*.
1118
- -- Folds *text* starting at position *start_pos* on line number *start_line*
1119
- -- with a beginning fold level of *start_level* in the buffer. If *lexer* has a
1120
- -- a `_fold` function or a `_foldsymbols` table, that field is used to perform
1121
- -- folding. Otherwise, if a `fold.by.indentation` property is set, folding by
1122
- -- indentation is done.
1123
- -- @param lexer The lexer object to fold with.
1124
- -- @param text The text in the buffer to fold.
1125
- -- @param start_pos The position in the buffer *text* starts at.
1126
- -- @param start_line The line number *text* starts on.
1127
- -- @param start_level The fold level *text* starts on.
1128
- -- @return table of fold levels.
1129
- -- @name fold
1130
- function M.fold(lexer, text, start_pos, start_line, start_level)
1131
- local folds = {}
1132
- if text == '' then return folds end
1133
- local fold = M.property_int['fold'] > 0
1134
- local FOLD_BASE = M.FOLD_BASE
1135
- local FOLD_HEADER, FOLD_BLANK = M.FOLD_HEADER, M.FOLD_BLANK
1136
- if fold and lexer._fold then
1137
- return lexer._fold(text, start_pos, start_line, start_level)
1138
- elseif fold and lexer._foldsymbols then
1139
- local lines = {}
1140
- for p, l in (text..'\n'):gmatch('()(.-)\r?\n') do
1141
- lines[#lines + 1] = {p, l}
1142
- end
1143
- local fold_zero_sum_lines = M.property_int['fold.on.zero.sum.lines'] > 0
1144
- local fold_symbols = lexer._foldsymbols
1145
- local fold_symbols_patterns = fold_symbols._patterns
1146
- local style_at, fold_level = M.style_at, M.fold_level
1147
- local line_num, prev_level = start_line, start_level
1148
- local current_level = prev_level
1149
- for i = 1, #lines do
1150
- local pos, line = lines[i][1], lines[i][2]
1151
- if line ~= '' then
1152
- local level_decreased = false
1153
- for j = 1, #fold_symbols_patterns do
1154
- for s, match in line:gmatch(fold_symbols_patterns[j]) do
1155
- local symbols = fold_symbols[style_at[start_pos + pos + s - 1]]
1156
- local l = symbols and symbols[match]
1157
- if type(l) == 'function' then l = l(text, pos, line, s, match) end
1158
- if type(l) == 'number' then
1159
- current_level = current_level + l
1160
- if l < 0 and current_level < prev_level then
1161
- -- Potential zero-sum line. If the level were to go back up on
1162
- -- the same line, the line may be marked as a fold header.
1163
- level_decreased = true
1164
- end
1165
- end
1166
- end
1167
- end
1168
- folds[line_num] = prev_level
1169
- if current_level > prev_level then
1170
- folds[line_num] = prev_level + FOLD_HEADER
1171
- elseif level_decreased and current_level == prev_level and
1172
- fold_zero_sum_lines then
1173
- if line_num > start_line then
1174
- folds[line_num] = prev_level - 1 + FOLD_HEADER
1175
- else
1176
- -- Typing within a zero-sum line.
1177
- local level = fold_level[line_num - 1] - 1
1178
- if level > FOLD_HEADER then level = level - FOLD_HEADER end
1179
- if level > FOLD_BLANK then level = level - FOLD_BLANK end
1180
- folds[line_num] = level + FOLD_HEADER
1181
- current_level = current_level + 1
1182
- end
1183
- end
1184
- if current_level < FOLD_BASE then current_level = FOLD_BASE end
1185
- prev_level = current_level
1186
- else
1187
- folds[line_num] = prev_level + FOLD_BLANK
1188
- end
1189
- line_num = line_num + 1
1190
- end
1191
- elseif fold and M.property_int['fold.by.indentation'] > 0 then
1192
- -- Indentation based folding.
1193
- -- Calculate indentation per line.
1194
- local indentation = {}
1195
- for indent, line in (text..'\n'):gmatch('([\t ]*)([^\r\n]*)\r?\n') do
1196
- indentation[#indentation + 1] = line ~= '' and #indent
1197
- end
1198
- -- Make line before start_line a fold header if necessary.
1199
- if start_line > 0 and indentation[1] then
1200
- local indent = M.indent_amount[start_line - 1]
1201
- if indentation[1] > indent then
1202
- folds[start_line - 1] = FOLD_BASE + indent + FOLD_HEADER
1203
- end
1204
- end
1205
- -- Iterate over lines, setting fold numbers and fold flags.
1206
- local line_num, prev_level = start_line, FOLD_BASE + (indentation[1] or 0)
1207
- local current_level = prev_level
1208
- for i = 1, #indentation do
1209
- if indentation[i] then
1210
- for j = i + 1, #indentation do
1211
- if indentation[j] then
1212
- current_level = FOLD_BASE + indentation[j]
1213
- break
1214
- end
1215
- end
1216
- folds[line_num] = prev_level
1217
- if current_level > prev_level then
1218
- folds[line_num] = prev_level + FOLD_HEADER
1219
- end
1220
- prev_level = current_level
1221
- else
1222
- folds[line_num] = prev_level + FOLD_BLANK
1223
- end
1224
- line_num = line_num + 1
1225
- end
1226
- else
1227
- -- No folding, reset fold levels if necessary.
1228
- local current_line = start_line
1229
- for _ in text:gmatch('\r?\n') do
1230
- folds[current_line] = start_level
1231
- current_line = current_line + 1
1232
- end
1233
- end
1234
- return folds
1235
- end
1236
-
1237
- -- The following are utility functions lexers will have access to.
1238
-
1239
- -- Common patterns.
1240
- M.any = lpeg_P(1)
1241
- M.ascii = lpeg_R('\000\127')
1242
- M.extend = lpeg_R('\000\255')
1243
- M.alpha = lpeg_R('AZ', 'az')
1244
- M.digit = lpeg_R('09')
1245
- M.alnum = lpeg_R('AZ', 'az', '09')
1246
- M.lower = lpeg_R('az')
1247
- M.upper = lpeg_R('AZ')
1248
- M.xdigit = lpeg_R('09', 'AF', 'af')
1249
- M.cntrl = lpeg_R('\000\031')
1250
- M.graph = lpeg_R('!~')
1251
- M.print = lpeg_R(' ~')
1252
- M.punct = lpeg_R('!/', ':@', '[\'', '{~')
1253
- M.space = lpeg_S('\t\v\f\n\r ')
1254
-
1255
- M.newline = lpeg_S('\r\n\f')^1
1256
- M.nonnewline = 1 - M.newline
1257
- M.nonnewline_esc = 1 - (M.newline + '\\') + '\\' * M.any
1258
-
1259
- M.dec_num = M.digit^1
1260
- M.hex_num = '0' * lpeg_S('xX') * M.xdigit^1
1261
- M.oct_num = '0' * lpeg_R('07')^1
1262
- M.integer = lpeg_S('+-')^-1 * (M.hex_num + M.oct_num + M.dec_num)
1263
- M.float = lpeg_S('+-')^-1 *
1264
- (M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0 +
1265
- M.digit^1) *
1266
- lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1
1267
- M.word = (M.alpha + '_') * (M.alnum + '_')^0
1268
-
1269
- ---
1270
- -- Creates and returns a token pattern with token name *name* and pattern
1271
- -- *patt*.
1272
- -- If *name* is not a predefined token name, its style must be defined in the
1273
- -- lexer's `_tokenstyles` table.
1274
- -- @param name The name of token. If this name is not a predefined token name,
1275
- -- then a style needs to be assiciated with it in the lexer's `_tokenstyles`
1276
- -- table.
1277
- -- @param patt The LPeg pattern associated with the token.
1278
- -- @return pattern
1279
- -- @usage local ws = token(l.WHITESPACE, l.space^1)
1280
- -- @usage local annotation = token('annotation', '@' * l.word)
1281
- -- @name token
1282
- function M.token(name, patt)
1283
- --return lpeg_Cg(patt, name)
1284
- return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_C(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
1285
- end
1286
-
1287
- function M.parent_token(name, patt)
1288
- --return lpeg_Cg(patt, name)
1289
- return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_Ct(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
1290
- end
1291
-
1292
- ---
1293
- -- Creates and returns a pattern that matches a range of text bounded by
1294
- -- *chars* characters.
1295
- -- This is a convenience function for matching more complicated delimited ranges
1296
- -- like strings with escape characters and balanced parentheses. *single_line*
1297
- -- indicates whether or not the range must be on a single line, *no_escape*
1298
- -- indicates whether or not to ignore '\' as an escape character, and *balanced*
1299
- -- indicates whether or not to handle balanced ranges like parentheses and
1300
- -- requires *chars* to be composed of two characters.
1301
- -- @param chars The character(s) that bound the matched range.
1302
- -- @param single_line Optional flag indicating whether or not the range must be
1303
- -- on a single line.
1304
- -- @param no_escape Optional flag indicating whether or not the range end
1305
- -- character may be escaped by a '\\' character.
1306
- -- @param balanced Optional flag indicating whether or not to match a balanced
1307
- -- range, like the "%b" Lua pattern. This flag only applies if *chars*
1308
- -- consists of two different characters (e.g. "()").
1309
- -- @return pattern
1310
- -- @usage local dq_str_escapes = l.delimited_range('"')
1311
- -- @usage local dq_str_noescapes = l.delimited_range('"', false, true)
1312
- -- @usage local unbalanced_parens = l.delimited_range('()')
1313
- -- @usage local balanced_parens = l.delimited_range('()', false, false, true)
1314
- -- @see nested_pair
1315
- -- @name delimited_range
1316
- function M.delimited_range(chars, single_line, no_escape, balanced)
1317
- local s = chars:sub(1, 1)
1318
- local e = #chars == 2 and chars:sub(2, 2) or s
1319
- local range
1320
- local b = balanced and s or ''
1321
- local n = single_line and '\n' or ''
1322
- if no_escape then
1323
- local invalid = lpeg_S(e..n..b)
1324
- range = M.any - invalid
1325
- else
1326
- local invalid = lpeg_S(e..n..b) + '\\'
1327
- range = M.any - invalid + '\\' * M.any
1328
- end
1329
- if balanced and s ~= e then
1330
- return lpeg_P{s * (range + lpeg_V(1))^0 * e}
1331
- else
1332
- return s * range^0 * lpeg_P(e)^-1
1333
- end
1334
- end
1335
-
1336
- ---
1337
- -- Creates and returns a pattern that matches pattern *patt* only at the
1338
- -- beginning of a line.
1339
- -- @param patt The LPeg pattern to match on the beginning of a line.
1340
- -- @return pattern
1341
- -- @usage local preproc = token(l.PREPROCESSOR, l.starts_line('#') *
1342
- -- l.nonnewline^0)
1343
- -- @name starts_line
1344
- function M.starts_line(patt)
1345
- return lpeg_Cmt(lpeg_C(patt), function(input, index, match, ...)
1346
- local pos = index - #match
1347
- if pos == 1 then return index, ... end
1348
- local char = input:sub(pos - 1, pos - 1)
1349
- if char == '\n' or char == '\r' or char == '\f' then return index, ... end
1350
- end)
1351
- end
1352
-
1353
- ---
1354
- -- Creates and returns a pattern that verifies that string set *s* contains the
1355
- -- first non-whitespace character behind the current match position.
1356
- -- @param s String character set like one passed to `lpeg.S()`.
1357
- -- @return pattern
1358
- -- @usage local regex = l.last_char_includes('+-*!%^&|=,([{') *
1359
- -- l.delimited_range('/')
1360
- -- @name last_char_includes
1361
- function M.last_char_includes(s)
1362
- s = '['..s:gsub('[-%%%[]', '%%%1')..']'
1363
- return lpeg_P(function(input, index)
1364
- if index == 1 then return index end
1365
- local i = index
1366
- while input:sub(i - 1, i - 1):match('[ \t\r\n\f]') do i = i - 1 end
1367
- if input:sub(i - 1, i - 1):match(s) then return index end
1368
- end)
1369
- end
1370
-
1371
- ---
1372
- -- Returns a pattern that matches a balanced range of text that starts with
1373
- -- string *start_chars* and ends with string *end_chars*.
1374
- -- With single-character delimiters, this function is identical to
1375
- -- `delimited_range(start_chars..end_chars, false, true, true)`.
1376
- -- @param start_chars The string starting a nested sequence.
1377
- -- @param end_chars The string ending a nested sequence.
1378
- -- @return pattern
1379
- -- @usage local nested_comment = l.nested_pair('/*', '*/')
1380
- -- @see delimited_range
1381
- -- @name nested_pair
1382
- function M.nested_pair(start_chars, end_chars)
1383
- local s, e = start_chars, lpeg_P(end_chars)^-1
1384
- return lpeg_P{s * (M.any - s - end_chars + lpeg_V(1))^0 * e}
1385
- end
1386
-
1387
- ---
1388
- -- Creates and returns a pattern that matches any single word in list *words*.
1389
- -- Words consist of alphanumeric and underscore characters, as well as the
1390
- -- characters in string set *word_chars*. *case_insensitive* indicates whether
1391
- -- or not to ignore case when matching words.
1392
- -- This is a convenience function for simplifying a set of ordered choice word
1393
- -- patterns.
1394
- -- @param words A table of words.
1395
- -- @param word_chars Optional string of additional characters considered to be
1396
- -- part of a word. By default, word characters are alphanumerics and
1397
- -- underscores ("%w_" in Lua). This parameter may be `nil` or the empty string
1398
- -- to indicate no additional word characters.
1399
- -- @param case_insensitive Optional boolean flag indicating whether or not the
1400
- -- word match is case-insensitive. The default is `false`.
1401
- -- @return pattern
1402
- -- @usage local keyword = token(l.KEYWORD, word_match{'foo', 'bar', 'baz'})
1403
- -- @usage local keyword = token(l.KEYWORD, word_match({'foo-bar', 'foo-baz',
1404
- -- 'bar-foo', 'bar-baz', 'baz-foo', 'baz-bar'}, '-', true))
1405
- -- @name word_match
1406
- function M.word_match(words, word_chars, case_insensitive)
1407
- local word_list = {}
1408
- for _, word in ipairs(words) do
1409
- word_list[case_insensitive and word:lower() or word] = true
1410
- end
1411
- local chars = M.alnum + '_'
1412
- if word_chars then chars = chars + lpeg_S(word_chars) end
1413
- return lpeg_Cmt(chars^1, function(input, index, word)
1414
- if case_insensitive then word = word:lower() end
1415
- return word_list[word] and index or nil
1416
- end)
1417
- end
1418
-
1419
- ---
1420
- -- Embeds child lexer *child* in parent lexer *parent* using patterns
1421
- -- *start_rule* and *end_rule*, which signal the beginning and end of the
1422
- -- embedded lexer, respectively.
1423
- -- @param parent The parent lexer.
1424
- -- @param child The child lexer.
1425
- -- @param start_rule The pattern that signals the beginning of the embedded
1426
- -- lexer.
1427
- -- @param end_rule The pattern that signals the end of the embedded lexer.
1428
- -- @usage l.embed_lexer(M, css, css_start_rule, css_end_rule)
1429
- -- @usage l.embed_lexer(html, M, php_start_rule, php_end_rule)
1430
- -- @usage l.embed_lexer(html, ruby, ruby_start_rule, ruby_end_rule)
1431
- -- @name embed_lexer
1432
- function M.embed_lexer(parent, child, start_rule, end_rule)
1433
- -- Add child rules.
1434
- if not child._EMBEDDEDRULES then child._EMBEDDEDRULES = {} end
1435
- if not child._RULES then -- creating a child lexer to be embedded
1436
- if not child._rules then error('Cannot embed language with no rules') end
1437
- for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end
1438
- end
1439
- child._EMBEDDEDRULES[parent._NAME] = {
1440
- ['start_rule'] = start_rule,
1441
- token_rule = join_tokens(child),
1442
- ['end_rule'] = end_rule
1443
- }
1444
- if not parent._CHILDREN then parent._CHILDREN = {} end
1445
- local children = parent._CHILDREN
1446
- children[#children + 1] = child
1447
- -- Add child styles.
1448
- if not parent._tokenstyles then parent._tokenstyles = {} end
1449
- local tokenstyles = parent._tokenstyles
1450
- tokenstyles[child._NAME..'_whitespace'] = M.STYLE_WHITESPACE
1451
- for token, style in pairs(child._tokenstyles or {}) do
1452
- tokenstyles[token] = style
1453
- end
1454
- child._lexer = parent -- use parent's tokens if child is embedding itself
1455
- parent_lexer = parent -- use parent's tokens if the calling lexer is a proxy
1456
- end
1457
-
1458
- -- Determines if the previous line is a comment.
1459
- -- This is used for determining if the current comment line is a fold point.
1460
- -- @param prefix The prefix string defining a comment.
1461
- -- @param text The text passed to a fold function.
1462
- -- @param pos The pos passed to a fold function.
1463
- -- @param line The line passed to a fold function.
1464
- -- @param s The s passed to a fold function.
1465
- local function prev_line_is_comment(prefix, text, pos, line, s)
1466
- local start = line:find('%S')
1467
- if start < s and not line:find(prefix, start, true) then return false end
1468
- local p = pos - 1
1469
- if text:sub(p, p) == '\n' then
1470
- p = p - 1
1471
- if text:sub(p, p) == '\r' then p = p - 1 end
1472
- if text:sub(p, p) ~= '\n' then
1473
- while p > 1 and text:sub(p - 1, p - 1) ~= '\n' do p = p - 1 end
1474
- while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
1475
- return text:sub(p, p + #prefix - 1) == prefix
1476
- end
1477
- end
1478
- return false
1479
- end
1480
-
1481
- -- Determines if the next line is a comment.
1482
- -- This is used for determining if the current comment line is a fold point.
1483
- -- @param prefix The prefix string defining a comment.
1484
- -- @param text The text passed to a fold function.
1485
- -- @param pos The pos passed to a fold function.
1486
- -- @param line The line passed to a fold function.
1487
- -- @param s The s passed to a fold function.
1488
- local function next_line_is_comment(prefix, text, pos, line, s)
1489
- local p = text:find('\n', pos + s)
1490
- if p then
1491
- p = p + 1
1492
- while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
1493
- return text:sub(p, p + #prefix - 1) == prefix
1494
- end
1495
- return false
1496
- end
1497
-
1498
- ---
1499
- -- Returns a fold function (to be used within the lexer's `_foldsymbols` table)
1500
- -- that folds consecutive line comments that start with string *prefix*.
1501
- -- @param prefix The prefix string defining a line comment.
1502
- -- @usage [l.COMMENT] = {['--'] = l.fold_line_comments('--')}
1503
- -- @usage [l.COMMENT] = {['//'] = l.fold_line_comments('//')}
1504
- -- @name fold_line_comments
1505
- function M.fold_line_comments(prefix)
1506
- local property_int = M.property_int
1507
- return function(text, pos, line, s)
1508
- if property_int['fold.line.comments'] == 0 then return 0 end
1509
- if s > 1 and line:match('^%s*()') < s then return 0 end
1510
- local prev_line_comment = prev_line_is_comment(prefix, text, pos, line, s)
1511
- local next_line_comment = next_line_is_comment(prefix, text, pos, line, s)
1512
- if not prev_line_comment and next_line_comment then return 1 end
1513
- if prev_line_comment and not next_line_comment then return -1 end
1514
- return 0
1515
- end
1516
- end
1517
-
1518
- M.property_expanded = setmetatable({}, {
1519
- -- Returns the string property value associated with string property *key*,
1520
- -- replacing any "$()" and "%()" expressions with the values of their keys.
1521
- __index = function(t, key)
1522
- return M.property[key]:gsub('[$%%]%b()', function(key)
1523
- return t[key:sub(3, -2)]
1524
- end)
1525
- end,
1526
- __newindex = function() error('read-only property') end
1527
- })
1528
-
1529
- --[[ The functions and fields below were defined in C.
1530
-
1531
- ---
1532
- -- Individual fields for a lexer instance.
1533
- -- @field _NAME The string name of the lexer.
1534
- -- @field _rules An ordered list of rules for a lexer grammar.
1535
- -- Each rule is a table containing an arbitrary rule name and the LPeg pattern
1536
- -- associated with the rule. The order of rules is important as rules are
1537
- -- matched sequentially.
1538
- -- Child lexers should not use this table to access and/or modify their
1539
- -- parent's rules and vice-versa. Use the `_RULES` table instead.
1540
- -- @field _tokenstyles A map of non-predefined token names to styles.
1541
- -- Remember to use token names, not rule names. It is recommended to use
1542
- -- predefined styles or color-agnostic styles derived from predefined styles
1543
- -- to ensure compatibility with user color themes.
1544
- -- @field _foldsymbols A table of recognized fold points for the lexer.
1545
- -- Keys are token names with table values defining fold points. Those table
1546
- -- values have string keys of keywords or characters that indicate a fold
1547
- -- point whose values are integers. A value of `1` indicates a beginning fold
1548
- -- point and a value of `-1` indicates an ending fold point. Values can also
1549
- -- be functions that return `1`, `-1`, or `0` (indicating no fold point) for
1550
- -- keys which need additional processing.
1551
- -- There is also a required `_pattern` key whose value is a table containing
1552
- -- Lua pattern strings that match all fold points (the string keys contained
1553
- -- in token name table values). When the lexer encounters text that matches
1554
- -- one of those patterns, the matched text is looked up in its token's table
1555
- -- to determine whether or not it is a fold point.
1556
- -- @field _fold If this function exists in the lexer, it is called for folding
1557
- -- the document instead of using `_foldsymbols` or indentation.
1558
- -- @field _lexer The parent lexer object whose rules should be used. This field
1559
- -- is only necessary to disambiguate a proxy lexer that loaded parent and
1560
- -- child lexers for embedding and ended up having multiple parents loaded.
1561
- -- @field _RULES A map of rule name keys with their associated LPeg pattern
1562
- -- values for the lexer.
1563
- -- This is constructed from the lexer's `_rules` table and accessible to other
1564
- -- lexers for embedded lexer applications like modifying parent or child
1565
- -- rules.
1566
- -- @field _LEXBYLINE Indicates the lexer can only process one whole line of text
1567
- -- (instead of an arbitrary chunk of text) at a time.
1568
- -- The default value is `false`. Line lexers cannot look ahead to subsequent
1569
- -- lines.
1570
- -- @class table
1571
- -- @name lexer
1572
- local lexer
1573
- ]]
1574
-
1575
- return M