immunio 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,68 +0,0 @@
1
- -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
2
- -- JavaScript LPeg lexer.
3
-
4
- local l = require('lexer')
5
- local token, word_match = l.token, l.word_match
6
- local P, R, S = lpeg.P, lpeg.R, lpeg.S
7
-
8
- local M = {_NAME = 'javascript'}
9
-
10
- -- Whitespace.
11
- local ws = token(l.WHITESPACE, l.space^1)
12
-
13
- -- Comments.
14
- local line_comment = '//' * l.nonnewline_esc^0
15
- local block_comment = '/*' * (l.any - '*/')^0 * P('*/')^-1
16
- local comment = token(l.COMMENT, line_comment + block_comment)
17
-
18
- -- Strings.
19
- local sq_str = l.delimited_range("'")
20
- local dq_str = l.delimited_range('"')
21
- local regex = token( "regex", l.last_char_includes('+-*%^!=&|?:;,([{<>') *
22
- l.delimited_range('/', true) * S('igm')^0 )
23
- local string = token(l.STRING, sq_str + dq_str) --+ token(l.REGEX, regex_str)
24
-
25
- -- Numbers.
26
- local number = token(l.NUMBER, l.float + l.integer)
27
-
28
- -- Keywords.
29
- local keyword = token(l.KEYWORD, word_match{
30
- 'abstract', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class',
31
- 'const', 'continue', 'debugger', 'default', 'delete', 'do', 'double', 'else',
32
- 'enum', 'export', 'extends', 'false', 'final', 'finally', 'float', 'for',
33
- 'function', 'goto', 'if', 'implements', 'import', 'in', 'instanceof', 'int',
34
- 'interface', 'let', 'long', 'native', 'new', 'null', 'package', 'private',
35
- 'protected', 'public', 'return', 'short', 'static', 'super', 'switch',
36
- 'synchronized', 'this', 'throw', 'throws', 'transient', 'true', 'try',
37
- 'typeof', 'var', 'void', 'volatile', 'while', 'with', 'yield'
38
- })
39
-
40
- -- Identifiers.
41
- local identifier = token(l.IDENTIFIER, l.word)
42
-
43
- -- Operators.
44
- local operator = token(l.OPERATOR, S('+-/*%^!=&|?:;,.()[]{}<>'))
45
-
46
- -- Immunio marker
47
- local marker = l.token('marker', P('{immunio-var:') * l.integer * ':' * l.xdigit^1 * '}')
48
-
49
-
50
- M._rules = {
51
- {'whitespace', ws},
52
- {'marker', marker},
53
- {'keyword', keyword},
54
- {'identifier', identifier},
55
- {'comment', comment},
56
- {'number', number},
57
- {'string', string},
58
- {'regex', regex},
59
- {'operator', operator},
60
- }
61
-
62
- M._foldsymbols = {
63
- _patterns = {'[{}]', '/%*', '%*/', '//'},
64
- [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
65
- [l.COMMENT] = {['/*'] = 1, ['*/'] = -1, ['//'] = l.fold_line_comments('//')}
66
- }
67
-
68
- return M
@@ -1,1575 +0,0 @@
1
- -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
2
-
3
- local M = {}
4
-
5
- --[=[ This comment is for LuaDoc.
6
- ---
7
- -- Lexes Scintilla documents with Lua and LPeg.
8
- --
9
- -- ## Overview
10
- --
11
- -- Lexers highlight the syntax of source code. Scintilla (the editing component
12
- -- behind [Textadept][] and [SciTE][]) traditionally uses static, compiled C++
13
- -- lexers which are notoriously difficult to create and/or extend. On the other
14
- -- hand, Lua makes it easy to to rapidly create new lexers, extend existing
15
- -- ones, and embed lexers within one another. Lua lexers tend to be more
16
- -- readable than C++ lexers too.
17
- --
18
- -- Lexers are Parsing Expression Grammars, or PEGs, composed with the Lua
19
- -- [LPeg library][]. The following table comes from the LPeg documentation and
20
- -- summarizes all you need to know about constructing basic LPeg patterns. This
21
- -- module provides convenience functions for creating and working with other
22
- -- more advanced patterns and concepts.
23
- --
24
- -- Operator | Description
25
- -- ---------------------|------------
26
- -- `lpeg.P(string)` | Matches `string` literally.
27
- -- `lpeg.P(`_`n`_`)` | Matches exactly _`n`_ characters.
28
- -- `lpeg.S(string)` | Matches any character in set `string`.
29
- -- `lpeg.R("`_`xy`_`")` | Matches any character between range `x` and `y`.
30
- -- `patt^`_`n`_ | Matches at least _`n`_ repetitions of `patt`.
31
- -- `patt^-`_`n`_ | Matches at most _`n`_ repetitions of `patt`.
32
- -- `patt1 * patt2` | Matches `patt1` followed by `patt2`.
33
- -- `patt1 + patt2` | Matches `patt1` or `patt2` (ordered choice).
34
- -- `patt1 - patt2` | Matches `patt1` if `patt2` does not match.
35
- -- `-patt` | Equivalent to `("" - patt)`.
36
- -- `#patt` | Matches `patt` but consumes no input.
37
- --
38
- -- The first part of this document deals with rapidly constructing a simple
39
- -- lexer. The next part deals with more advanced techniques, such as custom
40
- -- coloring and embedding lexers within one another. Following that is a
41
- -- discussion about code folding, or being able to tell Scintilla which code
42
- -- blocks are "foldable" (temporarily hideable from view). After that are
43
- -- instructions on how to use LPeg lexers with the aforementioned Textadept and
44
- -- SciTE editors. Finally there are comments on lexer performance and
45
- -- limitations.
46
- --
47
- -- [LPeg library]: http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html
48
- -- [Textadept]: http://foicica.com/textadept
49
- -- [SciTE]: http://scintilla.org/SciTE.html
50
- --
51
- -- ## Lexer Basics
52
- --
53
- -- The *lexers/* directory contains all lexers, including your new one. Before
54
- -- attempting to write one from scratch though, first determine if your
55
- -- programming language is similar to any of the 80+ languages supported. If so,
56
- -- you may be able to copy and modify that lexer, saving some time and effort.
57
- -- The filename of your lexer should be the name of your programming language in
58
- -- lower case followed by a *.lua* extension. For example, a new Lua lexer has
59
- -- the name *lua.lua*.
60
- --
61
- -- Note: Try to refrain from using one-character language names like "b", "c",
62
- -- or "d". For example, Scintillua uses "b_lang", "cpp", and "dmd",
63
- -- respectively.
64
- --
65
- -- ### New Lexer Template
66
- --
67
- -- There is a *lexers/template.txt* file that contains a simple template for a
68
- -- new lexer. Feel free to use it, replacing the '?'s with the name of your
69
- -- lexer:
70
- --
71
- -- -- ? LPeg lexer.
72
- --
73
- -- local l = require('lexer')
74
- -- local token, word_match = l.token, l.word_match
75
- -- local P, R, S = lpeg.P, lpeg.R, lpeg.S
76
- --
77
- -- local M = {_NAME = '?'}
78
- --
79
- -- -- Whitespace.
80
- -- local ws = token(l.WHITESPACE, l.space^1)
81
- --
82
- -- M._rules = {
83
- -- {'whitespace', ws},
84
- -- }
85
- --
86
- -- M._tokenstyles = {
87
- --
88
- -- }
89
- --
90
- -- return M
91
- --
92
- -- The first 4 lines of code simply define often used convenience variables. The
93
- -- 5th and last lines define and return the lexer object Scintilla uses; they
94
- -- are very important and must be part of every lexer. The sixth line defines
95
- -- something called a "token", an essential building block of lexers. You will
96
- -- learn about tokens shortly. The rest of the code defines a set of grammar
97
- -- rules and token styles. You will learn about those later. Note, however, the
98
- -- `M.` prefix in front of `_rules` and `_tokenstyles`: not only do these tables
99
- -- belong to their respective lexers, but any non-local variables need the `M.`
100
- -- prefix too so-as not to affect Lua's global environment. All in all, this is
101
- -- a minimal, working lexer that you can build on.
102
- --
103
- -- ### Tokens
104
- --
105
- -- Take a moment to think about your programming language's structure. What kind
106
- -- of key elements does it have? In the template shown earlier, one predefined
107
- -- element all languages have is whitespace. Your language probably also has
108
- -- elements like comments, strings, and keywords. Lexers refer to these elements
109
- -- as "tokens". Tokens are the fundamental "building blocks" of lexers. Lexers
110
- -- break down source code into tokens for coloring, which results in the syntax
111
- -- highlighting familiar to you. It is up to you how specific your lexer is when
112
- -- it comes to tokens. Perhaps only distinguishing between keywords and
113
- -- identifiers is necessary, or maybe recognizing constants and built-in
114
- -- functions, methods, or libraries is desirable. The Lua lexer, for example,
115
- -- defines 11 tokens: whitespace, comments, strings, numbers, keywords, built-in
116
- -- functions, constants, built-in libraries, identifiers, labels, and operators.
117
- -- Even though constants, built-in functions, and built-in libraries are subsets
118
- -- of identifiers, Lua programmers find it helpful for the lexer to distinguish
119
- -- between them all. It is perfectly acceptable to just recognize keywords and
120
- -- identifiers.
121
- --
122
- -- In a lexer, tokens consist of a token name and an LPeg pattern that matches a
123
- -- sequence of characters recognized as an instance of that token. Create tokens
124
- -- using the [`lexer.token()`]() function. Let us examine the "whitespace" token
125
- -- defined in the template shown earlier:
126
- --
127
- -- local ws = token(l.WHITESPACE, l.space^1)
128
- --
129
- -- At first glance, the first argument does not appear to be a string name and
130
- -- the second argument does not appear to be an LPeg pattern. Perhaps you
131
- -- expected something like:
132
- --
133
- -- local ws = token('whitespace', S('\t\v\f\n\r ')^1)
134
- --
135
- -- The `lexer` (`l`) module actually provides a convenient list of common token
136
- -- names and common LPeg patterns for you to use. Token names include
137
- -- [`lexer.DEFAULT`](), [`lexer.WHITESPACE`](), [`lexer.COMMENT`](),
138
- -- [`lexer.STRING`](), [`lexer.NUMBER`](), [`lexer.KEYWORD`](),
139
- -- [`lexer.IDENTIFIER`](), [`lexer.OPERATOR`](), [`lexer.ERROR`](),
140
- -- [`lexer.PREPROCESSOR`](), [`lexer.CONSTANT`](), [`lexer.VARIABLE`](),
141
- -- [`lexer.FUNCTION`](), [`lexer.CLASS`](), [`lexer.TYPE`](), [`lexer.LABEL`](),
142
- -- [`lexer.REGEX`](), and [`lexer.EMBEDDED`](). Patterns include
143
- -- [`lexer.any`](), [`lexer.ascii`](), [`lexer.extend`](), [`lexer.alpha`](),
144
- -- [`lexer.digit`](), [`lexer.alnum`](), [`lexer.lower`](), [`lexer.upper`](),
145
- -- [`lexer.xdigit`](), [`lexer.cntrl`](), [`lexer.graph`](), [`lexer.print`](),
146
- -- [`lexer.punct`](), [`lexer.space`](), [`lexer.newline`](),
147
- -- [`lexer.nonnewline`](), [`lexer.nonnewline_esc`](), [`lexer.dec_num`](),
148
- -- [`lexer.hex_num`](), [`lexer.oct_num`](), [`lexer.integer`](),
149
- -- [`lexer.float`](), and [`lexer.word`](). You may use your own token names if
150
- -- none of the above fit your language, but an advantage to using predefined
151
- -- token names is that your lexer's tokens will inherit the universal syntax
152
- -- highlighting color theme used by your text editor.
153
- --
154
- -- #### Example Tokens
155
- --
156
- -- So, how might you define other tokens like comments, strings, and keywords?
157
- -- Here are some examples.
158
- --
159
- -- **Comments**
160
- --
161
- -- Line-style comments with a prefix character(s) are easy to express with LPeg:
162
- --
163
- -- local shell_comment = token(l.COMMENT, '#' * l.nonnewline^0)
164
- -- local c_line_comment = token(l.COMMENT, '//' * l.nonnewline_esc^0)
165
- --
166
- -- The comments above start with a '#' or "//" and go to the end of the line.
167
- -- The second comment recognizes the next line also as a comment if the current
168
- -- line ends with a '\' escape character.
169
- --
170
- -- C-style "block" comments with a start and end delimiter are also easy to
171
- -- express:
172
- --
173
- -- local c_comment = token(l.COMMENT, '/*' * (l.any - '*/')^0 * P('*/')^-1)
174
- --
175
- -- This comment starts with a "/\*" sequence and contains anything up to and
176
- -- including an ending "\*/" sequence. The ending "\*/" is optional so the lexer
177
- -- can recognize unfinished comments as comments and highlight them properly.
178
- --
179
- -- **Strings**
180
- --
181
- -- It is tempting to think that a string is not much different from the block
182
- -- comment shown above in that both have start and end delimiters:
183
- --
184
- -- local dq_str = '"' * (l.any - '"')^0 * P('"')^-1
185
- -- local sq_str = "'" * (l.any - "'")^0 * P("'")^-1
186
- -- local simple_string = token(l.STRING, dq_str + sq_str)
187
- --
188
- -- However, most programming languages allow escape sequences in strings such
189
- -- that a sequence like "\\&quot;" in a double-quoted string indicates that the
190
- -- '&quot;' is not the end of the string. The above token incorrectly matches
191
- -- such a string. Instead, use the [`lexer.delimited_range()`]() convenience
192
- -- function.
193
- --
194
- -- local dq_str = l.delimited_range('"')
195
- -- local sq_str = l.delimited_range("'")
196
- -- local string = token(l.STRING, dq_str + sq_str)
197
- --
198
- -- In this case, the lexer treats '\' as an escape character in a string
199
- -- sequence.
200
- --
201
- -- **Keywords**
202
- --
203
- -- Instead of matching _n_ keywords with _n_ `P('keyword_`_`n`_`')` ordered
204
- -- choices, use another convenience function: [`lexer.word_match()`](). It is
205
- -- much easier and more efficient to write word matches like:
206
- --
207
- -- local keyword = token(l.KEYWORD, l.word_match{
208
- -- 'keyword_1', 'keyword_2', ..., 'keyword_n'
209
- -- })
210
- --
211
- -- local case_insensitive_keyword = token(l.KEYWORD, l.word_match({
212
- -- 'KEYWORD_1', 'keyword_2', ..., 'KEYword_n'
213
- -- }, nil, true))
214
- --
215
- -- local hyphened_keyword = token(l.KEYWORD, l.word_match({
216
- -- 'keyword-1', 'keyword-2', ..., 'keyword-n'
217
- -- }, '-'))
218
- --
219
- -- By default, characters considered to be in keywords are in the set of
220
- -- alphanumeric characters and underscores. The last token demonstrates how to
221
- -- allow '-' (hyphen) characters to be in keywords as well.
222
- --
223
- -- **Numbers**
224
- --
225
- -- Most programming languages have the same format for integer and float tokens,
226
- -- so it might be as simple as using a couple of predefined LPeg patterns:
227
- --
228
- -- local number = token(l.NUMBER, l.float + l.integer)
229
- --
230
- -- However, some languages allow postfix characters on integers.
231
- --
232
- -- local integer = P('-')^-1 * (l.dec_num * S('lL')^-1)
233
- -- local number = token(l.NUMBER, l.float + l.hex_num + integer)
234
- --
235
- -- Your language may need other tweaks, but it is up to you how fine-grained you
236
- -- want your highlighting to be. After all, you are not writing a compiler or
237
- -- interpreter!
238
- --
239
- -- ### Rules
240
- --
241
- -- Programming languages have grammars, which specify valid token structure. For
242
- -- example, comments usually cannot appear within a string. Grammars consist of
243
- -- rules, which are simply combinations of tokens. Recall from the lexer
244
- -- template the `_rules` table, which defines all the rules used by the lexer
245
- -- grammar:
246
- --
247
- -- M._rules = {
248
- -- {'whitespace', ws},
249
- -- }
250
- --
251
- -- Each entry in a lexer's `_rules` table consists of a rule name and its
252
- -- associated pattern. Rule names are completely arbitrary and serve only to
253
- -- identify and distinguish between different rules. Rule order is important: if
254
- -- text does not match the first rule, the lexer tries the second rule, and so
255
- -- on. This simple grammar says to match whitespace tokens under a rule named
256
- -- "whitespace".
257
- --
258
- -- To illustrate the importance of rule order, here is an example of a
259
- -- simplified Lua grammar:
260
- --
261
- -- M._rules = {
262
- -- {'whitespace', ws},
263
- -- {'keyword', keyword},
264
- -- {'identifier', identifier},
265
- -- {'string', string},
266
- -- {'comment', comment},
267
- -- {'number', number},
268
- -- {'label', label},
269
- -- {'operator', operator},
270
- -- }
271
- --
272
- -- Note how identifiers come after keywords. In Lua, as with most programming
273
- -- languages, the characters allowed in keywords and identifiers are in the same
274
- -- set (alphanumerics plus underscores). If the lexer specified the "identifier"
275
- -- rule before the "keyword" rule, all keywords would match identifiers and thus
276
- -- incorrectly highlight as identifiers instead of keywords. The same idea
277
- -- applies to function, constant, etc. tokens that you may want to distinguish
278
- -- between: their rules should come before identifiers.
279
- --
280
- -- So what about text that does not match any rules? For example in Lua, the '!'
281
- -- character is meaningless outside a string or comment. Normally the lexer
282
- -- skips over such text. If instead you want to highlight these "syntax errors",
283
- -- add an additional end rule:
284
- --
285
- -- M._rules = {
286
- -- {'whitespace', ws},
287
- -- {'error', token(l.ERROR, l.any)},
288
- -- }
289
- --
290
- -- This identifies and highlights any character not matched by an existing
291
- -- rule as an `lexer.ERROR` token.
292
- --
293
- -- Even though the rules defined in the examples above contain a single token,
294
- -- rules may consist of multiple tokens. For example, a rule for an HTML tag
295
- -- could consist of a tag token followed by an arbitrary number of attribute
296
- -- tokens, allowing the lexer to highlight all tokens separately. The rule might
297
- -- look something like this:
298
- --
299
- -- {'tag', tag_start * (ws * attributes)^0 * tag_end^-1}
300
- --
301
- -- Note however that lexers with complex rules like these are more prone to lose
302
- -- track of their state.
303
- --
304
- -- ### Summary
305
- --
306
- -- Lexers primarily consist of tokens and grammar rules. At your disposal are a
307
- -- number of convenience patterns and functions for rapidly creating a lexer. If
308
- -- you choose to use predefined token names for your tokens, you do not have to
309
- -- define how the lexer highlights them. The tokens will inherit the default
310
- -- syntax highlighting color theme your editor uses.
311
- --
312
- -- ## Advanced Techniques
313
- --
314
- -- ### Styles and Styling
315
- --
316
- -- The most basic form of syntax highlighting is assigning different colors to
317
- -- different tokens. Instead of highlighting with just colors, Scintilla allows
318
- -- for more rich highlighting, or "styling", with different fonts, font sizes,
319
- -- font attributes, and foreground and background colors, just to name a few.
320
- -- The unit of this rich highlighting is called a "style". Styles are simply
321
- -- strings of comma-separated property settings. By default, lexers associate
322
- -- predefined token names like `lexer.WHITESPACE`, `lexer.COMMENT`,
323
- -- `lexer.STRING`, etc. with particular styles as part of a universal color
324
- -- theme. These predefined styles include [`lexer.STYLE_CLASS`](),
325
- -- [`lexer.STYLE_COMMENT`](), [`lexer.STYLE_CONSTANT`](),
326
- -- [`lexer.STYLE_ERROR`](), [`lexer.STYLE_EMBEDDED`](),
327
- -- [`lexer.STYLE_FUNCTION`](), [`lexer.STYLE_IDENTIFIER`](),
328
- -- [`lexer.STYLE_KEYWORD`](), [`lexer.STYLE_LABEL`](), [`lexer.STYLE_NUMBER`](),
329
- -- [`lexer.STYLE_OPERATOR`](), [`lexer.STYLE_PREPROCESSOR`](),
330
- -- [`lexer.STYLE_REGEX`](), [`lexer.STYLE_STRING`](), [`lexer.STYLE_TYPE`](),
331
- -- [`lexer.STYLE_VARIABLE`](), and [`lexer.STYLE_WHITESPACE`](). Like with
332
- -- predefined token names and LPeg patterns, you may define your own styles. At
333
- -- their core, styles are just strings, so you may create new ones and/or modify
334
- -- existing ones. Each style consists of the following comma-separated settings:
335
- --
336
- -- Setting | Description
337
- -- ---------------|------------
338
- -- font:_name_ | The name of the font the style uses.
339
- -- size:_int_ | The size of the font the style uses.
340
- -- [not]bold | Whether or not the font face is bold.
341
- -- [not]italics | Whether or not the font face is italic.
342
- -- [not]underlined| Whether or not the font face is underlined.
343
- -- fore:_color_ | The foreground color of the font face.
344
- -- back:_color_ | The background color of the font face.
345
- -- [not]eolfilled | Does the background color extend to the end of the line?
346
- -- case:_char_ | The case of the font ('u': upper, 'l': lower, 'm': normal).
347
- -- [not]visible | Whether or not the text is visible.
348
- -- [not]changeable| Whether the text is changeable or read-only.
349
- -- [not]hotspot | Whether or not the text is clickable.
350
- --
351
- -- Specify font colors in either "#RRGGBB" format, "0xBBGGRR" format, or the
352
- -- decimal equivalent of the latter. As with token names, LPeg patterns, and
353
- -- styles, there is a set of predefined color names, but they vary depending on
354
- -- the current color theme in use. Therefore, it is generally not a good idea to
355
- -- manually define colors within styles in your lexer since they might not fit
356
- -- into a user's chosen color theme. Try to refrain from even using predefined
357
- -- colors in a style because that color may be theme-specific. Instead, the best
358
- -- practice is to either use predefined styles or derive new color-agnostic
359
- -- styles from predefined ones. For example, Lua "longstring" tokens use the
360
- -- existing `lexer.STYLE_STRING` style instead of defining a new one.
361
- --
362
- -- #### Example Styles
363
- --
364
- -- Defining styles is pretty straightforward. An empty style that inherits the
365
- -- default theme settings is simply an empty string:
366
- --
367
- -- local style_nothing = ''
368
- --
369
- -- A similar style but with a bold font face looks like this:
370
- --
371
- -- local style_bold = 'bold'
372
- --
373
- -- If you want the same style, but also with an italic font face, define the new
374
- -- style in terms of the old one:
375
- --
376
- -- local style_bold_italic = style_bold..',italics'
377
- --
378
- -- This allows you to derive new styles from predefined ones without having to
379
- -- rewrite them. This operation leaves the old style unchanged. Thus if you
380
- -- had a "static variable" token whose style you wanted to base off of
381
- -- `lexer.STYLE_VARIABLE`, it would probably look like:
382
- --
383
- -- local style_static_var = l.STYLE_VARIABLE..',italics'
384
- --
385
- -- The color theme files in the *lexers/themes/* folder give more examples of
386
- -- style definitions.
387
- --
388
- -- ### Token Styles
389
- --
390
- -- Lexers use the `_tokenstyles` table to assign tokens to particular styles.
391
- -- Recall the token definition and `_tokenstyles` table from the lexer template:
392
- --
393
- -- local ws = token(l.WHITESPACE, l.space^1)
394
- --
395
- -- ...
396
- --
397
- -- M._tokenstyles = {
398
- --
399
- -- }
400
- --
401
- -- Why is a style not assigned to the `lexer.WHITESPACE` token? As mentioned
402
- -- earlier, lexers automatically associate tokens that use predefined token
403
- -- names with a particular style. Only tokens with custom token names need
404
- -- manual style associations. As an example, consider a custom whitespace token:
405
- --
406
- -- local ws = token('custom_whitespace', l.space^1)
407
- --
408
- -- Assigning a style to this token looks like:
409
- --
410
- -- M._tokenstyles = {
411
- -- custom_whitespace = l.STYLE_WHITESPACE
412
- -- }
413
- --
414
- -- Do not confuse token names with rule names. They are completely different
415
- -- entities. In the example above, the lexer assigns the "custom_whitespace"
416
- -- token the existing style for `WHITESPACE` tokens. If instead you want to
417
- -- color the background of whitespace a shade of grey, it might look like:
418
- --
419
- -- local custom_style = l.STYLE_WHITESPACE..',back:$(color.grey)'
420
- -- M._tokenstyles = {
421
- -- custom_whitespace = custom_style
422
- -- }
423
- --
424
- -- Notice that the lexer peforms Scintilla/SciTE-style "$()" property expansion.
425
- -- You may also use "%()". Remember to refrain from assigning specific colors in
426
- -- styles, but in this case, all user color themes probably define the
427
- -- "color.grey" property.
428
- --
429
- -- ### Line Lexers
430
- --
431
- -- By default, lexers match the arbitrary chunks of text passed to them by
432
- -- Scintilla. These chunks may be a full document, only the visible part of a
433
- -- document, or even just portions of lines. Some lexers need to match whole
434
- -- lines. For example, a lexer for the output of a file "diff" needs to know if
435
- -- the line started with a '+' or '-' and then style the entire line
436
- -- accordingly. To indicate that your lexer matches by line, use the
437
- -- `_LEXBYLINE` field:
438
- --
439
- -- M._LEXBYLINE = true
440
- --
441
- -- Now the input text for the lexer is a single line at a time. Keep in mind
442
- -- that line lexers do not have the ability to look ahead at subsequent lines.
443
- --
444
- -- ### Embedded Lexers
445
- --
446
- -- Lexers embed within one another very easily, requiring minimal effort. In the
447
- -- following sections, the lexer being embedded is called the "child" lexer and
448
- -- the lexer a child is being embedded in is called the "parent". For example,
449
- -- consider an HTML lexer and a CSS lexer. Either lexer stands alone for styling
450
- -- their respective HTML and CSS files. However, CSS can be embedded inside
451
- -- HTML. In this specific case, the CSS lexer is the "child" lexer with the HTML
452
- -- lexer being the "parent". Now consider an HTML lexer and a PHP lexer. This
453
- -- sounds a lot like the case with CSS, but there is a subtle difference: PHP
454
- -- _embeds itself_ into HTML while CSS is _embedded in_ HTML. This fundamental
455
- -- difference results in two types of embedded lexers: a parent lexer that
456
- -- embeds other child lexers in it (like HTML embedding CSS), and a child lexer
457
- -- that embeds itself within a parent lexer (like PHP embedding itself in HTML).
458
- --
459
- -- #### Parent Lexer
460
- --
461
- -- Before embedding a child lexer into a parent lexer, the parent lexer needs to
462
- -- load the child lexer. This is done with the [`lexer.load()`]() function. For
463
- -- example, loading the CSS lexer within the HTML lexer looks like:
464
- --
465
- -- local css = l.load('css')
466
- --
467
- -- The next part of the embedding process is telling the parent lexer when to
468
- -- switch over to the child lexer and when to switch back. The lexer refers to
469
- -- these indications as the "start rule" and "end rule", respectively, and are
470
- -- just LPeg patterns. Continuing with the HTML/CSS example, the transition from
471
- -- HTML to CSS is when the lexer encounters a "style" tag with a "type"
472
- -- attribute whose value is "text/css":
473
- --
474
- -- local css_tag = P('<style') * P(function(input, index)
475
- -- if input:find('^[^>]+type="text/css"', index) then
476
- -- return index
477
- -- end
478
- -- end)
479
- --
480
- -- This pattern looks for the beginning of a "style" tag and searches its
481
- -- attribute list for the text "`type="text/css"`". (In this simplified example,
482
- -- the Lua pattern does not consider whitespace between the '=' nor does it
483
- -- consider that using single quotes is valid.) If there is a match, the
484
- -- functional pattern returns a value instead of `nil`. In this case, the value
485
- -- returned does not matter because we ultimately want to style the "style" tag
486
- -- as an HTML tag, so the actual start rule looks like this:
487
- --
488
- -- local css_start_rule = #css_tag * tag
489
- --
490
- -- Now that the parent knows when to switch to the child, it needs to know when
491
- -- to switch back. In the case of HTML/CSS, the switch back occurs when the
492
- -- lexer encounters an ending "style" tag, though the lexer should still style
493
- -- the tag as an HTML tag:
494
- --
495
- -- local css_end_rule = #P('</style>') * tag
496
- --
497
- -- Once the parent loads the child lexer and defines the child's start and end
498
- -- rules, it embeds the child with the [`lexer.embed_lexer()`]() function:
499
- --
500
- -- l.embed_lexer(M, css, css_start_rule, css_end_rule)
501
- --
502
- -- The first parameter is the parent lexer object to embed the child in, which
503
- -- in this case is `M`. The other three parameters are the child lexer object
504
- -- loaded earlier followed by its start and end rules.
505
- --
506
- -- #### Child Lexer
507
- --
508
- -- The process for instructing a child lexer to embed itself into a parent is
509
- -- very similar to embedding a child into a parent: first, load the parent lexer
510
- -- into the child lexer with the [`lexer.load()`]() function and then create
511
- -- start and end rules for the child lexer. However, in this case, swap the
512
- -- lexer object arguments to [`lexer.embed_lexer()`](). For example, in the PHP
513
- -- lexer:
514
- --
515
- -- local html = l.load('html')
516
- -- local php_start_rule = token('php_tag', '<?php ')
517
- -- local php_end_rule = token('php_tag', '?>')
518
- -- l.embed_lexer(html, M, php_start_rule, php_end_rule)
519
- --
520
- -- ## Code Folding
521
- --
522
- -- When reading source code, it is occasionally helpful to temporarily hide
523
- -- blocks of code like functions, classes, comments, etc. This is the concept of
524
- -- "folding". In the Textadept and SciTE editors for example, little indicators
525
- -- in the editor margins appear next to code that can be folded at places called
526
- -- "fold points". When the user clicks an indicator, the editor hides the code
527
- -- associated with the indicator until the user clicks the indicator again. The
528
- -- lexer specifies these fold points and what code exactly to fold.
529
- --
530
- -- The fold points for most languages occur on keywords or character sequences.
531
- -- Examples of fold keywords are "if" and "end" in Lua and examples of fold
532
- -- character sequences are '{', '}', "/\*", and "\*/" in C for code block and
533
- -- comment delimiters, respectively. However, these fold points cannot occur
534
- -- just anywhere. For example, lexers should not recognize fold keywords that
535
- -- appear within strings or comments. The lexer's `_foldsymbols` table allows
536
- -- you to conveniently define fold points with such granularity. For example,
537
- -- consider C:
538
- --
539
- -- M._foldsymbols = {
540
- -- [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
541
- -- [l.COMMENT] = {['/*'] = 1, ['*/'] = -1},
542
- -- _patterns = {'[{}]', '/%*', '%*/'}
543
- -- }
544
- --
545
- -- The first assignment states that any '{' or '}' that the lexer recognized as
546
- -- an `lexer.OPERATOR` token is a fold point. The integer `1` indicates the
547
- -- match is a beginning fold point and `-1` indicates the match is an ending
548
- -- fold point. Likewise, the second assignment states that any "/\*" or "\*/"
549
- -- that the lexer recognizes as part of a `lexer.COMMENT` token is a fold point.
550
- -- The lexer does not consider any occurences of these characters outside their
551
- -- defined tokens (such as in a string) as fold points. Finally, every
552
- -- `_foldsymbols` table must have a `_patterns` field that contains a list of
553
- -- [Lua patterns][] that match fold points. If the lexer encounters text that
554
- -- matches one of those patterns, the lexer looks up the matched text in its
555
- -- token's table to determine whether or not the text is a fold point. In the
556
- -- example above, the first Lua pattern matches any '{' or '}' characters. When
557
- -- the lexer comes across one of those characters, it checks if the match is an
558
- -- `lexer.OPERATOR` token. If so, the lexer identifies the match as a fold
559
- -- point. The same idea applies for the other patterns. (The '%' is in the other
560
- -- patterns because '\*' is a special character in Lua patterns that needs
561
- -- escaping.) How do you specify fold keywords? Here is an example for Lua:
562
- --
563
- -- M._foldsymbols = {
564
- -- [l.KEYWORD] = {
565
- -- ['if'] = 1, ['do'] = 1, ['function'] = 1,
566
- -- ['end'] = -1, ['repeat'] = 1, ['until'] = -1
567
- -- },
568
- -- _patterns = {'%l+'}
569
- -- }
570
- --
571
- -- Any time the lexer encounters a lower case word, if that word is a
572
- -- `lexer.KEYWORD` token and in the associated list of fold points, the lexer
573
- -- identifies the word as a fold point.
574
- --
575
- -- If your lexer needs to do some additional processing to determine if a match
576
- -- is a fold point, assign a function that returns an integer. Returning `1` or
577
- -- `-1` indicates the match is a fold point. Returning `0` indicates it is not.
578
- -- For example:
579
- --
580
- -- local function fold_strange_token(text, pos, line, s, match)
581
- -- if ... then
582
- -- return 1 -- beginning fold point
583
- -- elseif ... then
584
- -- return -1 -- ending fold point
585
- -- end
586
- -- return 0
587
- -- end
588
- --
589
- -- M._foldsymbols = {
590
- -- ['strange_token'] = {['|'] = fold_strange_token},
591
- -- _patterns = {'|'}
592
- -- }
593
- --
594
- -- Any time the lexer encounters a '|' that is a "strange_token", it calls the
595
- -- `fold_strange_token` function to determine if '|' is a fold point. The lexer
596
- -- calls these functions with the following arguments: the text to identify fold
597
- -- points in, the beginning position of the current line in the text to fold,
598
- -- the current line's text, the position in the current line the matched text
599
- -- starts at, and the matched text itself.
600
- --
601
- -- [Lua patterns]: http://www.lua.org/manual/5.2/manual.html#6.4.1
602
- --
603
- -- ## Using Lexers
604
- --
605
- -- ### Textadept
606
- --
607
- -- Put your lexer in your *~/.textadept/lexers/* directory so you do not
608
- -- overwrite it when upgrading Textadept. Also, lexers in this directory
609
- -- override default lexers. Thus, Textadept loads a user *lua* lexer instead of
610
- -- the default *lua* lexer. This is convenient for tweaking a default lexer to
611
- -- your liking. Then add a [file type][] for your lexer if necessary.
612
- --
613
- -- [file type]: _M.textadept.file_types.html
614
- --
615
- -- ### SciTE
616
- --
617
- -- Create a *.properties* file for your lexer and `import` it in either your
618
- -- *SciTEUser.properties* or *SciTEGlobal.properties*. The contents of the
619
- -- *.properties* file should contain:
620
- --
621
- -- file.patterns.[lexer_name]=[file_patterns]
622
- -- lexer.$(file.patterns.[lexer_name])=[lexer_name]
623
- --
624
- -- where `[lexer_name]` is the name of your lexer (minus the *.lua* extension)
625
- -- and `[file_patterns]` is a set of file extensions to use your lexer for.
626
- --
627
- -- Please note that Lua lexers ignore any styling information in *.properties*
628
- -- files. Your theme file in the *lexers/themes/* directory contains styling
629
- -- information.
630
- --
631
- -- ## Considerations
632
- --
633
- -- ### Performance
634
- --
635
- -- There might be some slight overhead when initializing a lexer, but loading a
636
- -- file from disk into Scintilla is usually more expensive. On modern computer
637
- -- systems, I see no difference in speed between LPeg lexers and Scintilla's C++
638
- -- ones. Optimize lexers for speed by re-arranging rules in the `_rules` table
639
- -- so that the most common rules match first. Do keep in mind that order matters
640
- -- for similar rules.
641
- --
642
- -- ### Limitations
643
- --
644
- -- Embedded preprocessor languages like PHP cannot completely embed in their
645
- -- parent languages in that the parent's tokens do not support start and end
646
- -- rules. This mostly goes unnoticed, but code like
647
- --
648
- -- <div id="<?php echo $id; ?>">
649
- --
650
- -- or
651
- --
652
- -- <div <?php if ($odd) { echo 'class="odd"'; } ?>>
653
- --
654
- -- will not style correctly.
655
- --
656
- -- ### Troubleshooting
657
- --
658
- -- Errors in lexers can be tricky to debug. Lexers print Lua errors to
659
- -- `io.stderr` and `_G.print()` statements to `io.stdout`. Running your editor
660
- -- from a terminal is the easiest way to see errors as they occur.
661
- --
662
- -- ### Risks
663
- --
664
- -- Poorly written lexers have the ability to crash Scintilla (and thus its
665
- -- containing application), so unsaved data might be lost. However, I have only
666
- -- observed these crashes in early lexer development, when syntax errors or
667
- -- pattern errors are present. Once the lexer actually starts styling text
668
- -- (either correctly or incorrectly, it does not matter), I have not observed
669
- -- any crashes.
670
- --
671
- -- ### Acknowledgements
672
- --
673
- -- Thanks to Peter Odding for his [lexer post][] on the Lua mailing list
674
- -- that inspired me, and thanks to Roberto Ierusalimschy for LPeg.
675
- --
676
- -- [lexer post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html
677
- -- @field LEXERPATH (string)
678
- -- The path used to search for a lexer to load.
679
- -- Identical in format to Lua's `package.path` string.
680
- -- The default value is `package.path`.
681
- -- @field DEFAULT (string)
682
- -- The token name for default tokens.
683
- -- @field WHITESPACE (string)
684
- -- The token name for whitespace tokens.
685
- -- @field COMMENT (string)
686
- -- The token name for comment tokens.
687
- -- @field STRING (string)
688
- -- The token name for string tokens.
689
- -- @field NUMBER (string)
690
- -- The token name for number tokens.
691
- -- @field KEYWORD (string)
692
- -- The token name for keyword tokens.
693
- -- @field IDENTIFIER (string)
694
- -- The token name for identifier tokens.
695
- -- @field OPERATOR (string)
696
- -- The token name for operator tokens.
697
- -- @field ERROR (string)
698
- -- The token name for error tokens.
699
- -- @field PREPROCESSOR (string)
700
- -- The token name for preprocessor tokens.
701
- -- @field CONSTANT (string)
702
- -- The token name for constant tokens.
703
- -- @field VARIABLE (string)
704
- -- The token name for variable tokens.
705
- -- @field FUNCTION (string)
706
- -- The token name for function tokens.
707
- -- @field CLASS (string)
708
- -- The token name for class tokens.
709
- -- @field TYPE (string)
710
- -- The token name for type tokens.
711
- -- @field LABEL (string)
712
- -- The token name for label tokens.
713
- -- @field REGEX (string)
714
- -- The token name for regex tokens.
715
- -- @field STYLE_CLASS (string)
716
- -- The style typically used for class definitions.
717
- -- @field STYLE_COMMENT (string)
718
- -- The style typically used for code comments.
719
- -- @field STYLE_CONSTANT (string)
720
- -- The style typically used for constants.
721
- -- @field STYLE_ERROR (string)
722
- -- The style typically used for erroneous syntax.
723
- -- @field STYLE_FUNCTION (string)
724
- -- The style typically used for function definitions.
725
- -- @field STYLE_KEYWORD (string)
726
- -- The style typically used for language keywords.
727
- -- @field STYLE_LABEL (string)
728
- -- The style typically used for labels.
729
- -- @field STYLE_NUMBER (string)
730
- -- The style typically used for numbers.
731
- -- @field STYLE_OPERATOR (string)
732
- -- The style typically used for operators.
733
- -- @field STYLE_REGEX (string)
734
- -- The style typically used for regular expression strings.
735
- -- @field STYLE_STRING (string)
736
- -- The style typically used for strings.
737
- -- @field STYLE_PREPROCESSOR (string)
738
- -- The style typically used for preprocessor statements.
739
- -- @field STYLE_TYPE (string)
740
- -- The style typically used for static types.
741
- -- @field STYLE_VARIABLE (string)
742
- -- The style typically used for variables.
743
- -- @field STYLE_WHITESPACE (string)
744
- -- The style typically used for whitespace.
745
- -- @field STYLE_EMBEDDED (string)
746
- -- The style typically used for embedded code.
747
- -- @field STYLE_IDENTIFIER (string)
748
- -- The style typically used for identifier words.
749
- -- @field STYLE_DEFAULT (string)
750
- -- The style all styles are based off of.
751
- -- @field STYLE_LINENUMBER (string)
752
- -- The style used for all margins except fold margins.
753
- -- @field STYLE_BRACELIGHT (string)
754
- -- The style used for highlighted brace characters.
755
- -- @field STYLE_BRACEBAD (string)
756
- -- The style used for unmatched brace characters.
757
- -- @field STYLE_CONTROLCHAR (string)
758
- -- The style used for control characters.
759
- -- Color attributes are ignored.
760
- -- @field STYLE_INDENTGUIDE (string)
761
- -- The style used for indentation guides.
762
- -- @field STYLE_CALLTIP (string)
763
- -- The style used by call tips if [`buffer.call_tip_use_style`]() is set.
764
- -- Only the font name, size, and color attributes are used.
765
- -- @field any (pattern)
766
- -- A pattern that matches any single character.
767
- -- @field ascii (pattern)
768
- -- A pattern that matches any ASCII character (codes 0 to 127).
769
- -- @field extend (pattern)
770
- -- A pattern that matches any ASCII extended character (codes 0 to 255).
771
- -- @field alpha (pattern)
772
- -- A pattern that matches any alphabetic character ('A'-'Z', 'a'-'z').
773
- -- @field digit (pattern)
774
- -- A pattern that matches any digit ('0'-'9').
775
- -- @field alnum (pattern)
776
- -- A pattern that matches any alphanumeric character ('A'-'Z', 'a'-'z',
777
- -- '0'-'9').
778
- -- @field lower (pattern)
779
- -- A pattern that matches any lower case character ('a'-'z').
780
- -- @field upper (pattern)
781
- -- A pattern that matches any upper case character ('A'-'Z').
782
- -- @field xdigit (pattern)
783
- -- A pattern that matches any hexadecimal digit ('0'-'9', 'A'-'F', 'a'-'f').
784
- -- @field cntrl (pattern)
785
- -- A pattern that matches any control character (ASCII codes 0 to 31).
786
- -- @field graph (pattern)
787
- -- A pattern that matches any graphical character ('!' to '~').
788
- -- @field print (pattern)
789
- -- A pattern that matches any printable character (' ' to '~').
790
- -- @field punct (pattern)
791
- -- A pattern that matches any punctuation character ('!' to '/', ':' to '@',
792
- -- '[' to ''', '{' to '~').
793
- -- @field space (pattern)
794
- -- A pattern that matches any whitespace character ('\t', '\v', '\f', '\n',
795
- -- '\r', space).
796
- -- @field newline (pattern)
797
- -- A pattern that matches any set of end of line characters.
798
- -- @field nonnewline (pattern)
799
- -- A pattern that matches any single, non-newline character.
800
- -- @field nonnewline_esc (pattern)
801
- -- A pattern that matches any single, non-newline character or any set of end
802
- -- of line characters escaped with '\'.
803
- -- @field dec_num (pattern)
804
- -- A pattern that matches a decimal number.
805
- -- @field hex_num (pattern)
806
- -- A pattern that matches a hexadecimal number.
807
- -- @field oct_num (pattern)
808
- -- A pattern that matches an octal number.
809
- -- @field integer (pattern)
810
- -- A pattern that matches either a decimal, hexadecimal, or octal number.
811
- -- @field float (pattern)
812
- -- A pattern that matches a floating point number.
813
- -- @field word (pattern)
814
- -- A pattern that matches a typical word. Words begin with a letter or
815
- -- underscore and consist of alphanumeric and underscore characters.
816
- -- @field FOLD_BASE (number)
817
- -- The initial (root) fold level.
818
- -- @field FOLD_BLANK (number)
819
- -- Flag indicating that the line is blank.
820
- -- @field FOLD_HEADER (number)
821
- -- Flag indicating the line is fold point.
822
- -- @field fold_level (table, Read-only)
823
- -- Table of fold level bit-masks for line numbers starting from zero.
824
- -- Fold level masks are composed of an integer level combined with any of the
825
- -- following bits:
826
- --
827
- -- * `lexer.FOLD_BASE`
828
- -- The initial fold level.
829
- -- * `lexer.FOLD_BLANK`
830
- -- The line is blank.
831
- -- * `lexer.FOLD_HEADER`
832
- -- The line is a header, or fold point.
833
- -- @field indent_amount (table, Read-only)
834
- -- Table of indentation amounts in character columns, for line numbers
835
- -- starting from zero.
836
- -- @field property (table)
837
- -- Map of key-value string pairs.
838
- -- @field property_expanded (table, Read-only)
839
- -- Map of key-value string pairs with `$()` and `%()` variable replacement
840
- -- performed in values.
841
- -- @field property_int (table, Read-only)
842
- -- Map of key-value pairs with values interpreted as numbers, or `0` if not
843
- -- found.
844
- -- @field style_at (table, Read-only)
845
- -- Table of style names at positions in the buffer starting from zero.
846
- module('lexer')]=]
847
-
848
- local lpeg = require('lpeg')
849
- local lpeg_P, lpeg_R, lpeg_S, lpeg_V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
850
- local lpeg_Ct, lpeg_Cc, lpeg_Cp = lpeg.Ct, lpeg.Cc, lpeg.Cp
851
- local lpeg_Cmt, lpeg_C, lpeg_Cg = lpeg.Cmt, lpeg.C, lpeg.Cg
852
- local lpeg_match = lpeg.match
853
-
854
- M.LEXERPATH = package.path
855
-
856
- -- Table of loaded lexers.
857
- local lexers = {}
858
-
859
- -- Keep track of the last parent lexer loaded. This lexer's rules are used for
860
- -- proxy lexers (those that load parent and child lexers to embed) that do not
861
- -- declare a parent lexer.
862
- local parent_lexer
863
-
864
- if not package.searchpath then
865
- -- Searches for the given *name* in the given *path*.
866
- -- This is an implementation of Lua 5.2's `package.searchpath()` function for
867
- -- Lua 5.1.
868
- function package.searchpath(name, path)
869
- local tried = {}
870
- for part in path:gmatch('[^;]+') do
871
- local filename = part:gsub('%?', name)
872
- local f = io.open(filename, 'r')
873
- if f then f:close() return filename end
874
- tried[#tried + 1] = ("no file '%s'"):format(filename)
875
- end
876
- return nil, table.concat(tried, '\n')
877
- end
878
- end
879
-
880
- -- Adds a rule to a lexer's current ordered list of rules.
881
- -- @param lexer The lexer to add the given rule to.
882
- -- @param name The name associated with this rule. It is used for other lexers
883
- -- to access this particular rule from the lexer's `_RULES` table. It does not
884
- -- have to be the same as the name passed to `token`.
885
- -- @param rule The LPeg pattern of the rule.
886
- local function add_rule(lexer, id, rule)
887
- if not lexer._RULES then
888
- lexer._RULES = {}
889
- -- Contains an ordered list (by numerical index) of rule names. This is used
890
- -- in conjunction with lexer._RULES for building _TOKENRULE.
891
- lexer._RULEORDER = {}
892
- end
893
- lexer._RULES[id] = rule
894
- lexer._RULEORDER[#lexer._RULEORDER + 1] = id
895
- end
896
-
897
- -- Adds a new Scintilla style to Scintilla.
898
- -- @param lexer The lexer to add the given style to.
899
- -- @param token_name The name of the token associated with this style.
900
- -- @param style A Scintilla style created from `style()`.
901
- -- @see style
902
- local function add_style(lexer, token_name, style)
903
- local num_styles = lexer._numstyles
904
- if num_styles == 32 then num_styles = num_styles + 8 end -- skip predefined
905
- if num_styles >= 255 then print('Too many styles defined (255 MAX)') end
906
- lexer._TOKENSTYLES[token_name], lexer._numstyles = num_styles, num_styles + 1
907
- lexer._EXTRASTYLES[token_name] = style
908
- end
909
-
910
- -- (Re)constructs `lexer._TOKENRULE`.
911
- -- @param parent The parent lexer.
912
- local function join_tokens(lexer)
913
- local patterns, order = lexer._RULES, lexer._RULEORDER
914
- local token_rule = patterns[order[1]]
915
- for i = 2, #order do token_rule = token_rule + patterns[order[i]] end
916
- lexer._TOKENRULE = token_rule + M.token(M.DEFAULT, M.any)
917
- return lexer._TOKENRULE
918
- end
919
-
920
- -- Adds a given lexer and any of its embedded lexers to a given grammar.
921
- -- @param grammar The grammar to add the lexer to.
922
- -- @param lexer The lexer to add.
923
- local function add_lexer(grammar, lexer, token_rule)
924
- local token_rule = join_tokens(lexer)
925
- local lexer_name = lexer._NAME
926
- for _, child in ipairs(lexer._CHILDREN) do
927
- if child._CHILDREN then add_lexer(grammar, child) end
928
- local child_name = child._NAME
929
- local rules = child._EMBEDDEDRULES[lexer_name]
930
- local rules_token_rule = grammar['__'..child_name] or rules.token_rule
931
- grammar[child_name] = (-rules.end_rule * rules_token_rule)^0 *
932
- rules.end_rule^-1 * lpeg_V(lexer_name)
933
- local embedded_child = '_'..child_name
934
- grammar[embedded_child] = rules.start_rule * (-rules.end_rule *
935
- rules_token_rule)^0 * rules.end_rule^-1
936
- token_rule = lpeg_V(embedded_child) + token_rule
937
- end
938
- grammar['__'..lexer_name] = token_rule -- can contain embedded lexer rules
939
- grammar[lexer_name] = token_rule^0
940
- end
941
-
942
- -- (Re)constructs `lexer._GRAMMAR`.
943
- -- @param lexer The parent lexer.
944
- -- @param initial_rule The name of the rule to start lexing with. The default
945
- -- value is `lexer._NAME`. Multilang lexers use this to start with a child
946
- -- rule if necessary.
947
- local function build_grammar(lexer, initial_rule)
948
- -- local children = lexer._CHILDREN
949
- -- if children then
950
- local lexer_name = lexer._NAME
951
- if not initial_rule then initial_rule = lexer_name end
952
- local grammar = {initial_rule}
953
- if not lexer._CHILDREN then lexer._CHILDREN={} end
954
- add_lexer(grammar, lexer)
955
- lexer._INITIALRULE = initial_rule
956
- lexer._GRAMMAR = lpeg_Ct(lpeg_P(grammar))
957
- -- else
958
- -- lexer._GRAMMAR = lpeg_Ct(join_tokens(lexer)^0)
959
- -- end
960
- end
961
-
962
- local string_upper = string.upper
963
- -- Default styles.
964
- local default = {
965
- 'nothing', 'whitespace', 'comment', 'string', 'number', 'keyword',
966
- 'identifier', 'operator', 'error', 'preprocessor', 'constant', 'variable',
967
- 'function', 'class', 'type', 'label', 'regex', 'embedded'
968
- }
969
- for _, v in ipairs(default) do
970
- M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
971
- end
972
- -- Predefined styles.
973
- local predefined = {
974
- 'default', 'linenumber', 'bracelight', 'bracebad', 'controlchar',
975
- 'indentguide', 'calltip'
976
- }
977
- for _, v in ipairs(predefined) do
978
- M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
979
- end
980
-
981
- ---
982
- -- Initializes or loads and returns the lexer of string name *name*.
983
- -- Scintilla calls this function to load a lexer. Parent lexers also call this
984
- -- function to load child lexers and vice-versa. The user calls this function
985
- -- to load a lexer when using Scintillua as a Lua library.
986
- -- @param name The name of the lexing language.
987
- -- @param alt_name The alternate name of the lexing language. This is useful for
988
- -- embedding the same child lexer with multiple sets of start and end tokens.
989
- -- @return lexer object
990
- -- @name load
991
- function M.load(name, alt_name)
992
- if lexers[alt_name or name] then return lexers[alt_name or name] end
993
- parent_lexer = nil -- reset
994
-
995
- -- When using Scintillua as a stand-alone module, the `property` and
996
- -- `property_int` tables do not exist (they are not useful). Create them to
997
- -- prevent errors from occurring.
998
- if not M.property then
999
- M.property, M.property_int = {}, setmetatable({}, {
1000
- __index = function(t, k)
1001
- return tostring(tonumber(M.property[k]) or 0)
1002
- end,
1003
- __newindex = function() error('read-only property') end
1004
- })
1005
- end
1006
-
1007
- -- Load the language lexer with its rules, styles, etc.
1008
- M.WHITESPACE = (alt_name or name)..'_whitespace'
1009
- local lexer_file, error = package.searchpath(name, M.LEXERPATH)
1010
- local ok, lexer = pcall(dofile, lexer_file or '')
1011
- if not ok then
1012
- _G.print(error or lexer) -- error message
1013
- lexer = {_NAME = alt_name or name}
1014
- end
1015
- if alt_name then lexer._NAME = alt_name end
1016
-
1017
- -- Create the initial maps for token names to style numbers and styles.
1018
- local token_styles = {}
1019
- for i = 1, #default do token_styles[default[i]] = i - 1 end
1020
- for i = 1, #predefined do token_styles[predefined[i]] = i + 31 end
1021
- lexer._TOKENSTYLES, lexer._numstyles = token_styles, #default
1022
- lexer._EXTRASTYLES = {}
1023
-
1024
- -- If the lexer is a proxy (loads parent and child lexers to embed) and does
1025
- -- not declare a parent, try and find one and use its rules.
1026
- if not lexer._rules and not lexer._lexer then lexer._lexer = parent_lexer end
1027
-
1028
- -- If the lexer is a proxy or a child that embedded itself, add its rules and
1029
- -- styles to the parent lexer. Then set the parent to be the main lexer.
1030
- if lexer._lexer then
1031
- local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles
1032
- if not l._tokenstyles then l._tokenstyles = {} end
1033
- for _, r in ipairs(_r or {}) do
1034
- -- Prevent rule id clashes.
1035
- l._rules[#l._rules + 1] = {lexer._NAME..'_'..r[1], r[2]}
1036
- end
1037
- for token, style in pairs(_s or {}) do l._tokenstyles[token] = style end
1038
- lexer = l
1039
- end
1040
-
1041
- -- Add the lexer's styles and build its grammar.
1042
- if lexer._rules then
1043
- for token, style in pairs(lexer._tokenstyles or {}) do
1044
- add_style(lexer, token, style)
1045
- end
1046
- for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end
1047
- build_grammar(lexer)
1048
- end
1049
- -- Add the lexer's unique whitespace style.
1050
- add_style(lexer, lexer._NAME..'_whitespace', M.STYLE_WHITESPACE)
1051
-
1052
- -- Process the lexer's fold symbols.
1053
- if lexer._foldsymbols and lexer._foldsymbols._patterns then
1054
- local patterns = lexer._foldsymbols._patterns
1055
- for i = 1, #patterns do patterns[i] = '()('..patterns[i]..')' end
1056
- end
1057
-
1058
- lexer.lex, lexer.fold = M.lex, M.fold
1059
- -- Immun.io copy over some of our helpful functions
1060
- if M.lex_recursive then lexer.lex_recursive = M.lex_recursive end
1061
- if M.unlex_rules then lexer.unlex_rules = M.unlex_rules end
1062
- lexers[alt_name or name] = lexer
1063
- return lexer
1064
- end
1065
-
1066
- ---
1067
- -- Lexes a chunk of text *text* (that has an initial style number of
1068
- -- *init_style*) with lexer *lexer*.
1069
- -- If *lexer* has a `_LEXBYLINE` flag set, the text is lexed one line at a time.
1070
- -- Otherwise the text is lexed as a whole.
1071
- -- @param lexer The lexer object to lex with.
1072
- -- @param text The text in the buffer to lex.
1073
- -- @param init_style The current style. Multiple-language lexers use this to
1074
- -- determine which language to start lexing in.
1075
- -- @return table of token names and positions.
1076
- -- @name lex
1077
- function M.lex(lexer, text, init_style)
1078
- if not lexer._LEXBYLINE then
1079
- -- For multilang lexers, build a new grammar whose initial_rule is the
1080
- -- current language.
1081
- if lexer._CHILDREN then
1082
- for style, style_num in pairs(lexer._TOKENSTYLES) do
1083
- if style_num == init_style then
1084
- local lexer_name = style:match('^(.+)_whitespace') or lexer._NAME
1085
- if lexer._INITIALRULE ~= lexer_name then
1086
- build_grammar(lexer, lexer_name)
1087
- end
1088
- break
1089
- end
1090
- end
1091
- end
1092
- return lpeg_match(lexer._GRAMMAR, text)
1093
- else
1094
- local tokens = {}
1095
- local function append(tokens, line_tokens, offset)
1096
- for i = 1, #line_tokens, 2 do
1097
- tokens[#tokens + 1] = line_tokens[i]
1098
- tokens[#tokens + 1] = line_tokens[i + 1] + offset
1099
- end
1100
- end
1101
- local offset = 0
1102
- local grammar = lexer._GRAMMAR
1103
- for line in text:gmatch('[^\r\n]*\r?\n?') do
1104
- local line_tokens = lpeg_match(grammar, line)
1105
- if line_tokens then append(tokens, line_tokens, offset) end
1106
- offset = offset + #line
1107
- -- Use the default style to the end of the line if none was specified.
1108
- if tokens[#tokens] ~= offset then
1109
- tokens[#tokens + 1], tokens[#tokens + 2] = 'default', offset + 1
1110
- end
1111
- end
1112
- return tokens
1113
- end
1114
- end
1115
-
1116
- ---
1117
- -- Folds a chunk of text *text* with lexer *lexer*.
1118
- -- Folds *text* starting at position *start_pos* on line number *start_line*
1119
- -- with a beginning fold level of *start_level* in the buffer. If *lexer* has a
1120
- -- a `_fold` function or a `_foldsymbols` table, that field is used to perform
1121
- -- folding. Otherwise, if a `fold.by.indentation` property is set, folding by
1122
- -- indentation is done.
1123
- -- @param lexer The lexer object to fold with.
1124
- -- @param text The text in the buffer to fold.
1125
- -- @param start_pos The position in the buffer *text* starts at.
1126
- -- @param start_line The line number *text* starts on.
1127
- -- @param start_level The fold level *text* starts on.
1128
- -- @return table of fold levels.
1129
- -- @name fold
1130
- function M.fold(lexer, text, start_pos, start_line, start_level)
1131
- local folds = {}
1132
- if text == '' then return folds end
1133
- local fold = M.property_int['fold'] > 0
1134
- local FOLD_BASE = M.FOLD_BASE
1135
- local FOLD_HEADER, FOLD_BLANK = M.FOLD_HEADER, M.FOLD_BLANK
1136
- if fold and lexer._fold then
1137
- return lexer._fold(text, start_pos, start_line, start_level)
1138
- elseif fold and lexer._foldsymbols then
1139
- local lines = {}
1140
- for p, l in (text..'\n'):gmatch('()(.-)\r?\n') do
1141
- lines[#lines + 1] = {p, l}
1142
- end
1143
- local fold_zero_sum_lines = M.property_int['fold.on.zero.sum.lines'] > 0
1144
- local fold_symbols = lexer._foldsymbols
1145
- local fold_symbols_patterns = fold_symbols._patterns
1146
- local style_at, fold_level = M.style_at, M.fold_level
1147
- local line_num, prev_level = start_line, start_level
1148
- local current_level = prev_level
1149
- for i = 1, #lines do
1150
- local pos, line = lines[i][1], lines[i][2]
1151
- if line ~= '' then
1152
- local level_decreased = false
1153
- for j = 1, #fold_symbols_patterns do
1154
- for s, match in line:gmatch(fold_symbols_patterns[j]) do
1155
- local symbols = fold_symbols[style_at[start_pos + pos + s - 1]]
1156
- local l = symbols and symbols[match]
1157
- if type(l) == 'function' then l = l(text, pos, line, s, match) end
1158
- if type(l) == 'number' then
1159
- current_level = current_level + l
1160
- if l < 0 and current_level < prev_level then
1161
- -- Potential zero-sum line. If the level were to go back up on
1162
- -- the same line, the line may be marked as a fold header.
1163
- level_decreased = true
1164
- end
1165
- end
1166
- end
1167
- end
1168
- folds[line_num] = prev_level
1169
- if current_level > prev_level then
1170
- folds[line_num] = prev_level + FOLD_HEADER
1171
- elseif level_decreased and current_level == prev_level and
1172
- fold_zero_sum_lines then
1173
- if line_num > start_line then
1174
- folds[line_num] = prev_level - 1 + FOLD_HEADER
1175
- else
1176
- -- Typing within a zero-sum line.
1177
- local level = fold_level[line_num - 1] - 1
1178
- if level > FOLD_HEADER then level = level - FOLD_HEADER end
1179
- if level > FOLD_BLANK then level = level - FOLD_BLANK end
1180
- folds[line_num] = level + FOLD_HEADER
1181
- current_level = current_level + 1
1182
- end
1183
- end
1184
- if current_level < FOLD_BASE then current_level = FOLD_BASE end
1185
- prev_level = current_level
1186
- else
1187
- folds[line_num] = prev_level + FOLD_BLANK
1188
- end
1189
- line_num = line_num + 1
1190
- end
1191
- elseif fold and M.property_int['fold.by.indentation'] > 0 then
1192
- -- Indentation based folding.
1193
- -- Calculate indentation per line.
1194
- local indentation = {}
1195
- for indent, line in (text..'\n'):gmatch('([\t ]*)([^\r\n]*)\r?\n') do
1196
- indentation[#indentation + 1] = line ~= '' and #indent
1197
- end
1198
- -- Make line before start_line a fold header if necessary.
1199
- if start_line > 0 and indentation[1] then
1200
- local indent = M.indent_amount[start_line - 1]
1201
- if indentation[1] > indent then
1202
- folds[start_line - 1] = FOLD_BASE + indent + FOLD_HEADER
1203
- end
1204
- end
1205
- -- Iterate over lines, setting fold numbers and fold flags.
1206
- local line_num, prev_level = start_line, FOLD_BASE + (indentation[1] or 0)
1207
- local current_level = prev_level
1208
- for i = 1, #indentation do
1209
- if indentation[i] then
1210
- for j = i + 1, #indentation do
1211
- if indentation[j] then
1212
- current_level = FOLD_BASE + indentation[j]
1213
- break
1214
- end
1215
- end
1216
- folds[line_num] = prev_level
1217
- if current_level > prev_level then
1218
- folds[line_num] = prev_level + FOLD_HEADER
1219
- end
1220
- prev_level = current_level
1221
- else
1222
- folds[line_num] = prev_level + FOLD_BLANK
1223
- end
1224
- line_num = line_num + 1
1225
- end
1226
- else
1227
- -- No folding, reset fold levels if necessary.
1228
- local current_line = start_line
1229
- for _ in text:gmatch('\r?\n') do
1230
- folds[current_line] = start_level
1231
- current_line = current_line + 1
1232
- end
1233
- end
1234
- return folds
1235
- end
1236
-
1237
- -- The following are utility functions lexers will have access to.
1238
-
1239
- -- Common patterns.
1240
- M.any = lpeg_P(1)
1241
- M.ascii = lpeg_R('\000\127')
1242
- M.extend = lpeg_R('\000\255')
1243
- M.alpha = lpeg_R('AZ', 'az')
1244
- M.digit = lpeg_R('09')
1245
- M.alnum = lpeg_R('AZ', 'az', '09')
1246
- M.lower = lpeg_R('az')
1247
- M.upper = lpeg_R('AZ')
1248
- M.xdigit = lpeg_R('09', 'AF', 'af')
1249
- M.cntrl = lpeg_R('\000\031')
1250
- M.graph = lpeg_R('!~')
1251
- M.print = lpeg_R(' ~')
1252
- M.punct = lpeg_R('!/', ':@', '[\'', '{~')
1253
- M.space = lpeg_S('\t\v\f\n\r ')
1254
-
1255
- M.newline = lpeg_S('\r\n\f')^1
1256
- M.nonnewline = 1 - M.newline
1257
- M.nonnewline_esc = 1 - (M.newline + '\\') + '\\' * M.any
1258
-
1259
- M.dec_num = M.digit^1
1260
- M.hex_num = '0' * lpeg_S('xX') * M.xdigit^1
1261
- M.oct_num = '0' * lpeg_R('07')^1
1262
- M.integer = lpeg_S('+-')^-1 * (M.hex_num + M.oct_num + M.dec_num)
1263
- M.float = lpeg_S('+-')^-1 *
1264
- (M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0 +
1265
- M.digit^1) *
1266
- lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1
1267
- M.word = (M.alpha + '_') * (M.alnum + '_')^0
1268
-
1269
- ---
1270
- -- Creates and returns a token pattern with token name *name* and pattern
1271
- -- *patt*.
1272
- -- If *name* is not a predefined token name, its style must be defined in the
1273
- -- lexer's `_tokenstyles` table.
1274
- -- @param name The name of token. If this name is not a predefined token name,
1275
- -- then a style needs to be assiciated with it in the lexer's `_tokenstyles`
1276
- -- table.
1277
- -- @param patt The LPeg pattern associated with the token.
1278
- -- @return pattern
1279
- -- @usage local ws = token(l.WHITESPACE, l.space^1)
1280
- -- @usage local annotation = token('annotation', '@' * l.word)
1281
- -- @name token
1282
- function M.token(name, patt)
1283
- --return lpeg_Cg(patt, name)
1284
- return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_C(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
1285
- end
1286
-
1287
- function M.parent_token(name, patt)
1288
- --return lpeg_Cg(patt, name)
1289
- return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_Ct(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
1290
- end
1291
-
1292
- ---
1293
- -- Creates and returns a pattern that matches a range of text bounded by
1294
- -- *chars* characters.
1295
- -- This is a convenience function for matching more complicated delimited ranges
1296
- -- like strings with escape characters and balanced parentheses. *single_line*
1297
- -- indicates whether or not the range must be on a single line, *no_escape*
1298
- -- indicates whether or not to ignore '\' as an escape character, and *balanced*
1299
- -- indicates whether or not to handle balanced ranges like parentheses and
1300
- -- requires *chars* to be composed of two characters.
1301
- -- @param chars The character(s) that bound the matched range.
1302
- -- @param single_line Optional flag indicating whether or not the range must be
1303
- -- on a single line.
1304
- -- @param no_escape Optional flag indicating whether or not the range end
1305
- -- character may be escaped by a '\\' character.
1306
- -- @param balanced Optional flag indicating whether or not to match a balanced
1307
- -- range, like the "%b" Lua pattern. This flag only applies if *chars*
1308
- -- consists of two different characters (e.g. "()").
1309
- -- @return pattern
1310
- -- @usage local dq_str_escapes = l.delimited_range('"')
1311
- -- @usage local dq_str_noescapes = l.delimited_range('"', false, true)
1312
- -- @usage local unbalanced_parens = l.delimited_range('()')
1313
- -- @usage local balanced_parens = l.delimited_range('()', false, false, true)
1314
- -- @see nested_pair
1315
- -- @name delimited_range
1316
- function M.delimited_range(chars, single_line, no_escape, balanced)
1317
- local s = chars:sub(1, 1)
1318
- local e = #chars == 2 and chars:sub(2, 2) or s
1319
- local range
1320
- local b = balanced and s or ''
1321
- local n = single_line and '\n' or ''
1322
- if no_escape then
1323
- local invalid = lpeg_S(e..n..b)
1324
- range = M.any - invalid
1325
- else
1326
- local invalid = lpeg_S(e..n..b) + '\\'
1327
- range = M.any - invalid + '\\' * M.any
1328
- end
1329
- if balanced and s ~= e then
1330
- return lpeg_P{s * (range + lpeg_V(1))^0 * e}
1331
- else
1332
- return s * range^0 * lpeg_P(e)^-1
1333
- end
1334
- end
1335
-
1336
- ---
1337
- -- Creates and returns a pattern that matches pattern *patt* only at the
1338
- -- beginning of a line.
1339
- -- @param patt The LPeg pattern to match on the beginning of a line.
1340
- -- @return pattern
1341
- -- @usage local preproc = token(l.PREPROCESSOR, l.starts_line('#') *
1342
- -- l.nonnewline^0)
1343
- -- @name starts_line
1344
- function M.starts_line(patt)
1345
- return lpeg_Cmt(lpeg_C(patt), function(input, index, match, ...)
1346
- local pos = index - #match
1347
- if pos == 1 then return index, ... end
1348
- local char = input:sub(pos - 1, pos - 1)
1349
- if char == '\n' or char == '\r' or char == '\f' then return index, ... end
1350
- end)
1351
- end
1352
-
1353
- ---
1354
- -- Creates and returns a pattern that verifies that string set *s* contains the
1355
- -- first non-whitespace character behind the current match position.
1356
- -- @param s String character set like one passed to `lpeg.S()`.
1357
- -- @return pattern
1358
- -- @usage local regex = l.last_char_includes('+-*!%^&|=,([{') *
1359
- -- l.delimited_range('/')
1360
- -- @name last_char_includes
1361
- function M.last_char_includes(s)
1362
- s = '['..s:gsub('[-%%%[]', '%%%1')..']'
1363
- return lpeg_P(function(input, index)
1364
- if index == 1 then return index end
1365
- local i = index
1366
- while input:sub(i - 1, i - 1):match('[ \t\r\n\f]') do i = i - 1 end
1367
- if input:sub(i - 1, i - 1):match(s) then return index end
1368
- end)
1369
- end
1370
-
1371
- ---
1372
- -- Returns a pattern that matches a balanced range of text that starts with
1373
- -- string *start_chars* and ends with string *end_chars*.
1374
- -- With single-character delimiters, this function is identical to
1375
- -- `delimited_range(start_chars..end_chars, false, true, true)`.
1376
- -- @param start_chars The string starting a nested sequence.
1377
- -- @param end_chars The string ending a nested sequence.
1378
- -- @return pattern
1379
- -- @usage local nested_comment = l.nested_pair('/*', '*/')
1380
- -- @see delimited_range
1381
- -- @name nested_pair
1382
- function M.nested_pair(start_chars, end_chars)
1383
- local s, e = start_chars, lpeg_P(end_chars)^-1
1384
- return lpeg_P{s * (M.any - s - end_chars + lpeg_V(1))^0 * e}
1385
- end
1386
-
1387
- ---
1388
- -- Creates and returns a pattern that matches any single word in list *words*.
1389
- -- Words consist of alphanumeric and underscore characters, as well as the
1390
- -- characters in string set *word_chars*. *case_insensitive* indicates whether
1391
- -- or not to ignore case when matching words.
1392
- -- This is a convenience function for simplifying a set of ordered choice word
1393
- -- patterns.
1394
- -- @param words A table of words.
1395
- -- @param word_chars Optional string of additional characters considered to be
1396
- -- part of a word. By default, word characters are alphanumerics and
1397
- -- underscores ("%w_" in Lua). This parameter may be `nil` or the empty string
1398
- -- to indicate no additional word characters.
1399
- -- @param case_insensitive Optional boolean flag indicating whether or not the
1400
- -- word match is case-insensitive. The default is `false`.
1401
- -- @return pattern
1402
- -- @usage local keyword = token(l.KEYWORD, word_match{'foo', 'bar', 'baz'})
1403
- -- @usage local keyword = token(l.KEYWORD, word_match({'foo-bar', 'foo-baz',
1404
- -- 'bar-foo', 'bar-baz', 'baz-foo', 'baz-bar'}, '-', true))
1405
- -- @name word_match
1406
- function M.word_match(words, word_chars, case_insensitive)
1407
- local word_list = {}
1408
- for _, word in ipairs(words) do
1409
- word_list[case_insensitive and word:lower() or word] = true
1410
- end
1411
- local chars = M.alnum + '_'
1412
- if word_chars then chars = chars + lpeg_S(word_chars) end
1413
- return lpeg_Cmt(chars^1, function(input, index, word)
1414
- if case_insensitive then word = word:lower() end
1415
- return word_list[word] and index or nil
1416
- end)
1417
- end
1418
-
1419
- ---
1420
- -- Embeds child lexer *child* in parent lexer *parent* using patterns
1421
- -- *start_rule* and *end_rule*, which signal the beginning and end of the
1422
- -- embedded lexer, respectively.
1423
- -- @param parent The parent lexer.
1424
- -- @param child The child lexer.
1425
- -- @param start_rule The pattern that signals the beginning of the embedded
1426
- -- lexer.
1427
- -- @param end_rule The pattern that signals the end of the embedded lexer.
1428
- -- @usage l.embed_lexer(M, css, css_start_rule, css_end_rule)
1429
- -- @usage l.embed_lexer(html, M, php_start_rule, php_end_rule)
1430
- -- @usage l.embed_lexer(html, ruby, ruby_start_rule, ruby_end_rule)
1431
- -- @name embed_lexer
1432
- function M.embed_lexer(parent, child, start_rule, end_rule)
1433
- -- Add child rules.
1434
- if not child._EMBEDDEDRULES then child._EMBEDDEDRULES = {} end
1435
- if not child._RULES then -- creating a child lexer to be embedded
1436
- if not child._rules then error('Cannot embed language with no rules') end
1437
- for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end
1438
- end
1439
- child._EMBEDDEDRULES[parent._NAME] = {
1440
- ['start_rule'] = start_rule,
1441
- token_rule = join_tokens(child),
1442
- ['end_rule'] = end_rule
1443
- }
1444
- if not parent._CHILDREN then parent._CHILDREN = {} end
1445
- local children = parent._CHILDREN
1446
- children[#children + 1] = child
1447
- -- Add child styles.
1448
- if not parent._tokenstyles then parent._tokenstyles = {} end
1449
- local tokenstyles = parent._tokenstyles
1450
- tokenstyles[child._NAME..'_whitespace'] = M.STYLE_WHITESPACE
1451
- for token, style in pairs(child._tokenstyles or {}) do
1452
- tokenstyles[token] = style
1453
- end
1454
- child._lexer = parent -- use parent's tokens if child is embedding itself
1455
- parent_lexer = parent -- use parent's tokens if the calling lexer is a proxy
1456
- end
1457
-
1458
- -- Determines if the previous line is a comment.
1459
- -- This is used for determining if the current comment line is a fold point.
1460
- -- @param prefix The prefix string defining a comment.
1461
- -- @param text The text passed to a fold function.
1462
- -- @param pos The pos passed to a fold function.
1463
- -- @param line The line passed to a fold function.
1464
- -- @param s The s passed to a fold function.
1465
- local function prev_line_is_comment(prefix, text, pos, line, s)
1466
- local start = line:find('%S')
1467
- if start < s and not line:find(prefix, start, true) then return false end
1468
- local p = pos - 1
1469
- if text:sub(p, p) == '\n' then
1470
- p = p - 1
1471
- if text:sub(p, p) == '\r' then p = p - 1 end
1472
- if text:sub(p, p) ~= '\n' then
1473
- while p > 1 and text:sub(p - 1, p - 1) ~= '\n' do p = p - 1 end
1474
- while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
1475
- return text:sub(p, p + #prefix - 1) == prefix
1476
- end
1477
- end
1478
- return false
1479
- end
1480
-
1481
- -- Determines if the next line is a comment.
1482
- -- This is used for determining if the current comment line is a fold point.
1483
- -- @param prefix The prefix string defining a comment.
1484
- -- @param text The text passed to a fold function.
1485
- -- @param pos The pos passed to a fold function.
1486
- -- @param line The line passed to a fold function.
1487
- -- @param s The s passed to a fold function.
1488
- local function next_line_is_comment(prefix, text, pos, line, s)
1489
- local p = text:find('\n', pos + s)
1490
- if p then
1491
- p = p + 1
1492
- while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
1493
- return text:sub(p, p + #prefix - 1) == prefix
1494
- end
1495
- return false
1496
- end
1497
-
1498
- ---
1499
- -- Returns a fold function (to be used within the lexer's `_foldsymbols` table)
1500
- -- that folds consecutive line comments that start with string *prefix*.
1501
- -- @param prefix The prefix string defining a line comment.
1502
- -- @usage [l.COMMENT] = {['--'] = l.fold_line_comments('--')}
1503
- -- @usage [l.COMMENT] = {['//'] = l.fold_line_comments('//')}
1504
- -- @name fold_line_comments
1505
- function M.fold_line_comments(prefix)
1506
- local property_int = M.property_int
1507
- return function(text, pos, line, s)
1508
- if property_int['fold.line.comments'] == 0 then return 0 end
1509
- if s > 1 and line:match('^%s*()') < s then return 0 end
1510
- local prev_line_comment = prev_line_is_comment(prefix, text, pos, line, s)
1511
- local next_line_comment = next_line_is_comment(prefix, text, pos, line, s)
1512
- if not prev_line_comment and next_line_comment then return 1 end
1513
- if prev_line_comment and not next_line_comment then return -1 end
1514
- return 0
1515
- end
1516
- end
1517
-
1518
- M.property_expanded = setmetatable({}, {
1519
- -- Returns the string property value associated with string property *key*,
1520
- -- replacing any "$()" and "%()" expressions with the values of their keys.
1521
- __index = function(t, key)
1522
- return M.property[key]:gsub('[$%%]%b()', function(key)
1523
- return t[key:sub(3, -2)]
1524
- end)
1525
- end,
1526
- __newindex = function() error('read-only property') end
1527
- })
1528
-
1529
- --[[ The functions and fields below were defined in C.
1530
-
1531
- ---
1532
- -- Individual fields for a lexer instance.
1533
- -- @field _NAME The string name of the lexer.
1534
- -- @field _rules An ordered list of rules for a lexer grammar.
1535
- -- Each rule is a table containing an arbitrary rule name and the LPeg pattern
1536
- -- associated with the rule. The order of rules is important as rules are
1537
- -- matched sequentially.
1538
- -- Child lexers should not use this table to access and/or modify their
1539
- -- parent's rules and vice-versa. Use the `_RULES` table instead.
1540
- -- @field _tokenstyles A map of non-predefined token names to styles.
1541
- -- Remember to use token names, not rule names. It is recommended to use
1542
- -- predefined styles or color-agnostic styles derived from predefined styles
1543
- -- to ensure compatibility with user color themes.
1544
- -- @field _foldsymbols A table of recognized fold points for the lexer.
1545
- -- Keys are token names with table values defining fold points. Those table
1546
- -- values have string keys of keywords or characters that indicate a fold
1547
- -- point whose values are integers. A value of `1` indicates a beginning fold
1548
- -- point and a value of `-1` indicates an ending fold point. Values can also
1549
- -- be functions that return `1`, `-1`, or `0` (indicating no fold point) for
1550
- -- keys which need additional processing.
1551
- -- There is also a required `_pattern` key whose value is a table containing
1552
- -- Lua pattern strings that match all fold points (the string keys contained
1553
- -- in token name table values). When the lexer encounters text that matches
1554
- -- one of those patterns, the matched text is looked up in its token's table
1555
- -- to determine whether or not it is a fold point.
1556
- -- @field _fold If this function exists in the lexer, it is called for folding
1557
- -- the document instead of using `_foldsymbols` or indentation.
1558
- -- @field _lexer The parent lexer object whose rules should be used. This field
1559
- -- is only necessary to disambiguate a proxy lexer that loaded parent and
1560
- -- child lexers for embedding and ended up having multiple parents loaded.
1561
- -- @field _RULES A map of rule name keys with their associated LPeg pattern
1562
- -- values for the lexer.
1563
- -- This is constructed from the lexer's `_rules` table and accessible to other
1564
- -- lexers for embedded lexer applications like modifying parent or child
1565
- -- rules.
1566
- -- @field _LEXBYLINE Indicates the lexer can only process one whole line of text
1567
- -- (instead of an arbitrary chunk of text) at a time.
1568
- -- The default value is `false`. Line lexers cannot look ahead to subsequent
1569
- -- lines.
1570
- -- @class table
1571
- -- @name lexer
1572
- local lexer
1573
- ]]
1574
-
1575
- return M