immunio 1.0.4 → 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/immunio/agent.rb +9 -9
- data/lib/immunio/authentication.rb +1 -1
- data/lib/immunio/channel.rb +15 -15
- data/lib/immunio/plugins/active_record.rb +3 -3
- data/lib/immunio/plugins/authlogic.rb +3 -3
- data/lib/immunio/plugins/csrf.rb +1 -1
- data/lib/immunio/plugins/devise.rb +1 -1
- data/lib/immunio/plugins/eval.rb +1 -1
- data/lib/immunio/plugins/http_finisher.rb +2 -2
- data/lib/immunio/plugins/http_tracker.rb +1 -1
- data/lib/immunio/plugins/io.rb +7 -7
- data/lib/immunio/plugins/redirect.rb +2 -2
- data/lib/immunio/plugins/warden.rb +5 -6
- data/lib/immunio/processor.rb +7 -7
- data/lib/immunio/request.rb +3 -3
- data/lib/immunio/version.rb +1 -1
- data/lib/immunio/vm.rb +6 -6
- data/lua-hooks/Makefile +49 -6
- data/lua-hooks/lib/boot.lua +49 -277
- metadata +2 -11
- data/lua-hooks/lib/encode.lua +0 -4
- data/lua-hooks/lib/lexers/LICENSE +0 -21
- data/lua-hooks/lib/lexers/bash.lua +0 -134
- data/lua-hooks/lib/lexers/bash_dqstr.lua +0 -59
- data/lua-hooks/lib/lexers/css.lua +0 -101
- data/lua-hooks/lib/lexers/css_attr.lua +0 -13
- data/lua-hooks/lib/lexers/html.lua +0 -113
- data/lua-hooks/lib/lexers/javascript.lua +0 -68
- data/lua-hooks/lib/lexers/lexer.lua +0 -1575
@@ -1,68 +0,0 @@
|
|
1
|
-
-- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
|
2
|
-
-- JavaScript LPeg lexer.
|
3
|
-
|
4
|
-
local l = require('lexer')
|
5
|
-
local token, word_match = l.token, l.word_match
|
6
|
-
local P, R, S = lpeg.P, lpeg.R, lpeg.S
|
7
|
-
|
8
|
-
local M = {_NAME = 'javascript'}
|
9
|
-
|
10
|
-
-- Whitespace.
|
11
|
-
local ws = token(l.WHITESPACE, l.space^1)
|
12
|
-
|
13
|
-
-- Comments.
|
14
|
-
local line_comment = '//' * l.nonnewline_esc^0
|
15
|
-
local block_comment = '/*' * (l.any - '*/')^0 * P('*/')^-1
|
16
|
-
local comment = token(l.COMMENT, line_comment + block_comment)
|
17
|
-
|
18
|
-
-- Strings.
|
19
|
-
local sq_str = l.delimited_range("'")
|
20
|
-
local dq_str = l.delimited_range('"')
|
21
|
-
local regex = token( "regex", l.last_char_includes('+-*%^!=&|?:;,([{<>') *
|
22
|
-
l.delimited_range('/', true) * S('igm')^0 )
|
23
|
-
local string = token(l.STRING, sq_str + dq_str) --+ token(l.REGEX, regex_str)
|
24
|
-
|
25
|
-
-- Numbers.
|
26
|
-
local number = token(l.NUMBER, l.float + l.integer)
|
27
|
-
|
28
|
-
-- Keywords.
|
29
|
-
local keyword = token(l.KEYWORD, word_match{
|
30
|
-
'abstract', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class',
|
31
|
-
'const', 'continue', 'debugger', 'default', 'delete', 'do', 'double', 'else',
|
32
|
-
'enum', 'export', 'extends', 'false', 'final', 'finally', 'float', 'for',
|
33
|
-
'function', 'goto', 'if', 'implements', 'import', 'in', 'instanceof', 'int',
|
34
|
-
'interface', 'let', 'long', 'native', 'new', 'null', 'package', 'private',
|
35
|
-
'protected', 'public', 'return', 'short', 'static', 'super', 'switch',
|
36
|
-
'synchronized', 'this', 'throw', 'throws', 'transient', 'true', 'try',
|
37
|
-
'typeof', 'var', 'void', 'volatile', 'while', 'with', 'yield'
|
38
|
-
})
|
39
|
-
|
40
|
-
-- Identifiers.
|
41
|
-
local identifier = token(l.IDENTIFIER, l.word)
|
42
|
-
|
43
|
-
-- Operators.
|
44
|
-
local operator = token(l.OPERATOR, S('+-/*%^!=&|?:;,.()[]{}<>'))
|
45
|
-
|
46
|
-
-- Immunio marker
|
47
|
-
local marker = l.token('marker', P('{immunio-var:') * l.integer * ':' * l.xdigit^1 * '}')
|
48
|
-
|
49
|
-
|
50
|
-
M._rules = {
|
51
|
-
{'whitespace', ws},
|
52
|
-
{'marker', marker},
|
53
|
-
{'keyword', keyword},
|
54
|
-
{'identifier', identifier},
|
55
|
-
{'comment', comment},
|
56
|
-
{'number', number},
|
57
|
-
{'string', string},
|
58
|
-
{'regex', regex},
|
59
|
-
{'operator', operator},
|
60
|
-
}
|
61
|
-
|
62
|
-
M._foldsymbols = {
|
63
|
-
_patterns = {'[{}]', '/%*', '%*/', '//'},
|
64
|
-
[l.OPERATOR] = {['{'] = 1, ['}'] = -1},
|
65
|
-
[l.COMMENT] = {['/*'] = 1, ['*/'] = -1, ['//'] = l.fold_line_comments('//')}
|
66
|
-
}
|
67
|
-
|
68
|
-
return M
|
@@ -1,1575 +0,0 @@
|
|
1
|
-
-- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
|
2
|
-
|
3
|
-
local M = {}
|
4
|
-
|
5
|
-
--[=[ This comment is for LuaDoc.
|
6
|
-
---
|
7
|
-
-- Lexes Scintilla documents with Lua and LPeg.
|
8
|
-
--
|
9
|
-
-- ## Overview
|
10
|
-
--
|
11
|
-
-- Lexers highlight the syntax of source code. Scintilla (the editing component
|
12
|
-
-- behind [Textadept][] and [SciTE][]) traditionally uses static, compiled C++
|
13
|
-
-- lexers which are notoriously difficult to create and/or extend. On the other
|
14
|
-
-- hand, Lua makes it easy to to rapidly create new lexers, extend existing
|
15
|
-
-- ones, and embed lexers within one another. Lua lexers tend to be more
|
16
|
-
-- readable than C++ lexers too.
|
17
|
-
--
|
18
|
-
-- Lexers are Parsing Expression Grammars, or PEGs, composed with the Lua
|
19
|
-
-- [LPeg library][]. The following table comes from the LPeg documentation and
|
20
|
-
-- summarizes all you need to know about constructing basic LPeg patterns. This
|
21
|
-
-- module provides convenience functions for creating and working with other
|
22
|
-
-- more advanced patterns and concepts.
|
23
|
-
--
|
24
|
-
-- Operator | Description
|
25
|
-
-- ---------------------|------------
|
26
|
-
-- `lpeg.P(string)` | Matches `string` literally.
|
27
|
-
-- `lpeg.P(`_`n`_`)` | Matches exactly _`n`_ characters.
|
28
|
-
-- `lpeg.S(string)` | Matches any character in set `string`.
|
29
|
-
-- `lpeg.R("`_`xy`_`")` | Matches any character between range `x` and `y`.
|
30
|
-
-- `patt^`_`n`_ | Matches at least _`n`_ repetitions of `patt`.
|
31
|
-
-- `patt^-`_`n`_ | Matches at most _`n`_ repetitions of `patt`.
|
32
|
-
-- `patt1 * patt2` | Matches `patt1` followed by `patt2`.
|
33
|
-
-- `patt1 + patt2` | Matches `patt1` or `patt2` (ordered choice).
|
34
|
-
-- `patt1 - patt2` | Matches `patt1` if `patt2` does not match.
|
35
|
-
-- `-patt` | Equivalent to `("" - patt)`.
|
36
|
-
-- `#patt` | Matches `patt` but consumes no input.
|
37
|
-
--
|
38
|
-
-- The first part of this document deals with rapidly constructing a simple
|
39
|
-
-- lexer. The next part deals with more advanced techniques, such as custom
|
40
|
-
-- coloring and embedding lexers within one another. Following that is a
|
41
|
-
-- discussion about code folding, or being able to tell Scintilla which code
|
42
|
-
-- blocks are "foldable" (temporarily hideable from view). After that are
|
43
|
-
-- instructions on how to use LPeg lexers with the aforementioned Textadept and
|
44
|
-
-- SciTE editors. Finally there are comments on lexer performance and
|
45
|
-
-- limitations.
|
46
|
-
--
|
47
|
-
-- [LPeg library]: http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html
|
48
|
-
-- [Textadept]: http://foicica.com/textadept
|
49
|
-
-- [SciTE]: http://scintilla.org/SciTE.html
|
50
|
-
--
|
51
|
-
-- ## Lexer Basics
|
52
|
-
--
|
53
|
-
-- The *lexers/* directory contains all lexers, including your new one. Before
|
54
|
-
-- attempting to write one from scratch though, first determine if your
|
55
|
-
-- programming language is similar to any of the 80+ languages supported. If so,
|
56
|
-
-- you may be able to copy and modify that lexer, saving some time and effort.
|
57
|
-
-- The filename of your lexer should be the name of your programming language in
|
58
|
-
-- lower case followed by a *.lua* extension. For example, a new Lua lexer has
|
59
|
-
-- the name *lua.lua*.
|
60
|
-
--
|
61
|
-
-- Note: Try to refrain from using one-character language names like "b", "c",
|
62
|
-
-- or "d". For example, Scintillua uses "b_lang", "cpp", and "dmd",
|
63
|
-
-- respectively.
|
64
|
-
--
|
65
|
-
-- ### New Lexer Template
|
66
|
-
--
|
67
|
-
-- There is a *lexers/template.txt* file that contains a simple template for a
|
68
|
-
-- new lexer. Feel free to use it, replacing the '?'s with the name of your
|
69
|
-
-- lexer:
|
70
|
-
--
|
71
|
-
-- -- ? LPeg lexer.
|
72
|
-
--
|
73
|
-
-- local l = require('lexer')
|
74
|
-
-- local token, word_match = l.token, l.word_match
|
75
|
-
-- local P, R, S = lpeg.P, lpeg.R, lpeg.S
|
76
|
-
--
|
77
|
-
-- local M = {_NAME = '?'}
|
78
|
-
--
|
79
|
-
-- -- Whitespace.
|
80
|
-
-- local ws = token(l.WHITESPACE, l.space^1)
|
81
|
-
--
|
82
|
-
-- M._rules = {
|
83
|
-
-- {'whitespace', ws},
|
84
|
-
-- }
|
85
|
-
--
|
86
|
-
-- M._tokenstyles = {
|
87
|
-
--
|
88
|
-
-- }
|
89
|
-
--
|
90
|
-
-- return M
|
91
|
-
--
|
92
|
-
-- The first 4 lines of code simply define often used convenience variables. The
|
93
|
-
-- 5th and last lines define and return the lexer object Scintilla uses; they
|
94
|
-
-- are very important and must be part of every lexer. The sixth line defines
|
95
|
-
-- something called a "token", an essential building block of lexers. You will
|
96
|
-
-- learn about tokens shortly. The rest of the code defines a set of grammar
|
97
|
-
-- rules and token styles. You will learn about those later. Note, however, the
|
98
|
-
-- `M.` prefix in front of `_rules` and `_tokenstyles`: not only do these tables
|
99
|
-
-- belong to their respective lexers, but any non-local variables need the `M.`
|
100
|
-
-- prefix too so-as not to affect Lua's global environment. All in all, this is
|
101
|
-
-- a minimal, working lexer that you can build on.
|
102
|
-
--
|
103
|
-
-- ### Tokens
|
104
|
-
--
|
105
|
-
-- Take a moment to think about your programming language's structure. What kind
|
106
|
-
-- of key elements does it have? In the template shown earlier, one predefined
|
107
|
-
-- element all languages have is whitespace. Your language probably also has
|
108
|
-
-- elements like comments, strings, and keywords. Lexers refer to these elements
|
109
|
-
-- as "tokens". Tokens are the fundamental "building blocks" of lexers. Lexers
|
110
|
-
-- break down source code into tokens for coloring, which results in the syntax
|
111
|
-
-- highlighting familiar to you. It is up to you how specific your lexer is when
|
112
|
-
-- it comes to tokens. Perhaps only distinguishing between keywords and
|
113
|
-
-- identifiers is necessary, or maybe recognizing constants and built-in
|
114
|
-
-- functions, methods, or libraries is desirable. The Lua lexer, for example,
|
115
|
-
-- defines 11 tokens: whitespace, comments, strings, numbers, keywords, built-in
|
116
|
-
-- functions, constants, built-in libraries, identifiers, labels, and operators.
|
117
|
-
-- Even though constants, built-in functions, and built-in libraries are subsets
|
118
|
-
-- of identifiers, Lua programmers find it helpful for the lexer to distinguish
|
119
|
-
-- between them all. It is perfectly acceptable to just recognize keywords and
|
120
|
-
-- identifiers.
|
121
|
-
--
|
122
|
-
-- In a lexer, tokens consist of a token name and an LPeg pattern that matches a
|
123
|
-
-- sequence of characters recognized as an instance of that token. Create tokens
|
124
|
-
-- using the [`lexer.token()`]() function. Let us examine the "whitespace" token
|
125
|
-
-- defined in the template shown earlier:
|
126
|
-
--
|
127
|
-
-- local ws = token(l.WHITESPACE, l.space^1)
|
128
|
-
--
|
129
|
-
-- At first glance, the first argument does not appear to be a string name and
|
130
|
-
-- the second argument does not appear to be an LPeg pattern. Perhaps you
|
131
|
-
-- expected something like:
|
132
|
-
--
|
133
|
-
-- local ws = token('whitespace', S('\t\v\f\n\r ')^1)
|
134
|
-
--
|
135
|
-
-- The `lexer` (`l`) module actually provides a convenient list of common token
|
136
|
-
-- names and common LPeg patterns for you to use. Token names include
|
137
|
-
-- [`lexer.DEFAULT`](), [`lexer.WHITESPACE`](), [`lexer.COMMENT`](),
|
138
|
-
-- [`lexer.STRING`](), [`lexer.NUMBER`](), [`lexer.KEYWORD`](),
|
139
|
-
-- [`lexer.IDENTIFIER`](), [`lexer.OPERATOR`](), [`lexer.ERROR`](),
|
140
|
-
-- [`lexer.PREPROCESSOR`](), [`lexer.CONSTANT`](), [`lexer.VARIABLE`](),
|
141
|
-
-- [`lexer.FUNCTION`](), [`lexer.CLASS`](), [`lexer.TYPE`](), [`lexer.LABEL`](),
|
142
|
-
-- [`lexer.REGEX`](), and [`lexer.EMBEDDED`](). Patterns include
|
143
|
-
-- [`lexer.any`](), [`lexer.ascii`](), [`lexer.extend`](), [`lexer.alpha`](),
|
144
|
-
-- [`lexer.digit`](), [`lexer.alnum`](), [`lexer.lower`](), [`lexer.upper`](),
|
145
|
-
-- [`lexer.xdigit`](), [`lexer.cntrl`](), [`lexer.graph`](), [`lexer.print`](),
|
146
|
-
-- [`lexer.punct`](), [`lexer.space`](), [`lexer.newline`](),
|
147
|
-
-- [`lexer.nonnewline`](), [`lexer.nonnewline_esc`](), [`lexer.dec_num`](),
|
148
|
-
-- [`lexer.hex_num`](), [`lexer.oct_num`](), [`lexer.integer`](),
|
149
|
-
-- [`lexer.float`](), and [`lexer.word`](). You may use your own token names if
|
150
|
-
-- none of the above fit your language, but an advantage to using predefined
|
151
|
-
-- token names is that your lexer's tokens will inherit the universal syntax
|
152
|
-
-- highlighting color theme used by your text editor.
|
153
|
-
--
|
154
|
-
-- #### Example Tokens
|
155
|
-
--
|
156
|
-
-- So, how might you define other tokens like comments, strings, and keywords?
|
157
|
-
-- Here are some examples.
|
158
|
-
--
|
159
|
-
-- **Comments**
|
160
|
-
--
|
161
|
-
-- Line-style comments with a prefix character(s) are easy to express with LPeg:
|
162
|
-
--
|
163
|
-
-- local shell_comment = token(l.COMMENT, '#' * l.nonnewline^0)
|
164
|
-
-- local c_line_comment = token(l.COMMENT, '//' * l.nonnewline_esc^0)
|
165
|
-
--
|
166
|
-
-- The comments above start with a '#' or "//" and go to the end of the line.
|
167
|
-
-- The second comment recognizes the next line also as a comment if the current
|
168
|
-
-- line ends with a '\' escape character.
|
169
|
-
--
|
170
|
-
-- C-style "block" comments with a start and end delimiter are also easy to
|
171
|
-
-- express:
|
172
|
-
--
|
173
|
-
-- local c_comment = token(l.COMMENT, '/*' * (l.any - '*/')^0 * P('*/')^-1)
|
174
|
-
--
|
175
|
-
-- This comment starts with a "/\*" sequence and contains anything up to and
|
176
|
-
-- including an ending "\*/" sequence. The ending "\*/" is optional so the lexer
|
177
|
-
-- can recognize unfinished comments as comments and highlight them properly.
|
178
|
-
--
|
179
|
-
-- **Strings**
|
180
|
-
--
|
181
|
-
-- It is tempting to think that a string is not much different from the block
|
182
|
-
-- comment shown above in that both have start and end delimiters:
|
183
|
-
--
|
184
|
-
-- local dq_str = '"' * (l.any - '"')^0 * P('"')^-1
|
185
|
-
-- local sq_str = "'" * (l.any - "'")^0 * P("'")^-1
|
186
|
-
-- local simple_string = token(l.STRING, dq_str + sq_str)
|
187
|
-
--
|
188
|
-
-- However, most programming languages allow escape sequences in strings such
|
189
|
-
-- that a sequence like "\\"" in a double-quoted string indicates that the
|
190
|
-
-- '"' is not the end of the string. The above token incorrectly matches
|
191
|
-
-- such a string. Instead, use the [`lexer.delimited_range()`]() convenience
|
192
|
-
-- function.
|
193
|
-
--
|
194
|
-
-- local dq_str = l.delimited_range('"')
|
195
|
-
-- local sq_str = l.delimited_range("'")
|
196
|
-
-- local string = token(l.STRING, dq_str + sq_str)
|
197
|
-
--
|
198
|
-
-- In this case, the lexer treats '\' as an escape character in a string
|
199
|
-
-- sequence.
|
200
|
-
--
|
201
|
-
-- **Keywords**
|
202
|
-
--
|
203
|
-
-- Instead of matching _n_ keywords with _n_ `P('keyword_`_`n`_`')` ordered
|
204
|
-
-- choices, use another convenience function: [`lexer.word_match()`](). It is
|
205
|
-
-- much easier and more efficient to write word matches like:
|
206
|
-
--
|
207
|
-
-- local keyword = token(l.KEYWORD, l.word_match{
|
208
|
-
-- 'keyword_1', 'keyword_2', ..., 'keyword_n'
|
209
|
-
-- })
|
210
|
-
--
|
211
|
-
-- local case_insensitive_keyword = token(l.KEYWORD, l.word_match({
|
212
|
-
-- 'KEYWORD_1', 'keyword_2', ..., 'KEYword_n'
|
213
|
-
-- }, nil, true))
|
214
|
-
--
|
215
|
-
-- local hyphened_keyword = token(l.KEYWORD, l.word_match({
|
216
|
-
-- 'keyword-1', 'keyword-2', ..., 'keyword-n'
|
217
|
-
-- }, '-'))
|
218
|
-
--
|
219
|
-
-- By default, characters considered to be in keywords are in the set of
|
220
|
-
-- alphanumeric characters and underscores. The last token demonstrates how to
|
221
|
-
-- allow '-' (hyphen) characters to be in keywords as well.
|
222
|
-
--
|
223
|
-
-- **Numbers**
|
224
|
-
--
|
225
|
-
-- Most programming languages have the same format for integer and float tokens,
|
226
|
-
-- so it might be as simple as using a couple of predefined LPeg patterns:
|
227
|
-
--
|
228
|
-
-- local number = token(l.NUMBER, l.float + l.integer)
|
229
|
-
--
|
230
|
-
-- However, some languages allow postfix characters on integers.
|
231
|
-
--
|
232
|
-
-- local integer = P('-')^-1 * (l.dec_num * S('lL')^-1)
|
233
|
-
-- local number = token(l.NUMBER, l.float + l.hex_num + integer)
|
234
|
-
--
|
235
|
-
-- Your language may need other tweaks, but it is up to you how fine-grained you
|
236
|
-
-- want your highlighting to be. After all, you are not writing a compiler or
|
237
|
-
-- interpreter!
|
238
|
-
--
|
239
|
-
-- ### Rules
|
240
|
-
--
|
241
|
-
-- Programming languages have grammars, which specify valid token structure. For
|
242
|
-
-- example, comments usually cannot appear within a string. Grammars consist of
|
243
|
-
-- rules, which are simply combinations of tokens. Recall from the lexer
|
244
|
-
-- template the `_rules` table, which defines all the rules used by the lexer
|
245
|
-
-- grammar:
|
246
|
-
--
|
247
|
-
-- M._rules = {
|
248
|
-
-- {'whitespace', ws},
|
249
|
-
-- }
|
250
|
-
--
|
251
|
-
-- Each entry in a lexer's `_rules` table consists of a rule name and its
|
252
|
-
-- associated pattern. Rule names are completely arbitrary and serve only to
|
253
|
-
-- identify and distinguish between different rules. Rule order is important: if
|
254
|
-
-- text does not match the first rule, the lexer tries the second rule, and so
|
255
|
-
-- on. This simple grammar says to match whitespace tokens under a rule named
|
256
|
-
-- "whitespace".
|
257
|
-
--
|
258
|
-
-- To illustrate the importance of rule order, here is an example of a
|
259
|
-
-- simplified Lua grammar:
|
260
|
-
--
|
261
|
-
-- M._rules = {
|
262
|
-
-- {'whitespace', ws},
|
263
|
-
-- {'keyword', keyword},
|
264
|
-
-- {'identifier', identifier},
|
265
|
-
-- {'string', string},
|
266
|
-
-- {'comment', comment},
|
267
|
-
-- {'number', number},
|
268
|
-
-- {'label', label},
|
269
|
-
-- {'operator', operator},
|
270
|
-
-- }
|
271
|
-
--
|
272
|
-
-- Note how identifiers come after keywords. In Lua, as with most programming
|
273
|
-
-- languages, the characters allowed in keywords and identifiers are in the same
|
274
|
-
-- set (alphanumerics plus underscores). If the lexer specified the "identifier"
|
275
|
-
-- rule before the "keyword" rule, all keywords would match identifiers and thus
|
276
|
-
-- incorrectly highlight as identifiers instead of keywords. The same idea
|
277
|
-
-- applies to function, constant, etc. tokens that you may want to distinguish
|
278
|
-
-- between: their rules should come before identifiers.
|
279
|
-
--
|
280
|
-
-- So what about text that does not match any rules? For example in Lua, the '!'
|
281
|
-
-- character is meaningless outside a string or comment. Normally the lexer
|
282
|
-
-- skips over such text. If instead you want to highlight these "syntax errors",
|
283
|
-
-- add an additional end rule:
|
284
|
-
--
|
285
|
-
-- M._rules = {
|
286
|
-
-- {'whitespace', ws},
|
287
|
-
-- {'error', token(l.ERROR, l.any)},
|
288
|
-
-- }
|
289
|
-
--
|
290
|
-
-- This identifies and highlights any character not matched by an existing
|
291
|
-
-- rule as an `lexer.ERROR` token.
|
292
|
-
--
|
293
|
-
-- Even though the rules defined in the examples above contain a single token,
|
294
|
-
-- rules may consist of multiple tokens. For example, a rule for an HTML tag
|
295
|
-
-- could consist of a tag token followed by an arbitrary number of attribute
|
296
|
-
-- tokens, allowing the lexer to highlight all tokens separately. The rule might
|
297
|
-
-- look something like this:
|
298
|
-
--
|
299
|
-
-- {'tag', tag_start * (ws * attributes)^0 * tag_end^-1}
|
300
|
-
--
|
301
|
-
-- Note however that lexers with complex rules like these are more prone to lose
|
302
|
-
-- track of their state.
|
303
|
-
--
|
304
|
-
-- ### Summary
|
305
|
-
--
|
306
|
-
-- Lexers primarily consist of tokens and grammar rules. At your disposal are a
|
307
|
-
-- number of convenience patterns and functions for rapidly creating a lexer. If
|
308
|
-
-- you choose to use predefined token names for your tokens, you do not have to
|
309
|
-
-- define how the lexer highlights them. The tokens will inherit the default
|
310
|
-
-- syntax highlighting color theme your editor uses.
|
311
|
-
--
|
312
|
-
-- ## Advanced Techniques
|
313
|
-
--
|
314
|
-
-- ### Styles and Styling
|
315
|
-
--
|
316
|
-
-- The most basic form of syntax highlighting is assigning different colors to
|
317
|
-
-- different tokens. Instead of highlighting with just colors, Scintilla allows
|
318
|
-
-- for more rich highlighting, or "styling", with different fonts, font sizes,
|
319
|
-
-- font attributes, and foreground and background colors, just to name a few.
|
320
|
-
-- The unit of this rich highlighting is called a "style". Styles are simply
|
321
|
-
-- strings of comma-separated property settings. By default, lexers associate
|
322
|
-
-- predefined token names like `lexer.WHITESPACE`, `lexer.COMMENT`,
|
323
|
-
-- `lexer.STRING`, etc. with particular styles as part of a universal color
|
324
|
-
-- theme. These predefined styles include [`lexer.STYLE_CLASS`](),
|
325
|
-
-- [`lexer.STYLE_COMMENT`](), [`lexer.STYLE_CONSTANT`](),
|
326
|
-
-- [`lexer.STYLE_ERROR`](), [`lexer.STYLE_EMBEDDED`](),
|
327
|
-
-- [`lexer.STYLE_FUNCTION`](), [`lexer.STYLE_IDENTIFIER`](),
|
328
|
-
-- [`lexer.STYLE_KEYWORD`](), [`lexer.STYLE_LABEL`](), [`lexer.STYLE_NUMBER`](),
|
329
|
-
-- [`lexer.STYLE_OPERATOR`](), [`lexer.STYLE_PREPROCESSOR`](),
|
330
|
-
-- [`lexer.STYLE_REGEX`](), [`lexer.STYLE_STRING`](), [`lexer.STYLE_TYPE`](),
|
331
|
-
-- [`lexer.STYLE_VARIABLE`](), and [`lexer.STYLE_WHITESPACE`](). Like with
|
332
|
-
-- predefined token names and LPeg patterns, you may define your own styles. At
|
333
|
-
-- their core, styles are just strings, so you may create new ones and/or modify
|
334
|
-
-- existing ones. Each style consists of the following comma-separated settings:
|
335
|
-
--
|
336
|
-
-- Setting | Description
|
337
|
-
-- ---------------|------------
|
338
|
-
-- font:_name_ | The name of the font the style uses.
|
339
|
-
-- size:_int_ | The size of the font the style uses.
|
340
|
-
-- [not]bold | Whether or not the font face is bold.
|
341
|
-
-- [not]italics | Whether or not the font face is italic.
|
342
|
-
-- [not]underlined| Whether or not the font face is underlined.
|
343
|
-
-- fore:_color_ | The foreground color of the font face.
|
344
|
-
-- back:_color_ | The background color of the font face.
|
345
|
-
-- [not]eolfilled | Does the background color extend to the end of the line?
|
346
|
-
-- case:_char_ | The case of the font ('u': upper, 'l': lower, 'm': normal).
|
347
|
-
-- [not]visible | Whether or not the text is visible.
|
348
|
-
-- [not]changeable| Whether the text is changeable or read-only.
|
349
|
-
-- [not]hotspot | Whether or not the text is clickable.
|
350
|
-
--
|
351
|
-
-- Specify font colors in either "#RRGGBB" format, "0xBBGGRR" format, or the
|
352
|
-
-- decimal equivalent of the latter. As with token names, LPeg patterns, and
|
353
|
-
-- styles, there is a set of predefined color names, but they vary depending on
|
354
|
-
-- the current color theme in use. Therefore, it is generally not a good idea to
|
355
|
-
-- manually define colors within styles in your lexer since they might not fit
|
356
|
-
-- into a user's chosen color theme. Try to refrain from even using predefined
|
357
|
-
-- colors in a style because that color may be theme-specific. Instead, the best
|
358
|
-
-- practice is to either use predefined styles or derive new color-agnostic
|
359
|
-
-- styles from predefined ones. For example, Lua "longstring" tokens use the
|
360
|
-
-- existing `lexer.STYLE_STRING` style instead of defining a new one.
|
361
|
-
--
|
362
|
-
-- #### Example Styles
|
363
|
-
--
|
364
|
-
-- Defining styles is pretty straightforward. An empty style that inherits the
|
365
|
-
-- default theme settings is simply an empty string:
|
366
|
-
--
|
367
|
-
-- local style_nothing = ''
|
368
|
-
--
|
369
|
-
-- A similar style but with a bold font face looks like this:
|
370
|
-
--
|
371
|
-
-- local style_bold = 'bold'
|
372
|
-
--
|
373
|
-
-- If you want the same style, but also with an italic font face, define the new
|
374
|
-
-- style in terms of the old one:
|
375
|
-
--
|
376
|
-
-- local style_bold_italic = style_bold..',italics'
|
377
|
-
--
|
378
|
-
-- This allows you to derive new styles from predefined ones without having to
|
379
|
-
-- rewrite them. This operation leaves the old style unchanged. Thus if you
|
380
|
-
-- had a "static variable" token whose style you wanted to base off of
|
381
|
-
-- `lexer.STYLE_VARIABLE`, it would probably look like:
|
382
|
-
--
|
383
|
-
-- local style_static_var = l.STYLE_VARIABLE..',italics'
|
384
|
-
--
|
385
|
-
-- The color theme files in the *lexers/themes/* folder give more examples of
|
386
|
-
-- style definitions.
|
387
|
-
--
|
388
|
-
-- ### Token Styles
|
389
|
-
--
|
390
|
-
-- Lexers use the `_tokenstyles` table to assign tokens to particular styles.
|
391
|
-
-- Recall the token definition and `_tokenstyles` table from the lexer template:
|
392
|
-
--
|
393
|
-
-- local ws = token(l.WHITESPACE, l.space^1)
|
394
|
-
--
|
395
|
-
-- ...
|
396
|
-
--
|
397
|
-
-- M._tokenstyles = {
|
398
|
-
--
|
399
|
-
-- }
|
400
|
-
--
|
401
|
-
-- Why is a style not assigned to the `lexer.WHITESPACE` token? As mentioned
|
402
|
-
-- earlier, lexers automatically associate tokens that use predefined token
|
403
|
-
-- names with a particular style. Only tokens with custom token names need
|
404
|
-
-- manual style associations. As an example, consider a custom whitespace token:
|
405
|
-
--
|
406
|
-
-- local ws = token('custom_whitespace', l.space^1)
|
407
|
-
--
|
408
|
-
-- Assigning a style to this token looks like:
|
409
|
-
--
|
410
|
-
-- M._tokenstyles = {
|
411
|
-
-- custom_whitespace = l.STYLE_WHITESPACE
|
412
|
-
-- }
|
413
|
-
--
|
414
|
-
-- Do not confuse token names with rule names. They are completely different
|
415
|
-
-- entities. In the example above, the lexer assigns the "custom_whitespace"
|
416
|
-
-- token the existing style for `WHITESPACE` tokens. If instead you want to
|
417
|
-
-- color the background of whitespace a shade of grey, it might look like:
|
418
|
-
--
|
419
|
-
-- local custom_style = l.STYLE_WHITESPACE..',back:$(color.grey)'
|
420
|
-
-- M._tokenstyles = {
|
421
|
-
-- custom_whitespace = custom_style
|
422
|
-
-- }
|
423
|
-
--
|
424
|
-
-- Notice that the lexer peforms Scintilla/SciTE-style "$()" property expansion.
|
425
|
-
-- You may also use "%()". Remember to refrain from assigning specific colors in
|
426
|
-
-- styles, but in this case, all user color themes probably define the
|
427
|
-
-- "color.grey" property.
|
428
|
-
--
|
429
|
-
-- ### Line Lexers
|
430
|
-
--
|
431
|
-
-- By default, lexers match the arbitrary chunks of text passed to them by
|
432
|
-
-- Scintilla. These chunks may be a full document, only the visible part of a
|
433
|
-
-- document, or even just portions of lines. Some lexers need to match whole
|
434
|
-
-- lines. For example, a lexer for the output of a file "diff" needs to know if
|
435
|
-
-- the line started with a '+' or '-' and then style the entire line
|
436
|
-
-- accordingly. To indicate that your lexer matches by line, use the
|
437
|
-
-- `_LEXBYLINE` field:
|
438
|
-
--
|
439
|
-
-- M._LEXBYLINE = true
|
440
|
-
--
|
441
|
-
-- Now the input text for the lexer is a single line at a time. Keep in mind
|
442
|
-
-- that line lexers do not have the ability to look ahead at subsequent lines.
|
443
|
-
--
|
444
|
-
-- ### Embedded Lexers
|
445
|
-
--
|
446
|
-
-- Lexers embed within one another very easily, requiring minimal effort. In the
|
447
|
-
-- following sections, the lexer being embedded is called the "child" lexer and
|
448
|
-
-- the lexer a child is being embedded in is called the "parent". For example,
|
449
|
-
-- consider an HTML lexer and a CSS lexer. Either lexer stands alone for styling
|
450
|
-
-- their respective HTML and CSS files. However, CSS can be embedded inside
|
451
|
-
-- HTML. In this specific case, the CSS lexer is the "child" lexer with the HTML
|
452
|
-
-- lexer being the "parent". Now consider an HTML lexer and a PHP lexer. This
|
453
|
-
-- sounds a lot like the case with CSS, but there is a subtle difference: PHP
|
454
|
-
-- _embeds itself_ into HTML while CSS is _embedded in_ HTML. This fundamental
|
455
|
-
-- difference results in two types of embedded lexers: a parent lexer that
|
456
|
-
-- embeds other child lexers in it (like HTML embedding CSS), and a child lexer
|
457
|
-
-- that embeds itself within a parent lexer (like PHP embedding itself in HTML).
|
458
|
-
--
|
459
|
-
-- #### Parent Lexer
|
460
|
-
--
|
461
|
-
-- Before embedding a child lexer into a parent lexer, the parent lexer needs to
|
462
|
-
-- load the child lexer. This is done with the [`lexer.load()`]() function. For
|
463
|
-
-- example, loading the CSS lexer within the HTML lexer looks like:
|
464
|
-
--
|
465
|
-
-- local css = l.load('css')
|
466
|
-
--
|
467
|
-
-- The next part of the embedding process is telling the parent lexer when to
|
468
|
-
-- switch over to the child lexer and when to switch back. The lexer refers to
|
469
|
-
-- these indications as the "start rule" and "end rule", respectively, and are
|
470
|
-
-- just LPeg patterns. Continuing with the HTML/CSS example, the transition from
|
471
|
-
-- HTML to CSS is when the lexer encounters a "style" tag with a "type"
|
472
|
-
-- attribute whose value is "text/css":
|
473
|
-
--
|
474
|
-
-- local css_tag = P('<style') * P(function(input, index)
|
475
|
-
-- if input:find('^[^>]+type="text/css"', index) then
|
476
|
-
-- return index
|
477
|
-
-- end
|
478
|
-
-- end)
|
479
|
-
--
|
480
|
-
-- This pattern looks for the beginning of a "style" tag and searches its
|
481
|
-
-- attribute list for the text "`type="text/css"`". (In this simplified example,
|
482
|
-
-- the Lua pattern does not consider whitespace between the '=' nor does it
|
483
|
-
-- consider that using single quotes is valid.) If there is a match, the
|
484
|
-
-- functional pattern returns a value instead of `nil`. In this case, the value
|
485
|
-
-- returned does not matter because we ultimately want to style the "style" tag
|
486
|
-
-- as an HTML tag, so the actual start rule looks like this:
|
487
|
-
--
|
488
|
-
-- local css_start_rule = #css_tag * tag
|
489
|
-
--
|
490
|
-
-- Now that the parent knows when to switch to the child, it needs to know when
|
491
|
-
-- to switch back. In the case of HTML/CSS, the switch back occurs when the
|
492
|
-
-- lexer encounters an ending "style" tag, though the lexer should still style
|
493
|
-
-- the tag as an HTML tag:
|
494
|
-
--
|
495
|
-
-- local css_end_rule = #P('</style>') * tag
|
496
|
-
--
|
497
|
-
-- Once the parent loads the child lexer and defines the child's start and end
|
498
|
-
-- rules, it embeds the child with the [`lexer.embed_lexer()`]() function:
|
499
|
-
--
|
500
|
-
-- l.embed_lexer(M, css, css_start_rule, css_end_rule)
|
501
|
-
--
|
502
|
-
-- The first parameter is the parent lexer object to embed the child in, which
|
503
|
-
-- in this case is `M`. The other three parameters are the child lexer object
|
504
|
-
-- loaded earlier followed by its start and end rules.
|
505
|
-
--
|
506
|
-
-- #### Child Lexer
|
507
|
-
--
|
508
|
-
-- The process for instructing a child lexer to embed itself into a parent is
|
509
|
-
-- very similar to embedding a child into a parent: first, load the parent lexer
|
510
|
-
-- into the child lexer with the [`lexer.load()`]() function and then create
|
511
|
-
-- start and end rules for the child lexer. However, in this case, swap the
|
512
|
-
-- lexer object arguments to [`lexer.embed_lexer()`](). For example, in the PHP
|
513
|
-
-- lexer:
|
514
|
-
--
|
515
|
-
-- local html = l.load('html')
|
516
|
-
-- local php_start_rule = token('php_tag', '<?php ')
|
517
|
-
-- local php_end_rule = token('php_tag', '?>')
|
518
|
-
-- l.embed_lexer(html, M, php_start_rule, php_end_rule)
|
519
|
-
--
|
520
|
-
-- ## Code Folding
|
521
|
-
--
|
522
|
-
-- When reading source code, it is occasionally helpful to temporarily hide
|
523
|
-
-- blocks of code like functions, classes, comments, etc. This is the concept of
|
524
|
-
-- "folding". In the Textadept and SciTE editors for example, little indicators
|
525
|
-
-- in the editor margins appear next to code that can be folded at places called
|
526
|
-
-- "fold points". When the user clicks an indicator, the editor hides the code
|
527
|
-
-- associated with the indicator until the user clicks the indicator again. The
|
528
|
-
-- lexer specifies these fold points and what code exactly to fold.
|
529
|
-
--
|
530
|
-
-- The fold points for most languages occur on keywords or character sequences.
|
531
|
-
-- Examples of fold keywords are "if" and "end" in Lua and examples of fold
|
532
|
-
-- character sequences are '{', '}', "/\*", and "\*/" in C for code block and
|
533
|
-
-- comment delimiters, respectively. However, these fold points cannot occur
|
534
|
-
-- just anywhere. For example, lexers should not recognize fold keywords that
|
535
|
-
-- appear within strings or comments. The lexer's `_foldsymbols` table allows
|
536
|
-
-- you to conveniently define fold points with such granularity. For example,
|
537
|
-
-- consider C:
|
538
|
-
--
|
539
|
-
-- M._foldsymbols = {
|
540
|
-
-- [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
|
541
|
-
-- [l.COMMENT] = {['/*'] = 1, ['*/'] = -1},
|
542
|
-
-- _patterns = {'[{}]', '/%*', '%*/'}
|
543
|
-
-- }
|
544
|
-
--
|
545
|
-
-- The first assignment states that any '{' or '}' that the lexer recognized as
|
546
|
-
-- an `lexer.OPERATOR` token is a fold point. The integer `1` indicates the
|
547
|
-
-- match is a beginning fold point and `-1` indicates the match is an ending
|
548
|
-
-- fold point. Likewise, the second assignment states that any "/\*" or "\*/"
|
549
|
-
-- that the lexer recognizes as part of a `lexer.COMMENT` token is a fold point.
|
550
|
-
-- The lexer does not consider any occurences of these characters outside their
|
551
|
-
-- defined tokens (such as in a string) as fold points. Finally, every
|
552
|
-
-- `_foldsymbols` table must have a `_patterns` field that contains a list of
|
553
|
-
-- [Lua patterns][] that match fold points. If the lexer encounters text that
|
554
|
-
-- matches one of those patterns, the lexer looks up the matched text in its
|
555
|
-
-- token's table to determine whether or not the text is a fold point. In the
|
556
|
-
-- example above, the first Lua pattern matches any '{' or '}' characters. When
|
557
|
-
-- the lexer comes across one of those characters, it checks if the match is an
|
558
|
-
-- `lexer.OPERATOR` token. If so, the lexer identifies the match as a fold
|
559
|
-
-- point. The same idea applies for the other patterns. (The '%' is in the other
|
560
|
-
-- patterns because '\*' is a special character in Lua patterns that needs
|
561
|
-
-- escaping.) How do you specify fold keywords? Here is an example for Lua:
|
562
|
-
--
|
563
|
-
-- M._foldsymbols = {
|
564
|
-
-- [l.KEYWORD] = {
|
565
|
-
-- ['if'] = 1, ['do'] = 1, ['function'] = 1,
|
566
|
-
-- ['end'] = -1, ['repeat'] = 1, ['until'] = -1
|
567
|
-
-- },
|
568
|
-
-- _patterns = {'%l+'}
|
569
|
-
-- }
|
570
|
-
--
|
571
|
-
-- Any time the lexer encounters a lower case word, if that word is a
|
572
|
-
-- `lexer.KEYWORD` token and in the associated list of fold points, the lexer
|
573
|
-
-- identifies the word as a fold point.
|
574
|
-
--
|
575
|
-
-- If your lexer needs to do some additional processing to determine if a match
|
576
|
-
-- is a fold point, assign a function that returns an integer. Returning `1` or
|
577
|
-
-- `-1` indicates the match is a fold point. Returning `0` indicates it is not.
|
578
|
-
-- For example:
|
579
|
-
--
|
580
|
-
-- local function fold_strange_token(text, pos, line, s, match)
|
581
|
-
-- if ... then
|
582
|
-
-- return 1 -- beginning fold point
|
583
|
-
-- elseif ... then
|
584
|
-
-- return -1 -- ending fold point
|
585
|
-
-- end
|
586
|
-
-- return 0
|
587
|
-
-- end
|
588
|
-
--
|
589
|
-
-- M._foldsymbols = {
|
590
|
-
-- ['strange_token'] = {['|'] = fold_strange_token},
|
591
|
-
-- _patterns = {'|'}
|
592
|
-
-- }
|
593
|
-
--
|
594
|
-
-- Any time the lexer encounters a '|' that is a "strange_token", it calls the
|
595
|
-
-- `fold_strange_token` function to determine if '|' is a fold point. The lexer
|
596
|
-
-- calls these functions with the following arguments: the text to identify fold
|
597
|
-
-- points in, the beginning position of the current line in the text to fold,
|
598
|
-
-- the current line's text, the position in the current line the matched text
|
599
|
-
-- starts at, and the matched text itself.
|
600
|
-
--
|
601
|
-
-- [Lua patterns]: http://www.lua.org/manual/5.2/manual.html#6.4.1
|
602
|
-
--
|
603
|
-
-- ## Using Lexers
|
604
|
-
--
|
605
|
-
-- ### Textadept
|
606
|
-
--
|
607
|
-
-- Put your lexer in your *~/.textadept/lexers/* directory so you do not
|
608
|
-
-- overwrite it when upgrading Textadept. Also, lexers in this directory
|
609
|
-
-- override default lexers. Thus, Textadept loads a user *lua* lexer instead of
|
610
|
-
-- the default *lua* lexer. This is convenient for tweaking a default lexer to
|
611
|
-
-- your liking. Then add a [file type][] for your lexer if necessary.
|
612
|
-
--
|
613
|
-
-- [file type]: _M.textadept.file_types.html
|
614
|
-
--
|
615
|
-
-- ### SciTE
|
616
|
-
--
|
617
|
-
-- Create a *.properties* file for your lexer and `import` it in either your
|
618
|
-
-- *SciTEUser.properties* or *SciTEGlobal.properties*. The contents of the
|
619
|
-
-- *.properties* file should contain:
|
620
|
-
--
|
621
|
-
-- file.patterns.[lexer_name]=[file_patterns]
|
622
|
-
-- lexer.$(file.patterns.[lexer_name])=[lexer_name]
|
623
|
-
--
|
624
|
-
-- where `[lexer_name]` is the name of your lexer (minus the *.lua* extension)
|
625
|
-
-- and `[file_patterns]` is a set of file extensions to use your lexer for.
|
626
|
-
--
|
627
|
-
-- Please note that Lua lexers ignore any styling information in *.properties*
|
628
|
-
-- files. Your theme file in the *lexers/themes/* directory contains styling
|
629
|
-
-- information.
|
630
|
-
--
|
631
|
-
-- ## Considerations
|
632
|
-
--
|
633
|
-
-- ### Performance
|
634
|
-
--
|
635
|
-
-- There might be some slight overhead when initializing a lexer, but loading a
|
636
|
-
-- file from disk into Scintilla is usually more expensive. On modern computer
|
637
|
-
-- systems, I see no difference in speed between LPeg lexers and Scintilla's C++
|
638
|
-
-- ones. Optimize lexers for speed by re-arranging rules in the `_rules` table
|
639
|
-
-- so that the most common rules match first. Do keep in mind that order matters
|
640
|
-
-- for similar rules.
|
641
|
-
--
|
642
|
-
-- ### Limitations
|
643
|
-
--
|
644
|
-
-- Embedded preprocessor languages like PHP cannot completely embed in their
|
645
|
-
-- parent languages in that the parent's tokens do not support start and end
|
646
|
-
-- rules. This mostly goes unnoticed, but code like
|
647
|
-
--
|
648
|
-
-- <div id="<?php echo $id; ?>">
|
649
|
-
--
|
650
|
-
-- or
|
651
|
-
--
|
652
|
-
-- <div <?php if ($odd) { echo 'class="odd"'; } ?>>
|
653
|
-
--
|
654
|
-
-- will not style correctly.
|
655
|
-
--
|
656
|
-
-- ### Troubleshooting
|
657
|
-
--
|
658
|
-
-- Errors in lexers can be tricky to debug. Lexers print Lua errors to
|
659
|
-
-- `io.stderr` and `_G.print()` statements to `io.stdout`. Running your editor
|
660
|
-
-- from a terminal is the easiest way to see errors as they occur.
|
661
|
-
--
|
662
|
-
-- ### Risks
|
663
|
-
--
|
664
|
-
-- Poorly written lexers have the ability to crash Scintilla (and thus its
|
665
|
-
-- containing application), so unsaved data might be lost. However, I have only
|
666
|
-
-- observed these crashes in early lexer development, when syntax errors or
|
667
|
-
-- pattern errors are present. Once the lexer actually starts styling text
|
668
|
-
-- (either correctly or incorrectly, it does not matter), I have not observed
|
669
|
-
-- any crashes.
|
670
|
-
--
|
671
|
-
-- ### Acknowledgements
|
672
|
-
--
|
673
|
-
-- Thanks to Peter Odding for his [lexer post][] on the Lua mailing list
|
674
|
-
-- that inspired me, and thanks to Roberto Ierusalimschy for LPeg.
|
675
|
-
--
|
676
|
-
-- [lexer post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html
|
677
|
-
-- @field LEXERPATH (string)
|
678
|
-
-- The path used to search for a lexer to load.
|
679
|
-
-- Identical in format to Lua's `package.path` string.
|
680
|
-
-- The default value is `package.path`.
|
681
|
-
-- @field DEFAULT (string)
|
682
|
-
-- The token name for default tokens.
|
683
|
-
-- @field WHITESPACE (string)
|
684
|
-
-- The token name for whitespace tokens.
|
685
|
-
-- @field COMMENT (string)
|
686
|
-
-- The token name for comment tokens.
|
687
|
-
-- @field STRING (string)
|
688
|
-
-- The token name for string tokens.
|
689
|
-
-- @field NUMBER (string)
|
690
|
-
-- The token name for number tokens.
|
691
|
-
-- @field KEYWORD (string)
|
692
|
-
-- The token name for keyword tokens.
|
693
|
-
-- @field IDENTIFIER (string)
|
694
|
-
-- The token name for identifier tokens.
|
695
|
-
-- @field OPERATOR (string)
|
696
|
-
-- The token name for operator tokens.
|
697
|
-
-- @field ERROR (string)
|
698
|
-
-- The token name for error tokens.
|
699
|
-
-- @field PREPROCESSOR (string)
|
700
|
-
-- The token name for preprocessor tokens.
|
701
|
-
-- @field CONSTANT (string)
|
702
|
-
-- The token name for constant tokens.
|
703
|
-
-- @field VARIABLE (string)
|
704
|
-
-- The token name for variable tokens.
|
705
|
-
-- @field FUNCTION (string)
|
706
|
-
-- The token name for function tokens.
|
707
|
-
-- @field CLASS (string)
|
708
|
-
-- The token name for class tokens.
|
709
|
-
-- @field TYPE (string)
|
710
|
-
-- The token name for type tokens.
|
711
|
-
-- @field LABEL (string)
|
712
|
-
-- The token name for label tokens.
|
713
|
-
-- @field REGEX (string)
|
714
|
-
-- The token name for regex tokens.
|
715
|
-
-- @field STYLE_CLASS (string)
|
716
|
-
-- The style typically used for class definitions.
|
717
|
-
-- @field STYLE_COMMENT (string)
|
718
|
-
-- The style typically used for code comments.
|
719
|
-
-- @field STYLE_CONSTANT (string)
|
720
|
-
-- The style typically used for constants.
|
721
|
-
-- @field STYLE_ERROR (string)
|
722
|
-
-- The style typically used for erroneous syntax.
|
723
|
-
-- @field STYLE_FUNCTION (string)
|
724
|
-
-- The style typically used for function definitions.
|
725
|
-
-- @field STYLE_KEYWORD (string)
|
726
|
-
-- The style typically used for language keywords.
|
727
|
-
-- @field STYLE_LABEL (string)
|
728
|
-
-- The style typically used for labels.
|
729
|
-
-- @field STYLE_NUMBER (string)
|
730
|
-
-- The style typically used for numbers.
|
731
|
-
-- @field STYLE_OPERATOR (string)
|
732
|
-
-- The style typically used for operators.
|
733
|
-
-- @field STYLE_REGEX (string)
|
734
|
-
-- The style typically used for regular expression strings.
|
735
|
-
-- @field STYLE_STRING (string)
|
736
|
-
-- The style typically used for strings.
|
737
|
-
-- @field STYLE_PREPROCESSOR (string)
|
738
|
-
-- The style typically used for preprocessor statements.
|
739
|
-
-- @field STYLE_TYPE (string)
|
740
|
-
-- The style typically used for static types.
|
741
|
-
-- @field STYLE_VARIABLE (string)
|
742
|
-
-- The style typically used for variables.
|
743
|
-
-- @field STYLE_WHITESPACE (string)
|
744
|
-
-- The style typically used for whitespace.
|
745
|
-
-- @field STYLE_EMBEDDED (string)
|
746
|
-
-- The style typically used for embedded code.
|
747
|
-
-- @field STYLE_IDENTIFIER (string)
|
748
|
-
-- The style typically used for identifier words.
|
749
|
-
-- @field STYLE_DEFAULT (string)
|
750
|
-
-- The style all styles are based off of.
|
751
|
-
-- @field STYLE_LINENUMBER (string)
|
752
|
-
-- The style used for all margins except fold margins.
|
753
|
-
-- @field STYLE_BRACELIGHT (string)
|
754
|
-
-- The style used for highlighted brace characters.
|
755
|
-
-- @field STYLE_BRACEBAD (string)
|
756
|
-
-- The style used for unmatched brace characters.
|
757
|
-
-- @field STYLE_CONTROLCHAR (string)
|
758
|
-
-- The style used for control characters.
|
759
|
-
-- Color attributes are ignored.
|
760
|
-
-- @field STYLE_INDENTGUIDE (string)
|
761
|
-
-- The style used for indentation guides.
|
762
|
-
-- @field STYLE_CALLTIP (string)
|
763
|
-
-- The style used by call tips if [`buffer.call_tip_use_style`]() is set.
|
764
|
-
-- Only the font name, size, and color attributes are used.
|
765
|
-
-- @field any (pattern)
|
766
|
-
-- A pattern that matches any single character.
|
767
|
-
-- @field ascii (pattern)
|
768
|
-
-- A pattern that matches any ASCII character (codes 0 to 127).
|
769
|
-
-- @field extend (pattern)
|
770
|
-
-- A pattern that matches any ASCII extended character (codes 0 to 255).
|
771
|
-
-- @field alpha (pattern)
|
772
|
-
-- A pattern that matches any alphabetic character ('A'-'Z', 'a'-'z').
|
773
|
-
-- @field digit (pattern)
|
774
|
-
-- A pattern that matches any digit ('0'-'9').
|
775
|
-
-- @field alnum (pattern)
|
776
|
-
-- A pattern that matches any alphanumeric character ('A'-'Z', 'a'-'z',
|
777
|
-
-- '0'-'9').
|
778
|
-
-- @field lower (pattern)
|
779
|
-
-- A pattern that matches any lower case character ('a'-'z').
|
780
|
-
-- @field upper (pattern)
|
781
|
-
-- A pattern that matches any upper case character ('A'-'Z').
|
782
|
-
-- @field xdigit (pattern)
|
783
|
-
-- A pattern that matches any hexadecimal digit ('0'-'9', 'A'-'F', 'a'-'f').
|
784
|
-
-- @field cntrl (pattern)
|
785
|
-
-- A pattern that matches any control character (ASCII codes 0 to 31).
|
786
|
-
-- @field graph (pattern)
|
787
|
-
-- A pattern that matches any graphical character ('!' to '~').
|
788
|
-
-- @field print (pattern)
|
789
|
-
-- A pattern that matches any printable character (' ' to '~').
|
790
|
-
-- @field punct (pattern)
|
791
|
-
-- A pattern that matches any punctuation character ('!' to '/', ':' to '@',
|
792
|
-
-- '[' to ''', '{' to '~').
|
793
|
-
-- @field space (pattern)
|
794
|
-
-- A pattern that matches any whitespace character ('\t', '\v', '\f', '\n',
|
795
|
-
-- '\r', space).
|
796
|
-
-- @field newline (pattern)
|
797
|
-
-- A pattern that matches any set of end of line characters.
|
798
|
-
-- @field nonnewline (pattern)
|
799
|
-
-- A pattern that matches any single, non-newline character.
|
800
|
-
-- @field nonnewline_esc (pattern)
|
801
|
-
-- A pattern that matches any single, non-newline character or any set of end
|
802
|
-
-- of line characters escaped with '\'.
|
803
|
-
-- @field dec_num (pattern)
|
804
|
-
-- A pattern that matches a decimal number.
|
805
|
-
-- @field hex_num (pattern)
|
806
|
-
-- A pattern that matches a hexadecimal number.
|
807
|
-
-- @field oct_num (pattern)
|
808
|
-
-- A pattern that matches an octal number.
|
809
|
-
-- @field integer (pattern)
|
810
|
-
-- A pattern that matches either a decimal, hexadecimal, or octal number.
|
811
|
-
-- @field float (pattern)
|
812
|
-
-- A pattern that matches a floating point number.
|
813
|
-
-- @field word (pattern)
|
814
|
-
-- A pattern that matches a typical word. Words begin with a letter or
|
815
|
-
-- underscore and consist of alphanumeric and underscore characters.
|
816
|
-
-- @field FOLD_BASE (number)
|
817
|
-
-- The initial (root) fold level.
|
818
|
-
-- @field FOLD_BLANK (number)
|
819
|
-
-- Flag indicating that the line is blank.
|
820
|
-
-- @field FOLD_HEADER (number)
|
821
|
-
-- Flag indicating the line is fold point.
|
822
|
-
-- @field fold_level (table, Read-only)
|
823
|
-
-- Table of fold level bit-masks for line numbers starting from zero.
|
824
|
-
-- Fold level masks are composed of an integer level combined with any of the
|
825
|
-
-- following bits:
|
826
|
-
--
|
827
|
-
-- * `lexer.FOLD_BASE`
|
828
|
-
-- The initial fold level.
|
829
|
-
-- * `lexer.FOLD_BLANK`
|
830
|
-
-- The line is blank.
|
831
|
-
-- * `lexer.FOLD_HEADER`
|
832
|
-
-- The line is a header, or fold point.
|
833
|
-
-- @field indent_amount (table, Read-only)
|
834
|
-
-- Table of indentation amounts in character columns, for line numbers
|
835
|
-
-- starting from zero.
|
836
|
-
-- @field property (table)
|
837
|
-
-- Map of key-value string pairs.
|
838
|
-
-- @field property_expanded (table, Read-only)
|
839
|
-
-- Map of key-value string pairs with `$()` and `%()` variable replacement
|
840
|
-
-- performed in values.
|
841
|
-
-- @field property_int (table, Read-only)
|
842
|
-
-- Map of key-value pairs with values interpreted as numbers, or `0` if not
|
843
|
-
-- found.
|
844
|
-
-- @field style_at (table, Read-only)
|
845
|
-
-- Table of style names at positions in the buffer starting from zero.
|
846
|
-
module('lexer')]=]
|
847
|
-
|
848
|
-
local lpeg = require('lpeg')
|
849
|
-
local lpeg_P, lpeg_R, lpeg_S, lpeg_V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
|
850
|
-
local lpeg_Ct, lpeg_Cc, lpeg_Cp = lpeg.Ct, lpeg.Cc, lpeg.Cp
|
851
|
-
local lpeg_Cmt, lpeg_C, lpeg_Cg = lpeg.Cmt, lpeg.C, lpeg.Cg
|
852
|
-
local lpeg_match = lpeg.match
|
853
|
-
|
854
|
-
M.LEXERPATH = package.path
|
855
|
-
|
856
|
-
-- Table of loaded lexers.
|
857
|
-
local lexers = {}
|
858
|
-
|
859
|
-
-- Keep track of the last parent lexer loaded. This lexer's rules are used for
|
860
|
-
-- proxy lexers (those that load parent and child lexers to embed) that do not
|
861
|
-
-- declare a parent lexer.
|
862
|
-
local parent_lexer
|
863
|
-
|
864
|
-
if not package.searchpath then
|
865
|
-
-- Searches for the given *name* in the given *path*.
|
866
|
-
-- This is an implementation of Lua 5.2's `package.searchpath()` function for
|
867
|
-
-- Lua 5.1.
|
868
|
-
function package.searchpath(name, path)
|
869
|
-
local tried = {}
|
870
|
-
for part in path:gmatch('[^;]+') do
|
871
|
-
local filename = part:gsub('%?', name)
|
872
|
-
local f = io.open(filename, 'r')
|
873
|
-
if f then f:close() return filename end
|
874
|
-
tried[#tried + 1] = ("no file '%s'"):format(filename)
|
875
|
-
end
|
876
|
-
return nil, table.concat(tried, '\n')
|
877
|
-
end
|
878
|
-
end
|
879
|
-
|
880
|
-
-- Adds a rule to a lexer's current ordered list of rules.
|
881
|
-
-- @param lexer The lexer to add the given rule to.
|
882
|
-
-- @param name The name associated with this rule. It is used for other lexers
|
883
|
-
-- to access this particular rule from the lexer's `_RULES` table. It does not
|
884
|
-
-- have to be the same as the name passed to `token`.
|
885
|
-
-- @param rule The LPeg pattern of the rule.
|
886
|
-
local function add_rule(lexer, id, rule)
|
887
|
-
if not lexer._RULES then
|
888
|
-
lexer._RULES = {}
|
889
|
-
-- Contains an ordered list (by numerical index) of rule names. This is used
|
890
|
-
-- in conjunction with lexer._RULES for building _TOKENRULE.
|
891
|
-
lexer._RULEORDER = {}
|
892
|
-
end
|
893
|
-
lexer._RULES[id] = rule
|
894
|
-
lexer._RULEORDER[#lexer._RULEORDER + 1] = id
|
895
|
-
end
|
896
|
-
|
897
|
-
-- Adds a new Scintilla style to Scintilla.
|
898
|
-
-- @param lexer The lexer to add the given style to.
|
899
|
-
-- @param token_name The name of the token associated with this style.
|
900
|
-
-- @param style A Scintilla style created from `style()`.
|
901
|
-
-- @see style
|
902
|
-
local function add_style(lexer, token_name, style)
|
903
|
-
local num_styles = lexer._numstyles
|
904
|
-
if num_styles == 32 then num_styles = num_styles + 8 end -- skip predefined
|
905
|
-
if num_styles >= 255 then print('Too many styles defined (255 MAX)') end
|
906
|
-
lexer._TOKENSTYLES[token_name], lexer._numstyles = num_styles, num_styles + 1
|
907
|
-
lexer._EXTRASTYLES[token_name] = style
|
908
|
-
end
|
909
|
-
|
910
|
-
-- (Re)constructs `lexer._TOKENRULE`.
|
911
|
-
-- @param parent The parent lexer.
|
912
|
-
local function join_tokens(lexer)
|
913
|
-
local patterns, order = lexer._RULES, lexer._RULEORDER
|
914
|
-
local token_rule = patterns[order[1]]
|
915
|
-
for i = 2, #order do token_rule = token_rule + patterns[order[i]] end
|
916
|
-
lexer._TOKENRULE = token_rule + M.token(M.DEFAULT, M.any)
|
917
|
-
return lexer._TOKENRULE
|
918
|
-
end
|
919
|
-
|
920
|
-
-- Adds a given lexer and any of its embedded lexers to a given grammar.
|
921
|
-
-- @param grammar The grammar to add the lexer to.
|
922
|
-
-- @param lexer The lexer to add.
|
923
|
-
local function add_lexer(grammar, lexer, token_rule)
|
924
|
-
local token_rule = join_tokens(lexer)
|
925
|
-
local lexer_name = lexer._NAME
|
926
|
-
for _, child in ipairs(lexer._CHILDREN) do
|
927
|
-
if child._CHILDREN then add_lexer(grammar, child) end
|
928
|
-
local child_name = child._NAME
|
929
|
-
local rules = child._EMBEDDEDRULES[lexer_name]
|
930
|
-
local rules_token_rule = grammar['__'..child_name] or rules.token_rule
|
931
|
-
grammar[child_name] = (-rules.end_rule * rules_token_rule)^0 *
|
932
|
-
rules.end_rule^-1 * lpeg_V(lexer_name)
|
933
|
-
local embedded_child = '_'..child_name
|
934
|
-
grammar[embedded_child] = rules.start_rule * (-rules.end_rule *
|
935
|
-
rules_token_rule)^0 * rules.end_rule^-1
|
936
|
-
token_rule = lpeg_V(embedded_child) + token_rule
|
937
|
-
end
|
938
|
-
grammar['__'..lexer_name] = token_rule -- can contain embedded lexer rules
|
939
|
-
grammar[lexer_name] = token_rule^0
|
940
|
-
end
|
941
|
-
|
942
|
-
-- (Re)constructs `lexer._GRAMMAR`.
|
943
|
-
-- @param lexer The parent lexer.
|
944
|
-
-- @param initial_rule The name of the rule to start lexing with. The default
|
945
|
-
-- value is `lexer._NAME`. Multilang lexers use this to start with a child
|
946
|
-
-- rule if necessary.
|
947
|
-
local function build_grammar(lexer, initial_rule)
|
948
|
-
-- local children = lexer._CHILDREN
|
949
|
-
-- if children then
|
950
|
-
local lexer_name = lexer._NAME
|
951
|
-
if not initial_rule then initial_rule = lexer_name end
|
952
|
-
local grammar = {initial_rule}
|
953
|
-
if not lexer._CHILDREN then lexer._CHILDREN={} end
|
954
|
-
add_lexer(grammar, lexer)
|
955
|
-
lexer._INITIALRULE = initial_rule
|
956
|
-
lexer._GRAMMAR = lpeg_Ct(lpeg_P(grammar))
|
957
|
-
-- else
|
958
|
-
-- lexer._GRAMMAR = lpeg_Ct(join_tokens(lexer)^0)
|
959
|
-
-- end
|
960
|
-
end
|
961
|
-
|
962
|
-
local string_upper = string.upper
|
963
|
-
-- Default styles.
|
964
|
-
local default = {
|
965
|
-
'nothing', 'whitespace', 'comment', 'string', 'number', 'keyword',
|
966
|
-
'identifier', 'operator', 'error', 'preprocessor', 'constant', 'variable',
|
967
|
-
'function', 'class', 'type', 'label', 'regex', 'embedded'
|
968
|
-
}
|
969
|
-
for _, v in ipairs(default) do
|
970
|
-
M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
|
971
|
-
end
|
972
|
-
-- Predefined styles.
|
973
|
-
local predefined = {
|
974
|
-
'default', 'linenumber', 'bracelight', 'bracebad', 'controlchar',
|
975
|
-
'indentguide', 'calltip'
|
976
|
-
}
|
977
|
-
for _, v in ipairs(predefined) do
|
978
|
-
M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
|
979
|
-
end
|
980
|
-
|
981
|
-
---
|
982
|
-
-- Initializes or loads and returns the lexer of string name *name*.
|
983
|
-
-- Scintilla calls this function to load a lexer. Parent lexers also call this
|
984
|
-
-- function to load child lexers and vice-versa. The user calls this function
|
985
|
-
-- to load a lexer when using Scintillua as a Lua library.
|
986
|
-
-- @param name The name of the lexing language.
|
987
|
-
-- @param alt_name The alternate name of the lexing language. This is useful for
|
988
|
-
-- embedding the same child lexer with multiple sets of start and end tokens.
|
989
|
-
-- @return lexer object
|
990
|
-
-- @name load
|
991
|
-
function M.load(name, alt_name)
|
992
|
-
if lexers[alt_name or name] then return lexers[alt_name or name] end
|
993
|
-
parent_lexer = nil -- reset
|
994
|
-
|
995
|
-
-- When using Scintillua as a stand-alone module, the `property` and
|
996
|
-
-- `property_int` tables do not exist (they are not useful). Create them to
|
997
|
-
-- prevent errors from occurring.
|
998
|
-
if not M.property then
|
999
|
-
M.property, M.property_int = {}, setmetatable({}, {
|
1000
|
-
__index = function(t, k)
|
1001
|
-
return tostring(tonumber(M.property[k]) or 0)
|
1002
|
-
end,
|
1003
|
-
__newindex = function() error('read-only property') end
|
1004
|
-
})
|
1005
|
-
end
|
1006
|
-
|
1007
|
-
-- Load the language lexer with its rules, styles, etc.
|
1008
|
-
M.WHITESPACE = (alt_name or name)..'_whitespace'
|
1009
|
-
local lexer_file, error = package.searchpath(name, M.LEXERPATH)
|
1010
|
-
local ok, lexer = pcall(dofile, lexer_file or '')
|
1011
|
-
if not ok then
|
1012
|
-
_G.print(error or lexer) -- error message
|
1013
|
-
lexer = {_NAME = alt_name or name}
|
1014
|
-
end
|
1015
|
-
if alt_name then lexer._NAME = alt_name end
|
1016
|
-
|
1017
|
-
-- Create the initial maps for token names to style numbers and styles.
|
1018
|
-
local token_styles = {}
|
1019
|
-
for i = 1, #default do token_styles[default[i]] = i - 1 end
|
1020
|
-
for i = 1, #predefined do token_styles[predefined[i]] = i + 31 end
|
1021
|
-
lexer._TOKENSTYLES, lexer._numstyles = token_styles, #default
|
1022
|
-
lexer._EXTRASTYLES = {}
|
1023
|
-
|
1024
|
-
-- If the lexer is a proxy (loads parent and child lexers to embed) and does
|
1025
|
-
-- not declare a parent, try and find one and use its rules.
|
1026
|
-
if not lexer._rules and not lexer._lexer then lexer._lexer = parent_lexer end
|
1027
|
-
|
1028
|
-
-- If the lexer is a proxy or a child that embedded itself, add its rules and
|
1029
|
-
-- styles to the parent lexer. Then set the parent to be the main lexer.
|
1030
|
-
if lexer._lexer then
|
1031
|
-
local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles
|
1032
|
-
if not l._tokenstyles then l._tokenstyles = {} end
|
1033
|
-
for _, r in ipairs(_r or {}) do
|
1034
|
-
-- Prevent rule id clashes.
|
1035
|
-
l._rules[#l._rules + 1] = {lexer._NAME..'_'..r[1], r[2]}
|
1036
|
-
end
|
1037
|
-
for token, style in pairs(_s or {}) do l._tokenstyles[token] = style end
|
1038
|
-
lexer = l
|
1039
|
-
end
|
1040
|
-
|
1041
|
-
-- Add the lexer's styles and build its grammar.
|
1042
|
-
if lexer._rules then
|
1043
|
-
for token, style in pairs(lexer._tokenstyles or {}) do
|
1044
|
-
add_style(lexer, token, style)
|
1045
|
-
end
|
1046
|
-
for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end
|
1047
|
-
build_grammar(lexer)
|
1048
|
-
end
|
1049
|
-
-- Add the lexer's unique whitespace style.
|
1050
|
-
add_style(lexer, lexer._NAME..'_whitespace', M.STYLE_WHITESPACE)
|
1051
|
-
|
1052
|
-
-- Process the lexer's fold symbols.
|
1053
|
-
if lexer._foldsymbols and lexer._foldsymbols._patterns then
|
1054
|
-
local patterns = lexer._foldsymbols._patterns
|
1055
|
-
for i = 1, #patterns do patterns[i] = '()('..patterns[i]..')' end
|
1056
|
-
end
|
1057
|
-
|
1058
|
-
lexer.lex, lexer.fold = M.lex, M.fold
|
1059
|
-
-- Immun.io copy over some of our helpful functions
|
1060
|
-
if M.lex_recursive then lexer.lex_recursive = M.lex_recursive end
|
1061
|
-
if M.unlex_rules then lexer.unlex_rules = M.unlex_rules end
|
1062
|
-
lexers[alt_name or name] = lexer
|
1063
|
-
return lexer
|
1064
|
-
end
|
1065
|
-
|
1066
|
-
---
|
1067
|
-
-- Lexes a chunk of text *text* (that has an initial style number of
|
1068
|
-
-- *init_style*) with lexer *lexer*.
|
1069
|
-
-- If *lexer* has a `_LEXBYLINE` flag set, the text is lexed one line at a time.
|
1070
|
-
-- Otherwise the text is lexed as a whole.
|
1071
|
-
-- @param lexer The lexer object to lex with.
|
1072
|
-
-- @param text The text in the buffer to lex.
|
1073
|
-
-- @param init_style The current style. Multiple-language lexers use this to
|
1074
|
-
-- determine which language to start lexing in.
|
1075
|
-
-- @return table of token names and positions.
|
1076
|
-
-- @name lex
|
1077
|
-
function M.lex(lexer, text, init_style)
|
1078
|
-
if not lexer._LEXBYLINE then
|
1079
|
-
-- For multilang lexers, build a new grammar whose initial_rule is the
|
1080
|
-
-- current language.
|
1081
|
-
if lexer._CHILDREN then
|
1082
|
-
for style, style_num in pairs(lexer._TOKENSTYLES) do
|
1083
|
-
if style_num == init_style then
|
1084
|
-
local lexer_name = style:match('^(.+)_whitespace') or lexer._NAME
|
1085
|
-
if lexer._INITIALRULE ~= lexer_name then
|
1086
|
-
build_grammar(lexer, lexer_name)
|
1087
|
-
end
|
1088
|
-
break
|
1089
|
-
end
|
1090
|
-
end
|
1091
|
-
end
|
1092
|
-
return lpeg_match(lexer._GRAMMAR, text)
|
1093
|
-
else
|
1094
|
-
local tokens = {}
|
1095
|
-
local function append(tokens, line_tokens, offset)
|
1096
|
-
for i = 1, #line_tokens, 2 do
|
1097
|
-
tokens[#tokens + 1] = line_tokens[i]
|
1098
|
-
tokens[#tokens + 1] = line_tokens[i + 1] + offset
|
1099
|
-
end
|
1100
|
-
end
|
1101
|
-
local offset = 0
|
1102
|
-
local grammar = lexer._GRAMMAR
|
1103
|
-
for line in text:gmatch('[^\r\n]*\r?\n?') do
|
1104
|
-
local line_tokens = lpeg_match(grammar, line)
|
1105
|
-
if line_tokens then append(tokens, line_tokens, offset) end
|
1106
|
-
offset = offset + #line
|
1107
|
-
-- Use the default style to the end of the line if none was specified.
|
1108
|
-
if tokens[#tokens] ~= offset then
|
1109
|
-
tokens[#tokens + 1], tokens[#tokens + 2] = 'default', offset + 1
|
1110
|
-
end
|
1111
|
-
end
|
1112
|
-
return tokens
|
1113
|
-
end
|
1114
|
-
end
|
1115
|
-
|
1116
|
-
---
|
1117
|
-
-- Folds a chunk of text *text* with lexer *lexer*.
|
1118
|
-
-- Folds *text* starting at position *start_pos* on line number *start_line*
|
1119
|
-
-- with a beginning fold level of *start_level* in the buffer. If *lexer* has a
|
1120
|
-
-- a `_fold` function or a `_foldsymbols` table, that field is used to perform
|
1121
|
-
-- folding. Otherwise, if a `fold.by.indentation` property is set, folding by
|
1122
|
-
-- indentation is done.
|
1123
|
-
-- @param lexer The lexer object to fold with.
|
1124
|
-
-- @param text The text in the buffer to fold.
|
1125
|
-
-- @param start_pos The position in the buffer *text* starts at.
|
1126
|
-
-- @param start_line The line number *text* starts on.
|
1127
|
-
-- @param start_level The fold level *text* starts on.
|
1128
|
-
-- @return table of fold levels.
|
1129
|
-
-- @name fold
|
1130
|
-
function M.fold(lexer, text, start_pos, start_line, start_level)
|
1131
|
-
local folds = {}
|
1132
|
-
if text == '' then return folds end
|
1133
|
-
local fold = M.property_int['fold'] > 0
|
1134
|
-
local FOLD_BASE = M.FOLD_BASE
|
1135
|
-
local FOLD_HEADER, FOLD_BLANK = M.FOLD_HEADER, M.FOLD_BLANK
|
1136
|
-
if fold and lexer._fold then
|
1137
|
-
return lexer._fold(text, start_pos, start_line, start_level)
|
1138
|
-
elseif fold and lexer._foldsymbols then
|
1139
|
-
local lines = {}
|
1140
|
-
for p, l in (text..'\n'):gmatch('()(.-)\r?\n') do
|
1141
|
-
lines[#lines + 1] = {p, l}
|
1142
|
-
end
|
1143
|
-
local fold_zero_sum_lines = M.property_int['fold.on.zero.sum.lines'] > 0
|
1144
|
-
local fold_symbols = lexer._foldsymbols
|
1145
|
-
local fold_symbols_patterns = fold_symbols._patterns
|
1146
|
-
local style_at, fold_level = M.style_at, M.fold_level
|
1147
|
-
local line_num, prev_level = start_line, start_level
|
1148
|
-
local current_level = prev_level
|
1149
|
-
for i = 1, #lines do
|
1150
|
-
local pos, line = lines[i][1], lines[i][2]
|
1151
|
-
if line ~= '' then
|
1152
|
-
local level_decreased = false
|
1153
|
-
for j = 1, #fold_symbols_patterns do
|
1154
|
-
for s, match in line:gmatch(fold_symbols_patterns[j]) do
|
1155
|
-
local symbols = fold_symbols[style_at[start_pos + pos + s - 1]]
|
1156
|
-
local l = symbols and symbols[match]
|
1157
|
-
if type(l) == 'function' then l = l(text, pos, line, s, match) end
|
1158
|
-
if type(l) == 'number' then
|
1159
|
-
current_level = current_level + l
|
1160
|
-
if l < 0 and current_level < prev_level then
|
1161
|
-
-- Potential zero-sum line. If the level were to go back up on
|
1162
|
-
-- the same line, the line may be marked as a fold header.
|
1163
|
-
level_decreased = true
|
1164
|
-
end
|
1165
|
-
end
|
1166
|
-
end
|
1167
|
-
end
|
1168
|
-
folds[line_num] = prev_level
|
1169
|
-
if current_level > prev_level then
|
1170
|
-
folds[line_num] = prev_level + FOLD_HEADER
|
1171
|
-
elseif level_decreased and current_level == prev_level and
|
1172
|
-
fold_zero_sum_lines then
|
1173
|
-
if line_num > start_line then
|
1174
|
-
folds[line_num] = prev_level - 1 + FOLD_HEADER
|
1175
|
-
else
|
1176
|
-
-- Typing within a zero-sum line.
|
1177
|
-
local level = fold_level[line_num - 1] - 1
|
1178
|
-
if level > FOLD_HEADER then level = level - FOLD_HEADER end
|
1179
|
-
if level > FOLD_BLANK then level = level - FOLD_BLANK end
|
1180
|
-
folds[line_num] = level + FOLD_HEADER
|
1181
|
-
current_level = current_level + 1
|
1182
|
-
end
|
1183
|
-
end
|
1184
|
-
if current_level < FOLD_BASE then current_level = FOLD_BASE end
|
1185
|
-
prev_level = current_level
|
1186
|
-
else
|
1187
|
-
folds[line_num] = prev_level + FOLD_BLANK
|
1188
|
-
end
|
1189
|
-
line_num = line_num + 1
|
1190
|
-
end
|
1191
|
-
elseif fold and M.property_int['fold.by.indentation'] > 0 then
|
1192
|
-
-- Indentation based folding.
|
1193
|
-
-- Calculate indentation per line.
|
1194
|
-
local indentation = {}
|
1195
|
-
for indent, line in (text..'\n'):gmatch('([\t ]*)([^\r\n]*)\r?\n') do
|
1196
|
-
indentation[#indentation + 1] = line ~= '' and #indent
|
1197
|
-
end
|
1198
|
-
-- Make line before start_line a fold header if necessary.
|
1199
|
-
if start_line > 0 and indentation[1] then
|
1200
|
-
local indent = M.indent_amount[start_line - 1]
|
1201
|
-
if indentation[1] > indent then
|
1202
|
-
folds[start_line - 1] = FOLD_BASE + indent + FOLD_HEADER
|
1203
|
-
end
|
1204
|
-
end
|
1205
|
-
-- Iterate over lines, setting fold numbers and fold flags.
|
1206
|
-
local line_num, prev_level = start_line, FOLD_BASE + (indentation[1] or 0)
|
1207
|
-
local current_level = prev_level
|
1208
|
-
for i = 1, #indentation do
|
1209
|
-
if indentation[i] then
|
1210
|
-
for j = i + 1, #indentation do
|
1211
|
-
if indentation[j] then
|
1212
|
-
current_level = FOLD_BASE + indentation[j]
|
1213
|
-
break
|
1214
|
-
end
|
1215
|
-
end
|
1216
|
-
folds[line_num] = prev_level
|
1217
|
-
if current_level > prev_level then
|
1218
|
-
folds[line_num] = prev_level + FOLD_HEADER
|
1219
|
-
end
|
1220
|
-
prev_level = current_level
|
1221
|
-
else
|
1222
|
-
folds[line_num] = prev_level + FOLD_BLANK
|
1223
|
-
end
|
1224
|
-
line_num = line_num + 1
|
1225
|
-
end
|
1226
|
-
else
|
1227
|
-
-- No folding, reset fold levels if necessary.
|
1228
|
-
local current_line = start_line
|
1229
|
-
for _ in text:gmatch('\r?\n') do
|
1230
|
-
folds[current_line] = start_level
|
1231
|
-
current_line = current_line + 1
|
1232
|
-
end
|
1233
|
-
end
|
1234
|
-
return folds
|
1235
|
-
end
|
1236
|
-
|
1237
|
-
-- The following are utility functions lexers will have access to.
|
1238
|
-
|
1239
|
-
-- Common patterns.
|
1240
|
-
M.any = lpeg_P(1)
|
1241
|
-
M.ascii = lpeg_R('\000\127')
|
1242
|
-
M.extend = lpeg_R('\000\255')
|
1243
|
-
M.alpha = lpeg_R('AZ', 'az')
|
1244
|
-
M.digit = lpeg_R('09')
|
1245
|
-
M.alnum = lpeg_R('AZ', 'az', '09')
|
1246
|
-
M.lower = lpeg_R('az')
|
1247
|
-
M.upper = lpeg_R('AZ')
|
1248
|
-
M.xdigit = lpeg_R('09', 'AF', 'af')
|
1249
|
-
M.cntrl = lpeg_R('\000\031')
|
1250
|
-
M.graph = lpeg_R('!~')
|
1251
|
-
M.print = lpeg_R(' ~')
|
1252
|
-
M.punct = lpeg_R('!/', ':@', '[\'', '{~')
|
1253
|
-
M.space = lpeg_S('\t\v\f\n\r ')
|
1254
|
-
|
1255
|
-
M.newline = lpeg_S('\r\n\f')^1
|
1256
|
-
M.nonnewline = 1 - M.newline
|
1257
|
-
M.nonnewline_esc = 1 - (M.newline + '\\') + '\\' * M.any
|
1258
|
-
|
1259
|
-
M.dec_num = M.digit^1
|
1260
|
-
M.hex_num = '0' * lpeg_S('xX') * M.xdigit^1
|
1261
|
-
M.oct_num = '0' * lpeg_R('07')^1
|
1262
|
-
M.integer = lpeg_S('+-')^-1 * (M.hex_num + M.oct_num + M.dec_num)
|
1263
|
-
M.float = lpeg_S('+-')^-1 *
|
1264
|
-
(M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0 +
|
1265
|
-
M.digit^1) *
|
1266
|
-
lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1
|
1267
|
-
M.word = (M.alpha + '_') * (M.alnum + '_')^0
|
1268
|
-
|
1269
|
-
---
|
1270
|
-
-- Creates and returns a token pattern with token name *name* and pattern
|
1271
|
-
-- *patt*.
|
1272
|
-
-- If *name* is not a predefined token name, its style must be defined in the
|
1273
|
-
-- lexer's `_tokenstyles` table.
|
1274
|
-
-- @param name The name of token. If this name is not a predefined token name,
|
1275
|
-
-- then a style needs to be assiciated with it in the lexer's `_tokenstyles`
|
1276
|
-
-- table.
|
1277
|
-
-- @param patt The LPeg pattern associated with the token.
|
1278
|
-
-- @return pattern
|
1279
|
-
-- @usage local ws = token(l.WHITESPACE, l.space^1)
|
1280
|
-
-- @usage local annotation = token('annotation', '@' * l.word)
|
1281
|
-
-- @name token
|
1282
|
-
function M.token(name, patt)
|
1283
|
-
--return lpeg_Cg(patt, name)
|
1284
|
-
return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_C(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
|
1285
|
-
end
|
1286
|
-
|
1287
|
-
function M.parent_token(name, patt)
|
1288
|
-
--return lpeg_Cg(patt, name)
|
1289
|
-
return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_Ct(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
|
1290
|
-
end
|
1291
|
-
|
1292
|
-
---
|
1293
|
-
-- Creates and returns a pattern that matches a range of text bounded by
|
1294
|
-
-- *chars* characters.
|
1295
|
-
-- This is a convenience function for matching more complicated delimited ranges
|
1296
|
-
-- like strings with escape characters and balanced parentheses. *single_line*
|
1297
|
-
-- indicates whether or not the range must be on a single line, *no_escape*
|
1298
|
-
-- indicates whether or not to ignore '\' as an escape character, and *balanced*
|
1299
|
-
-- indicates whether or not to handle balanced ranges like parentheses and
|
1300
|
-
-- requires *chars* to be composed of two characters.
|
1301
|
-
-- @param chars The character(s) that bound the matched range.
|
1302
|
-
-- @param single_line Optional flag indicating whether or not the range must be
|
1303
|
-
-- on a single line.
|
1304
|
-
-- @param no_escape Optional flag indicating whether or not the range end
|
1305
|
-
-- character may be escaped by a '\\' character.
|
1306
|
-
-- @param balanced Optional flag indicating whether or not to match a balanced
|
1307
|
-
-- range, like the "%b" Lua pattern. This flag only applies if *chars*
|
1308
|
-
-- consists of two different characters (e.g. "()").
|
1309
|
-
-- @return pattern
|
1310
|
-
-- @usage local dq_str_escapes = l.delimited_range('"')
|
1311
|
-
-- @usage local dq_str_noescapes = l.delimited_range('"', false, true)
|
1312
|
-
-- @usage local unbalanced_parens = l.delimited_range('()')
|
1313
|
-
-- @usage local balanced_parens = l.delimited_range('()', false, false, true)
|
1314
|
-
-- @see nested_pair
|
1315
|
-
-- @name delimited_range
|
1316
|
-
function M.delimited_range(chars, single_line, no_escape, balanced)
|
1317
|
-
local s = chars:sub(1, 1)
|
1318
|
-
local e = #chars == 2 and chars:sub(2, 2) or s
|
1319
|
-
local range
|
1320
|
-
local b = balanced and s or ''
|
1321
|
-
local n = single_line and '\n' or ''
|
1322
|
-
if no_escape then
|
1323
|
-
local invalid = lpeg_S(e..n..b)
|
1324
|
-
range = M.any - invalid
|
1325
|
-
else
|
1326
|
-
local invalid = lpeg_S(e..n..b) + '\\'
|
1327
|
-
range = M.any - invalid + '\\' * M.any
|
1328
|
-
end
|
1329
|
-
if balanced and s ~= e then
|
1330
|
-
return lpeg_P{s * (range + lpeg_V(1))^0 * e}
|
1331
|
-
else
|
1332
|
-
return s * range^0 * lpeg_P(e)^-1
|
1333
|
-
end
|
1334
|
-
end
|
1335
|
-
|
1336
|
-
---
|
1337
|
-
-- Creates and returns a pattern that matches pattern *patt* only at the
|
1338
|
-
-- beginning of a line.
|
1339
|
-
-- @param patt The LPeg pattern to match on the beginning of a line.
|
1340
|
-
-- @return pattern
|
1341
|
-
-- @usage local preproc = token(l.PREPROCESSOR, l.starts_line('#') *
|
1342
|
-
-- l.nonnewline^0)
|
1343
|
-
-- @name starts_line
|
1344
|
-
function M.starts_line(patt)
|
1345
|
-
return lpeg_Cmt(lpeg_C(patt), function(input, index, match, ...)
|
1346
|
-
local pos = index - #match
|
1347
|
-
if pos == 1 then return index, ... end
|
1348
|
-
local char = input:sub(pos - 1, pos - 1)
|
1349
|
-
if char == '\n' or char == '\r' or char == '\f' then return index, ... end
|
1350
|
-
end)
|
1351
|
-
end
|
1352
|
-
|
1353
|
-
---
|
1354
|
-
-- Creates and returns a pattern that verifies that string set *s* contains the
|
1355
|
-
-- first non-whitespace character behind the current match position.
|
1356
|
-
-- @param s String character set like one passed to `lpeg.S()`.
|
1357
|
-
-- @return pattern
|
1358
|
-
-- @usage local regex = l.last_char_includes('+-*!%^&|=,([{') *
|
1359
|
-
-- l.delimited_range('/')
|
1360
|
-
-- @name last_char_includes
|
1361
|
-
function M.last_char_includes(s)
|
1362
|
-
s = '['..s:gsub('[-%%%[]', '%%%1')..']'
|
1363
|
-
return lpeg_P(function(input, index)
|
1364
|
-
if index == 1 then return index end
|
1365
|
-
local i = index
|
1366
|
-
while input:sub(i - 1, i - 1):match('[ \t\r\n\f]') do i = i - 1 end
|
1367
|
-
if input:sub(i - 1, i - 1):match(s) then return index end
|
1368
|
-
end)
|
1369
|
-
end
|
1370
|
-
|
1371
|
-
---
|
1372
|
-
-- Returns a pattern that matches a balanced range of text that starts with
|
1373
|
-
-- string *start_chars* and ends with string *end_chars*.
|
1374
|
-
-- With single-character delimiters, this function is identical to
|
1375
|
-
-- `delimited_range(start_chars..end_chars, false, true, true)`.
|
1376
|
-
-- @param start_chars The string starting a nested sequence.
|
1377
|
-
-- @param end_chars The string ending a nested sequence.
|
1378
|
-
-- @return pattern
|
1379
|
-
-- @usage local nested_comment = l.nested_pair('/*', '*/')
|
1380
|
-
-- @see delimited_range
|
1381
|
-
-- @name nested_pair
|
1382
|
-
function M.nested_pair(start_chars, end_chars)
|
1383
|
-
local s, e = start_chars, lpeg_P(end_chars)^-1
|
1384
|
-
return lpeg_P{s * (M.any - s - end_chars + lpeg_V(1))^0 * e}
|
1385
|
-
end
|
1386
|
-
|
1387
|
-
---
|
1388
|
-
-- Creates and returns a pattern that matches any single word in list *words*.
|
1389
|
-
-- Words consist of alphanumeric and underscore characters, as well as the
|
1390
|
-
-- characters in string set *word_chars*. *case_insensitive* indicates whether
|
1391
|
-
-- or not to ignore case when matching words.
|
1392
|
-
-- This is a convenience function for simplifying a set of ordered choice word
|
1393
|
-
-- patterns.
|
1394
|
-
-- @param words A table of words.
|
1395
|
-
-- @param word_chars Optional string of additional characters considered to be
|
1396
|
-
-- part of a word. By default, word characters are alphanumerics and
|
1397
|
-
-- underscores ("%w_" in Lua). This parameter may be `nil` or the empty string
|
1398
|
-
-- to indicate no additional word characters.
|
1399
|
-
-- @param case_insensitive Optional boolean flag indicating whether or not the
|
1400
|
-
-- word match is case-insensitive. The default is `false`.
|
1401
|
-
-- @return pattern
|
1402
|
-
-- @usage local keyword = token(l.KEYWORD, word_match{'foo', 'bar', 'baz'})
|
1403
|
-
-- @usage local keyword = token(l.KEYWORD, word_match({'foo-bar', 'foo-baz',
|
1404
|
-
-- 'bar-foo', 'bar-baz', 'baz-foo', 'baz-bar'}, '-', true))
|
1405
|
-
-- @name word_match
|
1406
|
-
function M.word_match(words, word_chars, case_insensitive)
|
1407
|
-
local word_list = {}
|
1408
|
-
for _, word in ipairs(words) do
|
1409
|
-
word_list[case_insensitive and word:lower() or word] = true
|
1410
|
-
end
|
1411
|
-
local chars = M.alnum + '_'
|
1412
|
-
if word_chars then chars = chars + lpeg_S(word_chars) end
|
1413
|
-
return lpeg_Cmt(chars^1, function(input, index, word)
|
1414
|
-
if case_insensitive then word = word:lower() end
|
1415
|
-
return word_list[word] and index or nil
|
1416
|
-
end)
|
1417
|
-
end
|
1418
|
-
|
1419
|
-
---
|
1420
|
-
-- Embeds child lexer *child* in parent lexer *parent* using patterns
|
1421
|
-
-- *start_rule* and *end_rule*, which signal the beginning and end of the
|
1422
|
-
-- embedded lexer, respectively.
|
1423
|
-
-- @param parent The parent lexer.
|
1424
|
-
-- @param child The child lexer.
|
1425
|
-
-- @param start_rule The pattern that signals the beginning of the embedded
|
1426
|
-
-- lexer.
|
1427
|
-
-- @param end_rule The pattern that signals the end of the embedded lexer.
|
1428
|
-
-- @usage l.embed_lexer(M, css, css_start_rule, css_end_rule)
|
1429
|
-
-- @usage l.embed_lexer(html, M, php_start_rule, php_end_rule)
|
1430
|
-
-- @usage l.embed_lexer(html, ruby, ruby_start_rule, ruby_end_rule)
|
1431
|
-
-- @name embed_lexer
|
1432
|
-
function M.embed_lexer(parent, child, start_rule, end_rule)
|
1433
|
-
-- Add child rules.
|
1434
|
-
if not child._EMBEDDEDRULES then child._EMBEDDEDRULES = {} end
|
1435
|
-
if not child._RULES then -- creating a child lexer to be embedded
|
1436
|
-
if not child._rules then error('Cannot embed language with no rules') end
|
1437
|
-
for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end
|
1438
|
-
end
|
1439
|
-
child._EMBEDDEDRULES[parent._NAME] = {
|
1440
|
-
['start_rule'] = start_rule,
|
1441
|
-
token_rule = join_tokens(child),
|
1442
|
-
['end_rule'] = end_rule
|
1443
|
-
}
|
1444
|
-
if not parent._CHILDREN then parent._CHILDREN = {} end
|
1445
|
-
local children = parent._CHILDREN
|
1446
|
-
children[#children + 1] = child
|
1447
|
-
-- Add child styles.
|
1448
|
-
if not parent._tokenstyles then parent._tokenstyles = {} end
|
1449
|
-
local tokenstyles = parent._tokenstyles
|
1450
|
-
tokenstyles[child._NAME..'_whitespace'] = M.STYLE_WHITESPACE
|
1451
|
-
for token, style in pairs(child._tokenstyles or {}) do
|
1452
|
-
tokenstyles[token] = style
|
1453
|
-
end
|
1454
|
-
child._lexer = parent -- use parent's tokens if child is embedding itself
|
1455
|
-
parent_lexer = parent -- use parent's tokens if the calling lexer is a proxy
|
1456
|
-
end
|
1457
|
-
|
1458
|
-
-- Determines if the previous line is a comment.
|
1459
|
-
-- This is used for determining if the current comment line is a fold point.
|
1460
|
-
-- @param prefix The prefix string defining a comment.
|
1461
|
-
-- @param text The text passed to a fold function.
|
1462
|
-
-- @param pos The pos passed to a fold function.
|
1463
|
-
-- @param line The line passed to a fold function.
|
1464
|
-
-- @param s The s passed to a fold function.
|
1465
|
-
local function prev_line_is_comment(prefix, text, pos, line, s)
|
1466
|
-
local start = line:find('%S')
|
1467
|
-
if start < s and not line:find(prefix, start, true) then return false end
|
1468
|
-
local p = pos - 1
|
1469
|
-
if text:sub(p, p) == '\n' then
|
1470
|
-
p = p - 1
|
1471
|
-
if text:sub(p, p) == '\r' then p = p - 1 end
|
1472
|
-
if text:sub(p, p) ~= '\n' then
|
1473
|
-
while p > 1 and text:sub(p - 1, p - 1) ~= '\n' do p = p - 1 end
|
1474
|
-
while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
|
1475
|
-
return text:sub(p, p + #prefix - 1) == prefix
|
1476
|
-
end
|
1477
|
-
end
|
1478
|
-
return false
|
1479
|
-
end
|
1480
|
-
|
1481
|
-
-- Determines if the next line is a comment.
|
1482
|
-
-- This is used for determining if the current comment line is a fold point.
|
1483
|
-
-- @param prefix The prefix string defining a comment.
|
1484
|
-
-- @param text The text passed to a fold function.
|
1485
|
-
-- @param pos The pos passed to a fold function.
|
1486
|
-
-- @param line The line passed to a fold function.
|
1487
|
-
-- @param s The s passed to a fold function.
|
1488
|
-
local function next_line_is_comment(prefix, text, pos, line, s)
|
1489
|
-
local p = text:find('\n', pos + s)
|
1490
|
-
if p then
|
1491
|
-
p = p + 1
|
1492
|
-
while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
|
1493
|
-
return text:sub(p, p + #prefix - 1) == prefix
|
1494
|
-
end
|
1495
|
-
return false
|
1496
|
-
end
|
1497
|
-
|
1498
|
-
---
|
1499
|
-
-- Returns a fold function (to be used within the lexer's `_foldsymbols` table)
|
1500
|
-
-- that folds consecutive line comments that start with string *prefix*.
|
1501
|
-
-- @param prefix The prefix string defining a line comment.
|
1502
|
-
-- @usage [l.COMMENT] = {['--'] = l.fold_line_comments('--')}
|
1503
|
-
-- @usage [l.COMMENT] = {['//'] = l.fold_line_comments('//')}
|
1504
|
-
-- @name fold_line_comments
|
1505
|
-
function M.fold_line_comments(prefix)
|
1506
|
-
local property_int = M.property_int
|
1507
|
-
return function(text, pos, line, s)
|
1508
|
-
if property_int['fold.line.comments'] == 0 then return 0 end
|
1509
|
-
if s > 1 and line:match('^%s*()') < s then return 0 end
|
1510
|
-
local prev_line_comment = prev_line_is_comment(prefix, text, pos, line, s)
|
1511
|
-
local next_line_comment = next_line_is_comment(prefix, text, pos, line, s)
|
1512
|
-
if not prev_line_comment and next_line_comment then return 1 end
|
1513
|
-
if prev_line_comment and not next_line_comment then return -1 end
|
1514
|
-
return 0
|
1515
|
-
end
|
1516
|
-
end
|
1517
|
-
|
1518
|
-
M.property_expanded = setmetatable({}, {
|
1519
|
-
-- Returns the string property value associated with string property *key*,
|
1520
|
-
-- replacing any "$()" and "%()" expressions with the values of their keys.
|
1521
|
-
__index = function(t, key)
|
1522
|
-
return M.property[key]:gsub('[$%%]%b()', function(key)
|
1523
|
-
return t[key:sub(3, -2)]
|
1524
|
-
end)
|
1525
|
-
end,
|
1526
|
-
__newindex = function() error('read-only property') end
|
1527
|
-
})
|
1528
|
-
|
1529
|
-
--[[ The functions and fields below were defined in C.
|
1530
|
-
|
1531
|
-
---
|
1532
|
-
-- Individual fields for a lexer instance.
|
1533
|
-
-- @field _NAME The string name of the lexer.
|
1534
|
-
-- @field _rules An ordered list of rules for a lexer grammar.
|
1535
|
-
-- Each rule is a table containing an arbitrary rule name and the LPeg pattern
|
1536
|
-
-- associated with the rule. The order of rules is important as rules are
|
1537
|
-
-- matched sequentially.
|
1538
|
-
-- Child lexers should not use this table to access and/or modify their
|
1539
|
-
-- parent's rules and vice-versa. Use the `_RULES` table instead.
|
1540
|
-
-- @field _tokenstyles A map of non-predefined token names to styles.
|
1541
|
-
-- Remember to use token names, not rule names. It is recommended to use
|
1542
|
-
-- predefined styles or color-agnostic styles derived from predefined styles
|
1543
|
-
-- to ensure compatibility with user color themes.
|
1544
|
-
-- @field _foldsymbols A table of recognized fold points for the lexer.
|
1545
|
-
-- Keys are token names with table values defining fold points. Those table
|
1546
|
-
-- values have string keys of keywords or characters that indicate a fold
|
1547
|
-
-- point whose values are integers. A value of `1` indicates a beginning fold
|
1548
|
-
-- point and a value of `-1` indicates an ending fold point. Values can also
|
1549
|
-
-- be functions that return `1`, `-1`, or `0` (indicating no fold point) for
|
1550
|
-
-- keys which need additional processing.
|
1551
|
-
-- There is also a required `_pattern` key whose value is a table containing
|
1552
|
-
-- Lua pattern strings that match all fold points (the string keys contained
|
1553
|
-
-- in token name table values). When the lexer encounters text that matches
|
1554
|
-
-- one of those patterns, the matched text is looked up in its token's table
|
1555
|
-
-- to determine whether or not it is a fold point.
|
1556
|
-
-- @field _fold If this function exists in the lexer, it is called for folding
|
1557
|
-
-- the document instead of using `_foldsymbols` or indentation.
|
1558
|
-
-- @field _lexer The parent lexer object whose rules should be used. This field
|
1559
|
-
-- is only necessary to disambiguate a proxy lexer that loaded parent and
|
1560
|
-
-- child lexers for embedding and ended up having multiple parents loaded.
|
1561
|
-
-- @field _RULES A map of rule name keys with their associated LPeg pattern
|
1562
|
-
-- values for the lexer.
|
1563
|
-
-- This is constructed from the lexer's `_rules` table and accessible to other
|
1564
|
-
-- lexers for embedded lexer applications like modifying parent or child
|
1565
|
-
-- rules.
|
1566
|
-
-- @field _LEXBYLINE Indicates the lexer can only process one whole line of text
|
1567
|
-
-- (instead of an arbitrary chunk of text) at a time.
|
1568
|
-
-- The default value is `false`. Line lexers cannot look ahead to subsequent
|
1569
|
-
-- lines.
|
1570
|
-
-- @class table
|
1571
|
-
-- @name lexer
|
1572
|
-
local lexer
|
1573
|
-
]]
|
1574
|
-
|
1575
|
-
return M
|