immunio 0.15.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +234 -0
- data/README.md +147 -0
- data/bin/immunio +5 -0
- data/lib/immunio.rb +29 -0
- data/lib/immunio/agent.rb +260 -0
- data/lib/immunio/authentication.rb +96 -0
- data/lib/immunio/blocked_app.rb +38 -0
- data/lib/immunio/channel.rb +432 -0
- data/lib/immunio/cli.rb +39 -0
- data/lib/immunio/context.rb +114 -0
- data/lib/immunio/errors.rb +43 -0
- data/lib/immunio/immunio_ca.crt +45 -0
- data/lib/immunio/logger.rb +87 -0
- data/lib/immunio/plugins/action_dispatch.rb +45 -0
- data/lib/immunio/plugins/action_view.rb +431 -0
- data/lib/immunio/plugins/active_record.rb +707 -0
- data/lib/immunio/plugins/active_record_relation.rb +370 -0
- data/lib/immunio/plugins/authlogic.rb +80 -0
- data/lib/immunio/plugins/csrf.rb +24 -0
- data/lib/immunio/plugins/devise.rb +40 -0
- data/lib/immunio/plugins/environment_reporter.rb +69 -0
- data/lib/immunio/plugins/eval.rb +51 -0
- data/lib/immunio/plugins/exception_handler.rb +55 -0
- data/lib/immunio/plugins/gems_tracker.rb +5 -0
- data/lib/immunio/plugins/haml.rb +36 -0
- data/lib/immunio/plugins/http_finisher.rb +50 -0
- data/lib/immunio/plugins/http_tracker.rb +203 -0
- data/lib/immunio/plugins/io.rb +96 -0
- data/lib/immunio/plugins/redirect.rb +42 -0
- data/lib/immunio/plugins/warden.rb +66 -0
- data/lib/immunio/processor.rb +234 -0
- data/lib/immunio/rails.rb +26 -0
- data/lib/immunio/request.rb +139 -0
- data/lib/immunio/rufus_lua_ext/ref.rb +27 -0
- data/lib/immunio/rufus_lua_ext/state.rb +157 -0
- data/lib/immunio/rufus_lua_ext/table.rb +137 -0
- data/lib/immunio/rufus_lua_ext/utils.rb +13 -0
- data/lib/immunio/version.rb +5 -0
- data/lib/immunio/vm.rb +291 -0
- data/lua-hooks/ext/all.c +78 -0
- data/lua-hooks/ext/bitop/README +22 -0
- data/lua-hooks/ext/bitop/bit.c +189 -0
- data/lua-hooks/ext/extconf.rb +38 -0
- data/lua-hooks/ext/libinjection/COPYING +37 -0
- data/lua-hooks/ext/libinjection/libinjection.h +65 -0
- data/lua-hooks/ext/libinjection/libinjection_html5.c +847 -0
- data/lua-hooks/ext/libinjection/libinjection_html5.h +54 -0
- data/lua-hooks/ext/libinjection/libinjection_sqli.c +2301 -0
- data/lua-hooks/ext/libinjection/libinjection_sqli.h +295 -0
- data/lua-hooks/ext/libinjection/libinjection_sqli_data.h +9349 -0
- data/lua-hooks/ext/libinjection/libinjection_xss.c +531 -0
- data/lua-hooks/ext/libinjection/libinjection_xss.h +21 -0
- data/lua-hooks/ext/libinjection/lualib.c +109 -0
- data/lua-hooks/ext/lpeg/HISTORY +90 -0
- data/lua-hooks/ext/lpeg/lpcap.c +537 -0
- data/lua-hooks/ext/lpeg/lpcap.h +43 -0
- data/lua-hooks/ext/lpeg/lpcode.c +986 -0
- data/lua-hooks/ext/lpeg/lpcode.h +34 -0
- data/lua-hooks/ext/lpeg/lpeg-128.gif +0 -0
- data/lua-hooks/ext/lpeg/lpeg.html +1429 -0
- data/lua-hooks/ext/lpeg/lpprint.c +244 -0
- data/lua-hooks/ext/lpeg/lpprint.h +35 -0
- data/lua-hooks/ext/lpeg/lptree.c +1238 -0
- data/lua-hooks/ext/lpeg/lptree.h +77 -0
- data/lua-hooks/ext/lpeg/lptypes.h +149 -0
- data/lua-hooks/ext/lpeg/lpvm.c +355 -0
- data/lua-hooks/ext/lpeg/lpvm.h +58 -0
- data/lua-hooks/ext/lpeg/makefile +55 -0
- data/lua-hooks/ext/lpeg/re.html +498 -0
- data/lua-hooks/ext/lpeg/test.lua +1409 -0
- data/lua-hooks/ext/lua-cmsgpack/CMakeLists.txt +45 -0
- data/lua-hooks/ext/lua-cmsgpack/README.md +115 -0
- data/lua-hooks/ext/lua-cmsgpack/lua_cmsgpack.c +957 -0
- data/lua-hooks/ext/lua-cmsgpack/test.lua +570 -0
- data/lua-hooks/ext/lua-snapshot/LICENSE +7 -0
- data/lua-hooks/ext/lua-snapshot/Makefile +12 -0
- data/lua-hooks/ext/lua-snapshot/README.md +18 -0
- data/lua-hooks/ext/lua-snapshot/dump.lua +15 -0
- data/lua-hooks/ext/lua-snapshot/snapshot.c +455 -0
- data/lua-hooks/ext/lua/COPYRIGHT +34 -0
- data/lua-hooks/ext/lua/lapi.c +1087 -0
- data/lua-hooks/ext/lua/lapi.h +16 -0
- data/lua-hooks/ext/lua/lauxlib.c +652 -0
- data/lua-hooks/ext/lua/lauxlib.h +174 -0
- data/lua-hooks/ext/lua/lbaselib.c +659 -0
- data/lua-hooks/ext/lua/lcode.c +831 -0
- data/lua-hooks/ext/lua/lcode.h +76 -0
- data/lua-hooks/ext/lua/ldblib.c +398 -0
- data/lua-hooks/ext/lua/ldebug.c +638 -0
- data/lua-hooks/ext/lua/ldebug.h +33 -0
- data/lua-hooks/ext/lua/ldo.c +519 -0
- data/lua-hooks/ext/lua/ldo.h +57 -0
- data/lua-hooks/ext/lua/ldump.c +164 -0
- data/lua-hooks/ext/lua/lfunc.c +174 -0
- data/lua-hooks/ext/lua/lfunc.h +34 -0
- data/lua-hooks/ext/lua/lgc.c +710 -0
- data/lua-hooks/ext/lua/lgc.h +110 -0
- data/lua-hooks/ext/lua/linit.c +38 -0
- data/lua-hooks/ext/lua/liolib.c +556 -0
- data/lua-hooks/ext/lua/llex.c +463 -0
- data/lua-hooks/ext/lua/llex.h +81 -0
- data/lua-hooks/ext/lua/llimits.h +128 -0
- data/lua-hooks/ext/lua/lmathlib.c +263 -0
- data/lua-hooks/ext/lua/lmem.c +86 -0
- data/lua-hooks/ext/lua/lmem.h +49 -0
- data/lua-hooks/ext/lua/loadlib.c +705 -0
- data/lua-hooks/ext/lua/loadlib_rel.c +760 -0
- data/lua-hooks/ext/lua/lobject.c +214 -0
- data/lua-hooks/ext/lua/lobject.h +381 -0
- data/lua-hooks/ext/lua/lopcodes.c +102 -0
- data/lua-hooks/ext/lua/lopcodes.h +268 -0
- data/lua-hooks/ext/lua/loslib.c +243 -0
- data/lua-hooks/ext/lua/lparser.c +1339 -0
- data/lua-hooks/ext/lua/lparser.h +82 -0
- data/lua-hooks/ext/lua/lstate.c +214 -0
- data/lua-hooks/ext/lua/lstate.h +169 -0
- data/lua-hooks/ext/lua/lstring.c +111 -0
- data/lua-hooks/ext/lua/lstring.h +31 -0
- data/lua-hooks/ext/lua/lstrlib.c +871 -0
- data/lua-hooks/ext/lua/ltable.c +588 -0
- data/lua-hooks/ext/lua/ltable.h +40 -0
- data/lua-hooks/ext/lua/ltablib.c +287 -0
- data/lua-hooks/ext/lua/ltm.c +75 -0
- data/lua-hooks/ext/lua/ltm.h +54 -0
- data/lua-hooks/ext/lua/lua.c +392 -0
- data/lua-hooks/ext/lua/lua.def +131 -0
- data/lua-hooks/ext/lua/lua.h +388 -0
- data/lua-hooks/ext/lua/lua.rc +28 -0
- data/lua-hooks/ext/lua/lua_dll.rc +26 -0
- data/lua-hooks/ext/lua/luac.c +200 -0
- data/lua-hooks/ext/lua/luac.rc +1 -0
- data/lua-hooks/ext/lua/luaconf.h +763 -0
- data/lua-hooks/ext/lua/luaconf.h.in +724 -0
- data/lua-hooks/ext/lua/luaconf.h.orig +763 -0
- data/lua-hooks/ext/lua/lualib.h +53 -0
- data/lua-hooks/ext/lua/lundump.c +227 -0
- data/lua-hooks/ext/lua/lundump.h +36 -0
- data/lua-hooks/ext/lua/lvm.c +767 -0
- data/lua-hooks/ext/lua/lvm.h +36 -0
- data/lua-hooks/ext/lua/lzio.c +82 -0
- data/lua-hooks/ext/lua/lzio.h +67 -0
- data/lua-hooks/ext/lua/print.c +227 -0
- data/lua-hooks/ext/luautf8/README.md +152 -0
- data/lua-hooks/ext/luautf8/lutf8lib.c +1274 -0
- data/lua-hooks/ext/luautf8/unidata.h +3064 -0
- data/lua-hooks/lib/boot.lua +254 -0
- data/lua-hooks/lib/encode.lua +4 -0
- data/lua-hooks/lib/lexers/LICENSE +21 -0
- data/lua-hooks/lib/lexers/bash.lua +134 -0
- data/lua-hooks/lib/lexers/bash_dqstr.lua +62 -0
- data/lua-hooks/lib/lexers/css.lua +216 -0
- data/lua-hooks/lib/lexers/html.lua +106 -0
- data/lua-hooks/lib/lexers/javascript.lua +68 -0
- data/lua-hooks/lib/lexers/lexer.lua +1575 -0
- data/lua-hooks/lib/lexers/markers.lua +33 -0
- metadata +308 -0
@@ -0,0 +1,216 @@
|
|
1
|
+
-- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
|
2
|
+
-- CSS LPeg lexer.
|
3
|
+
|
4
|
+
local l = require('lexer')
|
5
|
+
local token, word_match = l.token, l.word_match
|
6
|
+
local P, R, S, V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
|
7
|
+
|
8
|
+
local M = {_NAME = 'css'}
|
9
|
+
|
10
|
+
-- Whitespace.
|
11
|
+
local ws = token(l.WHITESPACE, l.space^1)
|
12
|
+
|
13
|
+
-- Comments.
|
14
|
+
local comment = token(l.COMMENT, '/*' * (l.any - '*/')^0 * P('*/')^-1)
|
15
|
+
|
16
|
+
-- Strings.
|
17
|
+
local sq_str = l.delimited_range("'")
|
18
|
+
local dq_str = l.delimited_range('"')
|
19
|
+
local string = token(l.STRING, sq_str + dq_str)
|
20
|
+
|
21
|
+
-- Numbers.
|
22
|
+
local number = token(l.NUMBER, l.digit^1)
|
23
|
+
|
24
|
+
-- Keywords.
|
25
|
+
local css1_property = word_match({
|
26
|
+
'color', 'background-color', 'background-image', 'background-repeat',
|
27
|
+
'background-attachment', 'background-position', 'background', 'font-family',
|
28
|
+
'font-style', 'font-variant', 'font-weight', 'font-size', 'font',
|
29
|
+
'word-spacing', 'letter-spacing', 'text-decoration', 'vertical-align',
|
30
|
+
'text-transform', 'text-align', 'text-indent', 'line-height', 'margin-top',
|
31
|
+
'margin-right', 'margin-bottom', 'margin-left', 'margin', 'padding-top',
|
32
|
+
'padding-right', 'padding-bottom', 'padding-left', 'padding',
|
33
|
+
'border-top-width', 'border-right-width', 'border-bottom-width',
|
34
|
+
'border-left-width', 'border-width', 'border-top', 'border-right',
|
35
|
+
'border-bottom', 'border-left', 'border', 'border-color', 'border-style',
|
36
|
+
'width', 'height', 'float', 'clear', 'display', 'white-space',
|
37
|
+
'list-style-type', 'list-style-image', 'list-style-position', 'list-style'
|
38
|
+
}, '-')
|
39
|
+
local css1_value = word_match({
|
40
|
+
'auto', 'none', 'normal', 'italic', 'oblique', 'small-caps', 'bold', 'bolder',
|
41
|
+
'lighter', 'xx-small', 'x-small', 'small', 'medium', 'large', 'x-large',
|
42
|
+
'xx-large', 'larger', 'smaller', 'transparent', 'repeat', 'repeat-x',
|
43
|
+
'repeat-y', 'no-repeat', 'scroll', 'fixed', 'top', 'bottom', 'left', 'center',
|
44
|
+
'right', 'justify', 'both', 'underline', 'overline', 'line-through', 'blink',
|
45
|
+
'baseline', 'sub', 'super', 'text-top', 'middle', 'text-bottom', 'capitalize',
|
46
|
+
'uppercase', 'lowercase', 'thin', 'medium', 'thick', 'dotted', 'dashed',
|
47
|
+
'solid', 'double', 'groove', 'ridge', 'inset', 'outset', 'block', 'inline',
|
48
|
+
'list-item', 'pre', 'no-wrap', 'inside', 'outside', 'disc', 'circle',
|
49
|
+
'square', 'decimal', 'lower-roman', 'upper-roman', 'lower-alpha',
|
50
|
+
'upper-alpha', 'aqua', 'black', 'blue', 'fuchsia', 'gray', 'green', 'lime',
|
51
|
+
'maroon', 'navy', 'olive', 'purple', 'red', 'silver', 'teal', 'white',
|
52
|
+
'yellow'
|
53
|
+
}, '-')
|
54
|
+
local css2_property = word_match({
|
55
|
+
'border-top-color', 'border-right-color', 'border-bottom-color',
|
56
|
+
'border-left-color', 'border-color', 'border-top-style', 'border-right-style',
|
57
|
+
'border-bottom-style', 'border-left-style', 'border-style', 'top', 'right',
|
58
|
+
'bottom', 'left', 'position', 'z-index', 'direction', 'unicode-bidi',
|
59
|
+
'min-width', 'max-width', 'min-height', 'max-height', 'overflow', 'clip',
|
60
|
+
'visibility', 'content', 'quotes', 'counter-reset', 'counter-increment',
|
61
|
+
'marker-offset', 'size', 'marks', 'page-break-before', 'page-break-after',
|
62
|
+
'page-break-inside', 'page', 'orphans', 'widows', 'font-stretch',
|
63
|
+
'font-size-adjust', 'unicode-range', 'units-per-em', 'src', 'panose-1',
|
64
|
+
'stemv', 'stemh', 'slope', 'cap-height', 'x-height', 'ascent', 'descent',
|
65
|
+
'widths', 'bbox', 'definition-src', 'baseline', 'centerline', 'mathline',
|
66
|
+
'topline', 'text-shadow', 'caption-side', 'table-layout', 'border-collapse',
|
67
|
+
'border-spacing', 'empty-cells', 'speak-header', 'cursor', 'outline',
|
68
|
+
'outline-width', 'outline-style', 'outline-color', 'volume', 'speak',
|
69
|
+
'pause-before', 'pause-after', 'pause', 'cue-before', 'cue-after', 'cue',
|
70
|
+
'play-during', 'azimuth', 'elevation', 'speech-rate', 'voice-family', 'pitch',
|
71
|
+
'pitch-range', 'stress', 'richness', 'speak-punctuation', 'speak-numeral'
|
72
|
+
}, '-')
|
73
|
+
local css2_value = word_match({
|
74
|
+
'inherit', 'run-in', 'compact', 'marker', 'table', 'inline-table',
|
75
|
+
'table-row-group', 'table-header-group', 'table-footer-group', 'table-row',
|
76
|
+
'table-column-group', 'table-column', 'table-cell', 'table-caption', 'static',
|
77
|
+
'relative', 'absolute', 'fixed', 'ltr', 'rtl', 'embed', 'bidi-override',
|
78
|
+
'visible', 'hidden', 'scroll', 'collapse', 'open-quote', 'close-quote',
|
79
|
+
'no-open-quote', 'no-close-quote', 'decimal-leading-zero', 'lower-greek',
|
80
|
+
'lower-latin', 'upper-latin', 'hebrew', 'armenian', 'georgian',
|
81
|
+
'cjk-ideographic', 'hiragana', 'katakana', 'hiragana-iroha', 'katakana-iroha',
|
82
|
+
'landscape', 'portrait', 'crop', 'cross', 'always', 'avoid', 'wider',
|
83
|
+
'narrower', 'ultra-condensed', 'extra-condensed', 'condensed',
|
84
|
+
'semi-condensed', 'semi-expanded', 'expanded', 'extra-expanded',
|
85
|
+
'ultra-expanded', 'caption', 'icon', 'menu', 'message-box', 'small-caption',
|
86
|
+
'status-bar', 'separate', 'show', 'hide', 'once', 'crosshair', 'default',
|
87
|
+
'pointer', 'move', 'text', 'wait', 'help', 'e-resize', 'ne-resize',
|
88
|
+
'nw-resize', 'n-resize', 'se-resize', 'sw-resize', 's-resize', 'w-resize',
|
89
|
+
'ActiveBorder', 'ActiveCaption', 'AppWorkspace', 'Background', 'ButtonFace',
|
90
|
+
'ButtonHighlight', 'ButtonShadow', 'InactiveCaptionText', 'ButtonText',
|
91
|
+
'CaptionText', 'GrayText', 'Highlight', 'HighlightText', 'InactiveBorder',
|
92
|
+
'InactiveCaption', 'InfoBackground', 'InfoText', 'Menu', 'MenuText',
|
93
|
+
'Scrollbar', 'ThreeDDarkShadow', 'ThreeDFace', 'ThreeDHighlight',
|
94
|
+
'ThreeDLightShadow', 'ThreeDShadow', 'Window', 'WindowFrame', 'WindowText',
|
95
|
+
'silent', 'x-soft', 'soft', 'medium', 'loud', 'x-loud', 'spell-out', 'mix',
|
96
|
+
'left-side', 'far-left', 'center-left', 'center-right', 'far-right',
|
97
|
+
'right-side', 'behind', 'leftwards', 'rightwards', 'below', 'level', 'above',
|
98
|
+
'higher', 'lower', 'x-slow', 'slow', 'medium', 'fast', 'x-fast', 'faster',
|
99
|
+
'slower', 'male', 'female', 'child', 'x-low', 'low', 'high', 'x-high', 'code',
|
100
|
+
'digits', 'continous'
|
101
|
+
}, '-')
|
102
|
+
|
103
|
+
local css3_property = word_match({
|
104
|
+
'align-content', 'align-items', 'align-self', 'alignment-adjust',
|
105
|
+
'alignment-baseline', 'all', 'anchor-point', 'animation', 'animation-delay',
|
106
|
+
'animation-direction', 'animation-duration', 'animation-fill-mode',
|
107
|
+
'animation-iteration-count', 'animation-name', 'animation-play-state',
|
108
|
+
'animation-timing-function', 'backface-visibility', 'background-clip',
|
109
|
+
'background-origin', 'background-size', 'baseline-shift', 'binding', 'bleed',
|
110
|
+
'bookmark-label', 'bookmark-level', 'bookmark-state', 'border-bottom-left-radius',
|
111
|
+
'border-bottom-right-radius', 'border-image', 'border-image-outset',
|
112
|
+
'border-image-repeat', 'border-image-slice', 'border-image-source',
|
113
|
+
'border-image-width', 'border-radius', 'border-top-left-radius',
|
114
|
+
'border-top-right-radius', 'box-decoration-break', 'box-shadow', 'box-sizing',
|
115
|
+
'box-snap', 'box-suppress', 'break-after', 'break-before', 'break-inside',
|
116
|
+
'chains', 'clip-path', 'clip-rule', 'color-interpolation-filters', 'column-count',
|
117
|
+
'column-fill', 'column-gap', 'column-rule', 'column-rule-color', 'column-rule-style',
|
118
|
+
'column-rule-width', 'column-span', 'column-width', 'columns', 'contain',
|
119
|
+
'counter-set', 'crop', 'display-inside', 'display-list', 'display-outside',
|
120
|
+
'dominant-baseline', 'filter', 'flex', 'flex-basis', 'flex-direction', 'flex-flow',
|
121
|
+
'flex-grow', 'flex-shrink', 'flex-wrap', 'float-offset', 'flood-color',
|
122
|
+
'flood-opacity', 'flow-from', 'flow-into', 'font-feature-settings', 'font-kerning',
|
123
|
+
'font-language-override', 'font-synthesis', 'font-variant-alternates',
|
124
|
+
'font-variant-caps', 'font-variant-east-asian', 'font-variant-ligatures',
|
125
|
+
'font-variant-numeric', 'font-variant-position', 'grid', 'grid-area',
|
126
|
+
'grid-auto-columns', 'grid-auto-flow', 'grid-auto-rows', 'grid-column',
|
127
|
+
'grid-column-end', 'grid-column-start', 'grid-row', 'grid-row-end', 'grid-row-start',
|
128
|
+
'grid-template', 'grid-template-areas', 'grid-template-columns', 'grid-template-rows',
|
129
|
+
'hanging-punctuation', 'hyphens', 'icon', 'image-orientation', 'image-resolution',
|
130
|
+
'ime-mode', 'initial-letters', 'inline-box-align', 'justify-content', 'justify-items',
|
131
|
+
'justify-self', 'lighting-color', 'line-box-contain', 'line-break', 'line-grid',
|
132
|
+
'line-snap', 'line-stacking', 'line-stacking-ruby', 'line-stacking-shift',
|
133
|
+
'line-stacking-strategy', 'marker-side', 'mask', 'mask-box', 'mask-box-outset',
|
134
|
+
'mask-box-repeat', 'mask-box-slice', 'mask-box-source', 'mask-box-width',
|
135
|
+
'mask-clip', 'mask-image', 'mask-origin', 'mask-position', 'mask-repeat', 'mask-size',
|
136
|
+
'mask-source-type', 'mask-type', 'max-lines', 'move-to', 'nav-down', 'nav-index',
|
137
|
+
'nav-left', 'nav-right', 'nav-up', 'object-fit', 'object-position', 'opacity',
|
138
|
+
'order', 'outline-offset', 'overflow-wrap', 'overflow-x', 'overflow-y', 'page-policy',
|
139
|
+
'perspective', 'perspective-origin', 'presentation-level', 'region-fragment',
|
140
|
+
'resize', 'rest', 'rest-after', 'rest-before', 'rotation', 'rotation-point',
|
141
|
+
'ruby-align', 'ruby-merge', 'ruby-position', 'shape-image-threshold', 'shape-outside',
|
142
|
+
'shape-margin', 'speak-as', 'string-set', 'tab-size', 'text-align-last',
|
143
|
+
'text-combine-upright', 'text-decoration-color', 'text-decoration-line',
|
144
|
+
'text-decoration-skip', 'text-decoration-style', 'text-emphasis', 'text-emphasis-color',
|
145
|
+
'text-emphasis-color', 'text-emphasis-style', 'text-height', 'text-justify',
|
146
|
+
'text-orientation', 'text-overflow', 'text-space-collapse', 'text-underline-position',
|
147
|
+
'text-wrap', 'transform', 'transform-origin', 'transform-style', 'transition',
|
148
|
+
'transition-delay', 'transition-duration', 'transition-property',
|
149
|
+
'transition-timing-function', 'voice-balance', 'voice-duration', 'voice-pitch',
|
150
|
+
'voice-range', 'voice-rate', 'voice-stress', 'voice-volume', 'will-change',
|
151
|
+
'word-break', 'word-wrap', 'wrap-flow', 'wrap-through', 'writing-mode',
|
152
|
+
})
|
153
|
+
|
154
|
+
|
155
|
+
local property = token('property', css1_property + css2_property + css3_property)
|
156
|
+
local value = token('value', css1_value + css2_value)
|
157
|
+
local keyword = property + value
|
158
|
+
|
159
|
+
-- Identifiers.
|
160
|
+
local identifier = token(l.IDENTIFIER, l.alpha * (l.alnum + S('_-'))^0)
|
161
|
+
|
162
|
+
-- Operators.
|
163
|
+
local operator = token(l.OPERATOR, S('~!#*>+=|.,:;()[]{}'))
|
164
|
+
|
165
|
+
-- At rule.
|
166
|
+
local at_rule = token('at_rule', P('@') * word_match{
|
167
|
+
'charset', 'font-face', 'media', 'page', 'import'
|
168
|
+
})
|
169
|
+
|
170
|
+
-- Colors.
|
171
|
+
local xdigit = l.xdigit
|
172
|
+
local hex_color = '#' * xdigit * xdigit * xdigit * (xdigit * xdigit * xdigit)^-1
|
173
|
+
local color_name = word_match{
|
174
|
+
'aqua', 'black', 'blue', 'fuchsia', 'gray', 'green', 'lime', 'maroon', 'navy',
|
175
|
+
'olive', 'orange', 'purple', 'red', 'silver', 'teal', 'white', 'yellow'
|
176
|
+
}
|
177
|
+
local color = token('color', hex_color + color_name)
|
178
|
+
|
179
|
+
-- Pseudo.
|
180
|
+
local pseudo = token(l.CONSTANT, word_match({
|
181
|
+
-- Pseudo elements.
|
182
|
+
'first-line', 'first-letter', 'before', 'after',
|
183
|
+
-- Pseudo classes.
|
184
|
+
'first-child', 'link', 'visited', 'hover', 'active', 'focus', 'lang',
|
185
|
+
}, '-'))
|
186
|
+
|
187
|
+
-- Units.
|
188
|
+
local unit = token('unit', word_match{
|
189
|
+
'em', 'ex', 'px', 'pt', 'pc', 'in', 'ft', 'mm', 'cm', 'kHz', 'Hz', 'deg',
|
190
|
+
'rad', 'grad', 'ms', 's'
|
191
|
+
} + '%')
|
192
|
+
|
193
|
+
-- Immunio marker
|
194
|
+
local marker = l.token('marker', P('{immunio-var:') * l.integer * ':' * l.xdigit^1 * '}')
|
195
|
+
|
196
|
+
M._rules = {
|
197
|
+
{'whitespace', ws},
|
198
|
+
{'marker', marker},
|
199
|
+
{'keyword', keyword},
|
200
|
+
{'pseudo', pseudo},
|
201
|
+
{'color', color},
|
202
|
+
{'identifier', identifier},
|
203
|
+
{'string', string},
|
204
|
+
{'comment', comment},
|
205
|
+
{'number', number * unit^-1},
|
206
|
+
{'operator', operator},
|
207
|
+
{'at_rule', at_rule},
|
208
|
+
}
|
209
|
+
|
210
|
+
M._tokenstyles = {
|
211
|
+
}
|
212
|
+
|
213
|
+
M._foldsymbols = {
|
214
|
+
}
|
215
|
+
|
216
|
+
return M
|
@@ -0,0 +1,106 @@
|
|
1
|
+
-- Copyright (C) 2015 Immunio, Inc.
|
2
|
+
|
3
|
+
-- HTML: Simple h5 like HTML lexer for Immun.io.
|
4
|
+
|
5
|
+
-- NOTE: not covered by Scintillua MIT license in this directory.
|
6
|
+
|
7
|
+
local l = require('lexer')
|
8
|
+
local token, parent_token, word_match = l.token, l.parent_token, l.word_match
|
9
|
+
local P, R, S, V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
|
10
|
+
|
11
|
+
local M = {_NAME = 'html'}
|
12
|
+
|
13
|
+
local case_insensitive_tags = true
|
14
|
+
|
15
|
+
-- Whitespace.
|
16
|
+
local ws = l.space^1
|
17
|
+
-- This is broad to both accept our placeholders and be very liberal about what may be
|
18
|
+
-- interpreted as an attribute to ensure we escape attributes fairly aggressively.
|
19
|
+
local element_chars = (l.any - '<' - '>' - '=' - '"' - "'" - ws)^1
|
20
|
+
-- Comments.
|
21
|
+
local comment = token(l.COMMENT, '<!--' * (l.any - '-->')^0 * P('-->')^-1)
|
22
|
+
-- XXX add h5 bogus comment 1 and bogus comment 2?
|
23
|
+
|
24
|
+
-- Strings.
|
25
|
+
local sq_str = l.delimited_range("'")
|
26
|
+
local dq_str = l.delimited_range('"')
|
27
|
+
local string = sq_str + dq_str
|
28
|
+
|
29
|
+
-- Attributes. Individual recognition is handled in our XSS processing code.
|
30
|
+
local attr_name = token('attr_name', element_chars - '=')
|
31
|
+
local attr_value = token('attr_value', string + element_chars)
|
32
|
+
local attribute = parent_token('attribute', attr_name * '=' * attr_value)
|
33
|
+
|
34
|
+
-- Tags.
|
35
|
+
local tag_name = token('tag_name', element_chars - '/')
|
36
|
+
local tag_data = token('tag_data', (l.any - l.space - '>')^1 ) -- crap in a tag
|
37
|
+
|
38
|
+
-- XXX how should we handle void tags... right now they are an unmatched tag_open
|
39
|
+
local tag_open = parent_token('tag_open', P('<') * tag_name * ( (ws * attribute) + ( tag_data ) + ws )^0 * (P('>') + '/>') )
|
40
|
+
local tag_close = parent_token('tag_close', P('</') * tag_name * ( ( tag_data ) + ws )^0 * '>')
|
41
|
+
|
42
|
+
-- Special case for script and style tags.
|
43
|
+
local style_tag_name = token("tag_name", word_match({'style'}, nil, case_insensitive_tags))
|
44
|
+
local style_tag_open = parent_token("tag_open", P('<') * style_tag_name * ((ws * attribute) + tag_data)^0 * P('>'))
|
45
|
+
local style_tag_close = parent_token("tag_close", P('</') * style_tag_name * tag_data^0 * '>')
|
46
|
+
local style_data = token("style_data", (l.any - style_tag_close)^0)
|
47
|
+
local style_tag = parent_token('style_tag', style_tag_open * style_data * style_tag_close)
|
48
|
+
|
49
|
+
local script_tag_name = token("tag_name", word_match({'script'}, nil, case_insensitive_tags))
|
50
|
+
local script_tag_open = parent_token("tag_open", P('<') * script_tag_name * ((ws * attribute) + tag_data)^0 * P('>'))
|
51
|
+
local script_tag_close = parent_token("tag_close", P('</') * script_tag_name * tag_data^0 * '>')
|
52
|
+
local script_data = token("script_data", (l.any - script_tag_close)^0)
|
53
|
+
local script_tag = parent_token('script_tag', script_tag_open * script_data * script_tag_close)
|
54
|
+
|
55
|
+
-- Top level rules
|
56
|
+
|
57
|
+
-- Note: the ordering is important here as <script> and <style> have to supercede tag_open...
|
58
|
+
local tag = style_tag + script_tag + tag_open + tag_close
|
59
|
+
|
60
|
+
-- Entities.
|
61
|
+
local entity = token('entity', '&' * (l.any - l.space - ';' - '<' - '>' - "'" - '"' - "/" )^1 * ';')
|
62
|
+
|
63
|
+
-- Doctype.
|
64
|
+
local doctype = token('doctype', '<!' *
|
65
|
+
word_match({'doctype'}, nil, case_insensitive_tags) *
|
66
|
+
(l.any - '>')^1 * '>')
|
67
|
+
|
68
|
+
-- Data between tags
|
69
|
+
local data = token('data', (l.any - '<')^1)
|
70
|
+
|
71
|
+
M._rules = {
|
72
|
+
{'comment', comment},
|
73
|
+
{'doctype', doctype},
|
74
|
+
{'tag', tag},
|
75
|
+
{'entity', entity},
|
76
|
+
{'data', data},
|
77
|
+
}
|
78
|
+
|
79
|
+
M._tokenstyles = {
|
80
|
+
}
|
81
|
+
|
82
|
+
M._foldsymbols = {
|
83
|
+
}
|
84
|
+
|
85
|
+
M.unlex_rules = {
|
86
|
+
["tag_open"] = {
|
87
|
+
["prefix"] = "<",
|
88
|
+
["suffix"] = ">",
|
89
|
+
},
|
90
|
+
["tag_close"] = {
|
91
|
+
["prefix"] = "</",
|
92
|
+
["suffix"] = ">",
|
93
|
+
},
|
94
|
+
["attribute"] = {
|
95
|
+
["prefix"] = " ",
|
96
|
+
},
|
97
|
+
["tag_data"] = {
|
98
|
+
["prefix"] = " ",
|
99
|
+
},
|
100
|
+
["attr_name"] = {
|
101
|
+
["suffix"] = "=",
|
102
|
+
},
|
103
|
+
}
|
104
|
+
|
105
|
+
|
106
|
+
return M
|
@@ -0,0 +1,68 @@
|
|
1
|
+
-- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
|
2
|
+
-- JavaScript LPeg lexer.
|
3
|
+
|
4
|
+
local l = require('lexer')
|
5
|
+
local token, word_match = l.token, l.word_match
|
6
|
+
local P, R, S = lpeg.P, lpeg.R, lpeg.S
|
7
|
+
|
8
|
+
local M = {_NAME = 'javascript'}
|
9
|
+
|
10
|
+
-- Whitespace.
|
11
|
+
local ws = token(l.WHITESPACE, l.space^1)
|
12
|
+
|
13
|
+
-- Comments.
|
14
|
+
local line_comment = '//' * l.nonnewline_esc^0
|
15
|
+
local block_comment = '/*' * (l.any - '*/')^0 * P('*/')^-1
|
16
|
+
local comment = token(l.COMMENT, line_comment + block_comment)
|
17
|
+
|
18
|
+
-- Strings.
|
19
|
+
local sq_str = l.delimited_range("'")
|
20
|
+
local dq_str = l.delimited_range('"')
|
21
|
+
local regex = token( "regex", l.last_char_includes('+-*%^!=&|?:;,([{<>') *
|
22
|
+
l.delimited_range('/', true) * S('igm')^0 )
|
23
|
+
local string = token(l.STRING, sq_str + dq_str) --+ token(l.REGEX, regex_str)
|
24
|
+
|
25
|
+
-- Numbers.
|
26
|
+
local number = token(l.NUMBER, l.float + l.integer)
|
27
|
+
|
28
|
+
-- Keywords.
|
29
|
+
local keyword = token(l.KEYWORD, word_match{
|
30
|
+
'abstract', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class',
|
31
|
+
'const', 'continue', 'debugger', 'default', 'delete', 'do', 'double', 'else',
|
32
|
+
'enum', 'export', 'extends', 'false', 'final', 'finally', 'float', 'for',
|
33
|
+
'function', 'goto', 'if', 'implements', 'import', 'in', 'instanceof', 'int',
|
34
|
+
'interface', 'let', 'long', 'native', 'new', 'null', 'package', 'private',
|
35
|
+
'protected', 'public', 'return', 'short', 'static', 'super', 'switch',
|
36
|
+
'synchronized', 'this', 'throw', 'throws', 'transient', 'true', 'try',
|
37
|
+
'typeof', 'var', 'void', 'volatile', 'while', 'with', 'yield'
|
38
|
+
})
|
39
|
+
|
40
|
+
-- Identifiers.
|
41
|
+
local identifier = token(l.IDENTIFIER, l.word)
|
42
|
+
|
43
|
+
-- Operators.
|
44
|
+
local operator = token(l.OPERATOR, S('+-/*%^!=&|?:;,.()[]{}<>'))
|
45
|
+
|
46
|
+
-- Immunio marker
|
47
|
+
local marker = l.token('marker', P('{immunio-var:') * l.integer * ':' * l.xdigit^1 * '}')
|
48
|
+
|
49
|
+
|
50
|
+
M._rules = {
|
51
|
+
{'whitespace', ws},
|
52
|
+
{'marker', marker},
|
53
|
+
{'keyword', keyword},
|
54
|
+
{'identifier', identifier},
|
55
|
+
{'comment', comment},
|
56
|
+
{'number', number},
|
57
|
+
{'string', string},
|
58
|
+
{'regex', regex},
|
59
|
+
{'operator', operator},
|
60
|
+
}
|
61
|
+
|
62
|
+
M._foldsymbols = {
|
63
|
+
_patterns = {'[{}]', '/%*', '%*/', '//'},
|
64
|
+
[l.OPERATOR] = {['{'] = 1, ['}'] = -1},
|
65
|
+
[l.COMMENT] = {['/*'] = 1, ['*/'] = -1, ['//'] = l.fold_line_comments('//')}
|
66
|
+
}
|
67
|
+
|
68
|
+
return M
|
@@ -0,0 +1,1575 @@
|
|
1
|
+
-- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
|
2
|
+
|
3
|
+
local M = {}
|
4
|
+
|
5
|
+
--[=[ This comment is for LuaDoc.
|
6
|
+
---
|
7
|
+
-- Lexes Scintilla documents with Lua and LPeg.
|
8
|
+
--
|
9
|
+
-- ## Overview
|
10
|
+
--
|
11
|
+
-- Lexers highlight the syntax of source code. Scintilla (the editing component
|
12
|
+
-- behind [Textadept][] and [SciTE][]) traditionally uses static, compiled C++
|
13
|
+
-- lexers which are notoriously difficult to create and/or extend. On the other
|
14
|
+
-- hand, Lua makes it easy to to rapidly create new lexers, extend existing
|
15
|
+
-- ones, and embed lexers within one another. Lua lexers tend to be more
|
16
|
+
-- readable than C++ lexers too.
|
17
|
+
--
|
18
|
+
-- Lexers are Parsing Expression Grammars, or PEGs, composed with the Lua
|
19
|
+
-- [LPeg library][]. The following table comes from the LPeg documentation and
|
20
|
+
-- summarizes all you need to know about constructing basic LPeg patterns. This
|
21
|
+
-- module provides convenience functions for creating and working with other
|
22
|
+
-- more advanced patterns and concepts.
|
23
|
+
--
|
24
|
+
-- Operator | Description
|
25
|
+
-- ---------------------|------------
|
26
|
+
-- `lpeg.P(string)` | Matches `string` literally.
|
27
|
+
-- `lpeg.P(`_`n`_`)` | Matches exactly _`n`_ characters.
|
28
|
+
-- `lpeg.S(string)` | Matches any character in set `string`.
|
29
|
+
-- `lpeg.R("`_`xy`_`")` | Matches any character between range `x` and `y`.
|
30
|
+
-- `patt^`_`n`_ | Matches at least _`n`_ repetitions of `patt`.
|
31
|
+
-- `patt^-`_`n`_ | Matches at most _`n`_ repetitions of `patt`.
|
32
|
+
-- `patt1 * patt2` | Matches `patt1` followed by `patt2`.
|
33
|
+
-- `patt1 + patt2` | Matches `patt1` or `patt2` (ordered choice).
|
34
|
+
-- `patt1 - patt2` | Matches `patt1` if `patt2` does not match.
|
35
|
+
-- `-patt` | Equivalent to `("" - patt)`.
|
36
|
+
-- `#patt` | Matches `patt` but consumes no input.
|
37
|
+
--
|
38
|
+
-- The first part of this document deals with rapidly constructing a simple
|
39
|
+
-- lexer. The next part deals with more advanced techniques, such as custom
|
40
|
+
-- coloring and embedding lexers within one another. Following that is a
|
41
|
+
-- discussion about code folding, or being able to tell Scintilla which code
|
42
|
+
-- blocks are "foldable" (temporarily hideable from view). After that are
|
43
|
+
-- instructions on how to use LPeg lexers with the aforementioned Textadept and
|
44
|
+
-- SciTE editors. Finally there are comments on lexer performance and
|
45
|
+
-- limitations.
|
46
|
+
--
|
47
|
+
-- [LPeg library]: http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html
|
48
|
+
-- [Textadept]: http://foicica.com/textadept
|
49
|
+
-- [SciTE]: http://scintilla.org/SciTE.html
|
50
|
+
--
|
51
|
+
-- ## Lexer Basics
|
52
|
+
--
|
53
|
+
-- The *lexers/* directory contains all lexers, including your new one. Before
|
54
|
+
-- attempting to write one from scratch though, first determine if your
|
55
|
+
-- programming language is similar to any of the 80+ languages supported. If so,
|
56
|
+
-- you may be able to copy and modify that lexer, saving some time and effort.
|
57
|
+
-- The filename of your lexer should be the name of your programming language in
|
58
|
+
-- lower case followed by a *.lua* extension. For example, a new Lua lexer has
|
59
|
+
-- the name *lua.lua*.
|
60
|
+
--
|
61
|
+
-- Note: Try to refrain from using one-character language names like "b", "c",
|
62
|
+
-- or "d". For example, Scintillua uses "b_lang", "cpp", and "dmd",
|
63
|
+
-- respectively.
|
64
|
+
--
|
65
|
+
-- ### New Lexer Template
|
66
|
+
--
|
67
|
+
-- There is a *lexers/template.txt* file that contains a simple template for a
|
68
|
+
-- new lexer. Feel free to use it, replacing the '?'s with the name of your
|
69
|
+
-- lexer:
|
70
|
+
--
|
71
|
+
-- -- ? LPeg lexer.
|
72
|
+
--
|
73
|
+
-- local l = require('lexer')
|
74
|
+
-- local token, word_match = l.token, l.word_match
|
75
|
+
-- local P, R, S = lpeg.P, lpeg.R, lpeg.S
|
76
|
+
--
|
77
|
+
-- local M = {_NAME = '?'}
|
78
|
+
--
|
79
|
+
-- -- Whitespace.
|
80
|
+
-- local ws = token(l.WHITESPACE, l.space^1)
|
81
|
+
--
|
82
|
+
-- M._rules = {
|
83
|
+
-- {'whitespace', ws},
|
84
|
+
-- }
|
85
|
+
--
|
86
|
+
-- M._tokenstyles = {
|
87
|
+
--
|
88
|
+
-- }
|
89
|
+
--
|
90
|
+
-- return M
|
91
|
+
--
|
92
|
+
-- The first 4 lines of code simply define often used convenience variables. The
|
93
|
+
-- 5th and last lines define and return the lexer object Scintilla uses; they
|
94
|
+
-- are very important and must be part of every lexer. The sixth line defines
|
95
|
+
-- something called a "token", an essential building block of lexers. You will
|
96
|
+
-- learn about tokens shortly. The rest of the code defines a set of grammar
|
97
|
+
-- rules and token styles. You will learn about those later. Note, however, the
|
98
|
+
-- `M.` prefix in front of `_rules` and `_tokenstyles`: not only do these tables
|
99
|
+
-- belong to their respective lexers, but any non-local variables need the `M.`
|
100
|
+
-- prefix too so-as not to affect Lua's global environment. All in all, this is
|
101
|
+
-- a minimal, working lexer that you can build on.
|
102
|
+
--
|
103
|
+
-- ### Tokens
|
104
|
+
--
|
105
|
+
-- Take a moment to think about your programming language's structure. What kind
|
106
|
+
-- of key elements does it have? In the template shown earlier, one predefined
|
107
|
+
-- element all languages have is whitespace. Your language probably also has
|
108
|
+
-- elements like comments, strings, and keywords. Lexers refer to these elements
|
109
|
+
-- as "tokens". Tokens are the fundamental "building blocks" of lexers. Lexers
|
110
|
+
-- break down source code into tokens for coloring, which results in the syntax
|
111
|
+
-- highlighting familiar to you. It is up to you how specific your lexer is when
|
112
|
+
-- it comes to tokens. Perhaps only distinguishing between keywords and
|
113
|
+
-- identifiers is necessary, or maybe recognizing constants and built-in
|
114
|
+
-- functions, methods, or libraries is desirable. The Lua lexer, for example,
|
115
|
+
-- defines 11 tokens: whitespace, comments, strings, numbers, keywords, built-in
|
116
|
+
-- functions, constants, built-in libraries, identifiers, labels, and operators.
|
117
|
+
-- Even though constants, built-in functions, and built-in libraries are subsets
|
118
|
+
-- of identifiers, Lua programmers find it helpful for the lexer to distinguish
|
119
|
+
-- between them all. It is perfectly acceptable to just recognize keywords and
|
120
|
+
-- identifiers.
|
121
|
+
--
|
122
|
+
-- In a lexer, tokens consist of a token name and an LPeg pattern that matches a
|
123
|
+
-- sequence of characters recognized as an instance of that token. Create tokens
|
124
|
+
-- using the [`lexer.token()`]() function. Let us examine the "whitespace" token
|
125
|
+
-- defined in the template shown earlier:
|
126
|
+
--
|
127
|
+
-- local ws = token(l.WHITESPACE, l.space^1)
|
128
|
+
--
|
129
|
+
-- At first glance, the first argument does not appear to be a string name and
|
130
|
+
-- the second argument does not appear to be an LPeg pattern. Perhaps you
|
131
|
+
-- expected something like:
|
132
|
+
--
|
133
|
+
-- local ws = token('whitespace', S('\t\v\f\n\r ')^1)
|
134
|
+
--
|
135
|
+
-- The `lexer` (`l`) module actually provides a convenient list of common token
|
136
|
+
-- names and common LPeg patterns for you to use. Token names include
|
137
|
+
-- [`lexer.DEFAULT`](), [`lexer.WHITESPACE`](), [`lexer.COMMENT`](),
|
138
|
+
-- [`lexer.STRING`](), [`lexer.NUMBER`](), [`lexer.KEYWORD`](),
|
139
|
+
-- [`lexer.IDENTIFIER`](), [`lexer.OPERATOR`](), [`lexer.ERROR`](),
|
140
|
+
-- [`lexer.PREPROCESSOR`](), [`lexer.CONSTANT`](), [`lexer.VARIABLE`](),
|
141
|
+
-- [`lexer.FUNCTION`](), [`lexer.CLASS`](), [`lexer.TYPE`](), [`lexer.LABEL`](),
|
142
|
+
-- [`lexer.REGEX`](), and [`lexer.EMBEDDED`](). Patterns include
|
143
|
+
-- [`lexer.any`](), [`lexer.ascii`](), [`lexer.extend`](), [`lexer.alpha`](),
|
144
|
+
-- [`lexer.digit`](), [`lexer.alnum`](), [`lexer.lower`](), [`lexer.upper`](),
|
145
|
+
-- [`lexer.xdigit`](), [`lexer.cntrl`](), [`lexer.graph`](), [`lexer.print`](),
|
146
|
+
-- [`lexer.punct`](), [`lexer.space`](), [`lexer.newline`](),
|
147
|
+
-- [`lexer.nonnewline`](), [`lexer.nonnewline_esc`](), [`lexer.dec_num`](),
|
148
|
+
-- [`lexer.hex_num`](), [`lexer.oct_num`](), [`lexer.integer`](),
|
149
|
+
-- [`lexer.float`](), and [`lexer.word`](). You may use your own token names if
|
150
|
+
-- none of the above fit your language, but an advantage to using predefined
|
151
|
+
-- token names is that your lexer's tokens will inherit the universal syntax
|
152
|
+
-- highlighting color theme used by your text editor.
|
153
|
+
--
|
154
|
+
-- #### Example Tokens
|
155
|
+
--
|
156
|
+
-- So, how might you define other tokens like comments, strings, and keywords?
|
157
|
+
-- Here are some examples.
|
158
|
+
--
|
159
|
+
-- **Comments**
|
160
|
+
--
|
161
|
+
-- Line-style comments with a prefix character(s) are easy to express with LPeg:
|
162
|
+
--
|
163
|
+
-- local shell_comment = token(l.COMMENT, '#' * l.nonnewline^0)
|
164
|
+
-- local c_line_comment = token(l.COMMENT, '//' * l.nonnewline_esc^0)
|
165
|
+
--
|
166
|
+
-- The comments above start with a '#' or "//" and go to the end of the line.
|
167
|
+
-- The second comment recognizes the next line also as a comment if the current
|
168
|
+
-- line ends with a '\' escape character.
|
169
|
+
--
|
170
|
+
-- C-style "block" comments with a start and end delimiter are also easy to
|
171
|
+
-- express:
|
172
|
+
--
|
173
|
+
-- local c_comment = token(l.COMMENT, '/*' * (l.any - '*/')^0 * P('*/')^-1)
|
174
|
+
--
|
175
|
+
-- This comment starts with a "/\*" sequence and contains anything up to and
|
176
|
+
-- including an ending "\*/" sequence. The ending "\*/" is optional so the lexer
|
177
|
+
-- can recognize unfinished comments as comments and highlight them properly.
|
178
|
+
--
|
179
|
+
-- **Strings**
|
180
|
+
--
|
181
|
+
-- It is tempting to think that a string is not much different from the block
|
182
|
+
-- comment shown above in that both have start and end delimiters:
|
183
|
+
--
|
184
|
+
-- local dq_str = '"' * (l.any - '"')^0 * P('"')^-1
|
185
|
+
-- local sq_str = "'" * (l.any - "'")^0 * P("'")^-1
|
186
|
+
-- local simple_string = token(l.STRING, dq_str + sq_str)
|
187
|
+
--
|
188
|
+
-- However, most programming languages allow escape sequences in strings such
|
189
|
+
-- that a sequence like "\\"" in a double-quoted string indicates that the
|
190
|
+
-- '"' is not the end of the string. The above token incorrectly matches
|
191
|
+
-- such a string. Instead, use the [`lexer.delimited_range()`]() convenience
|
192
|
+
-- function.
|
193
|
+
--
|
194
|
+
-- local dq_str = l.delimited_range('"')
|
195
|
+
-- local sq_str = l.delimited_range("'")
|
196
|
+
-- local string = token(l.STRING, dq_str + sq_str)
|
197
|
+
--
|
198
|
+
-- In this case, the lexer treats '\' as an escape character in a string
|
199
|
+
-- sequence.
|
200
|
+
--
|
201
|
+
-- **Keywords**
|
202
|
+
--
|
203
|
+
-- Instead of matching _n_ keywords with _n_ `P('keyword_`_`n`_`')` ordered
|
204
|
+
-- choices, use another convenience function: [`lexer.word_match()`](). It is
|
205
|
+
-- much easier and more efficient to write word matches like:
|
206
|
+
--
|
207
|
+
-- local keyword = token(l.KEYWORD, l.word_match{
|
208
|
+
-- 'keyword_1', 'keyword_2', ..., 'keyword_n'
|
209
|
+
-- })
|
210
|
+
--
|
211
|
+
-- local case_insensitive_keyword = token(l.KEYWORD, l.word_match({
|
212
|
+
-- 'KEYWORD_1', 'keyword_2', ..., 'KEYword_n'
|
213
|
+
-- }, nil, true))
|
214
|
+
--
|
215
|
+
-- local hyphened_keyword = token(l.KEYWORD, l.word_match({
|
216
|
+
-- 'keyword-1', 'keyword-2', ..., 'keyword-n'
|
217
|
+
-- }, '-'))
|
218
|
+
--
|
219
|
+
-- By default, characters considered to be in keywords are in the set of
|
220
|
+
-- alphanumeric characters and underscores. The last token demonstrates how to
|
221
|
+
-- allow '-' (hyphen) characters to be in keywords as well.
|
222
|
+
--
|
223
|
+
-- **Numbers**
|
224
|
+
--
|
225
|
+
-- Most programming languages have the same format for integer and float tokens,
|
226
|
+
-- so it might be as simple as using a couple of predefined LPeg patterns:
|
227
|
+
--
|
228
|
+
-- local number = token(l.NUMBER, l.float + l.integer)
|
229
|
+
--
|
230
|
+
-- However, some languages allow postfix characters on integers.
|
231
|
+
--
|
232
|
+
-- local integer = P('-')^-1 * (l.dec_num * S('lL')^-1)
|
233
|
+
-- local number = token(l.NUMBER, l.float + l.hex_num + integer)
|
234
|
+
--
|
235
|
+
-- Your language may need other tweaks, but it is up to you how fine-grained you
|
236
|
+
-- want your highlighting to be. After all, you are not writing a compiler or
|
237
|
+
-- interpreter!
|
238
|
+
--
|
239
|
+
-- ### Rules
|
240
|
+
--
|
241
|
+
-- Programming languages have grammars, which specify valid token structure. For
|
242
|
+
-- example, comments usually cannot appear within a string. Grammars consist of
|
243
|
+
-- rules, which are simply combinations of tokens. Recall from the lexer
|
244
|
+
-- template the `_rules` table, which defines all the rules used by the lexer
|
245
|
+
-- grammar:
|
246
|
+
--
|
247
|
+
-- M._rules = {
|
248
|
+
-- {'whitespace', ws},
|
249
|
+
-- }
|
250
|
+
--
|
251
|
+
-- Each entry in a lexer's `_rules` table consists of a rule name and its
|
252
|
+
-- associated pattern. Rule names are completely arbitrary and serve only to
|
253
|
+
-- identify and distinguish between different rules. Rule order is important: if
|
254
|
+
-- text does not match the first rule, the lexer tries the second rule, and so
|
255
|
+
-- on. This simple grammar says to match whitespace tokens under a rule named
|
256
|
+
-- "whitespace".
|
257
|
+
--
|
258
|
+
-- To illustrate the importance of rule order, here is an example of a
|
259
|
+
-- simplified Lua grammar:
|
260
|
+
--
|
261
|
+
-- M._rules = {
|
262
|
+
-- {'whitespace', ws},
|
263
|
+
-- {'keyword', keyword},
|
264
|
+
-- {'identifier', identifier},
|
265
|
+
-- {'string', string},
|
266
|
+
-- {'comment', comment},
|
267
|
+
-- {'number', number},
|
268
|
+
-- {'label', label},
|
269
|
+
-- {'operator', operator},
|
270
|
+
-- }
|
271
|
+
--
|
272
|
+
-- Note how identifiers come after keywords. In Lua, as with most programming
|
273
|
+
-- languages, the characters allowed in keywords and identifiers are in the same
|
274
|
+
-- set (alphanumerics plus underscores). If the lexer specified the "identifier"
|
275
|
+
-- rule before the "keyword" rule, all keywords would match identifiers and thus
|
276
|
+
-- incorrectly highlight as identifiers instead of keywords. The same idea
|
277
|
+
-- applies to function, constant, etc. tokens that you may want to distinguish
|
278
|
+
-- between: their rules should come before identifiers.
|
279
|
+
--
|
280
|
+
-- So what about text that does not match any rules? For example in Lua, the '!'
|
281
|
+
-- character is meaningless outside a string or comment. Normally the lexer
|
282
|
+
-- skips over such text. If instead you want to highlight these "syntax errors",
|
283
|
+
-- add an additional end rule:
|
284
|
+
--
|
285
|
+
-- M._rules = {
|
286
|
+
-- {'whitespace', ws},
|
287
|
+
-- {'error', token(l.ERROR, l.any)},
|
288
|
+
-- }
|
289
|
+
--
|
290
|
+
-- This identifies and highlights any character not matched by an existing
|
291
|
+
-- rule as an `lexer.ERROR` token.
|
292
|
+
--
|
293
|
+
-- Even though the rules defined in the examples above contain a single token,
|
294
|
+
-- rules may consist of multiple tokens. For example, a rule for an HTML tag
|
295
|
+
-- could consist of a tag token followed by an arbitrary number of attribute
|
296
|
+
-- tokens, allowing the lexer to highlight all tokens separately. The rule might
|
297
|
+
-- look something like this:
|
298
|
+
--
|
299
|
+
-- {'tag', tag_start * (ws * attributes)^0 * tag_end^-1}
|
300
|
+
--
|
301
|
+
-- Note however that lexers with complex rules like these are more prone to lose
|
302
|
+
-- track of their state.
|
303
|
+
--
|
304
|
+
-- ### Summary
|
305
|
+
--
|
306
|
+
-- Lexers primarily consist of tokens and grammar rules. At your disposal are a
|
307
|
+
-- number of convenience patterns and functions for rapidly creating a lexer. If
|
308
|
+
-- you choose to use predefined token names for your tokens, you do not have to
|
309
|
+
-- define how the lexer highlights them. The tokens will inherit the default
|
310
|
+
-- syntax highlighting color theme your editor uses.
|
311
|
+
--
|
312
|
+
-- ## Advanced Techniques
|
313
|
+
--
|
314
|
+
-- ### Styles and Styling
|
315
|
+
--
|
316
|
+
-- The most basic form of syntax highlighting is assigning different colors to
|
317
|
+
-- different tokens. Instead of highlighting with just colors, Scintilla allows
|
318
|
+
-- for more rich highlighting, or "styling", with different fonts, font sizes,
|
319
|
+
-- font attributes, and foreground and background colors, just to name a few.
|
320
|
+
-- The unit of this rich highlighting is called a "style". Styles are simply
|
321
|
+
-- strings of comma-separated property settings. By default, lexers associate
|
322
|
+
-- predefined token names like `lexer.WHITESPACE`, `lexer.COMMENT`,
|
323
|
+
-- `lexer.STRING`, etc. with particular styles as part of a universal color
|
324
|
+
-- theme. These predefined styles include [`lexer.STYLE_CLASS`](),
|
325
|
+
-- [`lexer.STYLE_COMMENT`](), [`lexer.STYLE_CONSTANT`](),
|
326
|
+
-- [`lexer.STYLE_ERROR`](), [`lexer.STYLE_EMBEDDED`](),
|
327
|
+
-- [`lexer.STYLE_FUNCTION`](), [`lexer.STYLE_IDENTIFIER`](),
|
328
|
+
-- [`lexer.STYLE_KEYWORD`](), [`lexer.STYLE_LABEL`](), [`lexer.STYLE_NUMBER`](),
|
329
|
+
-- [`lexer.STYLE_OPERATOR`](), [`lexer.STYLE_PREPROCESSOR`](),
|
330
|
+
-- [`lexer.STYLE_REGEX`](), [`lexer.STYLE_STRING`](), [`lexer.STYLE_TYPE`](),
|
331
|
+
-- [`lexer.STYLE_VARIABLE`](), and [`lexer.STYLE_WHITESPACE`](). Like with
|
332
|
+
-- predefined token names and LPeg patterns, you may define your own styles. At
|
333
|
+
-- their core, styles are just strings, so you may create new ones and/or modify
|
334
|
+
-- existing ones. Each style consists of the following comma-separated settings:
|
335
|
+
--
|
336
|
+
-- Setting | Description
|
337
|
+
-- ---------------|------------
|
338
|
+
-- font:_name_ | The name of the font the style uses.
|
339
|
+
-- size:_int_ | The size of the font the style uses.
|
340
|
+
-- [not]bold | Whether or not the font face is bold.
|
341
|
+
-- [not]italics | Whether or not the font face is italic.
|
342
|
+
-- [not]underlined| Whether or not the font face is underlined.
|
343
|
+
-- fore:_color_ | The foreground color of the font face.
|
344
|
+
-- back:_color_ | The background color of the font face.
|
345
|
+
-- [not]eolfilled | Does the background color extend to the end of the line?
|
346
|
+
-- case:_char_ | The case of the font ('u': upper, 'l': lower, 'm': normal).
|
347
|
+
-- [not]visible | Whether or not the text is visible.
|
348
|
+
-- [not]changeable| Whether the text is changeable or read-only.
|
349
|
+
-- [not]hotspot | Whether or not the text is clickable.
|
350
|
+
--
|
351
|
+
-- Specify font colors in either "#RRGGBB" format, "0xBBGGRR" format, or the
|
352
|
+
-- decimal equivalent of the latter. As with token names, LPeg patterns, and
|
353
|
+
-- styles, there is a set of predefined color names, but they vary depending on
|
354
|
+
-- the current color theme in use. Therefore, it is generally not a good idea to
|
355
|
+
-- manually define colors within styles in your lexer since they might not fit
|
356
|
+
-- into a user's chosen color theme. Try to refrain from even using predefined
|
357
|
+
-- colors in a style because that color may be theme-specific. Instead, the best
|
358
|
+
-- practice is to either use predefined styles or derive new color-agnostic
|
359
|
+
-- styles from predefined ones. For example, Lua "longstring" tokens use the
|
360
|
+
-- existing `lexer.STYLE_STRING` style instead of defining a new one.
|
361
|
+
--
|
362
|
+
-- #### Example Styles
|
363
|
+
--
|
364
|
+
-- Defining styles is pretty straightforward. An empty style that inherits the
|
365
|
+
-- default theme settings is simply an empty string:
|
366
|
+
--
|
367
|
+
-- local style_nothing = ''
|
368
|
+
--
|
369
|
+
-- A similar style but with a bold font face looks like this:
|
370
|
+
--
|
371
|
+
-- local style_bold = 'bold'
|
372
|
+
--
|
373
|
+
-- If you want the same style, but also with an italic font face, define the new
|
374
|
+
-- style in terms of the old one:
|
375
|
+
--
|
376
|
+
-- local style_bold_italic = style_bold..',italics'
|
377
|
+
--
|
378
|
+
-- This allows you to derive new styles from predefined ones without having to
|
379
|
+
-- rewrite them. This operation leaves the old style unchanged. Thus if you
|
380
|
+
-- had a "static variable" token whose style you wanted to base off of
|
381
|
+
-- `lexer.STYLE_VARIABLE`, it would probably look like:
|
382
|
+
--
|
383
|
+
-- local style_static_var = l.STYLE_VARIABLE..',italics'
|
384
|
+
--
|
385
|
+
-- The color theme files in the *lexers/themes/* folder give more examples of
|
386
|
+
-- style definitions.
|
387
|
+
--
|
388
|
+
-- ### Token Styles
|
389
|
+
--
|
390
|
+
-- Lexers use the `_tokenstyles` table to assign tokens to particular styles.
|
391
|
+
-- Recall the token definition and `_tokenstyles` table from the lexer template:
|
392
|
+
--
|
393
|
+
-- local ws = token(l.WHITESPACE, l.space^1)
|
394
|
+
--
|
395
|
+
-- ...
|
396
|
+
--
|
397
|
+
-- M._tokenstyles = {
|
398
|
+
--
|
399
|
+
-- }
|
400
|
+
--
|
401
|
+
-- Why is a style not assigned to the `lexer.WHITESPACE` token? As mentioned
|
402
|
+
-- earlier, lexers automatically associate tokens that use predefined token
|
403
|
+
-- names with a particular style. Only tokens with custom token names need
|
404
|
+
-- manual style associations. As an example, consider a custom whitespace token:
|
405
|
+
--
|
406
|
+
-- local ws = token('custom_whitespace', l.space^1)
|
407
|
+
--
|
408
|
+
-- Assigning a style to this token looks like:
|
409
|
+
--
|
410
|
+
-- M._tokenstyles = {
|
411
|
+
-- custom_whitespace = l.STYLE_WHITESPACE
|
412
|
+
-- }
|
413
|
+
--
|
414
|
+
-- Do not confuse token names with rule names. They are completely different
|
415
|
+
-- entities. In the example above, the lexer assigns the "custom_whitespace"
|
416
|
+
-- token the existing style for `WHITESPACE` tokens. If instead you want to
|
417
|
+
-- color the background of whitespace a shade of grey, it might look like:
|
418
|
+
--
|
419
|
+
-- local custom_style = l.STYLE_WHITESPACE..',back:$(color.grey)'
|
420
|
+
-- M._tokenstyles = {
|
421
|
+
-- custom_whitespace = custom_style
|
422
|
+
-- }
|
423
|
+
--
|
424
|
+
-- Notice that the lexer peforms Scintilla/SciTE-style "$()" property expansion.
|
425
|
+
-- You may also use "%()". Remember to refrain from assigning specific colors in
|
426
|
+
-- styles, but in this case, all user color themes probably define the
|
427
|
+
-- "color.grey" property.
|
428
|
+
--
|
429
|
+
-- ### Line Lexers
|
430
|
+
--
|
431
|
+
-- By default, lexers match the arbitrary chunks of text passed to them by
|
432
|
+
-- Scintilla. These chunks may be a full document, only the visible part of a
|
433
|
+
-- document, or even just portions of lines. Some lexers need to match whole
|
434
|
+
-- lines. For example, a lexer for the output of a file "diff" needs to know if
|
435
|
+
-- the line started with a '+' or '-' and then style the entire line
|
436
|
+
-- accordingly. To indicate that your lexer matches by line, use the
|
437
|
+
-- `_LEXBYLINE` field:
|
438
|
+
--
|
439
|
+
-- M._LEXBYLINE = true
|
440
|
+
--
|
441
|
+
-- Now the input text for the lexer is a single line at a time. Keep in mind
|
442
|
+
-- that line lexers do not have the ability to look ahead at subsequent lines.
|
443
|
+
--
|
444
|
+
-- ### Embedded Lexers
|
445
|
+
--
|
446
|
+
-- Lexers embed within one another very easily, requiring minimal effort. In the
|
447
|
+
-- following sections, the lexer being embedded is called the "child" lexer and
|
448
|
+
-- the lexer a child is being embedded in is called the "parent". For example,
|
449
|
+
-- consider an HTML lexer and a CSS lexer. Either lexer stands alone for styling
|
450
|
+
-- their respective HTML and CSS files. However, CSS can be embedded inside
|
451
|
+
-- HTML. In this specific case, the CSS lexer is the "child" lexer with the HTML
|
452
|
+
-- lexer being the "parent". Now consider an HTML lexer and a PHP lexer. This
|
453
|
+
-- sounds a lot like the case with CSS, but there is a subtle difference: PHP
|
454
|
+
-- _embeds itself_ into HTML while CSS is _embedded in_ HTML. This fundamental
|
455
|
+
-- difference results in two types of embedded lexers: a parent lexer that
|
456
|
+
-- embeds other child lexers in it (like HTML embedding CSS), and a child lexer
|
457
|
+
-- that embeds itself within a parent lexer (like PHP embedding itself in HTML).
|
458
|
+
--
|
459
|
+
-- #### Parent Lexer
|
460
|
+
--
|
461
|
+
-- Before embedding a child lexer into a parent lexer, the parent lexer needs to
|
462
|
+
-- load the child lexer. This is done with the [`lexer.load()`]() function. For
|
463
|
+
-- example, loading the CSS lexer within the HTML lexer looks like:
|
464
|
+
--
|
465
|
+
-- local css = l.load('css')
|
466
|
+
--
|
467
|
+
-- The next part of the embedding process is telling the parent lexer when to
|
468
|
+
-- switch over to the child lexer and when to switch back. The lexer refers to
|
469
|
+
-- these indications as the "start rule" and "end rule", respectively, and are
|
470
|
+
-- just LPeg patterns. Continuing with the HTML/CSS example, the transition from
|
471
|
+
-- HTML to CSS is when the lexer encounters a "style" tag with a "type"
|
472
|
+
-- attribute whose value is "text/css":
|
473
|
+
--
|
474
|
+
-- local css_tag = P('<style') * P(function(input, index)
|
475
|
+
-- if input:find('^[^>]+type="text/css"', index) then
|
476
|
+
-- return index
|
477
|
+
-- end
|
478
|
+
-- end)
|
479
|
+
--
|
480
|
+
-- This pattern looks for the beginning of a "style" tag and searches its
|
481
|
+
-- attribute list for the text "`type="text/css"`". (In this simplified example,
|
482
|
+
-- the Lua pattern does not consider whitespace between the '=' nor does it
|
483
|
+
-- consider that using single quotes is valid.) If there is a match, the
|
484
|
+
-- functional pattern returns a value instead of `nil`. In this case, the value
|
485
|
+
-- returned does not matter because we ultimately want to style the "style" tag
|
486
|
+
-- as an HTML tag, so the actual start rule looks like this:
|
487
|
+
--
|
488
|
+
-- local css_start_rule = #css_tag * tag
|
489
|
+
--
|
490
|
+
-- Now that the parent knows when to switch to the child, it needs to know when
|
491
|
+
-- to switch back. In the case of HTML/CSS, the switch back occurs when the
|
492
|
+
-- lexer encounters an ending "style" tag, though the lexer should still style
|
493
|
+
-- the tag as an HTML tag:
|
494
|
+
--
|
495
|
+
-- local css_end_rule = #P('</style>') * tag
|
496
|
+
--
|
497
|
+
-- Once the parent loads the child lexer and defines the child's start and end
|
498
|
+
-- rules, it embeds the child with the [`lexer.embed_lexer()`]() function:
|
499
|
+
--
|
500
|
+
-- l.embed_lexer(M, css, css_start_rule, css_end_rule)
|
501
|
+
--
|
502
|
+
-- The first parameter is the parent lexer object to embed the child in, which
|
503
|
+
-- in this case is `M`. The other three parameters are the child lexer object
|
504
|
+
-- loaded earlier followed by its start and end rules.
|
505
|
+
--
|
506
|
+
-- #### Child Lexer
|
507
|
+
--
|
508
|
+
-- The process for instructing a child lexer to embed itself into a parent is
|
509
|
+
-- very similar to embedding a child into a parent: first, load the parent lexer
|
510
|
+
-- into the child lexer with the [`lexer.load()`]() function and then create
|
511
|
+
-- start and end rules for the child lexer. However, in this case, swap the
|
512
|
+
-- lexer object arguments to [`lexer.embed_lexer()`](). For example, in the PHP
|
513
|
+
-- lexer:
|
514
|
+
--
|
515
|
+
-- local html = l.load('html')
|
516
|
+
-- local php_start_rule = token('php_tag', '<?php ')
|
517
|
+
-- local php_end_rule = token('php_tag', '?>')
|
518
|
+
-- l.embed_lexer(html, M, php_start_rule, php_end_rule)
|
519
|
+
--
|
520
|
+
-- ## Code Folding
|
521
|
+
--
|
522
|
+
-- When reading source code, it is occasionally helpful to temporarily hide
|
523
|
+
-- blocks of code like functions, classes, comments, etc. This is the concept of
|
524
|
+
-- "folding". In the Textadept and SciTE editors for example, little indicators
|
525
|
+
-- in the editor margins appear next to code that can be folded at places called
|
526
|
+
-- "fold points". When the user clicks an indicator, the editor hides the code
|
527
|
+
-- associated with the indicator until the user clicks the indicator again. The
|
528
|
+
-- lexer specifies these fold points and what code exactly to fold.
|
529
|
+
--
|
530
|
+
-- The fold points for most languages occur on keywords or character sequences.
|
531
|
+
-- Examples of fold keywords are "if" and "end" in Lua and examples of fold
|
532
|
+
-- character sequences are '{', '}', "/\*", and "\*/" in C for code block and
|
533
|
+
-- comment delimiters, respectively. However, these fold points cannot occur
|
534
|
+
-- just anywhere. For example, lexers should not recognize fold keywords that
|
535
|
+
-- appear within strings or comments. The lexer's `_foldsymbols` table allows
|
536
|
+
-- you to conveniently define fold points with such granularity. For example,
|
537
|
+
-- consider C:
|
538
|
+
--
|
539
|
+
-- M._foldsymbols = {
|
540
|
+
-- [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
|
541
|
+
-- [l.COMMENT] = {['/*'] = 1, ['*/'] = -1},
|
542
|
+
-- _patterns = {'[{}]', '/%*', '%*/'}
|
543
|
+
-- }
|
544
|
+
--
|
545
|
+
-- The first assignment states that any '{' or '}' that the lexer recognized as
|
546
|
+
-- an `lexer.OPERATOR` token is a fold point. The integer `1` indicates the
|
547
|
+
-- match is a beginning fold point and `-1` indicates the match is an ending
|
548
|
+
-- fold point. Likewise, the second assignment states that any "/\*" or "\*/"
|
549
|
+
-- that the lexer recognizes as part of a `lexer.COMMENT` token is a fold point.
|
550
|
+
-- The lexer does not consider any occurences of these characters outside their
|
551
|
+
-- defined tokens (such as in a string) as fold points. Finally, every
|
552
|
+
-- `_foldsymbols` table must have a `_patterns` field that contains a list of
|
553
|
+
-- [Lua patterns][] that match fold points. If the lexer encounters text that
|
554
|
+
-- matches one of those patterns, the lexer looks up the matched text in its
|
555
|
+
-- token's table to determine whether or not the text is a fold point. In the
|
556
|
+
-- example above, the first Lua pattern matches any '{' or '}' characters. When
|
557
|
+
-- the lexer comes across one of those characters, it checks if the match is an
|
558
|
+
-- `lexer.OPERATOR` token. If so, the lexer identifies the match as a fold
|
559
|
+
-- point. The same idea applies for the other patterns. (The '%' is in the other
|
560
|
+
-- patterns because '\*' is a special character in Lua patterns that needs
|
561
|
+
-- escaping.) How do you specify fold keywords? Here is an example for Lua:
|
562
|
+
--
|
563
|
+
-- M._foldsymbols = {
|
564
|
+
-- [l.KEYWORD] = {
|
565
|
+
-- ['if'] = 1, ['do'] = 1, ['function'] = 1,
|
566
|
+
-- ['end'] = -1, ['repeat'] = 1, ['until'] = -1
|
567
|
+
-- },
|
568
|
+
-- _patterns = {'%l+'}
|
569
|
+
-- }
|
570
|
+
--
|
571
|
+
-- Any time the lexer encounters a lower case word, if that word is a
|
572
|
+
-- `lexer.KEYWORD` token and in the associated list of fold points, the lexer
|
573
|
+
-- identifies the word as a fold point.
|
574
|
+
--
|
575
|
+
-- If your lexer needs to do some additional processing to determine if a match
|
576
|
+
-- is a fold point, assign a function that returns an integer. Returning `1` or
|
577
|
+
-- `-1` indicates the match is a fold point. Returning `0` indicates it is not.
|
578
|
+
-- For example:
|
579
|
+
--
|
580
|
+
-- local function fold_strange_token(text, pos, line, s, match)
|
581
|
+
-- if ... then
|
582
|
+
-- return 1 -- beginning fold point
|
583
|
+
-- elseif ... then
|
584
|
+
-- return -1 -- ending fold point
|
585
|
+
-- end
|
586
|
+
-- return 0
|
587
|
+
-- end
|
588
|
+
--
|
589
|
+
-- M._foldsymbols = {
|
590
|
+
-- ['strange_token'] = {['|'] = fold_strange_token},
|
591
|
+
-- _patterns = {'|'}
|
592
|
+
-- }
|
593
|
+
--
|
594
|
+
-- Any time the lexer encounters a '|' that is a "strange_token", it calls the
|
595
|
+
-- `fold_strange_token` function to determine if '|' is a fold point. The lexer
|
596
|
+
-- calls these functions with the following arguments: the text to identify fold
|
597
|
+
-- points in, the beginning position of the current line in the text to fold,
|
598
|
+
-- the current line's text, the position in the current line the matched text
|
599
|
+
-- starts at, and the matched text itself.
|
600
|
+
--
|
601
|
+
-- [Lua patterns]: http://www.lua.org/manual/5.2/manual.html#6.4.1
|
602
|
+
--
|
603
|
+
-- ## Using Lexers
|
604
|
+
--
|
605
|
+
-- ### Textadept
|
606
|
+
--
|
607
|
+
-- Put your lexer in your *~/.textadept/lexers/* directory so you do not
|
608
|
+
-- overwrite it when upgrading Textadept. Also, lexers in this directory
|
609
|
+
-- override default lexers. Thus, Textadept loads a user *lua* lexer instead of
|
610
|
+
-- the default *lua* lexer. This is convenient for tweaking a default lexer to
|
611
|
+
-- your liking. Then add a [file type][] for your lexer if necessary.
|
612
|
+
--
|
613
|
+
-- [file type]: _M.textadept.file_types.html
|
614
|
+
--
|
615
|
+
-- ### SciTE
|
616
|
+
--
|
617
|
+
-- Create a *.properties* file for your lexer and `import` it in either your
|
618
|
+
-- *SciTEUser.properties* or *SciTEGlobal.properties*. The contents of the
|
619
|
+
-- *.properties* file should contain:
|
620
|
+
--
|
621
|
+
-- file.patterns.[lexer_name]=[file_patterns]
|
622
|
+
-- lexer.$(file.patterns.[lexer_name])=[lexer_name]
|
623
|
+
--
|
624
|
+
-- where `[lexer_name]` is the name of your lexer (minus the *.lua* extension)
|
625
|
+
-- and `[file_patterns]` is a set of file extensions to use your lexer for.
|
626
|
+
--
|
627
|
+
-- Please note that Lua lexers ignore any styling information in *.properties*
|
628
|
+
-- files. Your theme file in the *lexers/themes/* directory contains styling
|
629
|
+
-- information.
|
630
|
+
--
|
631
|
+
-- ## Considerations
|
632
|
+
--
|
633
|
+
-- ### Performance
|
634
|
+
--
|
635
|
+
-- There might be some slight overhead when initializing a lexer, but loading a
|
636
|
+
-- file from disk into Scintilla is usually more expensive. On modern computer
|
637
|
+
-- systems, I see no difference in speed between LPeg lexers and Scintilla's C++
|
638
|
+
-- ones. Optimize lexers for speed by re-arranging rules in the `_rules` table
|
639
|
+
-- so that the most common rules match first. Do keep in mind that order matters
|
640
|
+
-- for similar rules.
|
641
|
+
--
|
642
|
+
-- ### Limitations
|
643
|
+
--
|
644
|
+
-- Embedded preprocessor languages like PHP cannot completely embed in their
|
645
|
+
-- parent languages in that the parent's tokens do not support start and end
|
646
|
+
-- rules. This mostly goes unnoticed, but code like
|
647
|
+
--
|
648
|
+
-- <div id="<?php echo $id; ?>">
|
649
|
+
--
|
650
|
+
-- or
|
651
|
+
--
|
652
|
+
-- <div <?php if ($odd) { echo 'class="odd"'; } ?>>
|
653
|
+
--
|
654
|
+
-- will not style correctly.
|
655
|
+
--
|
656
|
+
-- ### Troubleshooting
|
657
|
+
--
|
658
|
+
-- Errors in lexers can be tricky to debug. Lexers print Lua errors to
|
659
|
+
-- `io.stderr` and `_G.print()` statements to `io.stdout`. Running your editor
|
660
|
+
-- from a terminal is the easiest way to see errors as they occur.
|
661
|
+
--
|
662
|
+
-- ### Risks
|
663
|
+
--
|
664
|
+
-- Poorly written lexers have the ability to crash Scintilla (and thus its
|
665
|
+
-- containing application), so unsaved data might be lost. However, I have only
|
666
|
+
-- observed these crashes in early lexer development, when syntax errors or
|
667
|
+
-- pattern errors are present. Once the lexer actually starts styling text
|
668
|
+
-- (either correctly or incorrectly, it does not matter), I have not observed
|
669
|
+
-- any crashes.
|
670
|
+
--
|
671
|
+
-- ### Acknowledgements
|
672
|
+
--
|
673
|
+
-- Thanks to Peter Odding for his [lexer post][] on the Lua mailing list
|
674
|
+
-- that inspired me, and thanks to Roberto Ierusalimschy for LPeg.
|
675
|
+
--
|
676
|
+
-- [lexer post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html
|
677
|
+
-- @field LEXERPATH (string)
|
678
|
+
-- The path used to search for a lexer to load.
|
679
|
+
-- Identical in format to Lua's `package.path` string.
|
680
|
+
-- The default value is `package.path`.
|
681
|
+
-- @field DEFAULT (string)
|
682
|
+
-- The token name for default tokens.
|
683
|
+
-- @field WHITESPACE (string)
|
684
|
+
-- The token name for whitespace tokens.
|
685
|
+
-- @field COMMENT (string)
|
686
|
+
-- The token name for comment tokens.
|
687
|
+
-- @field STRING (string)
|
688
|
+
-- The token name for string tokens.
|
689
|
+
-- @field NUMBER (string)
|
690
|
+
-- The token name for number tokens.
|
691
|
+
-- @field KEYWORD (string)
|
692
|
+
-- The token name for keyword tokens.
|
693
|
+
-- @field IDENTIFIER (string)
|
694
|
+
-- The token name for identifier tokens.
|
695
|
+
-- @field OPERATOR (string)
|
696
|
+
-- The token name for operator tokens.
|
697
|
+
-- @field ERROR (string)
|
698
|
+
-- The token name for error tokens.
|
699
|
+
-- @field PREPROCESSOR (string)
|
700
|
+
-- The token name for preprocessor tokens.
|
701
|
+
-- @field CONSTANT (string)
|
702
|
+
-- The token name for constant tokens.
|
703
|
+
-- @field VARIABLE (string)
|
704
|
+
-- The token name for variable tokens.
|
705
|
+
-- @field FUNCTION (string)
|
706
|
+
-- The token name for function tokens.
|
707
|
+
-- @field CLASS (string)
|
708
|
+
-- The token name for class tokens.
|
709
|
+
-- @field TYPE (string)
|
710
|
+
-- The token name for type tokens.
|
711
|
+
-- @field LABEL (string)
|
712
|
+
-- The token name for label tokens.
|
713
|
+
-- @field REGEX (string)
|
714
|
+
-- The token name for regex tokens.
|
715
|
+
-- @field STYLE_CLASS (string)
|
716
|
+
-- The style typically used for class definitions.
|
717
|
+
-- @field STYLE_COMMENT (string)
|
718
|
+
-- The style typically used for code comments.
|
719
|
+
-- @field STYLE_CONSTANT (string)
|
720
|
+
-- The style typically used for constants.
|
721
|
+
-- @field STYLE_ERROR (string)
|
722
|
+
-- The style typically used for erroneous syntax.
|
723
|
+
-- @field STYLE_FUNCTION (string)
|
724
|
+
-- The style typically used for function definitions.
|
725
|
+
-- @field STYLE_KEYWORD (string)
|
726
|
+
-- The style typically used for language keywords.
|
727
|
+
-- @field STYLE_LABEL (string)
|
728
|
+
-- The style typically used for labels.
|
729
|
+
-- @field STYLE_NUMBER (string)
|
730
|
+
-- The style typically used for numbers.
|
731
|
+
-- @field STYLE_OPERATOR (string)
|
732
|
+
-- The style typically used for operators.
|
733
|
+
-- @field STYLE_REGEX (string)
|
734
|
+
-- The style typically used for regular expression strings.
|
735
|
+
-- @field STYLE_STRING (string)
|
736
|
+
-- The style typically used for strings.
|
737
|
+
-- @field STYLE_PREPROCESSOR (string)
|
738
|
+
-- The style typically used for preprocessor statements.
|
739
|
+
-- @field STYLE_TYPE (string)
|
740
|
+
-- The style typically used for static types.
|
741
|
+
-- @field STYLE_VARIABLE (string)
|
742
|
+
-- The style typically used for variables.
|
743
|
+
-- @field STYLE_WHITESPACE (string)
|
744
|
+
-- The style typically used for whitespace.
|
745
|
+
-- @field STYLE_EMBEDDED (string)
|
746
|
+
-- The style typically used for embedded code.
|
747
|
+
-- @field STYLE_IDENTIFIER (string)
|
748
|
+
-- The style typically used for identifier words.
|
749
|
+
-- @field STYLE_DEFAULT (string)
|
750
|
+
-- The style all styles are based off of.
|
751
|
+
-- @field STYLE_LINENUMBER (string)
|
752
|
+
-- The style used for all margins except fold margins.
|
753
|
+
-- @field STYLE_BRACELIGHT (string)
|
754
|
+
-- The style used for highlighted brace characters.
|
755
|
+
-- @field STYLE_BRACEBAD (string)
|
756
|
+
-- The style used for unmatched brace characters.
|
757
|
+
-- @field STYLE_CONTROLCHAR (string)
|
758
|
+
-- The style used for control characters.
|
759
|
+
-- Color attributes are ignored.
|
760
|
+
-- @field STYLE_INDENTGUIDE (string)
|
761
|
+
-- The style used for indentation guides.
|
762
|
+
-- @field STYLE_CALLTIP (string)
|
763
|
+
-- The style used by call tips if [`buffer.call_tip_use_style`]() is set.
|
764
|
+
-- Only the font name, size, and color attributes are used.
|
765
|
+
-- @field any (pattern)
|
766
|
+
-- A pattern that matches any single character.
|
767
|
+
-- @field ascii (pattern)
|
768
|
+
-- A pattern that matches any ASCII character (codes 0 to 127).
|
769
|
+
-- @field extend (pattern)
|
770
|
+
-- A pattern that matches any ASCII extended character (codes 0 to 255).
|
771
|
+
-- @field alpha (pattern)
|
772
|
+
-- A pattern that matches any alphabetic character ('A'-'Z', 'a'-'z').
|
773
|
+
-- @field digit (pattern)
|
774
|
+
-- A pattern that matches any digit ('0'-'9').
|
775
|
+
-- @field alnum (pattern)
|
776
|
+
-- A pattern that matches any alphanumeric character ('A'-'Z', 'a'-'z',
|
777
|
+
-- '0'-'9').
|
778
|
+
-- @field lower (pattern)
|
779
|
+
-- A pattern that matches any lower case character ('a'-'z').
|
780
|
+
-- @field upper (pattern)
|
781
|
+
-- A pattern that matches any upper case character ('A'-'Z').
|
782
|
+
-- @field xdigit (pattern)
|
783
|
+
-- A pattern that matches any hexadecimal digit ('0'-'9', 'A'-'F', 'a'-'f').
|
784
|
+
-- @field cntrl (pattern)
|
785
|
+
-- A pattern that matches any control character (ASCII codes 0 to 31).
|
786
|
+
-- @field graph (pattern)
|
787
|
+
-- A pattern that matches any graphical character ('!' to '~').
|
788
|
+
-- @field print (pattern)
|
789
|
+
-- A pattern that matches any printable character (' ' to '~').
|
790
|
+
-- @field punct (pattern)
|
791
|
+
-- A pattern that matches any punctuation character ('!' to '/', ':' to '@',
|
792
|
+
-- '[' to ''', '{' to '~').
|
793
|
+
-- @field space (pattern)
|
794
|
+
-- A pattern that matches any whitespace character ('\t', '\v', '\f', '\n',
|
795
|
+
-- '\r', space).
|
796
|
+
-- @field newline (pattern)
|
797
|
+
-- A pattern that matches any set of end of line characters.
|
798
|
+
-- @field nonnewline (pattern)
|
799
|
+
-- A pattern that matches any single, non-newline character.
|
800
|
+
-- @field nonnewline_esc (pattern)
|
801
|
+
-- A pattern that matches any single, non-newline character or any set of end
|
802
|
+
-- of line characters escaped with '\'.
|
803
|
+
-- @field dec_num (pattern)
|
804
|
+
-- A pattern that matches a decimal number.
|
805
|
+
-- @field hex_num (pattern)
|
806
|
+
-- A pattern that matches a hexadecimal number.
|
807
|
+
-- @field oct_num (pattern)
|
808
|
+
-- A pattern that matches an octal number.
|
809
|
+
-- @field integer (pattern)
|
810
|
+
-- A pattern that matches either a decimal, hexadecimal, or octal number.
|
811
|
+
-- @field float (pattern)
|
812
|
+
-- A pattern that matches a floating point number.
|
813
|
+
-- @field word (pattern)
|
814
|
+
-- A pattern that matches a typical word. Words begin with a letter or
|
815
|
+
-- underscore and consist of alphanumeric and underscore characters.
|
816
|
+
-- @field FOLD_BASE (number)
|
817
|
+
-- The initial (root) fold level.
|
818
|
+
-- @field FOLD_BLANK (number)
|
819
|
+
-- Flag indicating that the line is blank.
|
820
|
+
-- @field FOLD_HEADER (number)
|
821
|
+
-- Flag indicating the line is fold point.
|
822
|
+
-- @field fold_level (table, Read-only)
|
823
|
+
-- Table of fold level bit-masks for line numbers starting from zero.
|
824
|
+
-- Fold level masks are composed of an integer level combined with any of the
|
825
|
+
-- following bits:
|
826
|
+
--
|
827
|
+
-- * `lexer.FOLD_BASE`
|
828
|
+
-- The initial fold level.
|
829
|
+
-- * `lexer.FOLD_BLANK`
|
830
|
+
-- The line is blank.
|
831
|
+
-- * `lexer.FOLD_HEADER`
|
832
|
+
-- The line is a header, or fold point.
|
833
|
+
-- @field indent_amount (table, Read-only)
|
834
|
+
-- Table of indentation amounts in character columns, for line numbers
|
835
|
+
-- starting from zero.
|
836
|
+
-- @field property (table)
|
837
|
+
-- Map of key-value string pairs.
|
838
|
+
-- @field property_expanded (table, Read-only)
|
839
|
+
-- Map of key-value string pairs with `$()` and `%()` variable replacement
|
840
|
+
-- performed in values.
|
841
|
+
-- @field property_int (table, Read-only)
|
842
|
+
-- Map of key-value pairs with values interpreted as numbers, or `0` if not
|
843
|
+
-- found.
|
844
|
+
-- @field style_at (table, Read-only)
|
845
|
+
-- Table of style names at positions in the buffer starting from zero.
|
846
|
+
module('lexer')]=]
|
847
|
+
|
848
|
+
local lpeg = require('lpeg')
|
849
|
+
local lpeg_P, lpeg_R, lpeg_S, lpeg_V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
|
850
|
+
local lpeg_Ct, lpeg_Cc, lpeg_Cp = lpeg.Ct, lpeg.Cc, lpeg.Cp
|
851
|
+
local lpeg_Cmt, lpeg_C, lpeg_Cg = lpeg.Cmt, lpeg.C, lpeg.Cg
|
852
|
+
local lpeg_match = lpeg.match
|
853
|
+
|
854
|
+
M.LEXERPATH = package.path
|
855
|
+
|
856
|
+
-- Table of loaded lexers.
|
857
|
+
local lexers = {}
|
858
|
+
|
859
|
+
-- Keep track of the last parent lexer loaded. This lexer's rules are used for
|
860
|
+
-- proxy lexers (those that load parent and child lexers to embed) that do not
|
861
|
+
-- declare a parent lexer.
|
862
|
+
local parent_lexer
|
863
|
+
|
864
|
+
if not package.searchpath then
|
865
|
+
-- Searches for the given *name* in the given *path*.
|
866
|
+
-- This is an implementation of Lua 5.2's `package.searchpath()` function for
|
867
|
+
-- Lua 5.1.
|
868
|
+
function package.searchpath(name, path)
|
869
|
+
local tried = {}
|
870
|
+
for part in path:gmatch('[^;]+') do
|
871
|
+
local filename = part:gsub('%?', name)
|
872
|
+
local f = io.open(filename, 'r')
|
873
|
+
if f then f:close() return filename end
|
874
|
+
tried[#tried + 1] = ("no file '%s'"):format(filename)
|
875
|
+
end
|
876
|
+
return nil, table.concat(tried, '\n')
|
877
|
+
end
|
878
|
+
end
|
879
|
+
|
880
|
+
-- Adds a rule to a lexer's current ordered list of rules.
|
881
|
+
-- @param lexer The lexer to add the given rule to.
|
882
|
+
-- @param name The name associated with this rule. It is used for other lexers
|
883
|
+
-- to access this particular rule from the lexer's `_RULES` table. It does not
|
884
|
+
-- have to be the same as the name passed to `token`.
|
885
|
+
-- @param rule The LPeg pattern of the rule.
|
886
|
+
local function add_rule(lexer, id, rule)
|
887
|
+
if not lexer._RULES then
|
888
|
+
lexer._RULES = {}
|
889
|
+
-- Contains an ordered list (by numerical index) of rule names. This is used
|
890
|
+
-- in conjunction with lexer._RULES for building _TOKENRULE.
|
891
|
+
lexer._RULEORDER = {}
|
892
|
+
end
|
893
|
+
lexer._RULES[id] = rule
|
894
|
+
lexer._RULEORDER[#lexer._RULEORDER + 1] = id
|
895
|
+
end
|
896
|
+
|
897
|
+
-- Adds a new Scintilla style to Scintilla.
|
898
|
+
-- @param lexer The lexer to add the given style to.
|
899
|
+
-- @param token_name The name of the token associated with this style.
|
900
|
+
-- @param style A Scintilla style created from `style()`.
|
901
|
+
-- @see style
|
902
|
+
local function add_style(lexer, token_name, style)
|
903
|
+
local num_styles = lexer._numstyles
|
904
|
+
if num_styles == 32 then num_styles = num_styles + 8 end -- skip predefined
|
905
|
+
if num_styles >= 255 then print('Too many styles defined (255 MAX)') end
|
906
|
+
lexer._TOKENSTYLES[token_name], lexer._numstyles = num_styles, num_styles + 1
|
907
|
+
lexer._EXTRASTYLES[token_name] = style
|
908
|
+
end
|
909
|
+
|
910
|
+
-- (Re)constructs `lexer._TOKENRULE`.
|
911
|
+
-- @param parent The parent lexer.
|
912
|
+
local function join_tokens(lexer)
|
913
|
+
local patterns, order = lexer._RULES, lexer._RULEORDER
|
914
|
+
local token_rule = patterns[order[1]]
|
915
|
+
for i = 2, #order do token_rule = token_rule + patterns[order[i]] end
|
916
|
+
lexer._TOKENRULE = token_rule + M.token(M.DEFAULT, M.any)
|
917
|
+
return lexer._TOKENRULE
|
918
|
+
end
|
919
|
+
|
920
|
+
-- Adds a given lexer and any of its embedded lexers to a given grammar.
|
921
|
+
-- @param grammar The grammar to add the lexer to.
|
922
|
+
-- @param lexer The lexer to add.
|
923
|
+
local function add_lexer(grammar, lexer, token_rule)
|
924
|
+
local token_rule = join_tokens(lexer)
|
925
|
+
local lexer_name = lexer._NAME
|
926
|
+
for _, child in ipairs(lexer._CHILDREN) do
|
927
|
+
if child._CHILDREN then add_lexer(grammar, child) end
|
928
|
+
local child_name = child._NAME
|
929
|
+
local rules = child._EMBEDDEDRULES[lexer_name]
|
930
|
+
local rules_token_rule = grammar['__'..child_name] or rules.token_rule
|
931
|
+
grammar[child_name] = (-rules.end_rule * rules_token_rule)^0 *
|
932
|
+
rules.end_rule^-1 * lpeg_V(lexer_name)
|
933
|
+
local embedded_child = '_'..child_name
|
934
|
+
grammar[embedded_child] = rules.start_rule * (-rules.end_rule *
|
935
|
+
rules_token_rule)^0 * rules.end_rule^-1
|
936
|
+
token_rule = lpeg_V(embedded_child) + token_rule
|
937
|
+
end
|
938
|
+
grammar['__'..lexer_name] = token_rule -- can contain embedded lexer rules
|
939
|
+
grammar[lexer_name] = token_rule^0
|
940
|
+
end
|
941
|
+
|
942
|
+
-- (Re)constructs `lexer._GRAMMAR`.
|
943
|
+
-- @param lexer The parent lexer.
|
944
|
+
-- @param initial_rule The name of the rule to start lexing with. The default
|
945
|
+
-- value is `lexer._NAME`. Multilang lexers use this to start with a child
|
946
|
+
-- rule if necessary.
|
947
|
+
local function build_grammar(lexer, initial_rule)
|
948
|
+
-- local children = lexer._CHILDREN
|
949
|
+
-- if children then
|
950
|
+
local lexer_name = lexer._NAME
|
951
|
+
if not initial_rule then initial_rule = lexer_name end
|
952
|
+
local grammar = {initial_rule}
|
953
|
+
if not lexer._CHILDREN then lexer._CHILDREN={} end
|
954
|
+
add_lexer(grammar, lexer)
|
955
|
+
lexer._INITIALRULE = initial_rule
|
956
|
+
lexer._GRAMMAR = lpeg_Ct(lpeg_P(grammar))
|
957
|
+
-- else
|
958
|
+
-- lexer._GRAMMAR = lpeg_Ct(join_tokens(lexer)^0)
|
959
|
+
-- end
|
960
|
+
end
|
961
|
+
|
962
|
+
local string_upper = string.upper
|
963
|
+
-- Default styles.
|
964
|
+
local default = {
|
965
|
+
'nothing', 'whitespace', 'comment', 'string', 'number', 'keyword',
|
966
|
+
'identifier', 'operator', 'error', 'preprocessor', 'constant', 'variable',
|
967
|
+
'function', 'class', 'type', 'label', 'regex', 'embedded'
|
968
|
+
}
|
969
|
+
for _, v in ipairs(default) do
|
970
|
+
M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
|
971
|
+
end
|
972
|
+
-- Predefined styles.
|
973
|
+
local predefined = {
|
974
|
+
'default', 'linenumber', 'bracelight', 'bracebad', 'controlchar',
|
975
|
+
'indentguide', 'calltip'
|
976
|
+
}
|
977
|
+
for _, v in ipairs(predefined) do
|
978
|
+
M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
|
979
|
+
end
|
980
|
+
|
981
|
+
---
|
982
|
+
-- Initializes or loads and returns the lexer of string name *name*.
|
983
|
+
-- Scintilla calls this function to load a lexer. Parent lexers also call this
|
984
|
+
-- function to load child lexers and vice-versa. The user calls this function
|
985
|
+
-- to load a lexer when using Scintillua as a Lua library.
|
986
|
+
-- @param name The name of the lexing language.
|
987
|
+
-- @param alt_name The alternate name of the lexing language. This is useful for
|
988
|
+
-- embedding the same child lexer with multiple sets of start and end tokens.
|
989
|
+
-- @return lexer object
|
990
|
+
-- @name load
|
991
|
+
function M.load(name, alt_name)
|
992
|
+
if lexers[alt_name or name] then return lexers[alt_name or name] end
|
993
|
+
parent_lexer = nil -- reset
|
994
|
+
|
995
|
+
-- When using Scintillua as a stand-alone module, the `property` and
|
996
|
+
-- `property_int` tables do not exist (they are not useful). Create them to
|
997
|
+
-- prevent errors from occurring.
|
998
|
+
if not M.property then
|
999
|
+
M.property, M.property_int = {}, setmetatable({}, {
|
1000
|
+
__index = function(t, k)
|
1001
|
+
return tostring(tonumber(M.property[k]) or 0)
|
1002
|
+
end,
|
1003
|
+
__newindex = function() error('read-only property') end
|
1004
|
+
})
|
1005
|
+
end
|
1006
|
+
|
1007
|
+
-- Load the language lexer with its rules, styles, etc.
|
1008
|
+
M.WHITESPACE = (alt_name or name)..'_whitespace'
|
1009
|
+
local lexer_file, error = package.searchpath(name, M.LEXERPATH)
|
1010
|
+
local ok, lexer = pcall(dofile, lexer_file or '')
|
1011
|
+
if not ok then
|
1012
|
+
_G.print(error or lexer) -- error message
|
1013
|
+
lexer = {_NAME = alt_name or name}
|
1014
|
+
end
|
1015
|
+
if alt_name then lexer._NAME = alt_name end
|
1016
|
+
|
1017
|
+
-- Create the initial maps for token names to style numbers and styles.
|
1018
|
+
local token_styles = {}
|
1019
|
+
for i = 1, #default do token_styles[default[i]] = i - 1 end
|
1020
|
+
for i = 1, #predefined do token_styles[predefined[i]] = i + 31 end
|
1021
|
+
lexer._TOKENSTYLES, lexer._numstyles = token_styles, #default
|
1022
|
+
lexer._EXTRASTYLES = {}
|
1023
|
+
|
1024
|
+
-- If the lexer is a proxy (loads parent and child lexers to embed) and does
|
1025
|
+
-- not declare a parent, try and find one and use its rules.
|
1026
|
+
if not lexer._rules and not lexer._lexer then lexer._lexer = parent_lexer end
|
1027
|
+
|
1028
|
+
-- If the lexer is a proxy or a child that embedded itself, add its rules and
|
1029
|
+
-- styles to the parent lexer. Then set the parent to be the main lexer.
|
1030
|
+
if lexer._lexer then
|
1031
|
+
local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles
|
1032
|
+
if not l._tokenstyles then l._tokenstyles = {} end
|
1033
|
+
for _, r in ipairs(_r or {}) do
|
1034
|
+
-- Prevent rule id clashes.
|
1035
|
+
l._rules[#l._rules + 1] = {lexer._NAME..'_'..r[1], r[2]}
|
1036
|
+
end
|
1037
|
+
for token, style in pairs(_s or {}) do l._tokenstyles[token] = style end
|
1038
|
+
lexer = l
|
1039
|
+
end
|
1040
|
+
|
1041
|
+
-- Add the lexer's styles and build its grammar.
|
1042
|
+
if lexer._rules then
|
1043
|
+
for token, style in pairs(lexer._tokenstyles or {}) do
|
1044
|
+
add_style(lexer, token, style)
|
1045
|
+
end
|
1046
|
+
for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end
|
1047
|
+
build_grammar(lexer)
|
1048
|
+
end
|
1049
|
+
-- Add the lexer's unique whitespace style.
|
1050
|
+
add_style(lexer, lexer._NAME..'_whitespace', M.STYLE_WHITESPACE)
|
1051
|
+
|
1052
|
+
-- Process the lexer's fold symbols.
|
1053
|
+
if lexer._foldsymbols and lexer._foldsymbols._patterns then
|
1054
|
+
local patterns = lexer._foldsymbols._patterns
|
1055
|
+
for i = 1, #patterns do patterns[i] = '()('..patterns[i]..')' end
|
1056
|
+
end
|
1057
|
+
|
1058
|
+
lexer.lex, lexer.fold = M.lex, M.fold
|
1059
|
+
-- Immun.io copy over some of our helpful functions
|
1060
|
+
if M.lex_recursive then lexer.lex_recursive = M.lex_recursive end
|
1061
|
+
if M.unlex_rules then lexer.unlex_rules = M.unlex_rules end
|
1062
|
+
lexers[alt_name or name] = lexer
|
1063
|
+
return lexer
|
1064
|
+
end
|
1065
|
+
|
1066
|
+
---
|
1067
|
+
-- Lexes a chunk of text *text* (that has an initial style number of
|
1068
|
+
-- *init_style*) with lexer *lexer*.
|
1069
|
+
-- If *lexer* has a `_LEXBYLINE` flag set, the text is lexed one line at a time.
|
1070
|
+
-- Otherwise the text is lexed as a whole.
|
1071
|
+
-- @param lexer The lexer object to lex with.
|
1072
|
+
-- @param text The text in the buffer to lex.
|
1073
|
+
-- @param init_style The current style. Multiple-language lexers use this to
|
1074
|
+
-- determine which language to start lexing in.
|
1075
|
+
-- @return table of token names and positions.
|
1076
|
+
-- @name lex
|
1077
|
+
function M.lex(lexer, text, init_style)
|
1078
|
+
if not lexer._LEXBYLINE then
|
1079
|
+
-- For multilang lexers, build a new grammar whose initial_rule is the
|
1080
|
+
-- current language.
|
1081
|
+
if lexer._CHILDREN then
|
1082
|
+
for style, style_num in pairs(lexer._TOKENSTYLES) do
|
1083
|
+
if style_num == init_style then
|
1084
|
+
local lexer_name = style:match('^(.+)_whitespace') or lexer._NAME
|
1085
|
+
if lexer._INITIALRULE ~= lexer_name then
|
1086
|
+
build_grammar(lexer, lexer_name)
|
1087
|
+
end
|
1088
|
+
break
|
1089
|
+
end
|
1090
|
+
end
|
1091
|
+
end
|
1092
|
+
return lpeg_match(lexer._GRAMMAR, text)
|
1093
|
+
else
|
1094
|
+
local tokens = {}
|
1095
|
+
local function append(tokens, line_tokens, offset)
|
1096
|
+
for i = 1, #line_tokens, 2 do
|
1097
|
+
tokens[#tokens + 1] = line_tokens[i]
|
1098
|
+
tokens[#tokens + 1] = line_tokens[i + 1] + offset
|
1099
|
+
end
|
1100
|
+
end
|
1101
|
+
local offset = 0
|
1102
|
+
local grammar = lexer._GRAMMAR
|
1103
|
+
for line in text:gmatch('[^\r\n]*\r?\n?') do
|
1104
|
+
local line_tokens = lpeg_match(grammar, line)
|
1105
|
+
if line_tokens then append(tokens, line_tokens, offset) end
|
1106
|
+
offset = offset + #line
|
1107
|
+
-- Use the default style to the end of the line if none was specified.
|
1108
|
+
if tokens[#tokens] ~= offset then
|
1109
|
+
tokens[#tokens + 1], tokens[#tokens + 2] = 'default', offset + 1
|
1110
|
+
end
|
1111
|
+
end
|
1112
|
+
return tokens
|
1113
|
+
end
|
1114
|
+
end
|
1115
|
+
|
1116
|
+
---
|
1117
|
+
-- Folds a chunk of text *text* with lexer *lexer*.
|
1118
|
+
-- Folds *text* starting at position *start_pos* on line number *start_line*
|
1119
|
+
-- with a beginning fold level of *start_level* in the buffer. If *lexer* has a
|
1120
|
+
-- a `_fold` function or a `_foldsymbols` table, that field is used to perform
|
1121
|
+
-- folding. Otherwise, if a `fold.by.indentation` property is set, folding by
|
1122
|
+
-- indentation is done.
|
1123
|
+
-- @param lexer The lexer object to fold with.
|
1124
|
+
-- @param text The text in the buffer to fold.
|
1125
|
+
-- @param start_pos The position in the buffer *text* starts at.
|
1126
|
+
-- @param start_line The line number *text* starts on.
|
1127
|
+
-- @param start_level The fold level *text* starts on.
|
1128
|
+
-- @return table of fold levels.
|
1129
|
+
-- @name fold
|
1130
|
+
function M.fold(lexer, text, start_pos, start_line, start_level)
|
1131
|
+
local folds = {}
|
1132
|
+
if text == '' then return folds end
|
1133
|
+
local fold = M.property_int['fold'] > 0
|
1134
|
+
local FOLD_BASE = M.FOLD_BASE
|
1135
|
+
local FOLD_HEADER, FOLD_BLANK = M.FOLD_HEADER, M.FOLD_BLANK
|
1136
|
+
if fold and lexer._fold then
|
1137
|
+
return lexer._fold(text, start_pos, start_line, start_level)
|
1138
|
+
elseif fold and lexer._foldsymbols then
|
1139
|
+
local lines = {}
|
1140
|
+
for p, l in (text..'\n'):gmatch('()(.-)\r?\n') do
|
1141
|
+
lines[#lines + 1] = {p, l}
|
1142
|
+
end
|
1143
|
+
local fold_zero_sum_lines = M.property_int['fold.on.zero.sum.lines'] > 0
|
1144
|
+
local fold_symbols = lexer._foldsymbols
|
1145
|
+
local fold_symbols_patterns = fold_symbols._patterns
|
1146
|
+
local style_at, fold_level = M.style_at, M.fold_level
|
1147
|
+
local line_num, prev_level = start_line, start_level
|
1148
|
+
local current_level = prev_level
|
1149
|
+
for i = 1, #lines do
|
1150
|
+
local pos, line = lines[i][1], lines[i][2]
|
1151
|
+
if line ~= '' then
|
1152
|
+
local level_decreased = false
|
1153
|
+
for j = 1, #fold_symbols_patterns do
|
1154
|
+
for s, match in line:gmatch(fold_symbols_patterns[j]) do
|
1155
|
+
local symbols = fold_symbols[style_at[start_pos + pos + s - 1]]
|
1156
|
+
local l = symbols and symbols[match]
|
1157
|
+
if type(l) == 'function' then l = l(text, pos, line, s, match) end
|
1158
|
+
if type(l) == 'number' then
|
1159
|
+
current_level = current_level + l
|
1160
|
+
if l < 0 and current_level < prev_level then
|
1161
|
+
-- Potential zero-sum line. If the level were to go back up on
|
1162
|
+
-- the same line, the line may be marked as a fold header.
|
1163
|
+
level_decreased = true
|
1164
|
+
end
|
1165
|
+
end
|
1166
|
+
end
|
1167
|
+
end
|
1168
|
+
folds[line_num] = prev_level
|
1169
|
+
if current_level > prev_level then
|
1170
|
+
folds[line_num] = prev_level + FOLD_HEADER
|
1171
|
+
elseif level_decreased and current_level == prev_level and
|
1172
|
+
fold_zero_sum_lines then
|
1173
|
+
if line_num > start_line then
|
1174
|
+
folds[line_num] = prev_level - 1 + FOLD_HEADER
|
1175
|
+
else
|
1176
|
+
-- Typing within a zero-sum line.
|
1177
|
+
local level = fold_level[line_num - 1] - 1
|
1178
|
+
if level > FOLD_HEADER then level = level - FOLD_HEADER end
|
1179
|
+
if level > FOLD_BLANK then level = level - FOLD_BLANK end
|
1180
|
+
folds[line_num] = level + FOLD_HEADER
|
1181
|
+
current_level = current_level + 1
|
1182
|
+
end
|
1183
|
+
end
|
1184
|
+
if current_level < FOLD_BASE then current_level = FOLD_BASE end
|
1185
|
+
prev_level = current_level
|
1186
|
+
else
|
1187
|
+
folds[line_num] = prev_level + FOLD_BLANK
|
1188
|
+
end
|
1189
|
+
line_num = line_num + 1
|
1190
|
+
end
|
1191
|
+
elseif fold and M.property_int['fold.by.indentation'] > 0 then
|
1192
|
+
-- Indentation based folding.
|
1193
|
+
-- Calculate indentation per line.
|
1194
|
+
local indentation = {}
|
1195
|
+
for indent, line in (text..'\n'):gmatch('([\t ]*)([^\r\n]*)\r?\n') do
|
1196
|
+
indentation[#indentation + 1] = line ~= '' and #indent
|
1197
|
+
end
|
1198
|
+
-- Make line before start_line a fold header if necessary.
|
1199
|
+
if start_line > 0 and indentation[1] then
|
1200
|
+
local indent = M.indent_amount[start_line - 1]
|
1201
|
+
if indentation[1] > indent then
|
1202
|
+
folds[start_line - 1] = FOLD_BASE + indent + FOLD_HEADER
|
1203
|
+
end
|
1204
|
+
end
|
1205
|
+
-- Iterate over lines, setting fold numbers and fold flags.
|
1206
|
+
local line_num, prev_level = start_line, FOLD_BASE + (indentation[1] or 0)
|
1207
|
+
local current_level = prev_level
|
1208
|
+
for i = 1, #indentation do
|
1209
|
+
if indentation[i] then
|
1210
|
+
for j = i + 1, #indentation do
|
1211
|
+
if indentation[j] then
|
1212
|
+
current_level = FOLD_BASE + indentation[j]
|
1213
|
+
break
|
1214
|
+
end
|
1215
|
+
end
|
1216
|
+
folds[line_num] = prev_level
|
1217
|
+
if current_level > prev_level then
|
1218
|
+
folds[line_num] = prev_level + FOLD_HEADER
|
1219
|
+
end
|
1220
|
+
prev_level = current_level
|
1221
|
+
else
|
1222
|
+
folds[line_num] = prev_level + FOLD_BLANK
|
1223
|
+
end
|
1224
|
+
line_num = line_num + 1
|
1225
|
+
end
|
1226
|
+
else
|
1227
|
+
-- No folding, reset fold levels if necessary.
|
1228
|
+
local current_line = start_line
|
1229
|
+
for _ in text:gmatch('\r?\n') do
|
1230
|
+
folds[current_line] = start_level
|
1231
|
+
current_line = current_line + 1
|
1232
|
+
end
|
1233
|
+
end
|
1234
|
+
return folds
|
1235
|
+
end
|
1236
|
+
|
1237
|
+
-- The following are utility functions lexers will have access to.
|
1238
|
+
|
1239
|
+
-- Common patterns.
|
1240
|
+
M.any = lpeg_P(1)
|
1241
|
+
M.ascii = lpeg_R('\000\127')
|
1242
|
+
M.extend = lpeg_R('\000\255')
|
1243
|
+
M.alpha = lpeg_R('AZ', 'az')
|
1244
|
+
M.digit = lpeg_R('09')
|
1245
|
+
M.alnum = lpeg_R('AZ', 'az', '09')
|
1246
|
+
M.lower = lpeg_R('az')
|
1247
|
+
M.upper = lpeg_R('AZ')
|
1248
|
+
M.xdigit = lpeg_R('09', 'AF', 'af')
|
1249
|
+
M.cntrl = lpeg_R('\000\031')
|
1250
|
+
M.graph = lpeg_R('!~')
|
1251
|
+
M.print = lpeg_R(' ~')
|
1252
|
+
M.punct = lpeg_R('!/', ':@', '[\'', '{~')
|
1253
|
+
M.space = lpeg_S('\t\v\f\n\r ')
|
1254
|
+
|
1255
|
+
M.newline = lpeg_S('\r\n\f')^1
|
1256
|
+
M.nonnewline = 1 - M.newline
|
1257
|
+
M.nonnewline_esc = 1 - (M.newline + '\\') + '\\' * M.any
|
1258
|
+
|
1259
|
+
M.dec_num = M.digit^1
|
1260
|
+
M.hex_num = '0' * lpeg_S('xX') * M.xdigit^1
|
1261
|
+
M.oct_num = '0' * lpeg_R('07')^1
|
1262
|
+
M.integer = lpeg_S('+-')^-1 * (M.hex_num + M.oct_num + M.dec_num)
|
1263
|
+
M.float = lpeg_S('+-')^-1 *
|
1264
|
+
(M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0 +
|
1265
|
+
M.digit^1) *
|
1266
|
+
lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1
|
1267
|
+
M.word = (M.alpha + '_') * (M.alnum + '_')^0
|
1268
|
+
|
1269
|
+
---
|
1270
|
+
-- Creates and returns a token pattern with token name *name* and pattern
|
1271
|
+
-- *patt*.
|
1272
|
+
-- If *name* is not a predefined token name, its style must be defined in the
|
1273
|
+
-- lexer's `_tokenstyles` table.
|
1274
|
+
-- @param name The name of token. If this name is not a predefined token name,
|
1275
|
+
-- then a style needs to be assiciated with it in the lexer's `_tokenstyles`
|
1276
|
+
-- table.
|
1277
|
+
-- @param patt The LPeg pattern associated with the token.
|
1278
|
+
-- @return pattern
|
1279
|
+
-- @usage local ws = token(l.WHITESPACE, l.space^1)
|
1280
|
+
-- @usage local annotation = token('annotation', '@' * l.word)
|
1281
|
+
-- @name token
|
1282
|
+
function M.token(name, patt)
|
1283
|
+
--return lpeg_Cg(patt, name)
|
1284
|
+
return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_C(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
|
1285
|
+
end
|
1286
|
+
|
1287
|
+
function M.parent_token(name, patt)
|
1288
|
+
--return lpeg_Cg(patt, name)
|
1289
|
+
return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_Ct(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
|
1290
|
+
end
|
1291
|
+
|
1292
|
+
---
|
1293
|
+
-- Creates and returns a pattern that matches a range of text bounded by
|
1294
|
+
-- *chars* characters.
|
1295
|
+
-- This is a convenience function for matching more complicated delimited ranges
|
1296
|
+
-- like strings with escape characters and balanced parentheses. *single_line*
|
1297
|
+
-- indicates whether or not the range must be on a single line, *no_escape*
|
1298
|
+
-- indicates whether or not to ignore '\' as an escape character, and *balanced*
|
1299
|
+
-- indicates whether or not to handle balanced ranges like parentheses and
|
1300
|
+
-- requires *chars* to be composed of two characters.
|
1301
|
+
-- @param chars The character(s) that bound the matched range.
|
1302
|
+
-- @param single_line Optional flag indicating whether or not the range must be
|
1303
|
+
-- on a single line.
|
1304
|
+
-- @param no_escape Optional flag indicating whether or not the range end
|
1305
|
+
-- character may be escaped by a '\\' character.
|
1306
|
+
-- @param balanced Optional flag indicating whether or not to match a balanced
|
1307
|
+
-- range, like the "%b" Lua pattern. This flag only applies if *chars*
|
1308
|
+
-- consists of two different characters (e.g. "()").
|
1309
|
+
-- @return pattern
|
1310
|
+
-- @usage local dq_str_escapes = l.delimited_range('"')
|
1311
|
+
-- @usage local dq_str_noescapes = l.delimited_range('"', false, true)
|
1312
|
+
-- @usage local unbalanced_parens = l.delimited_range('()')
|
1313
|
+
-- @usage local balanced_parens = l.delimited_range('()', false, false, true)
|
1314
|
+
-- @see nested_pair
|
1315
|
+
-- @name delimited_range
|
1316
|
+
function M.delimited_range(chars, single_line, no_escape, balanced)
|
1317
|
+
local s = chars:sub(1, 1)
|
1318
|
+
local e = #chars == 2 and chars:sub(2, 2) or s
|
1319
|
+
local range
|
1320
|
+
local b = balanced and s or ''
|
1321
|
+
local n = single_line and '\n' or ''
|
1322
|
+
if no_escape then
|
1323
|
+
local invalid = lpeg_S(e..n..b)
|
1324
|
+
range = M.any - invalid
|
1325
|
+
else
|
1326
|
+
local invalid = lpeg_S(e..n..b) + '\\'
|
1327
|
+
range = M.any - invalid + '\\' * M.any
|
1328
|
+
end
|
1329
|
+
if balanced and s ~= e then
|
1330
|
+
return lpeg_P{s * (range + lpeg_V(1))^0 * e}
|
1331
|
+
else
|
1332
|
+
return s * range^0 * lpeg_P(e)^-1
|
1333
|
+
end
|
1334
|
+
end
|
1335
|
+
|
1336
|
+
---
|
1337
|
+
-- Creates and returns a pattern that matches pattern *patt* only at the
|
1338
|
+
-- beginning of a line.
|
1339
|
+
-- @param patt The LPeg pattern to match on the beginning of a line.
|
1340
|
+
-- @return pattern
|
1341
|
+
-- @usage local preproc = token(l.PREPROCESSOR, l.starts_line('#') *
|
1342
|
+
-- l.nonnewline^0)
|
1343
|
+
-- @name starts_line
|
1344
|
+
function M.starts_line(patt)
|
1345
|
+
return lpeg_Cmt(lpeg_C(patt), function(input, index, match, ...)
|
1346
|
+
local pos = index - #match
|
1347
|
+
if pos == 1 then return index, ... end
|
1348
|
+
local char = input:sub(pos - 1, pos - 1)
|
1349
|
+
if char == '\n' or char == '\r' or char == '\f' then return index, ... end
|
1350
|
+
end)
|
1351
|
+
end
|
1352
|
+
|
1353
|
+
---
|
1354
|
+
-- Creates and returns a pattern that verifies that string set *s* contains the
|
1355
|
+
-- first non-whitespace character behind the current match position.
|
1356
|
+
-- @param s String character set like one passed to `lpeg.S()`.
|
1357
|
+
-- @return pattern
|
1358
|
+
-- @usage local regex = l.last_char_includes('+-*!%^&|=,([{') *
|
1359
|
+
-- l.delimited_range('/')
|
1360
|
+
-- @name last_char_includes
|
1361
|
+
function M.last_char_includes(s)
|
1362
|
+
s = '['..s:gsub('[-%%%[]', '%%%1')..']'
|
1363
|
+
return lpeg_P(function(input, index)
|
1364
|
+
if index == 1 then return index end
|
1365
|
+
local i = index
|
1366
|
+
while input:sub(i - 1, i - 1):match('[ \t\r\n\f]') do i = i - 1 end
|
1367
|
+
if input:sub(i - 1, i - 1):match(s) then return index end
|
1368
|
+
end)
|
1369
|
+
end
|
1370
|
+
|
1371
|
+
---
|
1372
|
+
-- Returns a pattern that matches a balanced range of text that starts with
|
1373
|
+
-- string *start_chars* and ends with string *end_chars*.
|
1374
|
+
-- With single-character delimiters, this function is identical to
|
1375
|
+
-- `delimited_range(start_chars..end_chars, false, true, true)`.
|
1376
|
+
-- @param start_chars The string starting a nested sequence.
|
1377
|
+
-- @param end_chars The string ending a nested sequence.
|
1378
|
+
-- @return pattern
|
1379
|
+
-- @usage local nested_comment = l.nested_pair('/*', '*/')
|
1380
|
+
-- @see delimited_range
|
1381
|
+
-- @name nested_pair
|
1382
|
+
function M.nested_pair(start_chars, end_chars)
|
1383
|
+
local s, e = start_chars, lpeg_P(end_chars)^-1
|
1384
|
+
return lpeg_P{s * (M.any - s - end_chars + lpeg_V(1))^0 * e}
|
1385
|
+
end
|
1386
|
+
|
1387
|
+
---
|
1388
|
+
-- Creates and returns a pattern that matches any single word in list *words*.
|
1389
|
+
-- Words consist of alphanumeric and underscore characters, as well as the
|
1390
|
+
-- characters in string set *word_chars*. *case_insensitive* indicates whether
|
1391
|
+
-- or not to ignore case when matching words.
|
1392
|
+
-- This is a convenience function for simplifying a set of ordered choice word
|
1393
|
+
-- patterns.
|
1394
|
+
-- @param words A table of words.
|
1395
|
+
-- @param word_chars Optional string of additional characters considered to be
|
1396
|
+
-- part of a word. By default, word characters are alphanumerics and
|
1397
|
+
-- underscores ("%w_" in Lua). This parameter may be `nil` or the empty string
|
1398
|
+
-- to indicate no additional word characters.
|
1399
|
+
-- @param case_insensitive Optional boolean flag indicating whether or not the
|
1400
|
+
-- word match is case-insensitive. The default is `false`.
|
1401
|
+
-- @return pattern
|
1402
|
+
-- @usage local keyword = token(l.KEYWORD, word_match{'foo', 'bar', 'baz'})
|
1403
|
+
-- @usage local keyword = token(l.KEYWORD, word_match({'foo-bar', 'foo-baz',
|
1404
|
+
-- 'bar-foo', 'bar-baz', 'baz-foo', 'baz-bar'}, '-', true))
|
1405
|
+
-- @name word_match
|
1406
|
+
function M.word_match(words, word_chars, case_insensitive)
|
1407
|
+
local word_list = {}
|
1408
|
+
for _, word in ipairs(words) do
|
1409
|
+
word_list[case_insensitive and word:lower() or word] = true
|
1410
|
+
end
|
1411
|
+
local chars = M.alnum + '_'
|
1412
|
+
if word_chars then chars = chars + lpeg_S(word_chars) end
|
1413
|
+
return lpeg_Cmt(chars^1, function(input, index, word)
|
1414
|
+
if case_insensitive then word = word:lower() end
|
1415
|
+
return word_list[word] and index or nil
|
1416
|
+
end)
|
1417
|
+
end
|
1418
|
+
|
1419
|
+
---
|
1420
|
+
-- Embeds child lexer *child* in parent lexer *parent* using patterns
|
1421
|
+
-- *start_rule* and *end_rule*, which signal the beginning and end of the
|
1422
|
+
-- embedded lexer, respectively.
|
1423
|
+
-- @param parent The parent lexer.
|
1424
|
+
-- @param child The child lexer.
|
1425
|
+
-- @param start_rule The pattern that signals the beginning of the embedded
|
1426
|
+
-- lexer.
|
1427
|
+
-- @param end_rule The pattern that signals the end of the embedded lexer.
|
1428
|
+
-- @usage l.embed_lexer(M, css, css_start_rule, css_end_rule)
|
1429
|
+
-- @usage l.embed_lexer(html, M, php_start_rule, php_end_rule)
|
1430
|
+
-- @usage l.embed_lexer(html, ruby, ruby_start_rule, ruby_end_rule)
|
1431
|
+
-- @name embed_lexer
|
1432
|
+
function M.embed_lexer(parent, child, start_rule, end_rule)
|
1433
|
+
-- Add child rules.
|
1434
|
+
if not child._EMBEDDEDRULES then child._EMBEDDEDRULES = {} end
|
1435
|
+
if not child._RULES then -- creating a child lexer to be embedded
|
1436
|
+
if not child._rules then error('Cannot embed language with no rules') end
|
1437
|
+
for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end
|
1438
|
+
end
|
1439
|
+
child._EMBEDDEDRULES[parent._NAME] = {
|
1440
|
+
['start_rule'] = start_rule,
|
1441
|
+
token_rule = join_tokens(child),
|
1442
|
+
['end_rule'] = end_rule
|
1443
|
+
}
|
1444
|
+
if not parent._CHILDREN then parent._CHILDREN = {} end
|
1445
|
+
local children = parent._CHILDREN
|
1446
|
+
children[#children + 1] = child
|
1447
|
+
-- Add child styles.
|
1448
|
+
if not parent._tokenstyles then parent._tokenstyles = {} end
|
1449
|
+
local tokenstyles = parent._tokenstyles
|
1450
|
+
tokenstyles[child._NAME..'_whitespace'] = M.STYLE_WHITESPACE
|
1451
|
+
for token, style in pairs(child._tokenstyles or {}) do
|
1452
|
+
tokenstyles[token] = style
|
1453
|
+
end
|
1454
|
+
child._lexer = parent -- use parent's tokens if child is embedding itself
|
1455
|
+
parent_lexer = parent -- use parent's tokens if the calling lexer is a proxy
|
1456
|
+
end
|
1457
|
+
|
1458
|
+
-- Determines if the previous line is a comment.
|
1459
|
+
-- This is used for determining if the current comment line is a fold point.
|
1460
|
+
-- @param prefix The prefix string defining a comment.
|
1461
|
+
-- @param text The text passed to a fold function.
|
1462
|
+
-- @param pos The pos passed to a fold function.
|
1463
|
+
-- @param line The line passed to a fold function.
|
1464
|
+
-- @param s The s passed to a fold function.
|
1465
|
+
local function prev_line_is_comment(prefix, text, pos, line, s)
|
1466
|
+
local start = line:find('%S')
|
1467
|
+
if start < s and not line:find(prefix, start, true) then return false end
|
1468
|
+
local p = pos - 1
|
1469
|
+
if text:sub(p, p) == '\n' then
|
1470
|
+
p = p - 1
|
1471
|
+
if text:sub(p, p) == '\r' then p = p - 1 end
|
1472
|
+
if text:sub(p, p) ~= '\n' then
|
1473
|
+
while p > 1 and text:sub(p - 1, p - 1) ~= '\n' do p = p - 1 end
|
1474
|
+
while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
|
1475
|
+
return text:sub(p, p + #prefix - 1) == prefix
|
1476
|
+
end
|
1477
|
+
end
|
1478
|
+
return false
|
1479
|
+
end
|
1480
|
+
|
1481
|
+
-- Determines if the next line is a comment.
|
1482
|
+
-- This is used for determining if the current comment line is a fold point.
|
1483
|
+
-- @param prefix The prefix string defining a comment.
|
1484
|
+
-- @param text The text passed to a fold function.
|
1485
|
+
-- @param pos The pos passed to a fold function.
|
1486
|
+
-- @param line The line passed to a fold function.
|
1487
|
+
-- @param s The s passed to a fold function.
|
1488
|
+
local function next_line_is_comment(prefix, text, pos, line, s)
|
1489
|
+
local p = text:find('\n', pos + s)
|
1490
|
+
if p then
|
1491
|
+
p = p + 1
|
1492
|
+
while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
|
1493
|
+
return text:sub(p, p + #prefix - 1) == prefix
|
1494
|
+
end
|
1495
|
+
return false
|
1496
|
+
end
|
1497
|
+
|
1498
|
+
---
|
1499
|
+
-- Returns a fold function (to be used within the lexer's `_foldsymbols` table)
|
1500
|
+
-- that folds consecutive line comments that start with string *prefix*.
|
1501
|
+
-- @param prefix The prefix string defining a line comment.
|
1502
|
+
-- @usage [l.COMMENT] = {['--'] = l.fold_line_comments('--')}
|
1503
|
+
-- @usage [l.COMMENT] = {['//'] = l.fold_line_comments('//')}
|
1504
|
+
-- @name fold_line_comments
|
1505
|
+
function M.fold_line_comments(prefix)
|
1506
|
+
local property_int = M.property_int
|
1507
|
+
return function(text, pos, line, s)
|
1508
|
+
if property_int['fold.line.comments'] == 0 then return 0 end
|
1509
|
+
if s > 1 and line:match('^%s*()') < s then return 0 end
|
1510
|
+
local prev_line_comment = prev_line_is_comment(prefix, text, pos, line, s)
|
1511
|
+
local next_line_comment = next_line_is_comment(prefix, text, pos, line, s)
|
1512
|
+
if not prev_line_comment and next_line_comment then return 1 end
|
1513
|
+
if prev_line_comment and not next_line_comment then return -1 end
|
1514
|
+
return 0
|
1515
|
+
end
|
1516
|
+
end
|
1517
|
+
|
1518
|
+
M.property_expanded = setmetatable({}, {
|
1519
|
+
-- Returns the string property value associated with string property *key*,
|
1520
|
+
-- replacing any "$()" and "%()" expressions with the values of their keys.
|
1521
|
+
__index = function(t, key)
|
1522
|
+
return M.property[key]:gsub('[$%%]%b()', function(key)
|
1523
|
+
return t[key:sub(3, -2)]
|
1524
|
+
end)
|
1525
|
+
end,
|
1526
|
+
__newindex = function() error('read-only property') end
|
1527
|
+
})
|
1528
|
+
|
1529
|
+
--[[ The functions and fields below were defined in C.
|
1530
|
+
|
1531
|
+
---
|
1532
|
+
-- Individual fields for a lexer instance.
|
1533
|
+
-- @field _NAME The string name of the lexer.
|
1534
|
+
-- @field _rules An ordered list of rules for a lexer grammar.
|
1535
|
+
-- Each rule is a table containing an arbitrary rule name and the LPeg pattern
|
1536
|
+
-- associated with the rule. The order of rules is important as rules are
|
1537
|
+
-- matched sequentially.
|
1538
|
+
-- Child lexers should not use this table to access and/or modify their
|
1539
|
+
-- parent's rules and vice-versa. Use the `_RULES` table instead.
|
1540
|
+
-- @field _tokenstyles A map of non-predefined token names to styles.
|
1541
|
+
-- Remember to use token names, not rule names. It is recommended to use
|
1542
|
+
-- predefined styles or color-agnostic styles derived from predefined styles
|
1543
|
+
-- to ensure compatibility with user color themes.
|
1544
|
+
-- @field _foldsymbols A table of recognized fold points for the lexer.
|
1545
|
+
-- Keys are token names with table values defining fold points. Those table
|
1546
|
+
-- values have string keys of keywords or characters that indicate a fold
|
1547
|
+
-- point whose values are integers. A value of `1` indicates a beginning fold
|
1548
|
+
-- point and a value of `-1` indicates an ending fold point. Values can also
|
1549
|
+
-- be functions that return `1`, `-1`, or `0` (indicating no fold point) for
|
1550
|
+
-- keys which need additional processing.
|
1551
|
+
-- There is also a required `_pattern` key whose value is a table containing
|
1552
|
+
-- Lua pattern strings that match all fold points (the string keys contained
|
1553
|
+
-- in token name table values). When the lexer encounters text that matches
|
1554
|
+
-- one of those patterns, the matched text is looked up in its token's table
|
1555
|
+
-- to determine whether or not it is a fold point.
|
1556
|
+
-- @field _fold If this function exists in the lexer, it is called for folding
|
1557
|
+
-- the document instead of using `_foldsymbols` or indentation.
|
1558
|
+
-- @field _lexer The parent lexer object whose rules should be used. This field
|
1559
|
+
-- is only necessary to disambiguate a proxy lexer that loaded parent and
|
1560
|
+
-- child lexers for embedding and ended up having multiple parents loaded.
|
1561
|
+
-- @field _RULES A map of rule name keys with their associated LPeg pattern
|
1562
|
+
-- values for the lexer.
|
1563
|
+
-- This is constructed from the lexer's `_rules` table and accessible to other
|
1564
|
+
-- lexers for embedded lexer applications like modifying parent or child
|
1565
|
+
-- rules.
|
1566
|
+
-- @field _LEXBYLINE Indicates the lexer can only process one whole line of text
|
1567
|
+
-- (instead of an arbitrary chunk of text) at a time.
|
1568
|
+
-- The default value is `false`. Line lexers cannot look ahead to subsequent
|
1569
|
+
-- lines.
|
1570
|
+
-- @class table
|
1571
|
+
-- @name lexer
|
1572
|
+
local lexer
|
1573
|
+
]]
|
1574
|
+
|
1575
|
+
return M
|