immunio 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +234 -0
  3. data/README.md +147 -0
  4. data/bin/immunio +5 -0
  5. data/lib/immunio.rb +29 -0
  6. data/lib/immunio/agent.rb +260 -0
  7. data/lib/immunio/authentication.rb +96 -0
  8. data/lib/immunio/blocked_app.rb +38 -0
  9. data/lib/immunio/channel.rb +432 -0
  10. data/lib/immunio/cli.rb +39 -0
  11. data/lib/immunio/context.rb +114 -0
  12. data/lib/immunio/errors.rb +43 -0
  13. data/lib/immunio/immunio_ca.crt +45 -0
  14. data/lib/immunio/logger.rb +87 -0
  15. data/lib/immunio/plugins/action_dispatch.rb +45 -0
  16. data/lib/immunio/plugins/action_view.rb +431 -0
  17. data/lib/immunio/plugins/active_record.rb +707 -0
  18. data/lib/immunio/plugins/active_record_relation.rb +370 -0
  19. data/lib/immunio/plugins/authlogic.rb +80 -0
  20. data/lib/immunio/plugins/csrf.rb +24 -0
  21. data/lib/immunio/plugins/devise.rb +40 -0
  22. data/lib/immunio/plugins/environment_reporter.rb +69 -0
  23. data/lib/immunio/plugins/eval.rb +51 -0
  24. data/lib/immunio/plugins/exception_handler.rb +55 -0
  25. data/lib/immunio/plugins/gems_tracker.rb +5 -0
  26. data/lib/immunio/plugins/haml.rb +36 -0
  27. data/lib/immunio/plugins/http_finisher.rb +50 -0
  28. data/lib/immunio/plugins/http_tracker.rb +203 -0
  29. data/lib/immunio/plugins/io.rb +96 -0
  30. data/lib/immunio/plugins/redirect.rb +42 -0
  31. data/lib/immunio/plugins/warden.rb +66 -0
  32. data/lib/immunio/processor.rb +234 -0
  33. data/lib/immunio/rails.rb +26 -0
  34. data/lib/immunio/request.rb +139 -0
  35. data/lib/immunio/rufus_lua_ext/ref.rb +27 -0
  36. data/lib/immunio/rufus_lua_ext/state.rb +157 -0
  37. data/lib/immunio/rufus_lua_ext/table.rb +137 -0
  38. data/lib/immunio/rufus_lua_ext/utils.rb +13 -0
  39. data/lib/immunio/version.rb +5 -0
  40. data/lib/immunio/vm.rb +291 -0
  41. data/lua-hooks/ext/all.c +78 -0
  42. data/lua-hooks/ext/bitop/README +22 -0
  43. data/lua-hooks/ext/bitop/bit.c +189 -0
  44. data/lua-hooks/ext/extconf.rb +38 -0
  45. data/lua-hooks/ext/libinjection/COPYING +37 -0
  46. data/lua-hooks/ext/libinjection/libinjection.h +65 -0
  47. data/lua-hooks/ext/libinjection/libinjection_html5.c +847 -0
  48. data/lua-hooks/ext/libinjection/libinjection_html5.h +54 -0
  49. data/lua-hooks/ext/libinjection/libinjection_sqli.c +2301 -0
  50. data/lua-hooks/ext/libinjection/libinjection_sqli.h +295 -0
  51. data/lua-hooks/ext/libinjection/libinjection_sqli_data.h +9349 -0
  52. data/lua-hooks/ext/libinjection/libinjection_xss.c +531 -0
  53. data/lua-hooks/ext/libinjection/libinjection_xss.h +21 -0
  54. data/lua-hooks/ext/libinjection/lualib.c +109 -0
  55. data/lua-hooks/ext/lpeg/HISTORY +90 -0
  56. data/lua-hooks/ext/lpeg/lpcap.c +537 -0
  57. data/lua-hooks/ext/lpeg/lpcap.h +43 -0
  58. data/lua-hooks/ext/lpeg/lpcode.c +986 -0
  59. data/lua-hooks/ext/lpeg/lpcode.h +34 -0
  60. data/lua-hooks/ext/lpeg/lpeg-128.gif +0 -0
  61. data/lua-hooks/ext/lpeg/lpeg.html +1429 -0
  62. data/lua-hooks/ext/lpeg/lpprint.c +244 -0
  63. data/lua-hooks/ext/lpeg/lpprint.h +35 -0
  64. data/lua-hooks/ext/lpeg/lptree.c +1238 -0
  65. data/lua-hooks/ext/lpeg/lptree.h +77 -0
  66. data/lua-hooks/ext/lpeg/lptypes.h +149 -0
  67. data/lua-hooks/ext/lpeg/lpvm.c +355 -0
  68. data/lua-hooks/ext/lpeg/lpvm.h +58 -0
  69. data/lua-hooks/ext/lpeg/makefile +55 -0
  70. data/lua-hooks/ext/lpeg/re.html +498 -0
  71. data/lua-hooks/ext/lpeg/test.lua +1409 -0
  72. data/lua-hooks/ext/lua-cmsgpack/CMakeLists.txt +45 -0
  73. data/lua-hooks/ext/lua-cmsgpack/README.md +115 -0
  74. data/lua-hooks/ext/lua-cmsgpack/lua_cmsgpack.c +957 -0
  75. data/lua-hooks/ext/lua-cmsgpack/test.lua +570 -0
  76. data/lua-hooks/ext/lua-snapshot/LICENSE +7 -0
  77. data/lua-hooks/ext/lua-snapshot/Makefile +12 -0
  78. data/lua-hooks/ext/lua-snapshot/README.md +18 -0
  79. data/lua-hooks/ext/lua-snapshot/dump.lua +15 -0
  80. data/lua-hooks/ext/lua-snapshot/snapshot.c +455 -0
  81. data/lua-hooks/ext/lua/COPYRIGHT +34 -0
  82. data/lua-hooks/ext/lua/lapi.c +1087 -0
  83. data/lua-hooks/ext/lua/lapi.h +16 -0
  84. data/lua-hooks/ext/lua/lauxlib.c +652 -0
  85. data/lua-hooks/ext/lua/lauxlib.h +174 -0
  86. data/lua-hooks/ext/lua/lbaselib.c +659 -0
  87. data/lua-hooks/ext/lua/lcode.c +831 -0
  88. data/lua-hooks/ext/lua/lcode.h +76 -0
  89. data/lua-hooks/ext/lua/ldblib.c +398 -0
  90. data/lua-hooks/ext/lua/ldebug.c +638 -0
  91. data/lua-hooks/ext/lua/ldebug.h +33 -0
  92. data/lua-hooks/ext/lua/ldo.c +519 -0
  93. data/lua-hooks/ext/lua/ldo.h +57 -0
  94. data/lua-hooks/ext/lua/ldump.c +164 -0
  95. data/lua-hooks/ext/lua/lfunc.c +174 -0
  96. data/lua-hooks/ext/lua/lfunc.h +34 -0
  97. data/lua-hooks/ext/lua/lgc.c +710 -0
  98. data/lua-hooks/ext/lua/lgc.h +110 -0
  99. data/lua-hooks/ext/lua/linit.c +38 -0
  100. data/lua-hooks/ext/lua/liolib.c +556 -0
  101. data/lua-hooks/ext/lua/llex.c +463 -0
  102. data/lua-hooks/ext/lua/llex.h +81 -0
  103. data/lua-hooks/ext/lua/llimits.h +128 -0
  104. data/lua-hooks/ext/lua/lmathlib.c +263 -0
  105. data/lua-hooks/ext/lua/lmem.c +86 -0
  106. data/lua-hooks/ext/lua/lmem.h +49 -0
  107. data/lua-hooks/ext/lua/loadlib.c +705 -0
  108. data/lua-hooks/ext/lua/loadlib_rel.c +760 -0
  109. data/lua-hooks/ext/lua/lobject.c +214 -0
  110. data/lua-hooks/ext/lua/lobject.h +381 -0
  111. data/lua-hooks/ext/lua/lopcodes.c +102 -0
  112. data/lua-hooks/ext/lua/lopcodes.h +268 -0
  113. data/lua-hooks/ext/lua/loslib.c +243 -0
  114. data/lua-hooks/ext/lua/lparser.c +1339 -0
  115. data/lua-hooks/ext/lua/lparser.h +82 -0
  116. data/lua-hooks/ext/lua/lstate.c +214 -0
  117. data/lua-hooks/ext/lua/lstate.h +169 -0
  118. data/lua-hooks/ext/lua/lstring.c +111 -0
  119. data/lua-hooks/ext/lua/lstring.h +31 -0
  120. data/lua-hooks/ext/lua/lstrlib.c +871 -0
  121. data/lua-hooks/ext/lua/ltable.c +588 -0
  122. data/lua-hooks/ext/lua/ltable.h +40 -0
  123. data/lua-hooks/ext/lua/ltablib.c +287 -0
  124. data/lua-hooks/ext/lua/ltm.c +75 -0
  125. data/lua-hooks/ext/lua/ltm.h +54 -0
  126. data/lua-hooks/ext/lua/lua.c +392 -0
  127. data/lua-hooks/ext/lua/lua.def +131 -0
  128. data/lua-hooks/ext/lua/lua.h +388 -0
  129. data/lua-hooks/ext/lua/lua.rc +28 -0
  130. data/lua-hooks/ext/lua/lua_dll.rc +26 -0
  131. data/lua-hooks/ext/lua/luac.c +200 -0
  132. data/lua-hooks/ext/lua/luac.rc +1 -0
  133. data/lua-hooks/ext/lua/luaconf.h +763 -0
  134. data/lua-hooks/ext/lua/luaconf.h.in +724 -0
  135. data/lua-hooks/ext/lua/luaconf.h.orig +763 -0
  136. data/lua-hooks/ext/lua/lualib.h +53 -0
  137. data/lua-hooks/ext/lua/lundump.c +227 -0
  138. data/lua-hooks/ext/lua/lundump.h +36 -0
  139. data/lua-hooks/ext/lua/lvm.c +767 -0
  140. data/lua-hooks/ext/lua/lvm.h +36 -0
  141. data/lua-hooks/ext/lua/lzio.c +82 -0
  142. data/lua-hooks/ext/lua/lzio.h +67 -0
  143. data/lua-hooks/ext/lua/print.c +227 -0
  144. data/lua-hooks/ext/luautf8/README.md +152 -0
  145. data/lua-hooks/ext/luautf8/lutf8lib.c +1274 -0
  146. data/lua-hooks/ext/luautf8/unidata.h +3064 -0
  147. data/lua-hooks/lib/boot.lua +254 -0
  148. data/lua-hooks/lib/encode.lua +4 -0
  149. data/lua-hooks/lib/lexers/LICENSE +21 -0
  150. data/lua-hooks/lib/lexers/bash.lua +134 -0
  151. data/lua-hooks/lib/lexers/bash_dqstr.lua +62 -0
  152. data/lua-hooks/lib/lexers/css.lua +216 -0
  153. data/lua-hooks/lib/lexers/html.lua +106 -0
  154. data/lua-hooks/lib/lexers/javascript.lua +68 -0
  155. data/lua-hooks/lib/lexers/lexer.lua +1575 -0
  156. data/lua-hooks/lib/lexers/markers.lua +33 -0
  157. metadata +308 -0
@@ -0,0 +1,216 @@
1
+ -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
2
+ -- CSS LPeg lexer.
3
+
4
+ local l = require('lexer')
5
+ local token, word_match = l.token, l.word_match
6
+ local P, R, S, V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
7
+
8
+ local M = {_NAME = 'css'}
9
+
10
+ -- Whitespace.
11
+ local ws = token(l.WHITESPACE, l.space^1)
12
+
13
+ -- Comments.
14
+ local comment = token(l.COMMENT, '/*' * (l.any - '*/')^0 * P('*/')^-1)
15
+
16
+ -- Strings.
17
+ local sq_str = l.delimited_range("'")
18
+ local dq_str = l.delimited_range('"')
19
+ local string = token(l.STRING, sq_str + dq_str)
20
+
21
+ -- Numbers.
22
+ local number = token(l.NUMBER, l.digit^1)
23
+
24
+ -- Keywords.
25
+ local css1_property = word_match({
26
+ 'color', 'background-color', 'background-image', 'background-repeat',
27
+ 'background-attachment', 'background-position', 'background', 'font-family',
28
+ 'font-style', 'font-variant', 'font-weight', 'font-size', 'font',
29
+ 'word-spacing', 'letter-spacing', 'text-decoration', 'vertical-align',
30
+ 'text-transform', 'text-align', 'text-indent', 'line-height', 'margin-top',
31
+ 'margin-right', 'margin-bottom', 'margin-left', 'margin', 'padding-top',
32
+ 'padding-right', 'padding-bottom', 'padding-left', 'padding',
33
+ 'border-top-width', 'border-right-width', 'border-bottom-width',
34
+ 'border-left-width', 'border-width', 'border-top', 'border-right',
35
+ 'border-bottom', 'border-left', 'border', 'border-color', 'border-style',
36
+ 'width', 'height', 'float', 'clear', 'display', 'white-space',
37
+ 'list-style-type', 'list-style-image', 'list-style-position', 'list-style'
38
+ }, '-')
39
+ local css1_value = word_match({
40
+ 'auto', 'none', 'normal', 'italic', 'oblique', 'small-caps', 'bold', 'bolder',
41
+ 'lighter', 'xx-small', 'x-small', 'small', 'medium', 'large', 'x-large',
42
+ 'xx-large', 'larger', 'smaller', 'transparent', 'repeat', 'repeat-x',
43
+ 'repeat-y', 'no-repeat', 'scroll', 'fixed', 'top', 'bottom', 'left', 'center',
44
+ 'right', 'justify', 'both', 'underline', 'overline', 'line-through', 'blink',
45
+ 'baseline', 'sub', 'super', 'text-top', 'middle', 'text-bottom', 'capitalize',
46
+ 'uppercase', 'lowercase', 'thin', 'medium', 'thick', 'dotted', 'dashed',
47
+ 'solid', 'double', 'groove', 'ridge', 'inset', 'outset', 'block', 'inline',
48
+ 'list-item', 'pre', 'no-wrap', 'inside', 'outside', 'disc', 'circle',
49
+ 'square', 'decimal', 'lower-roman', 'upper-roman', 'lower-alpha',
50
+ 'upper-alpha', 'aqua', 'black', 'blue', 'fuchsia', 'gray', 'green', 'lime',
51
+ 'maroon', 'navy', 'olive', 'purple', 'red', 'silver', 'teal', 'white',
52
+ 'yellow'
53
+ }, '-')
54
+ local css2_property = word_match({
55
+ 'border-top-color', 'border-right-color', 'border-bottom-color',
56
+ 'border-left-color', 'border-color', 'border-top-style', 'border-right-style',
57
+ 'border-bottom-style', 'border-left-style', 'border-style', 'top', 'right',
58
+ 'bottom', 'left', 'position', 'z-index', 'direction', 'unicode-bidi',
59
+ 'min-width', 'max-width', 'min-height', 'max-height', 'overflow', 'clip',
60
+ 'visibility', 'content', 'quotes', 'counter-reset', 'counter-increment',
61
+ 'marker-offset', 'size', 'marks', 'page-break-before', 'page-break-after',
62
+ 'page-break-inside', 'page', 'orphans', 'widows', 'font-stretch',
63
+ 'font-size-adjust', 'unicode-range', 'units-per-em', 'src', 'panose-1',
64
+ 'stemv', 'stemh', 'slope', 'cap-height', 'x-height', 'ascent', 'descent',
65
+ 'widths', 'bbox', 'definition-src', 'baseline', 'centerline', 'mathline',
66
+ 'topline', 'text-shadow', 'caption-side', 'table-layout', 'border-collapse',
67
+ 'border-spacing', 'empty-cells', 'speak-header', 'cursor', 'outline',
68
+ 'outline-width', 'outline-style', 'outline-color', 'volume', 'speak',
69
+ 'pause-before', 'pause-after', 'pause', 'cue-before', 'cue-after', 'cue',
70
+ 'play-during', 'azimuth', 'elevation', 'speech-rate', 'voice-family', 'pitch',
71
+ 'pitch-range', 'stress', 'richness', 'speak-punctuation', 'speak-numeral'
72
+ }, '-')
73
+ local css2_value = word_match({
74
+ 'inherit', 'run-in', 'compact', 'marker', 'table', 'inline-table',
75
+ 'table-row-group', 'table-header-group', 'table-footer-group', 'table-row',
76
+ 'table-column-group', 'table-column', 'table-cell', 'table-caption', 'static',
77
+ 'relative', 'absolute', 'fixed', 'ltr', 'rtl', 'embed', 'bidi-override',
78
+ 'visible', 'hidden', 'scroll', 'collapse', 'open-quote', 'close-quote',
79
+ 'no-open-quote', 'no-close-quote', 'decimal-leading-zero', 'lower-greek',
80
+ 'lower-latin', 'upper-latin', 'hebrew', 'armenian', 'georgian',
81
+ 'cjk-ideographic', 'hiragana', 'katakana', 'hiragana-iroha', 'katakana-iroha',
82
+ 'landscape', 'portrait', 'crop', 'cross', 'always', 'avoid', 'wider',
83
+ 'narrower', 'ultra-condensed', 'extra-condensed', 'condensed',
84
+ 'semi-condensed', 'semi-expanded', 'expanded', 'extra-expanded',
85
+ 'ultra-expanded', 'caption', 'icon', 'menu', 'message-box', 'small-caption',
86
+ 'status-bar', 'separate', 'show', 'hide', 'once', 'crosshair', 'default',
87
+ 'pointer', 'move', 'text', 'wait', 'help', 'e-resize', 'ne-resize',
88
+ 'nw-resize', 'n-resize', 'se-resize', 'sw-resize', 's-resize', 'w-resize',
89
+ 'ActiveBorder', 'ActiveCaption', 'AppWorkspace', 'Background', 'ButtonFace',
90
+ 'ButtonHighlight', 'ButtonShadow', 'InactiveCaptionText', 'ButtonText',
91
+ 'CaptionText', 'GrayText', 'Highlight', 'HighlightText', 'InactiveBorder',
92
+ 'InactiveCaption', 'InfoBackground', 'InfoText', 'Menu', 'MenuText',
93
+ 'Scrollbar', 'ThreeDDarkShadow', 'ThreeDFace', 'ThreeDHighlight',
94
+ 'ThreeDLightShadow', 'ThreeDShadow', 'Window', 'WindowFrame', 'WindowText',
95
+ 'silent', 'x-soft', 'soft', 'medium', 'loud', 'x-loud', 'spell-out', 'mix',
96
+ 'left-side', 'far-left', 'center-left', 'center-right', 'far-right',
97
+ 'right-side', 'behind', 'leftwards', 'rightwards', 'below', 'level', 'above',
98
+ 'higher', 'lower', 'x-slow', 'slow', 'medium', 'fast', 'x-fast', 'faster',
99
+ 'slower', 'male', 'female', 'child', 'x-low', 'low', 'high', 'x-high', 'code',
100
+ 'digits', 'continous'
101
+ }, '-')
102
+
103
+ local css3_property = word_match({
104
+ 'align-content', 'align-items', 'align-self', 'alignment-adjust',
105
+ 'alignment-baseline', 'all', 'anchor-point', 'animation', 'animation-delay',
106
+ 'animation-direction', 'animation-duration', 'animation-fill-mode',
107
+ 'animation-iteration-count', 'animation-name', 'animation-play-state',
108
+ 'animation-timing-function', 'backface-visibility', 'background-clip',
109
+ 'background-origin', 'background-size', 'baseline-shift', 'binding', 'bleed',
110
+ 'bookmark-label', 'bookmark-level', 'bookmark-state', 'border-bottom-left-radius',
111
+ 'border-bottom-right-radius', 'border-image', 'border-image-outset',
112
+ 'border-image-repeat', 'border-image-slice', 'border-image-source',
113
+ 'border-image-width', 'border-radius', 'border-top-left-radius',
114
+ 'border-top-right-radius', 'box-decoration-break', 'box-shadow', 'box-sizing',
115
+ 'box-snap', 'box-suppress', 'break-after', 'break-before', 'break-inside',
116
+ 'chains', 'clip-path', 'clip-rule', 'color-interpolation-filters', 'column-count',
117
+ 'column-fill', 'column-gap', 'column-rule', 'column-rule-color', 'column-rule-style',
118
+ 'column-rule-width', 'column-span', 'column-width', 'columns', 'contain',
119
+ 'counter-set', 'crop', 'display-inside', 'display-list', 'display-outside',
120
+ 'dominant-baseline', 'filter', 'flex', 'flex-basis', 'flex-direction', 'flex-flow',
121
+ 'flex-grow', 'flex-shrink', 'flex-wrap', 'float-offset', 'flood-color',
122
+ 'flood-opacity', 'flow-from', 'flow-into', 'font-feature-settings', 'font-kerning',
123
+ 'font-language-override', 'font-synthesis', 'font-variant-alternates',
124
+ 'font-variant-caps', 'font-variant-east-asian', 'font-variant-ligatures',
125
+ 'font-variant-numeric', 'font-variant-position', 'grid', 'grid-area',
126
+ 'grid-auto-columns', 'grid-auto-flow', 'grid-auto-rows', 'grid-column',
127
+ 'grid-column-end', 'grid-column-start', 'grid-row', 'grid-row-end', 'grid-row-start',
128
+ 'grid-template', 'grid-template-areas', 'grid-template-columns', 'grid-template-rows',
129
+ 'hanging-punctuation', 'hyphens', 'icon', 'image-orientation', 'image-resolution',
130
+ 'ime-mode', 'initial-letters', 'inline-box-align', 'justify-content', 'justify-items',
131
+ 'justify-self', 'lighting-color', 'line-box-contain', 'line-break', 'line-grid',
132
+ 'line-snap', 'line-stacking', 'line-stacking-ruby', 'line-stacking-shift',
133
+ 'line-stacking-strategy', 'marker-side', 'mask', 'mask-box', 'mask-box-outset',
134
+ 'mask-box-repeat', 'mask-box-slice', 'mask-box-source', 'mask-box-width',
135
+ 'mask-clip', 'mask-image', 'mask-origin', 'mask-position', 'mask-repeat', 'mask-size',
136
+ 'mask-source-type', 'mask-type', 'max-lines', 'move-to', 'nav-down', 'nav-index',
137
+ 'nav-left', 'nav-right', 'nav-up', 'object-fit', 'object-position', 'opacity',
138
+ 'order', 'outline-offset', 'overflow-wrap', 'overflow-x', 'overflow-y', 'page-policy',
139
+ 'perspective', 'perspective-origin', 'presentation-level', 'region-fragment',
140
+ 'resize', 'rest', 'rest-after', 'rest-before', 'rotation', 'rotation-point',
141
+ 'ruby-align', 'ruby-merge', 'ruby-position', 'shape-image-threshold', 'shape-outside',
142
+ 'shape-margin', 'speak-as', 'string-set', 'tab-size', 'text-align-last',
143
+ 'text-combine-upright', 'text-decoration-color', 'text-decoration-line',
144
+ 'text-decoration-skip', 'text-decoration-style', 'text-emphasis', 'text-emphasis-color',
145
+ 'text-emphasis-color', 'text-emphasis-style', 'text-height', 'text-justify',
146
+ 'text-orientation', 'text-overflow', 'text-space-collapse', 'text-underline-position',
147
+ 'text-wrap', 'transform', 'transform-origin', 'transform-style', 'transition',
148
+ 'transition-delay', 'transition-duration', 'transition-property',
149
+ 'transition-timing-function', 'voice-balance', 'voice-duration', 'voice-pitch',
150
+ 'voice-range', 'voice-rate', 'voice-stress', 'voice-volume', 'will-change',
151
+ 'word-break', 'word-wrap', 'wrap-flow', 'wrap-through', 'writing-mode',
152
+ })
153
+
154
+
155
+ local property = token('property', css1_property + css2_property + css3_property)
156
+ local value = token('value', css1_value + css2_value)
157
+ local keyword = property + value
158
+
159
+ -- Identifiers.
160
+ local identifier = token(l.IDENTIFIER, l.alpha * (l.alnum + S('_-'))^0)
161
+
162
+ -- Operators.
163
+ local operator = token(l.OPERATOR, S('~!#*>+=|.,:;()[]{}'))
164
+
165
+ -- At rule.
166
+ local at_rule = token('at_rule', P('@') * word_match{
167
+ 'charset', 'font-face', 'media', 'page', 'import'
168
+ })
169
+
170
+ -- Colors.
171
+ local xdigit = l.xdigit
172
+ local hex_color = '#' * xdigit * xdigit * xdigit * (xdigit * xdigit * xdigit)^-1
173
+ local color_name = word_match{
174
+ 'aqua', 'black', 'blue', 'fuchsia', 'gray', 'green', 'lime', 'maroon', 'navy',
175
+ 'olive', 'orange', 'purple', 'red', 'silver', 'teal', 'white', 'yellow'
176
+ }
177
+ local color = token('color', hex_color + color_name)
178
+
179
+ -- Pseudo.
180
+ local pseudo = token(l.CONSTANT, word_match({
181
+ -- Pseudo elements.
182
+ 'first-line', 'first-letter', 'before', 'after',
183
+ -- Pseudo classes.
184
+ 'first-child', 'link', 'visited', 'hover', 'active', 'focus', 'lang',
185
+ }, '-'))
186
+
187
+ -- Units.
188
+ local unit = token('unit', word_match{
189
+ 'em', 'ex', 'px', 'pt', 'pc', 'in', 'ft', 'mm', 'cm', 'kHz', 'Hz', 'deg',
190
+ 'rad', 'grad', 'ms', 's'
191
+ } + '%')
192
+
193
+ -- Immunio marker
194
+ local marker = l.token('marker', P('{immunio-var:') * l.integer * ':' * l.xdigit^1 * '}')
195
+
196
+ M._rules = {
197
+ {'whitespace', ws},
198
+ {'marker', marker},
199
+ {'keyword', keyword},
200
+ {'pseudo', pseudo},
201
+ {'color', color},
202
+ {'identifier', identifier},
203
+ {'string', string},
204
+ {'comment', comment},
205
+ {'number', number * unit^-1},
206
+ {'operator', operator},
207
+ {'at_rule', at_rule},
208
+ }
209
+
210
+ M._tokenstyles = {
211
+ }
212
+
213
+ M._foldsymbols = {
214
+ }
215
+
216
+ return M
@@ -0,0 +1,106 @@
1
+ -- Copyright (C) 2015 Immunio, Inc.
2
+
3
+ -- HTML: Simple h5 like HTML lexer for Immun.io.
4
+
5
+ -- NOTE: not covered by Scintillua MIT license in this directory.
6
+
7
+ local l = require('lexer')
8
+ local token, parent_token, word_match = l.token, l.parent_token, l.word_match
9
+ local P, R, S, V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
10
+
11
+ local M = {_NAME = 'html'}
12
+
13
+ local case_insensitive_tags = true
14
+
15
+ -- Whitespace.
16
+ local ws = l.space^1
17
+ -- This is broad to both accept our placeholders and be very liberal about what may be
18
+ -- interpreted as an attribute to ensure we escape attributes fairly aggressively.
19
+ local element_chars = (l.any - '<' - '>' - '=' - '"' - "'" - ws)^1
20
+ -- Comments.
21
+ local comment = token(l.COMMENT, '<!--' * (l.any - '-->')^0 * P('-->')^-1)
22
+ -- XXX add h5 bogus comment 1 and bogus comment 2?
23
+
24
+ -- Strings.
25
+ local sq_str = l.delimited_range("'")
26
+ local dq_str = l.delimited_range('"')
27
+ local string = sq_str + dq_str
28
+
29
+ -- Attributes. Individual recognition is handled in our XSS processing code.
30
+ local attr_name = token('attr_name', element_chars - '=')
31
+ local attr_value = token('attr_value', string + element_chars)
32
+ local attribute = parent_token('attribute', attr_name * '=' * attr_value)
33
+
34
+ -- Tags.
35
+ local tag_name = token('tag_name', element_chars - '/')
36
+ local tag_data = token('tag_data', (l.any - l.space - '>')^1 ) -- crap in a tag
37
+
38
+ -- XXX how should we handle void tags... right now they are an unmatched tag_open
39
+ local tag_open = parent_token('tag_open', P('<') * tag_name * ( (ws * attribute) + ( tag_data ) + ws )^0 * (P('>') + '/>') )
40
+ local tag_close = parent_token('tag_close', P('</') * tag_name * ( ( tag_data ) + ws )^0 * '>')
41
+
42
+ -- Special case for script and style tags.
43
+ local style_tag_name = token("tag_name", word_match({'style'}, nil, case_insensitive_tags))
44
+ local style_tag_open = parent_token("tag_open", P('<') * style_tag_name * ((ws * attribute) + tag_data)^0 * P('>'))
45
+ local style_tag_close = parent_token("tag_close", P('</') * style_tag_name * tag_data^0 * '>')
46
+ local style_data = token("style_data", (l.any - style_tag_close)^0)
47
+ local style_tag = parent_token('style_tag', style_tag_open * style_data * style_tag_close)
48
+
49
+ local script_tag_name = token("tag_name", word_match({'script'}, nil, case_insensitive_tags))
50
+ local script_tag_open = parent_token("tag_open", P('<') * script_tag_name * ((ws * attribute) + tag_data)^0 * P('>'))
51
+ local script_tag_close = parent_token("tag_close", P('</') * script_tag_name * tag_data^0 * '>')
52
+ local script_data = token("script_data", (l.any - script_tag_close)^0)
53
+ local script_tag = parent_token('script_tag', script_tag_open * script_data * script_tag_close)
54
+
55
+ -- Top level rules
56
+
57
+ -- Note: the ordering is important here as <script> and <style> have to supercede tag_open...
58
+ local tag = style_tag + script_tag + tag_open + tag_close
59
+
60
+ -- Entities.
61
+ local entity = token('entity', '&' * (l.any - l.space - ';' - '<' - '>' - "'" - '"' - "/" )^1 * ';')
62
+
63
+ -- Doctype.
64
+ local doctype = token('doctype', '<!' *
65
+ word_match({'doctype'}, nil, case_insensitive_tags) *
66
+ (l.any - '>')^1 * '>')
67
+
68
+ -- Data between tags
69
+ local data = token('data', (l.any - '<')^1)
70
+
71
+ M._rules = {
72
+ {'comment', comment},
73
+ {'doctype', doctype},
74
+ {'tag', tag},
75
+ {'entity', entity},
76
+ {'data', data},
77
+ }
78
+
79
+ M._tokenstyles = {
80
+ }
81
+
82
+ M._foldsymbols = {
83
+ }
84
+
85
+ M.unlex_rules = {
86
+ ["tag_open"] = {
87
+ ["prefix"] = "<",
88
+ ["suffix"] = ">",
89
+ },
90
+ ["tag_close"] = {
91
+ ["prefix"] = "</",
92
+ ["suffix"] = ">",
93
+ },
94
+ ["attribute"] = {
95
+ ["prefix"] = " ",
96
+ },
97
+ ["tag_data"] = {
98
+ ["prefix"] = " ",
99
+ },
100
+ ["attr_name"] = {
101
+ ["suffix"] = "=",
102
+ },
103
+ }
104
+
105
+
106
+ return M
@@ -0,0 +1,68 @@
1
+ -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
2
+ -- JavaScript LPeg lexer.
3
+
4
+ local l = require('lexer')
5
+ local token, word_match = l.token, l.word_match
6
+ local P, R, S = lpeg.P, lpeg.R, lpeg.S
7
+
8
+ local M = {_NAME = 'javascript'}
9
+
10
+ -- Whitespace.
11
+ local ws = token(l.WHITESPACE, l.space^1)
12
+
13
+ -- Comments.
14
+ local line_comment = '//' * l.nonnewline_esc^0
15
+ local block_comment = '/*' * (l.any - '*/')^0 * P('*/')^-1
16
+ local comment = token(l.COMMENT, line_comment + block_comment)
17
+
18
+ -- Strings.
19
+ local sq_str = l.delimited_range("'")
20
+ local dq_str = l.delimited_range('"')
21
+ local regex = token( "regex", l.last_char_includes('+-*%^!=&|?:;,([{<>') *
22
+ l.delimited_range('/', true) * S('igm')^0 )
23
+ local string = token(l.STRING, sq_str + dq_str) --+ token(l.REGEX, regex_str)
24
+
25
+ -- Numbers.
26
+ local number = token(l.NUMBER, l.float + l.integer)
27
+
28
+ -- Keywords.
29
+ local keyword = token(l.KEYWORD, word_match{
30
+ 'abstract', 'boolean', 'break', 'byte', 'case', 'catch', 'char', 'class',
31
+ 'const', 'continue', 'debugger', 'default', 'delete', 'do', 'double', 'else',
32
+ 'enum', 'export', 'extends', 'false', 'final', 'finally', 'float', 'for',
33
+ 'function', 'goto', 'if', 'implements', 'import', 'in', 'instanceof', 'int',
34
+ 'interface', 'let', 'long', 'native', 'new', 'null', 'package', 'private',
35
+ 'protected', 'public', 'return', 'short', 'static', 'super', 'switch',
36
+ 'synchronized', 'this', 'throw', 'throws', 'transient', 'true', 'try',
37
+ 'typeof', 'var', 'void', 'volatile', 'while', 'with', 'yield'
38
+ })
39
+
40
+ -- Identifiers.
41
+ local identifier = token(l.IDENTIFIER, l.word)
42
+
43
+ -- Operators.
44
+ local operator = token(l.OPERATOR, S('+-/*%^!=&|?:;,.()[]{}<>'))
45
+
46
+ -- Immunio marker
47
+ local marker = l.token('marker', P('{immunio-var:') * l.integer * ':' * l.xdigit^1 * '}')
48
+
49
+
50
+ M._rules = {
51
+ {'whitespace', ws},
52
+ {'marker', marker},
53
+ {'keyword', keyword},
54
+ {'identifier', identifier},
55
+ {'comment', comment},
56
+ {'number', number},
57
+ {'string', string},
58
+ {'regex', regex},
59
+ {'operator', operator},
60
+ }
61
+
62
+ M._foldsymbols = {
63
+ _patterns = {'[{}]', '/%*', '%*/', '//'},
64
+ [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
65
+ [l.COMMENT] = {['/*'] = 1, ['*/'] = -1, ['//'] = l.fold_line_comments('//')}
66
+ }
67
+
68
+ return M
@@ -0,0 +1,1575 @@
1
+ -- Copyright 2006-2015 Mitchell mitchell.att.foicica.com. See LICENSE.
2
+
3
+ local M = {}
4
+
5
+ --[=[ This comment is for LuaDoc.
6
+ ---
7
+ -- Lexes Scintilla documents with Lua and LPeg.
8
+ --
9
+ -- ## Overview
10
+ --
11
+ -- Lexers highlight the syntax of source code. Scintilla (the editing component
12
+ -- behind [Textadept][] and [SciTE][]) traditionally uses static, compiled C++
13
+ -- lexers which are notoriously difficult to create and/or extend. On the other
14
+ -- hand, Lua makes it easy to to rapidly create new lexers, extend existing
15
+ -- ones, and embed lexers within one another. Lua lexers tend to be more
16
+ -- readable than C++ lexers too.
17
+ --
18
+ -- Lexers are Parsing Expression Grammars, or PEGs, composed with the Lua
19
+ -- [LPeg library][]. The following table comes from the LPeg documentation and
20
+ -- summarizes all you need to know about constructing basic LPeg patterns. This
21
+ -- module provides convenience functions for creating and working with other
22
+ -- more advanced patterns and concepts.
23
+ --
24
+ -- Operator | Description
25
+ -- ---------------------|------------
26
+ -- `lpeg.P(string)` | Matches `string` literally.
27
+ -- `lpeg.P(`_`n`_`)` | Matches exactly _`n`_ characters.
28
+ -- `lpeg.S(string)` | Matches any character in set `string`.
29
+ -- `lpeg.R("`_`xy`_`")` | Matches any character between range `x` and `y`.
30
+ -- `patt^`_`n`_ | Matches at least _`n`_ repetitions of `patt`.
31
+ -- `patt^-`_`n`_ | Matches at most _`n`_ repetitions of `patt`.
32
+ -- `patt1 * patt2` | Matches `patt1` followed by `patt2`.
33
+ -- `patt1 + patt2` | Matches `patt1` or `patt2` (ordered choice).
34
+ -- `patt1 - patt2` | Matches `patt1` if `patt2` does not match.
35
+ -- `-patt` | Equivalent to `("" - patt)`.
36
+ -- `#patt` | Matches `patt` but consumes no input.
37
+ --
38
+ -- The first part of this document deals with rapidly constructing a simple
39
+ -- lexer. The next part deals with more advanced techniques, such as custom
40
+ -- coloring and embedding lexers within one another. Following that is a
41
+ -- discussion about code folding, or being able to tell Scintilla which code
42
+ -- blocks are "foldable" (temporarily hideable from view). After that are
43
+ -- instructions on how to use LPeg lexers with the aforementioned Textadept and
44
+ -- SciTE editors. Finally there are comments on lexer performance and
45
+ -- limitations.
46
+ --
47
+ -- [LPeg library]: http://www.inf.puc-rio.br/~roberto/lpeg/lpeg.html
48
+ -- [Textadept]: http://foicica.com/textadept
49
+ -- [SciTE]: http://scintilla.org/SciTE.html
50
+ --
51
+ -- ## Lexer Basics
52
+ --
53
+ -- The *lexers/* directory contains all lexers, including your new one. Before
54
+ -- attempting to write one from scratch though, first determine if your
55
+ -- programming language is similar to any of the 80+ languages supported. If so,
56
+ -- you may be able to copy and modify that lexer, saving some time and effort.
57
+ -- The filename of your lexer should be the name of your programming language in
58
+ -- lower case followed by a *.lua* extension. For example, a new Lua lexer has
59
+ -- the name *lua.lua*.
60
+ --
61
+ -- Note: Try to refrain from using one-character language names like "b", "c",
62
+ -- or "d". For example, Scintillua uses "b_lang", "cpp", and "dmd",
63
+ -- respectively.
64
+ --
65
+ -- ### New Lexer Template
66
+ --
67
+ -- There is a *lexers/template.txt* file that contains a simple template for a
68
+ -- new lexer. Feel free to use it, replacing the '?'s with the name of your
69
+ -- lexer:
70
+ --
71
+ -- -- ? LPeg lexer.
72
+ --
73
+ -- local l = require('lexer')
74
+ -- local token, word_match = l.token, l.word_match
75
+ -- local P, R, S = lpeg.P, lpeg.R, lpeg.S
76
+ --
77
+ -- local M = {_NAME = '?'}
78
+ --
79
+ -- -- Whitespace.
80
+ -- local ws = token(l.WHITESPACE, l.space^1)
81
+ --
82
+ -- M._rules = {
83
+ -- {'whitespace', ws},
84
+ -- }
85
+ --
86
+ -- M._tokenstyles = {
87
+ --
88
+ -- }
89
+ --
90
+ -- return M
91
+ --
92
+ -- The first 4 lines of code simply define often used convenience variables. The
93
+ -- 5th and last lines define and return the lexer object Scintilla uses; they
94
+ -- are very important and must be part of every lexer. The sixth line defines
95
+ -- something called a "token", an essential building block of lexers. You will
96
+ -- learn about tokens shortly. The rest of the code defines a set of grammar
97
+ -- rules and token styles. You will learn about those later. Note, however, the
98
+ -- `M.` prefix in front of `_rules` and `_tokenstyles`: not only do these tables
99
+ -- belong to their respective lexers, but any non-local variables need the `M.`
100
+ -- prefix too so-as not to affect Lua's global environment. All in all, this is
101
+ -- a minimal, working lexer that you can build on.
102
+ --
103
+ -- ### Tokens
104
+ --
105
+ -- Take a moment to think about your programming language's structure. What kind
106
+ -- of key elements does it have? In the template shown earlier, one predefined
107
+ -- element all languages have is whitespace. Your language probably also has
108
+ -- elements like comments, strings, and keywords. Lexers refer to these elements
109
+ -- as "tokens". Tokens are the fundamental "building blocks" of lexers. Lexers
110
+ -- break down source code into tokens for coloring, which results in the syntax
111
+ -- highlighting familiar to you. It is up to you how specific your lexer is when
112
+ -- it comes to tokens. Perhaps only distinguishing between keywords and
113
+ -- identifiers is necessary, or maybe recognizing constants and built-in
114
+ -- functions, methods, or libraries is desirable. The Lua lexer, for example,
115
+ -- defines 11 tokens: whitespace, comments, strings, numbers, keywords, built-in
116
+ -- functions, constants, built-in libraries, identifiers, labels, and operators.
117
+ -- Even though constants, built-in functions, and built-in libraries are subsets
118
+ -- of identifiers, Lua programmers find it helpful for the lexer to distinguish
119
+ -- between them all. It is perfectly acceptable to just recognize keywords and
120
+ -- identifiers.
121
+ --
122
+ -- In a lexer, tokens consist of a token name and an LPeg pattern that matches a
123
+ -- sequence of characters recognized as an instance of that token. Create tokens
124
+ -- using the [`lexer.token()`]() function. Let us examine the "whitespace" token
125
+ -- defined in the template shown earlier:
126
+ --
127
+ -- local ws = token(l.WHITESPACE, l.space^1)
128
+ --
129
+ -- At first glance, the first argument does not appear to be a string name and
130
+ -- the second argument does not appear to be an LPeg pattern. Perhaps you
131
+ -- expected something like:
132
+ --
133
+ -- local ws = token('whitespace', S('\t\v\f\n\r ')^1)
134
+ --
135
+ -- The `lexer` (`l`) module actually provides a convenient list of common token
136
+ -- names and common LPeg patterns for you to use. Token names include
137
+ -- [`lexer.DEFAULT`](), [`lexer.WHITESPACE`](), [`lexer.COMMENT`](),
138
+ -- [`lexer.STRING`](), [`lexer.NUMBER`](), [`lexer.KEYWORD`](),
139
+ -- [`lexer.IDENTIFIER`](), [`lexer.OPERATOR`](), [`lexer.ERROR`](),
140
+ -- [`lexer.PREPROCESSOR`](), [`lexer.CONSTANT`](), [`lexer.VARIABLE`](),
141
+ -- [`lexer.FUNCTION`](), [`lexer.CLASS`](), [`lexer.TYPE`](), [`lexer.LABEL`](),
142
+ -- [`lexer.REGEX`](), and [`lexer.EMBEDDED`](). Patterns include
143
+ -- [`lexer.any`](), [`lexer.ascii`](), [`lexer.extend`](), [`lexer.alpha`](),
144
+ -- [`lexer.digit`](), [`lexer.alnum`](), [`lexer.lower`](), [`lexer.upper`](),
145
+ -- [`lexer.xdigit`](), [`lexer.cntrl`](), [`lexer.graph`](), [`lexer.print`](),
146
+ -- [`lexer.punct`](), [`lexer.space`](), [`lexer.newline`](),
147
+ -- [`lexer.nonnewline`](), [`lexer.nonnewline_esc`](), [`lexer.dec_num`](),
148
+ -- [`lexer.hex_num`](), [`lexer.oct_num`](), [`lexer.integer`](),
149
+ -- [`lexer.float`](), and [`lexer.word`](). You may use your own token names if
150
+ -- none of the above fit your language, but an advantage to using predefined
151
+ -- token names is that your lexer's tokens will inherit the universal syntax
152
+ -- highlighting color theme used by your text editor.
153
+ --
154
+ -- #### Example Tokens
155
+ --
156
+ -- So, how might you define other tokens like comments, strings, and keywords?
157
+ -- Here are some examples.
158
+ --
159
+ -- **Comments**
160
+ --
161
+ -- Line-style comments with a prefix character(s) are easy to express with LPeg:
162
+ --
163
+ -- local shell_comment = token(l.COMMENT, '#' * l.nonnewline^0)
164
+ -- local c_line_comment = token(l.COMMENT, '//' * l.nonnewline_esc^0)
165
+ --
166
+ -- The comments above start with a '#' or "//" and go to the end of the line.
167
+ -- The second comment recognizes the next line also as a comment if the current
168
+ -- line ends with a '\' escape character.
169
+ --
170
+ -- C-style "block" comments with a start and end delimiter are also easy to
171
+ -- express:
172
+ --
173
+ -- local c_comment = token(l.COMMENT, '/*' * (l.any - '*/')^0 * P('*/')^-1)
174
+ --
175
+ -- This comment starts with a "/\*" sequence and contains anything up to and
176
+ -- including an ending "\*/" sequence. The ending "\*/" is optional so the lexer
177
+ -- can recognize unfinished comments as comments and highlight them properly.
178
+ --
179
+ -- **Strings**
180
+ --
181
+ -- It is tempting to think that a string is not much different from the block
182
+ -- comment shown above in that both have start and end delimiters:
183
+ --
184
+ -- local dq_str = '"' * (l.any - '"')^0 * P('"')^-1
185
+ -- local sq_str = "'" * (l.any - "'")^0 * P("'")^-1
186
+ -- local simple_string = token(l.STRING, dq_str + sq_str)
187
+ --
188
+ -- However, most programming languages allow escape sequences in strings such
189
+ -- that a sequence like "\\&quot;" in a double-quoted string indicates that the
190
+ -- '&quot;' is not the end of the string. The above token incorrectly matches
191
+ -- such a string. Instead, use the [`lexer.delimited_range()`]() convenience
192
+ -- function.
193
+ --
194
+ -- local dq_str = l.delimited_range('"')
195
+ -- local sq_str = l.delimited_range("'")
196
+ -- local string = token(l.STRING, dq_str + sq_str)
197
+ --
198
+ -- In this case, the lexer treats '\' as an escape character in a string
199
+ -- sequence.
200
+ --
201
+ -- **Keywords**
202
+ --
203
+ -- Instead of matching _n_ keywords with _n_ `P('keyword_`_`n`_`')` ordered
204
+ -- choices, use another convenience function: [`lexer.word_match()`](). It is
205
+ -- much easier and more efficient to write word matches like:
206
+ --
207
+ -- local keyword = token(l.KEYWORD, l.word_match{
208
+ -- 'keyword_1', 'keyword_2', ..., 'keyword_n'
209
+ -- })
210
+ --
211
+ -- local case_insensitive_keyword = token(l.KEYWORD, l.word_match({
212
+ -- 'KEYWORD_1', 'keyword_2', ..., 'KEYword_n'
213
+ -- }, nil, true))
214
+ --
215
+ -- local hyphened_keyword = token(l.KEYWORD, l.word_match({
216
+ -- 'keyword-1', 'keyword-2', ..., 'keyword-n'
217
+ -- }, '-'))
218
+ --
219
+ -- By default, characters considered to be in keywords are in the set of
220
+ -- alphanumeric characters and underscores. The last token demonstrates how to
221
+ -- allow '-' (hyphen) characters to be in keywords as well.
222
+ --
223
+ -- **Numbers**
224
+ --
225
+ -- Most programming languages have the same format for integer and float tokens,
226
+ -- so it might be as simple as using a couple of predefined LPeg patterns:
227
+ --
228
+ -- local number = token(l.NUMBER, l.float + l.integer)
229
+ --
230
+ -- However, some languages allow postfix characters on integers.
231
+ --
232
+ -- local integer = P('-')^-1 * (l.dec_num * S('lL')^-1)
233
+ -- local number = token(l.NUMBER, l.float + l.hex_num + integer)
234
+ --
235
+ -- Your language may need other tweaks, but it is up to you how fine-grained you
236
+ -- want your highlighting to be. After all, you are not writing a compiler or
237
+ -- interpreter!
238
+ --
239
+ -- ### Rules
240
+ --
241
+ -- Programming languages have grammars, which specify valid token structure. For
242
+ -- example, comments usually cannot appear within a string. Grammars consist of
243
+ -- rules, which are simply combinations of tokens. Recall from the lexer
244
+ -- template the `_rules` table, which defines all the rules used by the lexer
245
+ -- grammar:
246
+ --
247
+ -- M._rules = {
248
+ -- {'whitespace', ws},
249
+ -- }
250
+ --
251
+ -- Each entry in a lexer's `_rules` table consists of a rule name and its
252
+ -- associated pattern. Rule names are completely arbitrary and serve only to
253
+ -- identify and distinguish between different rules. Rule order is important: if
254
+ -- text does not match the first rule, the lexer tries the second rule, and so
255
+ -- on. This simple grammar says to match whitespace tokens under a rule named
256
+ -- "whitespace".
257
+ --
258
+ -- To illustrate the importance of rule order, here is an example of a
259
+ -- simplified Lua grammar:
260
+ --
261
+ -- M._rules = {
262
+ -- {'whitespace', ws},
263
+ -- {'keyword', keyword},
264
+ -- {'identifier', identifier},
265
+ -- {'string', string},
266
+ -- {'comment', comment},
267
+ -- {'number', number},
268
+ -- {'label', label},
269
+ -- {'operator', operator},
270
+ -- }
271
+ --
272
+ -- Note how identifiers come after keywords. In Lua, as with most programming
273
+ -- languages, the characters allowed in keywords and identifiers are in the same
274
+ -- set (alphanumerics plus underscores). If the lexer specified the "identifier"
275
+ -- rule before the "keyword" rule, all keywords would match identifiers and thus
276
+ -- incorrectly highlight as identifiers instead of keywords. The same idea
277
+ -- applies to function, constant, etc. tokens that you may want to distinguish
278
+ -- between: their rules should come before identifiers.
279
+ --
280
+ -- So what about text that does not match any rules? For example in Lua, the '!'
281
+ -- character is meaningless outside a string or comment. Normally the lexer
282
+ -- skips over such text. If instead you want to highlight these "syntax errors",
283
+ -- add an additional end rule:
284
+ --
285
+ -- M._rules = {
286
+ -- {'whitespace', ws},
287
+ -- {'error', token(l.ERROR, l.any)},
288
+ -- }
289
+ --
290
+ -- This identifies and highlights any character not matched by an existing
291
+ -- rule as an `lexer.ERROR` token.
292
+ --
293
+ -- Even though the rules defined in the examples above contain a single token,
294
+ -- rules may consist of multiple tokens. For example, a rule for an HTML tag
295
+ -- could consist of a tag token followed by an arbitrary number of attribute
296
+ -- tokens, allowing the lexer to highlight all tokens separately. The rule might
297
+ -- look something like this:
298
+ --
299
+ -- {'tag', tag_start * (ws * attributes)^0 * tag_end^-1}
300
+ --
301
+ -- Note however that lexers with complex rules like these are more prone to lose
302
+ -- track of their state.
303
+ --
304
+ -- ### Summary
305
+ --
306
+ -- Lexers primarily consist of tokens and grammar rules. At your disposal are a
307
+ -- number of convenience patterns and functions for rapidly creating a lexer. If
308
+ -- you choose to use predefined token names for your tokens, you do not have to
309
+ -- define how the lexer highlights them. The tokens will inherit the default
310
+ -- syntax highlighting color theme your editor uses.
311
+ --
312
+ -- ## Advanced Techniques
313
+ --
314
+ -- ### Styles and Styling
315
+ --
316
+ -- The most basic form of syntax highlighting is assigning different colors to
317
+ -- different tokens. Instead of highlighting with just colors, Scintilla allows
318
+ -- for more rich highlighting, or "styling", with different fonts, font sizes,
319
+ -- font attributes, and foreground and background colors, just to name a few.
320
+ -- The unit of this rich highlighting is called a "style". Styles are simply
321
+ -- strings of comma-separated property settings. By default, lexers associate
322
+ -- predefined token names like `lexer.WHITESPACE`, `lexer.COMMENT`,
323
+ -- `lexer.STRING`, etc. with particular styles as part of a universal color
324
+ -- theme. These predefined styles include [`lexer.STYLE_CLASS`](),
325
+ -- [`lexer.STYLE_COMMENT`](), [`lexer.STYLE_CONSTANT`](),
326
+ -- [`lexer.STYLE_ERROR`](), [`lexer.STYLE_EMBEDDED`](),
327
+ -- [`lexer.STYLE_FUNCTION`](), [`lexer.STYLE_IDENTIFIER`](),
328
+ -- [`lexer.STYLE_KEYWORD`](), [`lexer.STYLE_LABEL`](), [`lexer.STYLE_NUMBER`](),
329
+ -- [`lexer.STYLE_OPERATOR`](), [`lexer.STYLE_PREPROCESSOR`](),
330
+ -- [`lexer.STYLE_REGEX`](), [`lexer.STYLE_STRING`](), [`lexer.STYLE_TYPE`](),
331
+ -- [`lexer.STYLE_VARIABLE`](), and [`lexer.STYLE_WHITESPACE`](). Like with
332
+ -- predefined token names and LPeg patterns, you may define your own styles. At
333
+ -- their core, styles are just strings, so you may create new ones and/or modify
334
+ -- existing ones. Each style consists of the following comma-separated settings:
335
+ --
336
+ -- Setting | Description
337
+ -- ---------------|------------
338
+ -- font:_name_ | The name of the font the style uses.
339
+ -- size:_int_ | The size of the font the style uses.
340
+ -- [not]bold | Whether or not the font face is bold.
341
+ -- [not]italics | Whether or not the font face is italic.
342
+ -- [not]underlined| Whether or not the font face is underlined.
343
+ -- fore:_color_ | The foreground color of the font face.
344
+ -- back:_color_ | The background color of the font face.
345
+ -- [not]eolfilled | Does the background color extend to the end of the line?
346
+ -- case:_char_ | The case of the font ('u': upper, 'l': lower, 'm': normal).
347
+ -- [not]visible | Whether or not the text is visible.
348
+ -- [not]changeable| Whether the text is changeable or read-only.
349
+ -- [not]hotspot | Whether or not the text is clickable.
350
+ --
351
+ -- Specify font colors in either "#RRGGBB" format, "0xBBGGRR" format, or the
352
+ -- decimal equivalent of the latter. As with token names, LPeg patterns, and
353
+ -- styles, there is a set of predefined color names, but they vary depending on
354
+ -- the current color theme in use. Therefore, it is generally not a good idea to
355
+ -- manually define colors within styles in your lexer since they might not fit
356
+ -- into a user's chosen color theme. Try to refrain from even using predefined
357
+ -- colors in a style because that color may be theme-specific. Instead, the best
358
+ -- practice is to either use predefined styles or derive new color-agnostic
359
+ -- styles from predefined ones. For example, Lua "longstring" tokens use the
360
+ -- existing `lexer.STYLE_STRING` style instead of defining a new one.
361
+ --
362
+ -- #### Example Styles
363
+ --
364
+ -- Defining styles is pretty straightforward. An empty style that inherits the
365
+ -- default theme settings is simply an empty string:
366
+ --
367
+ -- local style_nothing = ''
368
+ --
369
+ -- A similar style but with a bold font face looks like this:
370
+ --
371
+ -- local style_bold = 'bold'
372
+ --
373
+ -- If you want the same style, but also with an italic font face, define the new
374
+ -- style in terms of the old one:
375
+ --
376
+ -- local style_bold_italic = style_bold..',italics'
377
+ --
378
+ -- This allows you to derive new styles from predefined ones without having to
379
+ -- rewrite them. This operation leaves the old style unchanged. Thus if you
380
+ -- had a "static variable" token whose style you wanted to base off of
381
+ -- `lexer.STYLE_VARIABLE`, it would probably look like:
382
+ --
383
+ -- local style_static_var = l.STYLE_VARIABLE..',italics'
384
+ --
385
+ -- The color theme files in the *lexers/themes/* folder give more examples of
386
+ -- style definitions.
387
+ --
388
+ -- ### Token Styles
389
+ --
390
+ -- Lexers use the `_tokenstyles` table to assign tokens to particular styles.
391
+ -- Recall the token definition and `_tokenstyles` table from the lexer template:
392
+ --
393
+ -- local ws = token(l.WHITESPACE, l.space^1)
394
+ --
395
+ -- ...
396
+ --
397
+ -- M._tokenstyles = {
398
+ --
399
+ -- }
400
+ --
401
+ -- Why is a style not assigned to the `lexer.WHITESPACE` token? As mentioned
402
+ -- earlier, lexers automatically associate tokens that use predefined token
403
+ -- names with a particular style. Only tokens with custom token names need
404
+ -- manual style associations. As an example, consider a custom whitespace token:
405
+ --
406
+ -- local ws = token('custom_whitespace', l.space^1)
407
+ --
408
+ -- Assigning a style to this token looks like:
409
+ --
410
+ -- M._tokenstyles = {
411
+ -- custom_whitespace = l.STYLE_WHITESPACE
412
+ -- }
413
+ --
414
+ -- Do not confuse token names with rule names. They are completely different
415
+ -- entities. In the example above, the lexer assigns the "custom_whitespace"
416
+ -- token the existing style for `WHITESPACE` tokens. If instead you want to
417
+ -- color the background of whitespace a shade of grey, it might look like:
418
+ --
419
+ -- local custom_style = l.STYLE_WHITESPACE..',back:$(color.grey)'
420
+ -- M._tokenstyles = {
421
+ -- custom_whitespace = custom_style
422
+ -- }
423
+ --
424
+ -- Notice that the lexer peforms Scintilla/SciTE-style "$()" property expansion.
425
+ -- You may also use "%()". Remember to refrain from assigning specific colors in
426
+ -- styles, but in this case, all user color themes probably define the
427
+ -- "color.grey" property.
428
+ --
429
+ -- ### Line Lexers
430
+ --
431
+ -- By default, lexers match the arbitrary chunks of text passed to them by
432
+ -- Scintilla. These chunks may be a full document, only the visible part of a
433
+ -- document, or even just portions of lines. Some lexers need to match whole
434
+ -- lines. For example, a lexer for the output of a file "diff" needs to know if
435
+ -- the line started with a '+' or '-' and then style the entire line
436
+ -- accordingly. To indicate that your lexer matches by line, use the
437
+ -- `_LEXBYLINE` field:
438
+ --
439
+ -- M._LEXBYLINE = true
440
+ --
441
+ -- Now the input text for the lexer is a single line at a time. Keep in mind
442
+ -- that line lexers do not have the ability to look ahead at subsequent lines.
443
+ --
444
+ -- ### Embedded Lexers
445
+ --
446
+ -- Lexers embed within one another very easily, requiring minimal effort. In the
447
+ -- following sections, the lexer being embedded is called the "child" lexer and
448
+ -- the lexer a child is being embedded in is called the "parent". For example,
449
+ -- consider an HTML lexer and a CSS lexer. Either lexer stands alone for styling
450
+ -- their respective HTML and CSS files. However, CSS can be embedded inside
451
+ -- HTML. In this specific case, the CSS lexer is the "child" lexer with the HTML
452
+ -- lexer being the "parent". Now consider an HTML lexer and a PHP lexer. This
453
+ -- sounds a lot like the case with CSS, but there is a subtle difference: PHP
454
+ -- _embeds itself_ into HTML while CSS is _embedded in_ HTML. This fundamental
455
+ -- difference results in two types of embedded lexers: a parent lexer that
456
+ -- embeds other child lexers in it (like HTML embedding CSS), and a child lexer
457
+ -- that embeds itself within a parent lexer (like PHP embedding itself in HTML).
458
+ --
459
+ -- #### Parent Lexer
460
+ --
461
+ -- Before embedding a child lexer into a parent lexer, the parent lexer needs to
462
+ -- load the child lexer. This is done with the [`lexer.load()`]() function. For
463
+ -- example, loading the CSS lexer within the HTML lexer looks like:
464
+ --
465
+ -- local css = l.load('css')
466
+ --
467
+ -- The next part of the embedding process is telling the parent lexer when to
468
+ -- switch over to the child lexer and when to switch back. The lexer refers to
469
+ -- these indications as the "start rule" and "end rule", respectively, and are
470
+ -- just LPeg patterns. Continuing with the HTML/CSS example, the transition from
471
+ -- HTML to CSS is when the lexer encounters a "style" tag with a "type"
472
+ -- attribute whose value is "text/css":
473
+ --
474
+ -- local css_tag = P('<style') * P(function(input, index)
475
+ -- if input:find('^[^>]+type="text/css"', index) then
476
+ -- return index
477
+ -- end
478
+ -- end)
479
+ --
480
+ -- This pattern looks for the beginning of a "style" tag and searches its
481
+ -- attribute list for the text "`type="text/css"`". (In this simplified example,
482
+ -- the Lua pattern does not consider whitespace between the '=' nor does it
483
+ -- consider that using single quotes is valid.) If there is a match, the
484
+ -- functional pattern returns a value instead of `nil`. In this case, the value
485
+ -- returned does not matter because we ultimately want to style the "style" tag
486
+ -- as an HTML tag, so the actual start rule looks like this:
487
+ --
488
+ -- local css_start_rule = #css_tag * tag
489
+ --
490
+ -- Now that the parent knows when to switch to the child, it needs to know when
491
+ -- to switch back. In the case of HTML/CSS, the switch back occurs when the
492
+ -- lexer encounters an ending "style" tag, though the lexer should still style
493
+ -- the tag as an HTML tag:
494
+ --
495
+ -- local css_end_rule = #P('</style>') * tag
496
+ --
497
+ -- Once the parent loads the child lexer and defines the child's start and end
498
+ -- rules, it embeds the child with the [`lexer.embed_lexer()`]() function:
499
+ --
500
+ -- l.embed_lexer(M, css, css_start_rule, css_end_rule)
501
+ --
502
+ -- The first parameter is the parent lexer object to embed the child in, which
503
+ -- in this case is `M`. The other three parameters are the child lexer object
504
+ -- loaded earlier followed by its start and end rules.
505
+ --
506
+ -- #### Child Lexer
507
+ --
508
+ -- The process for instructing a child lexer to embed itself into a parent is
509
+ -- very similar to embedding a child into a parent: first, load the parent lexer
510
+ -- into the child lexer with the [`lexer.load()`]() function and then create
511
+ -- start and end rules for the child lexer. However, in this case, swap the
512
+ -- lexer object arguments to [`lexer.embed_lexer()`](). For example, in the PHP
513
+ -- lexer:
514
+ --
515
+ -- local html = l.load('html')
516
+ -- local php_start_rule = token('php_tag', '<?php ')
517
+ -- local php_end_rule = token('php_tag', '?>')
518
+ -- l.embed_lexer(html, M, php_start_rule, php_end_rule)
519
+ --
520
+ -- ## Code Folding
521
+ --
522
+ -- When reading source code, it is occasionally helpful to temporarily hide
523
+ -- blocks of code like functions, classes, comments, etc. This is the concept of
524
+ -- "folding". In the Textadept and SciTE editors for example, little indicators
525
+ -- in the editor margins appear next to code that can be folded at places called
526
+ -- "fold points". When the user clicks an indicator, the editor hides the code
527
+ -- associated with the indicator until the user clicks the indicator again. The
528
+ -- lexer specifies these fold points and what code exactly to fold.
529
+ --
530
+ -- The fold points for most languages occur on keywords or character sequences.
531
+ -- Examples of fold keywords are "if" and "end" in Lua and examples of fold
532
+ -- character sequences are '{', '}', "/\*", and "\*/" in C for code block and
533
+ -- comment delimiters, respectively. However, these fold points cannot occur
534
+ -- just anywhere. For example, lexers should not recognize fold keywords that
535
+ -- appear within strings or comments. The lexer's `_foldsymbols` table allows
536
+ -- you to conveniently define fold points with such granularity. For example,
537
+ -- consider C:
538
+ --
539
+ -- M._foldsymbols = {
540
+ -- [l.OPERATOR] = {['{'] = 1, ['}'] = -1},
541
+ -- [l.COMMENT] = {['/*'] = 1, ['*/'] = -1},
542
+ -- _patterns = {'[{}]', '/%*', '%*/'}
543
+ -- }
544
+ --
545
+ -- The first assignment states that any '{' or '}' that the lexer recognized as
546
+ -- an `lexer.OPERATOR` token is a fold point. The integer `1` indicates the
547
+ -- match is a beginning fold point and `-1` indicates the match is an ending
548
+ -- fold point. Likewise, the second assignment states that any "/\*" or "\*/"
549
+ -- that the lexer recognizes as part of a `lexer.COMMENT` token is a fold point.
550
+ -- The lexer does not consider any occurences of these characters outside their
551
+ -- defined tokens (such as in a string) as fold points. Finally, every
552
+ -- `_foldsymbols` table must have a `_patterns` field that contains a list of
553
+ -- [Lua patterns][] that match fold points. If the lexer encounters text that
554
+ -- matches one of those patterns, the lexer looks up the matched text in its
555
+ -- token's table to determine whether or not the text is a fold point. In the
556
+ -- example above, the first Lua pattern matches any '{' or '}' characters. When
557
+ -- the lexer comes across one of those characters, it checks if the match is an
558
+ -- `lexer.OPERATOR` token. If so, the lexer identifies the match as a fold
559
+ -- point. The same idea applies for the other patterns. (The '%' is in the other
560
+ -- patterns because '\*' is a special character in Lua patterns that needs
561
+ -- escaping.) How do you specify fold keywords? Here is an example for Lua:
562
+ --
563
+ -- M._foldsymbols = {
564
+ -- [l.KEYWORD] = {
565
+ -- ['if'] = 1, ['do'] = 1, ['function'] = 1,
566
+ -- ['end'] = -1, ['repeat'] = 1, ['until'] = -1
567
+ -- },
568
+ -- _patterns = {'%l+'}
569
+ -- }
570
+ --
571
+ -- Any time the lexer encounters a lower case word, if that word is a
572
+ -- `lexer.KEYWORD` token and in the associated list of fold points, the lexer
573
+ -- identifies the word as a fold point.
574
+ --
575
+ -- If your lexer needs to do some additional processing to determine if a match
576
+ -- is a fold point, assign a function that returns an integer. Returning `1` or
577
+ -- `-1` indicates the match is a fold point. Returning `0` indicates it is not.
578
+ -- For example:
579
+ --
580
+ -- local function fold_strange_token(text, pos, line, s, match)
581
+ -- if ... then
582
+ -- return 1 -- beginning fold point
583
+ -- elseif ... then
584
+ -- return -1 -- ending fold point
585
+ -- end
586
+ -- return 0
587
+ -- end
588
+ --
589
+ -- M._foldsymbols = {
590
+ -- ['strange_token'] = {['|'] = fold_strange_token},
591
+ -- _patterns = {'|'}
592
+ -- }
593
+ --
594
+ -- Any time the lexer encounters a '|' that is a "strange_token", it calls the
595
+ -- `fold_strange_token` function to determine if '|' is a fold point. The lexer
596
+ -- calls these functions with the following arguments: the text to identify fold
597
+ -- points in, the beginning position of the current line in the text to fold,
598
+ -- the current line's text, the position in the current line the matched text
599
+ -- starts at, and the matched text itself.
600
+ --
601
+ -- [Lua patterns]: http://www.lua.org/manual/5.2/manual.html#6.4.1
602
+ --
603
+ -- ## Using Lexers
604
+ --
605
+ -- ### Textadept
606
+ --
607
+ -- Put your lexer in your *~/.textadept/lexers/* directory so you do not
608
+ -- overwrite it when upgrading Textadept. Also, lexers in this directory
609
+ -- override default lexers. Thus, Textadept loads a user *lua* lexer instead of
610
+ -- the default *lua* lexer. This is convenient for tweaking a default lexer to
611
+ -- your liking. Then add a [file type][] for your lexer if necessary.
612
+ --
613
+ -- [file type]: _M.textadept.file_types.html
614
+ --
615
+ -- ### SciTE
616
+ --
617
+ -- Create a *.properties* file for your lexer and `import` it in either your
618
+ -- *SciTEUser.properties* or *SciTEGlobal.properties*. The contents of the
619
+ -- *.properties* file should contain:
620
+ --
621
+ -- file.patterns.[lexer_name]=[file_patterns]
622
+ -- lexer.$(file.patterns.[lexer_name])=[lexer_name]
623
+ --
624
+ -- where `[lexer_name]` is the name of your lexer (minus the *.lua* extension)
625
+ -- and `[file_patterns]` is a set of file extensions to use your lexer for.
626
+ --
627
+ -- Please note that Lua lexers ignore any styling information in *.properties*
628
+ -- files. Your theme file in the *lexers/themes/* directory contains styling
629
+ -- information.
630
+ --
631
+ -- ## Considerations
632
+ --
633
+ -- ### Performance
634
+ --
635
+ -- There might be some slight overhead when initializing a lexer, but loading a
636
+ -- file from disk into Scintilla is usually more expensive. On modern computer
637
+ -- systems, I see no difference in speed between LPeg lexers and Scintilla's C++
638
+ -- ones. Optimize lexers for speed by re-arranging rules in the `_rules` table
639
+ -- so that the most common rules match first. Do keep in mind that order matters
640
+ -- for similar rules.
641
+ --
642
+ -- ### Limitations
643
+ --
644
+ -- Embedded preprocessor languages like PHP cannot completely embed in their
645
+ -- parent languages in that the parent's tokens do not support start and end
646
+ -- rules. This mostly goes unnoticed, but code like
647
+ --
648
+ -- <div id="<?php echo $id; ?>">
649
+ --
650
+ -- or
651
+ --
652
+ -- <div <?php if ($odd) { echo 'class="odd"'; } ?>>
653
+ --
654
+ -- will not style correctly.
655
+ --
656
+ -- ### Troubleshooting
657
+ --
658
+ -- Errors in lexers can be tricky to debug. Lexers print Lua errors to
659
+ -- `io.stderr` and `_G.print()` statements to `io.stdout`. Running your editor
660
+ -- from a terminal is the easiest way to see errors as they occur.
661
+ --
662
+ -- ### Risks
663
+ --
664
+ -- Poorly written lexers have the ability to crash Scintilla (and thus its
665
+ -- containing application), so unsaved data might be lost. However, I have only
666
+ -- observed these crashes in early lexer development, when syntax errors or
667
+ -- pattern errors are present. Once the lexer actually starts styling text
668
+ -- (either correctly or incorrectly, it does not matter), I have not observed
669
+ -- any crashes.
670
+ --
671
+ -- ### Acknowledgements
672
+ --
673
+ -- Thanks to Peter Odding for his [lexer post][] on the Lua mailing list
674
+ -- that inspired me, and thanks to Roberto Ierusalimschy for LPeg.
675
+ --
676
+ -- [lexer post]: http://lua-users.org/lists/lua-l/2007-04/msg00116.html
677
+ -- @field LEXERPATH (string)
678
+ -- The path used to search for a lexer to load.
679
+ -- Identical in format to Lua's `package.path` string.
680
+ -- The default value is `package.path`.
681
+ -- @field DEFAULT (string)
682
+ -- The token name for default tokens.
683
+ -- @field WHITESPACE (string)
684
+ -- The token name for whitespace tokens.
685
+ -- @field COMMENT (string)
686
+ -- The token name for comment tokens.
687
+ -- @field STRING (string)
688
+ -- The token name for string tokens.
689
+ -- @field NUMBER (string)
690
+ -- The token name for number tokens.
691
+ -- @field KEYWORD (string)
692
+ -- The token name for keyword tokens.
693
+ -- @field IDENTIFIER (string)
694
+ -- The token name for identifier tokens.
695
+ -- @field OPERATOR (string)
696
+ -- The token name for operator tokens.
697
+ -- @field ERROR (string)
698
+ -- The token name for error tokens.
699
+ -- @field PREPROCESSOR (string)
700
+ -- The token name for preprocessor tokens.
701
+ -- @field CONSTANT (string)
702
+ -- The token name for constant tokens.
703
+ -- @field VARIABLE (string)
704
+ -- The token name for variable tokens.
705
+ -- @field FUNCTION (string)
706
+ -- The token name for function tokens.
707
+ -- @field CLASS (string)
708
+ -- The token name for class tokens.
709
+ -- @field TYPE (string)
710
+ -- The token name for type tokens.
711
+ -- @field LABEL (string)
712
+ -- The token name for label tokens.
713
+ -- @field REGEX (string)
714
+ -- The token name for regex tokens.
715
+ -- @field STYLE_CLASS (string)
716
+ -- The style typically used for class definitions.
717
+ -- @field STYLE_COMMENT (string)
718
+ -- The style typically used for code comments.
719
+ -- @field STYLE_CONSTANT (string)
720
+ -- The style typically used for constants.
721
+ -- @field STYLE_ERROR (string)
722
+ -- The style typically used for erroneous syntax.
723
+ -- @field STYLE_FUNCTION (string)
724
+ -- The style typically used for function definitions.
725
+ -- @field STYLE_KEYWORD (string)
726
+ -- The style typically used for language keywords.
727
+ -- @field STYLE_LABEL (string)
728
+ -- The style typically used for labels.
729
+ -- @field STYLE_NUMBER (string)
730
+ -- The style typically used for numbers.
731
+ -- @field STYLE_OPERATOR (string)
732
+ -- The style typically used for operators.
733
+ -- @field STYLE_REGEX (string)
734
+ -- The style typically used for regular expression strings.
735
+ -- @field STYLE_STRING (string)
736
+ -- The style typically used for strings.
737
+ -- @field STYLE_PREPROCESSOR (string)
738
+ -- The style typically used for preprocessor statements.
739
+ -- @field STYLE_TYPE (string)
740
+ -- The style typically used for static types.
741
+ -- @field STYLE_VARIABLE (string)
742
+ -- The style typically used for variables.
743
+ -- @field STYLE_WHITESPACE (string)
744
+ -- The style typically used for whitespace.
745
+ -- @field STYLE_EMBEDDED (string)
746
+ -- The style typically used for embedded code.
747
+ -- @field STYLE_IDENTIFIER (string)
748
+ -- The style typically used for identifier words.
749
+ -- @field STYLE_DEFAULT (string)
750
+ -- The style all styles are based off of.
751
+ -- @field STYLE_LINENUMBER (string)
752
+ -- The style used for all margins except fold margins.
753
+ -- @field STYLE_BRACELIGHT (string)
754
+ -- The style used for highlighted brace characters.
755
+ -- @field STYLE_BRACEBAD (string)
756
+ -- The style used for unmatched brace characters.
757
+ -- @field STYLE_CONTROLCHAR (string)
758
+ -- The style used for control characters.
759
+ -- Color attributes are ignored.
760
+ -- @field STYLE_INDENTGUIDE (string)
761
+ -- The style used for indentation guides.
762
+ -- @field STYLE_CALLTIP (string)
763
+ -- The style used by call tips if [`buffer.call_tip_use_style`]() is set.
764
+ -- Only the font name, size, and color attributes are used.
765
+ -- @field any (pattern)
766
+ -- A pattern that matches any single character.
767
+ -- @field ascii (pattern)
768
+ -- A pattern that matches any ASCII character (codes 0 to 127).
769
+ -- @field extend (pattern)
770
+ -- A pattern that matches any ASCII extended character (codes 0 to 255).
771
+ -- @field alpha (pattern)
772
+ -- A pattern that matches any alphabetic character ('A'-'Z', 'a'-'z').
773
+ -- @field digit (pattern)
774
+ -- A pattern that matches any digit ('0'-'9').
775
+ -- @field alnum (pattern)
776
+ -- A pattern that matches any alphanumeric character ('A'-'Z', 'a'-'z',
777
+ -- '0'-'9').
778
+ -- @field lower (pattern)
779
+ -- A pattern that matches any lower case character ('a'-'z').
780
+ -- @field upper (pattern)
781
+ -- A pattern that matches any upper case character ('A'-'Z').
782
+ -- @field xdigit (pattern)
783
+ -- A pattern that matches any hexadecimal digit ('0'-'9', 'A'-'F', 'a'-'f').
784
+ -- @field cntrl (pattern)
785
+ -- A pattern that matches any control character (ASCII codes 0 to 31).
786
+ -- @field graph (pattern)
787
+ -- A pattern that matches any graphical character ('!' to '~').
788
+ -- @field print (pattern)
789
+ -- A pattern that matches any printable character (' ' to '~').
790
+ -- @field punct (pattern)
791
+ -- A pattern that matches any punctuation character ('!' to '/', ':' to '@',
792
+ -- '[' to ''', '{' to '~').
793
+ -- @field space (pattern)
794
+ -- A pattern that matches any whitespace character ('\t', '\v', '\f', '\n',
795
+ -- '\r', space).
796
+ -- @field newline (pattern)
797
+ -- A pattern that matches any set of end of line characters.
798
+ -- @field nonnewline (pattern)
799
+ -- A pattern that matches any single, non-newline character.
800
+ -- @field nonnewline_esc (pattern)
801
+ -- A pattern that matches any single, non-newline character or any set of end
802
+ -- of line characters escaped with '\'.
803
+ -- @field dec_num (pattern)
804
+ -- A pattern that matches a decimal number.
805
+ -- @field hex_num (pattern)
806
+ -- A pattern that matches a hexadecimal number.
807
+ -- @field oct_num (pattern)
808
+ -- A pattern that matches an octal number.
809
+ -- @field integer (pattern)
810
+ -- A pattern that matches either a decimal, hexadecimal, or octal number.
811
+ -- @field float (pattern)
812
+ -- A pattern that matches a floating point number.
813
+ -- @field word (pattern)
814
+ -- A pattern that matches a typical word. Words begin with a letter or
815
+ -- underscore and consist of alphanumeric and underscore characters.
816
+ -- @field FOLD_BASE (number)
817
+ -- The initial (root) fold level.
818
+ -- @field FOLD_BLANK (number)
819
+ -- Flag indicating that the line is blank.
820
+ -- @field FOLD_HEADER (number)
821
+ -- Flag indicating the line is fold point.
822
+ -- @field fold_level (table, Read-only)
823
+ -- Table of fold level bit-masks for line numbers starting from zero.
824
+ -- Fold level masks are composed of an integer level combined with any of the
825
+ -- following bits:
826
+ --
827
+ -- * `lexer.FOLD_BASE`
828
+ -- The initial fold level.
829
+ -- * `lexer.FOLD_BLANK`
830
+ -- The line is blank.
831
+ -- * `lexer.FOLD_HEADER`
832
+ -- The line is a header, or fold point.
833
+ -- @field indent_amount (table, Read-only)
834
+ -- Table of indentation amounts in character columns, for line numbers
835
+ -- starting from zero.
836
+ -- @field property (table)
837
+ -- Map of key-value string pairs.
838
+ -- @field property_expanded (table, Read-only)
839
+ -- Map of key-value string pairs with `$()` and `%()` variable replacement
840
+ -- performed in values.
841
+ -- @field property_int (table, Read-only)
842
+ -- Map of key-value pairs with values interpreted as numbers, or `0` if not
843
+ -- found.
844
+ -- @field style_at (table, Read-only)
845
+ -- Table of style names at positions in the buffer starting from zero.
846
+ module('lexer')]=]
847
+
848
+ local lpeg = require('lpeg')
849
+ local lpeg_P, lpeg_R, lpeg_S, lpeg_V = lpeg.P, lpeg.R, lpeg.S, lpeg.V
850
+ local lpeg_Ct, lpeg_Cc, lpeg_Cp = lpeg.Ct, lpeg.Cc, lpeg.Cp
851
+ local lpeg_Cmt, lpeg_C, lpeg_Cg = lpeg.Cmt, lpeg.C, lpeg.Cg
852
+ local lpeg_match = lpeg.match
853
+
854
+ M.LEXERPATH = package.path
855
+
856
+ -- Table of loaded lexers.
857
+ local lexers = {}
858
+
859
+ -- Keep track of the last parent lexer loaded. This lexer's rules are used for
860
+ -- proxy lexers (those that load parent and child lexers to embed) that do not
861
+ -- declare a parent lexer.
862
+ local parent_lexer
863
+
864
+ if not package.searchpath then
865
+ -- Searches for the given *name* in the given *path*.
866
+ -- This is an implementation of Lua 5.2's `package.searchpath()` function for
867
+ -- Lua 5.1.
868
+ function package.searchpath(name, path)
869
+ local tried = {}
870
+ for part in path:gmatch('[^;]+') do
871
+ local filename = part:gsub('%?', name)
872
+ local f = io.open(filename, 'r')
873
+ if f then f:close() return filename end
874
+ tried[#tried + 1] = ("no file '%s'"):format(filename)
875
+ end
876
+ return nil, table.concat(tried, '\n')
877
+ end
878
+ end
879
+
880
+ -- Adds a rule to a lexer's current ordered list of rules.
881
+ -- @param lexer The lexer to add the given rule to.
882
+ -- @param name The name associated with this rule. It is used for other lexers
883
+ -- to access this particular rule from the lexer's `_RULES` table. It does not
884
+ -- have to be the same as the name passed to `token`.
885
+ -- @param rule The LPeg pattern of the rule.
886
+ local function add_rule(lexer, id, rule)
887
+ if not lexer._RULES then
888
+ lexer._RULES = {}
889
+ -- Contains an ordered list (by numerical index) of rule names. This is used
890
+ -- in conjunction with lexer._RULES for building _TOKENRULE.
891
+ lexer._RULEORDER = {}
892
+ end
893
+ lexer._RULES[id] = rule
894
+ lexer._RULEORDER[#lexer._RULEORDER + 1] = id
895
+ end
896
+
897
+ -- Adds a new Scintilla style to Scintilla.
898
+ -- @param lexer The lexer to add the given style to.
899
+ -- @param token_name The name of the token associated with this style.
900
+ -- @param style A Scintilla style created from `style()`.
901
+ -- @see style
902
+ local function add_style(lexer, token_name, style)
903
+ local num_styles = lexer._numstyles
904
+ if num_styles == 32 then num_styles = num_styles + 8 end -- skip predefined
905
+ if num_styles >= 255 then print('Too many styles defined (255 MAX)') end
906
+ lexer._TOKENSTYLES[token_name], lexer._numstyles = num_styles, num_styles + 1
907
+ lexer._EXTRASTYLES[token_name] = style
908
+ end
909
+
910
+ -- (Re)constructs `lexer._TOKENRULE`.
911
+ -- @param parent The parent lexer.
912
+ local function join_tokens(lexer)
913
+ local patterns, order = lexer._RULES, lexer._RULEORDER
914
+ local token_rule = patterns[order[1]]
915
+ for i = 2, #order do token_rule = token_rule + patterns[order[i]] end
916
+ lexer._TOKENRULE = token_rule + M.token(M.DEFAULT, M.any)
917
+ return lexer._TOKENRULE
918
+ end
919
+
920
+ -- Adds a given lexer and any of its embedded lexers to a given grammar.
921
+ -- @param grammar The grammar to add the lexer to.
922
+ -- @param lexer The lexer to add.
923
+ local function add_lexer(grammar, lexer, token_rule)
924
+ local token_rule = join_tokens(lexer)
925
+ local lexer_name = lexer._NAME
926
+ for _, child in ipairs(lexer._CHILDREN) do
927
+ if child._CHILDREN then add_lexer(grammar, child) end
928
+ local child_name = child._NAME
929
+ local rules = child._EMBEDDEDRULES[lexer_name]
930
+ local rules_token_rule = grammar['__'..child_name] or rules.token_rule
931
+ grammar[child_name] = (-rules.end_rule * rules_token_rule)^0 *
932
+ rules.end_rule^-1 * lpeg_V(lexer_name)
933
+ local embedded_child = '_'..child_name
934
+ grammar[embedded_child] = rules.start_rule * (-rules.end_rule *
935
+ rules_token_rule)^0 * rules.end_rule^-1
936
+ token_rule = lpeg_V(embedded_child) + token_rule
937
+ end
938
+ grammar['__'..lexer_name] = token_rule -- can contain embedded lexer rules
939
+ grammar[lexer_name] = token_rule^0
940
+ end
941
+
942
+ -- (Re)constructs `lexer._GRAMMAR`.
943
+ -- @param lexer The parent lexer.
944
+ -- @param initial_rule The name of the rule to start lexing with. The default
945
+ -- value is `lexer._NAME`. Multilang lexers use this to start with a child
946
+ -- rule if necessary.
947
+ local function build_grammar(lexer, initial_rule)
948
+ -- local children = lexer._CHILDREN
949
+ -- if children then
950
+ local lexer_name = lexer._NAME
951
+ if not initial_rule then initial_rule = lexer_name end
952
+ local grammar = {initial_rule}
953
+ if not lexer._CHILDREN then lexer._CHILDREN={} end
954
+ add_lexer(grammar, lexer)
955
+ lexer._INITIALRULE = initial_rule
956
+ lexer._GRAMMAR = lpeg_Ct(lpeg_P(grammar))
957
+ -- else
958
+ -- lexer._GRAMMAR = lpeg_Ct(join_tokens(lexer)^0)
959
+ -- end
960
+ end
961
+
962
+ local string_upper = string.upper
963
+ -- Default styles.
964
+ local default = {
965
+ 'nothing', 'whitespace', 'comment', 'string', 'number', 'keyword',
966
+ 'identifier', 'operator', 'error', 'preprocessor', 'constant', 'variable',
967
+ 'function', 'class', 'type', 'label', 'regex', 'embedded'
968
+ }
969
+ for _, v in ipairs(default) do
970
+ M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
971
+ end
972
+ -- Predefined styles.
973
+ local predefined = {
974
+ 'default', 'linenumber', 'bracelight', 'bracebad', 'controlchar',
975
+ 'indentguide', 'calltip'
976
+ }
977
+ for _, v in ipairs(predefined) do
978
+ M[string_upper(v)], M['STYLE_'..string_upper(v)] = v, '$(style.'..v..')'
979
+ end
980
+
981
+ ---
982
+ -- Initializes or loads and returns the lexer of string name *name*.
983
+ -- Scintilla calls this function to load a lexer. Parent lexers also call this
984
+ -- function to load child lexers and vice-versa. The user calls this function
985
+ -- to load a lexer when using Scintillua as a Lua library.
986
+ -- @param name The name of the lexing language.
987
+ -- @param alt_name The alternate name of the lexing language. This is useful for
988
+ -- embedding the same child lexer with multiple sets of start and end tokens.
989
+ -- @return lexer object
990
+ -- @name load
991
+ function M.load(name, alt_name)
992
+ if lexers[alt_name or name] then return lexers[alt_name or name] end
993
+ parent_lexer = nil -- reset
994
+
995
+ -- When using Scintillua as a stand-alone module, the `property` and
996
+ -- `property_int` tables do not exist (they are not useful). Create them to
997
+ -- prevent errors from occurring.
998
+ if not M.property then
999
+ M.property, M.property_int = {}, setmetatable({}, {
1000
+ __index = function(t, k)
1001
+ return tostring(tonumber(M.property[k]) or 0)
1002
+ end,
1003
+ __newindex = function() error('read-only property') end
1004
+ })
1005
+ end
1006
+
1007
+ -- Load the language lexer with its rules, styles, etc.
1008
+ M.WHITESPACE = (alt_name or name)..'_whitespace'
1009
+ local lexer_file, error = package.searchpath(name, M.LEXERPATH)
1010
+ local ok, lexer = pcall(dofile, lexer_file or '')
1011
+ if not ok then
1012
+ _G.print(error or lexer) -- error message
1013
+ lexer = {_NAME = alt_name or name}
1014
+ end
1015
+ if alt_name then lexer._NAME = alt_name end
1016
+
1017
+ -- Create the initial maps for token names to style numbers and styles.
1018
+ local token_styles = {}
1019
+ for i = 1, #default do token_styles[default[i]] = i - 1 end
1020
+ for i = 1, #predefined do token_styles[predefined[i]] = i + 31 end
1021
+ lexer._TOKENSTYLES, lexer._numstyles = token_styles, #default
1022
+ lexer._EXTRASTYLES = {}
1023
+
1024
+ -- If the lexer is a proxy (loads parent and child lexers to embed) and does
1025
+ -- not declare a parent, try and find one and use its rules.
1026
+ if not lexer._rules and not lexer._lexer then lexer._lexer = parent_lexer end
1027
+
1028
+ -- If the lexer is a proxy or a child that embedded itself, add its rules and
1029
+ -- styles to the parent lexer. Then set the parent to be the main lexer.
1030
+ if lexer._lexer then
1031
+ local l, _r, _s = lexer._lexer, lexer._rules, lexer._tokenstyles
1032
+ if not l._tokenstyles then l._tokenstyles = {} end
1033
+ for _, r in ipairs(_r or {}) do
1034
+ -- Prevent rule id clashes.
1035
+ l._rules[#l._rules + 1] = {lexer._NAME..'_'..r[1], r[2]}
1036
+ end
1037
+ for token, style in pairs(_s or {}) do l._tokenstyles[token] = style end
1038
+ lexer = l
1039
+ end
1040
+
1041
+ -- Add the lexer's styles and build its grammar.
1042
+ if lexer._rules then
1043
+ for token, style in pairs(lexer._tokenstyles or {}) do
1044
+ add_style(lexer, token, style)
1045
+ end
1046
+ for _, r in ipairs(lexer._rules) do add_rule(lexer, r[1], r[2]) end
1047
+ build_grammar(lexer)
1048
+ end
1049
+ -- Add the lexer's unique whitespace style.
1050
+ add_style(lexer, lexer._NAME..'_whitespace', M.STYLE_WHITESPACE)
1051
+
1052
+ -- Process the lexer's fold symbols.
1053
+ if lexer._foldsymbols and lexer._foldsymbols._patterns then
1054
+ local patterns = lexer._foldsymbols._patterns
1055
+ for i = 1, #patterns do patterns[i] = '()('..patterns[i]..')' end
1056
+ end
1057
+
1058
+ lexer.lex, lexer.fold = M.lex, M.fold
1059
+ -- Immun.io copy over some of our helpful functions
1060
+ if M.lex_recursive then lexer.lex_recursive = M.lex_recursive end
1061
+ if M.unlex_rules then lexer.unlex_rules = M.unlex_rules end
1062
+ lexers[alt_name or name] = lexer
1063
+ return lexer
1064
+ end
1065
+
1066
+ ---
1067
+ -- Lexes a chunk of text *text* (that has an initial style number of
1068
+ -- *init_style*) with lexer *lexer*.
1069
+ -- If *lexer* has a `_LEXBYLINE` flag set, the text is lexed one line at a time.
1070
+ -- Otherwise the text is lexed as a whole.
1071
+ -- @param lexer The lexer object to lex with.
1072
+ -- @param text The text in the buffer to lex.
1073
+ -- @param init_style The current style. Multiple-language lexers use this to
1074
+ -- determine which language to start lexing in.
1075
+ -- @return table of token names and positions.
1076
+ -- @name lex
1077
+ function M.lex(lexer, text, init_style)
1078
+ if not lexer._LEXBYLINE then
1079
+ -- For multilang lexers, build a new grammar whose initial_rule is the
1080
+ -- current language.
1081
+ if lexer._CHILDREN then
1082
+ for style, style_num in pairs(lexer._TOKENSTYLES) do
1083
+ if style_num == init_style then
1084
+ local lexer_name = style:match('^(.+)_whitespace') or lexer._NAME
1085
+ if lexer._INITIALRULE ~= lexer_name then
1086
+ build_grammar(lexer, lexer_name)
1087
+ end
1088
+ break
1089
+ end
1090
+ end
1091
+ end
1092
+ return lpeg_match(lexer._GRAMMAR, text)
1093
+ else
1094
+ local tokens = {}
1095
+ local function append(tokens, line_tokens, offset)
1096
+ for i = 1, #line_tokens, 2 do
1097
+ tokens[#tokens + 1] = line_tokens[i]
1098
+ tokens[#tokens + 1] = line_tokens[i + 1] + offset
1099
+ end
1100
+ end
1101
+ local offset = 0
1102
+ local grammar = lexer._GRAMMAR
1103
+ for line in text:gmatch('[^\r\n]*\r?\n?') do
1104
+ local line_tokens = lpeg_match(grammar, line)
1105
+ if line_tokens then append(tokens, line_tokens, offset) end
1106
+ offset = offset + #line
1107
+ -- Use the default style to the end of the line if none was specified.
1108
+ if tokens[#tokens] ~= offset then
1109
+ tokens[#tokens + 1], tokens[#tokens + 2] = 'default', offset + 1
1110
+ end
1111
+ end
1112
+ return tokens
1113
+ end
1114
+ end
1115
+
1116
+ ---
1117
+ -- Folds a chunk of text *text* with lexer *lexer*.
1118
+ -- Folds *text* starting at position *start_pos* on line number *start_line*
1119
+ -- with a beginning fold level of *start_level* in the buffer. If *lexer* has a
1120
+ -- a `_fold` function or a `_foldsymbols` table, that field is used to perform
1121
+ -- folding. Otherwise, if a `fold.by.indentation` property is set, folding by
1122
+ -- indentation is done.
1123
+ -- @param lexer The lexer object to fold with.
1124
+ -- @param text The text in the buffer to fold.
1125
+ -- @param start_pos The position in the buffer *text* starts at.
1126
+ -- @param start_line The line number *text* starts on.
1127
+ -- @param start_level The fold level *text* starts on.
1128
+ -- @return table of fold levels.
1129
+ -- @name fold
1130
+ function M.fold(lexer, text, start_pos, start_line, start_level)
1131
+ local folds = {}
1132
+ if text == '' then return folds end
1133
+ local fold = M.property_int['fold'] > 0
1134
+ local FOLD_BASE = M.FOLD_BASE
1135
+ local FOLD_HEADER, FOLD_BLANK = M.FOLD_HEADER, M.FOLD_BLANK
1136
+ if fold and lexer._fold then
1137
+ return lexer._fold(text, start_pos, start_line, start_level)
1138
+ elseif fold and lexer._foldsymbols then
1139
+ local lines = {}
1140
+ for p, l in (text..'\n'):gmatch('()(.-)\r?\n') do
1141
+ lines[#lines + 1] = {p, l}
1142
+ end
1143
+ local fold_zero_sum_lines = M.property_int['fold.on.zero.sum.lines'] > 0
1144
+ local fold_symbols = lexer._foldsymbols
1145
+ local fold_symbols_patterns = fold_symbols._patterns
1146
+ local style_at, fold_level = M.style_at, M.fold_level
1147
+ local line_num, prev_level = start_line, start_level
1148
+ local current_level = prev_level
1149
+ for i = 1, #lines do
1150
+ local pos, line = lines[i][1], lines[i][2]
1151
+ if line ~= '' then
1152
+ local level_decreased = false
1153
+ for j = 1, #fold_symbols_patterns do
1154
+ for s, match in line:gmatch(fold_symbols_patterns[j]) do
1155
+ local symbols = fold_symbols[style_at[start_pos + pos + s - 1]]
1156
+ local l = symbols and symbols[match]
1157
+ if type(l) == 'function' then l = l(text, pos, line, s, match) end
1158
+ if type(l) == 'number' then
1159
+ current_level = current_level + l
1160
+ if l < 0 and current_level < prev_level then
1161
+ -- Potential zero-sum line. If the level were to go back up on
1162
+ -- the same line, the line may be marked as a fold header.
1163
+ level_decreased = true
1164
+ end
1165
+ end
1166
+ end
1167
+ end
1168
+ folds[line_num] = prev_level
1169
+ if current_level > prev_level then
1170
+ folds[line_num] = prev_level + FOLD_HEADER
1171
+ elseif level_decreased and current_level == prev_level and
1172
+ fold_zero_sum_lines then
1173
+ if line_num > start_line then
1174
+ folds[line_num] = prev_level - 1 + FOLD_HEADER
1175
+ else
1176
+ -- Typing within a zero-sum line.
1177
+ local level = fold_level[line_num - 1] - 1
1178
+ if level > FOLD_HEADER then level = level - FOLD_HEADER end
1179
+ if level > FOLD_BLANK then level = level - FOLD_BLANK end
1180
+ folds[line_num] = level + FOLD_HEADER
1181
+ current_level = current_level + 1
1182
+ end
1183
+ end
1184
+ if current_level < FOLD_BASE then current_level = FOLD_BASE end
1185
+ prev_level = current_level
1186
+ else
1187
+ folds[line_num] = prev_level + FOLD_BLANK
1188
+ end
1189
+ line_num = line_num + 1
1190
+ end
1191
+ elseif fold and M.property_int['fold.by.indentation'] > 0 then
1192
+ -- Indentation based folding.
1193
+ -- Calculate indentation per line.
1194
+ local indentation = {}
1195
+ for indent, line in (text..'\n'):gmatch('([\t ]*)([^\r\n]*)\r?\n') do
1196
+ indentation[#indentation + 1] = line ~= '' and #indent
1197
+ end
1198
+ -- Make line before start_line a fold header if necessary.
1199
+ if start_line > 0 and indentation[1] then
1200
+ local indent = M.indent_amount[start_line - 1]
1201
+ if indentation[1] > indent then
1202
+ folds[start_line - 1] = FOLD_BASE + indent + FOLD_HEADER
1203
+ end
1204
+ end
1205
+ -- Iterate over lines, setting fold numbers and fold flags.
1206
+ local line_num, prev_level = start_line, FOLD_BASE + (indentation[1] or 0)
1207
+ local current_level = prev_level
1208
+ for i = 1, #indentation do
1209
+ if indentation[i] then
1210
+ for j = i + 1, #indentation do
1211
+ if indentation[j] then
1212
+ current_level = FOLD_BASE + indentation[j]
1213
+ break
1214
+ end
1215
+ end
1216
+ folds[line_num] = prev_level
1217
+ if current_level > prev_level then
1218
+ folds[line_num] = prev_level + FOLD_HEADER
1219
+ end
1220
+ prev_level = current_level
1221
+ else
1222
+ folds[line_num] = prev_level + FOLD_BLANK
1223
+ end
1224
+ line_num = line_num + 1
1225
+ end
1226
+ else
1227
+ -- No folding, reset fold levels if necessary.
1228
+ local current_line = start_line
1229
+ for _ in text:gmatch('\r?\n') do
1230
+ folds[current_line] = start_level
1231
+ current_line = current_line + 1
1232
+ end
1233
+ end
1234
+ return folds
1235
+ end
1236
+
1237
+ -- The following are utility functions lexers will have access to.
1238
+
1239
+ -- Common patterns.
1240
+ M.any = lpeg_P(1)
1241
+ M.ascii = lpeg_R('\000\127')
1242
+ M.extend = lpeg_R('\000\255')
1243
+ M.alpha = lpeg_R('AZ', 'az')
1244
+ M.digit = lpeg_R('09')
1245
+ M.alnum = lpeg_R('AZ', 'az', '09')
1246
+ M.lower = lpeg_R('az')
1247
+ M.upper = lpeg_R('AZ')
1248
+ M.xdigit = lpeg_R('09', 'AF', 'af')
1249
+ M.cntrl = lpeg_R('\000\031')
1250
+ M.graph = lpeg_R('!~')
1251
+ M.print = lpeg_R(' ~')
1252
+ M.punct = lpeg_R('!/', ':@', '[\'', '{~')
1253
+ M.space = lpeg_S('\t\v\f\n\r ')
1254
+
1255
+ M.newline = lpeg_S('\r\n\f')^1
1256
+ M.nonnewline = 1 - M.newline
1257
+ M.nonnewline_esc = 1 - (M.newline + '\\') + '\\' * M.any
1258
+
1259
+ M.dec_num = M.digit^1
1260
+ M.hex_num = '0' * lpeg_S('xX') * M.xdigit^1
1261
+ M.oct_num = '0' * lpeg_R('07')^1
1262
+ M.integer = lpeg_S('+-')^-1 * (M.hex_num + M.oct_num + M.dec_num)
1263
+ M.float = lpeg_S('+-')^-1 *
1264
+ (M.digit^0 * '.' * M.digit^1 + M.digit^1 * '.' * M.digit^0 +
1265
+ M.digit^1) *
1266
+ lpeg_S('eE') * lpeg_S('+-')^-1 * M.digit^1
1267
+ M.word = (M.alpha + '_') * (M.alnum + '_')^0
1268
+
1269
+ ---
1270
+ -- Creates and returns a token pattern with token name *name* and pattern
1271
+ -- *patt*.
1272
+ -- If *name* is not a predefined token name, its style must be defined in the
1273
+ -- lexer's `_tokenstyles` table.
1274
+ -- @param name The name of token. If this name is not a predefined token name,
1275
+ -- then a style needs to be assiciated with it in the lexer's `_tokenstyles`
1276
+ -- table.
1277
+ -- @param patt The LPeg pattern associated with the token.
1278
+ -- @return pattern
1279
+ -- @usage local ws = token(l.WHITESPACE, l.space^1)
1280
+ -- @usage local annotation = token('annotation', '@' * l.word)
1281
+ -- @name token
1282
+ function M.token(name, patt)
1283
+ --return lpeg_Cg(patt, name)
1284
+ return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_C(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
1285
+ end
1286
+
1287
+ function M.parent_token(name, patt)
1288
+ --return lpeg_Cg(patt, name)
1289
+ return lpeg_Ct( lpeg_Cg( lpeg_Cc(name), 'token' ) * lpeg_Cg( lpeg_Ct(patt), 'val' ) * lpeg_Cg( lpeg_Cp(), 'pos' ) )
1290
+ end
1291
+
1292
+ ---
1293
+ -- Creates and returns a pattern that matches a range of text bounded by
1294
+ -- *chars* characters.
1295
+ -- This is a convenience function for matching more complicated delimited ranges
1296
+ -- like strings with escape characters and balanced parentheses. *single_line*
1297
+ -- indicates whether or not the range must be on a single line, *no_escape*
1298
+ -- indicates whether or not to ignore '\' as an escape character, and *balanced*
1299
+ -- indicates whether or not to handle balanced ranges like parentheses and
1300
+ -- requires *chars* to be composed of two characters.
1301
+ -- @param chars The character(s) that bound the matched range.
1302
+ -- @param single_line Optional flag indicating whether or not the range must be
1303
+ -- on a single line.
1304
+ -- @param no_escape Optional flag indicating whether or not the range end
1305
+ -- character may be escaped by a '\\' character.
1306
+ -- @param balanced Optional flag indicating whether or not to match a balanced
1307
+ -- range, like the "%b" Lua pattern. This flag only applies if *chars*
1308
+ -- consists of two different characters (e.g. "()").
1309
+ -- @return pattern
1310
+ -- @usage local dq_str_escapes = l.delimited_range('"')
1311
+ -- @usage local dq_str_noescapes = l.delimited_range('"', false, true)
1312
+ -- @usage local unbalanced_parens = l.delimited_range('()')
1313
+ -- @usage local balanced_parens = l.delimited_range('()', false, false, true)
1314
+ -- @see nested_pair
1315
+ -- @name delimited_range
1316
+ function M.delimited_range(chars, single_line, no_escape, balanced)
1317
+ local s = chars:sub(1, 1)
1318
+ local e = #chars == 2 and chars:sub(2, 2) or s
1319
+ local range
1320
+ local b = balanced and s or ''
1321
+ local n = single_line and '\n' or ''
1322
+ if no_escape then
1323
+ local invalid = lpeg_S(e..n..b)
1324
+ range = M.any - invalid
1325
+ else
1326
+ local invalid = lpeg_S(e..n..b) + '\\'
1327
+ range = M.any - invalid + '\\' * M.any
1328
+ end
1329
+ if balanced and s ~= e then
1330
+ return lpeg_P{s * (range + lpeg_V(1))^0 * e}
1331
+ else
1332
+ return s * range^0 * lpeg_P(e)^-1
1333
+ end
1334
+ end
1335
+
1336
+ ---
1337
+ -- Creates and returns a pattern that matches pattern *patt* only at the
1338
+ -- beginning of a line.
1339
+ -- @param patt The LPeg pattern to match on the beginning of a line.
1340
+ -- @return pattern
1341
+ -- @usage local preproc = token(l.PREPROCESSOR, l.starts_line('#') *
1342
+ -- l.nonnewline^0)
1343
+ -- @name starts_line
1344
+ function M.starts_line(patt)
1345
+ return lpeg_Cmt(lpeg_C(patt), function(input, index, match, ...)
1346
+ local pos = index - #match
1347
+ if pos == 1 then return index, ... end
1348
+ local char = input:sub(pos - 1, pos - 1)
1349
+ if char == '\n' or char == '\r' or char == '\f' then return index, ... end
1350
+ end)
1351
+ end
1352
+
1353
+ ---
1354
+ -- Creates and returns a pattern that verifies that string set *s* contains the
1355
+ -- first non-whitespace character behind the current match position.
1356
+ -- @param s String character set like one passed to `lpeg.S()`.
1357
+ -- @return pattern
1358
+ -- @usage local regex = l.last_char_includes('+-*!%^&|=,([{') *
1359
+ -- l.delimited_range('/')
1360
+ -- @name last_char_includes
1361
+ function M.last_char_includes(s)
1362
+ s = '['..s:gsub('[-%%%[]', '%%%1')..']'
1363
+ return lpeg_P(function(input, index)
1364
+ if index == 1 then return index end
1365
+ local i = index
1366
+ while input:sub(i - 1, i - 1):match('[ \t\r\n\f]') do i = i - 1 end
1367
+ if input:sub(i - 1, i - 1):match(s) then return index end
1368
+ end)
1369
+ end
1370
+
1371
+ ---
1372
+ -- Returns a pattern that matches a balanced range of text that starts with
1373
+ -- string *start_chars* and ends with string *end_chars*.
1374
+ -- With single-character delimiters, this function is identical to
1375
+ -- `delimited_range(start_chars..end_chars, false, true, true)`.
1376
+ -- @param start_chars The string starting a nested sequence.
1377
+ -- @param end_chars The string ending a nested sequence.
1378
+ -- @return pattern
1379
+ -- @usage local nested_comment = l.nested_pair('/*', '*/')
1380
+ -- @see delimited_range
1381
+ -- @name nested_pair
1382
+ function M.nested_pair(start_chars, end_chars)
1383
+ local s, e = start_chars, lpeg_P(end_chars)^-1
1384
+ return lpeg_P{s * (M.any - s - end_chars + lpeg_V(1))^0 * e}
1385
+ end
1386
+
1387
+ ---
1388
+ -- Creates and returns a pattern that matches any single word in list *words*.
1389
+ -- Words consist of alphanumeric and underscore characters, as well as the
1390
+ -- characters in string set *word_chars*. *case_insensitive* indicates whether
1391
+ -- or not to ignore case when matching words.
1392
+ -- This is a convenience function for simplifying a set of ordered choice word
1393
+ -- patterns.
1394
+ -- @param words A table of words.
1395
+ -- @param word_chars Optional string of additional characters considered to be
1396
+ -- part of a word. By default, word characters are alphanumerics and
1397
+ -- underscores ("%w_" in Lua). This parameter may be `nil` or the empty string
1398
+ -- to indicate no additional word characters.
1399
+ -- @param case_insensitive Optional boolean flag indicating whether or not the
1400
+ -- word match is case-insensitive. The default is `false`.
1401
+ -- @return pattern
1402
+ -- @usage local keyword = token(l.KEYWORD, word_match{'foo', 'bar', 'baz'})
1403
+ -- @usage local keyword = token(l.KEYWORD, word_match({'foo-bar', 'foo-baz',
1404
+ -- 'bar-foo', 'bar-baz', 'baz-foo', 'baz-bar'}, '-', true))
1405
+ -- @name word_match
1406
+ function M.word_match(words, word_chars, case_insensitive)
1407
+ local word_list = {}
1408
+ for _, word in ipairs(words) do
1409
+ word_list[case_insensitive and word:lower() or word] = true
1410
+ end
1411
+ local chars = M.alnum + '_'
1412
+ if word_chars then chars = chars + lpeg_S(word_chars) end
1413
+ return lpeg_Cmt(chars^1, function(input, index, word)
1414
+ if case_insensitive then word = word:lower() end
1415
+ return word_list[word] and index or nil
1416
+ end)
1417
+ end
1418
+
1419
+ ---
1420
+ -- Embeds child lexer *child* in parent lexer *parent* using patterns
1421
+ -- *start_rule* and *end_rule*, which signal the beginning and end of the
1422
+ -- embedded lexer, respectively.
1423
+ -- @param parent The parent lexer.
1424
+ -- @param child The child lexer.
1425
+ -- @param start_rule The pattern that signals the beginning of the embedded
1426
+ -- lexer.
1427
+ -- @param end_rule The pattern that signals the end of the embedded lexer.
1428
+ -- @usage l.embed_lexer(M, css, css_start_rule, css_end_rule)
1429
+ -- @usage l.embed_lexer(html, M, php_start_rule, php_end_rule)
1430
+ -- @usage l.embed_lexer(html, ruby, ruby_start_rule, ruby_end_rule)
1431
+ -- @name embed_lexer
1432
+ function M.embed_lexer(parent, child, start_rule, end_rule)
1433
+ -- Add child rules.
1434
+ if not child._EMBEDDEDRULES then child._EMBEDDEDRULES = {} end
1435
+ if not child._RULES then -- creating a child lexer to be embedded
1436
+ if not child._rules then error('Cannot embed language with no rules') end
1437
+ for _, r in ipairs(child._rules) do add_rule(child, r[1], r[2]) end
1438
+ end
1439
+ child._EMBEDDEDRULES[parent._NAME] = {
1440
+ ['start_rule'] = start_rule,
1441
+ token_rule = join_tokens(child),
1442
+ ['end_rule'] = end_rule
1443
+ }
1444
+ if not parent._CHILDREN then parent._CHILDREN = {} end
1445
+ local children = parent._CHILDREN
1446
+ children[#children + 1] = child
1447
+ -- Add child styles.
1448
+ if not parent._tokenstyles then parent._tokenstyles = {} end
1449
+ local tokenstyles = parent._tokenstyles
1450
+ tokenstyles[child._NAME..'_whitespace'] = M.STYLE_WHITESPACE
1451
+ for token, style in pairs(child._tokenstyles or {}) do
1452
+ tokenstyles[token] = style
1453
+ end
1454
+ child._lexer = parent -- use parent's tokens if child is embedding itself
1455
+ parent_lexer = parent -- use parent's tokens if the calling lexer is a proxy
1456
+ end
1457
+
1458
+ -- Determines if the previous line is a comment.
1459
+ -- This is used for determining if the current comment line is a fold point.
1460
+ -- @param prefix The prefix string defining a comment.
1461
+ -- @param text The text passed to a fold function.
1462
+ -- @param pos The pos passed to a fold function.
1463
+ -- @param line The line passed to a fold function.
1464
+ -- @param s The s passed to a fold function.
1465
+ local function prev_line_is_comment(prefix, text, pos, line, s)
1466
+ local start = line:find('%S')
1467
+ if start < s and not line:find(prefix, start, true) then return false end
1468
+ local p = pos - 1
1469
+ if text:sub(p, p) == '\n' then
1470
+ p = p - 1
1471
+ if text:sub(p, p) == '\r' then p = p - 1 end
1472
+ if text:sub(p, p) ~= '\n' then
1473
+ while p > 1 and text:sub(p - 1, p - 1) ~= '\n' do p = p - 1 end
1474
+ while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
1475
+ return text:sub(p, p + #prefix - 1) == prefix
1476
+ end
1477
+ end
1478
+ return false
1479
+ end
1480
+
1481
+ -- Determines if the next line is a comment.
1482
+ -- This is used for determining if the current comment line is a fold point.
1483
+ -- @param prefix The prefix string defining a comment.
1484
+ -- @param text The text passed to a fold function.
1485
+ -- @param pos The pos passed to a fold function.
1486
+ -- @param line The line passed to a fold function.
1487
+ -- @param s The s passed to a fold function.
1488
+ local function next_line_is_comment(prefix, text, pos, line, s)
1489
+ local p = text:find('\n', pos + s)
1490
+ if p then
1491
+ p = p + 1
1492
+ while text:sub(p, p):find('^[\t ]$') do p = p + 1 end
1493
+ return text:sub(p, p + #prefix - 1) == prefix
1494
+ end
1495
+ return false
1496
+ end
1497
+
1498
+ ---
1499
+ -- Returns a fold function (to be used within the lexer's `_foldsymbols` table)
1500
+ -- that folds consecutive line comments that start with string *prefix*.
1501
+ -- @param prefix The prefix string defining a line comment.
1502
+ -- @usage [l.COMMENT] = {['--'] = l.fold_line_comments('--')}
1503
+ -- @usage [l.COMMENT] = {['//'] = l.fold_line_comments('//')}
1504
+ -- @name fold_line_comments
1505
+ function M.fold_line_comments(prefix)
1506
+ local property_int = M.property_int
1507
+ return function(text, pos, line, s)
1508
+ if property_int['fold.line.comments'] == 0 then return 0 end
1509
+ if s > 1 and line:match('^%s*()') < s then return 0 end
1510
+ local prev_line_comment = prev_line_is_comment(prefix, text, pos, line, s)
1511
+ local next_line_comment = next_line_is_comment(prefix, text, pos, line, s)
1512
+ if not prev_line_comment and next_line_comment then return 1 end
1513
+ if prev_line_comment and not next_line_comment then return -1 end
1514
+ return 0
1515
+ end
1516
+ end
1517
+
1518
+ M.property_expanded = setmetatable({}, {
1519
+ -- Returns the string property value associated with string property *key*,
1520
+ -- replacing any "$()" and "%()" expressions with the values of their keys.
1521
+ __index = function(t, key)
1522
+ return M.property[key]:gsub('[$%%]%b()', function(key)
1523
+ return t[key:sub(3, -2)]
1524
+ end)
1525
+ end,
1526
+ __newindex = function() error('read-only property') end
1527
+ })
1528
+
1529
+ --[[ The functions and fields below were defined in C.
1530
+
1531
+ ---
1532
+ -- Individual fields for a lexer instance.
1533
+ -- @field _NAME The string name of the lexer.
1534
+ -- @field _rules An ordered list of rules for a lexer grammar.
1535
+ -- Each rule is a table containing an arbitrary rule name and the LPeg pattern
1536
+ -- associated with the rule. The order of rules is important as rules are
1537
+ -- matched sequentially.
1538
+ -- Child lexers should not use this table to access and/or modify their
1539
+ -- parent's rules and vice-versa. Use the `_RULES` table instead.
1540
+ -- @field _tokenstyles A map of non-predefined token names to styles.
1541
+ -- Remember to use token names, not rule names. It is recommended to use
1542
+ -- predefined styles or color-agnostic styles derived from predefined styles
1543
+ -- to ensure compatibility with user color themes.
1544
+ -- @field _foldsymbols A table of recognized fold points for the lexer.
1545
+ -- Keys are token names with table values defining fold points. Those table
1546
+ -- values have string keys of keywords or characters that indicate a fold
1547
+ -- point whose values are integers. A value of `1` indicates a beginning fold
1548
+ -- point and a value of `-1` indicates an ending fold point. Values can also
1549
+ -- be functions that return `1`, `-1`, or `0` (indicating no fold point) for
1550
+ -- keys which need additional processing.
1551
+ -- There is also a required `_pattern` key whose value is a table containing
1552
+ -- Lua pattern strings that match all fold points (the string keys contained
1553
+ -- in token name table values). When the lexer encounters text that matches
1554
+ -- one of those patterns, the matched text is looked up in its token's table
1555
+ -- to determine whether or not it is a fold point.
1556
+ -- @field _fold If this function exists in the lexer, it is called for folding
1557
+ -- the document instead of using `_foldsymbols` or indentation.
1558
+ -- @field _lexer The parent lexer object whose rules should be used. This field
1559
+ -- is only necessary to disambiguate a proxy lexer that loaded parent and
1560
+ -- child lexers for embedding and ended up having multiple parents loaded.
1561
+ -- @field _RULES A map of rule name keys with their associated LPeg pattern
1562
+ -- values for the lexer.
1563
+ -- This is constructed from the lexer's `_rules` table and accessible to other
1564
+ -- lexers for embedded lexer applications like modifying parent or child
1565
+ -- rules.
1566
+ -- @field _LEXBYLINE Indicates the lexer can only process one whole line of text
1567
+ -- (instead of an arbitrary chunk of text) at a time.
1568
+ -- The default value is `false`. Line lexers cannot look ahead to subsequent
1569
+ -- lines.
1570
+ -- @class table
1571
+ -- @name lexer
1572
+ local lexer
1573
+ ]]
1574
+
1575
+ return M