immunio 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +234 -0
  3. data/README.md +147 -0
  4. data/bin/immunio +5 -0
  5. data/lib/immunio.rb +29 -0
  6. data/lib/immunio/agent.rb +260 -0
  7. data/lib/immunio/authentication.rb +96 -0
  8. data/lib/immunio/blocked_app.rb +38 -0
  9. data/lib/immunio/channel.rb +432 -0
  10. data/lib/immunio/cli.rb +39 -0
  11. data/lib/immunio/context.rb +114 -0
  12. data/lib/immunio/errors.rb +43 -0
  13. data/lib/immunio/immunio_ca.crt +45 -0
  14. data/lib/immunio/logger.rb +87 -0
  15. data/lib/immunio/plugins/action_dispatch.rb +45 -0
  16. data/lib/immunio/plugins/action_view.rb +431 -0
  17. data/lib/immunio/plugins/active_record.rb +707 -0
  18. data/lib/immunio/plugins/active_record_relation.rb +370 -0
  19. data/lib/immunio/plugins/authlogic.rb +80 -0
  20. data/lib/immunio/plugins/csrf.rb +24 -0
  21. data/lib/immunio/plugins/devise.rb +40 -0
  22. data/lib/immunio/plugins/environment_reporter.rb +69 -0
  23. data/lib/immunio/plugins/eval.rb +51 -0
  24. data/lib/immunio/plugins/exception_handler.rb +55 -0
  25. data/lib/immunio/plugins/gems_tracker.rb +5 -0
  26. data/lib/immunio/plugins/haml.rb +36 -0
  27. data/lib/immunio/plugins/http_finisher.rb +50 -0
  28. data/lib/immunio/plugins/http_tracker.rb +203 -0
  29. data/lib/immunio/plugins/io.rb +96 -0
  30. data/lib/immunio/plugins/redirect.rb +42 -0
  31. data/lib/immunio/plugins/warden.rb +66 -0
  32. data/lib/immunio/processor.rb +234 -0
  33. data/lib/immunio/rails.rb +26 -0
  34. data/lib/immunio/request.rb +139 -0
  35. data/lib/immunio/rufus_lua_ext/ref.rb +27 -0
  36. data/lib/immunio/rufus_lua_ext/state.rb +157 -0
  37. data/lib/immunio/rufus_lua_ext/table.rb +137 -0
  38. data/lib/immunio/rufus_lua_ext/utils.rb +13 -0
  39. data/lib/immunio/version.rb +5 -0
  40. data/lib/immunio/vm.rb +291 -0
  41. data/lua-hooks/ext/all.c +78 -0
  42. data/lua-hooks/ext/bitop/README +22 -0
  43. data/lua-hooks/ext/bitop/bit.c +189 -0
  44. data/lua-hooks/ext/extconf.rb +38 -0
  45. data/lua-hooks/ext/libinjection/COPYING +37 -0
  46. data/lua-hooks/ext/libinjection/libinjection.h +65 -0
  47. data/lua-hooks/ext/libinjection/libinjection_html5.c +847 -0
  48. data/lua-hooks/ext/libinjection/libinjection_html5.h +54 -0
  49. data/lua-hooks/ext/libinjection/libinjection_sqli.c +2301 -0
  50. data/lua-hooks/ext/libinjection/libinjection_sqli.h +295 -0
  51. data/lua-hooks/ext/libinjection/libinjection_sqli_data.h +9349 -0
  52. data/lua-hooks/ext/libinjection/libinjection_xss.c +531 -0
  53. data/lua-hooks/ext/libinjection/libinjection_xss.h +21 -0
  54. data/lua-hooks/ext/libinjection/lualib.c +109 -0
  55. data/lua-hooks/ext/lpeg/HISTORY +90 -0
  56. data/lua-hooks/ext/lpeg/lpcap.c +537 -0
  57. data/lua-hooks/ext/lpeg/lpcap.h +43 -0
  58. data/lua-hooks/ext/lpeg/lpcode.c +986 -0
  59. data/lua-hooks/ext/lpeg/lpcode.h +34 -0
  60. data/lua-hooks/ext/lpeg/lpeg-128.gif +0 -0
  61. data/lua-hooks/ext/lpeg/lpeg.html +1429 -0
  62. data/lua-hooks/ext/lpeg/lpprint.c +244 -0
  63. data/lua-hooks/ext/lpeg/lpprint.h +35 -0
  64. data/lua-hooks/ext/lpeg/lptree.c +1238 -0
  65. data/lua-hooks/ext/lpeg/lptree.h +77 -0
  66. data/lua-hooks/ext/lpeg/lptypes.h +149 -0
  67. data/lua-hooks/ext/lpeg/lpvm.c +355 -0
  68. data/lua-hooks/ext/lpeg/lpvm.h +58 -0
  69. data/lua-hooks/ext/lpeg/makefile +55 -0
  70. data/lua-hooks/ext/lpeg/re.html +498 -0
  71. data/lua-hooks/ext/lpeg/test.lua +1409 -0
  72. data/lua-hooks/ext/lua-cmsgpack/CMakeLists.txt +45 -0
  73. data/lua-hooks/ext/lua-cmsgpack/README.md +115 -0
  74. data/lua-hooks/ext/lua-cmsgpack/lua_cmsgpack.c +957 -0
  75. data/lua-hooks/ext/lua-cmsgpack/test.lua +570 -0
  76. data/lua-hooks/ext/lua-snapshot/LICENSE +7 -0
  77. data/lua-hooks/ext/lua-snapshot/Makefile +12 -0
  78. data/lua-hooks/ext/lua-snapshot/README.md +18 -0
  79. data/lua-hooks/ext/lua-snapshot/dump.lua +15 -0
  80. data/lua-hooks/ext/lua-snapshot/snapshot.c +455 -0
  81. data/lua-hooks/ext/lua/COPYRIGHT +34 -0
  82. data/lua-hooks/ext/lua/lapi.c +1087 -0
  83. data/lua-hooks/ext/lua/lapi.h +16 -0
  84. data/lua-hooks/ext/lua/lauxlib.c +652 -0
  85. data/lua-hooks/ext/lua/lauxlib.h +174 -0
  86. data/lua-hooks/ext/lua/lbaselib.c +659 -0
  87. data/lua-hooks/ext/lua/lcode.c +831 -0
  88. data/lua-hooks/ext/lua/lcode.h +76 -0
  89. data/lua-hooks/ext/lua/ldblib.c +398 -0
  90. data/lua-hooks/ext/lua/ldebug.c +638 -0
  91. data/lua-hooks/ext/lua/ldebug.h +33 -0
  92. data/lua-hooks/ext/lua/ldo.c +519 -0
  93. data/lua-hooks/ext/lua/ldo.h +57 -0
  94. data/lua-hooks/ext/lua/ldump.c +164 -0
  95. data/lua-hooks/ext/lua/lfunc.c +174 -0
  96. data/lua-hooks/ext/lua/lfunc.h +34 -0
  97. data/lua-hooks/ext/lua/lgc.c +710 -0
  98. data/lua-hooks/ext/lua/lgc.h +110 -0
  99. data/lua-hooks/ext/lua/linit.c +38 -0
  100. data/lua-hooks/ext/lua/liolib.c +556 -0
  101. data/lua-hooks/ext/lua/llex.c +463 -0
  102. data/lua-hooks/ext/lua/llex.h +81 -0
  103. data/lua-hooks/ext/lua/llimits.h +128 -0
  104. data/lua-hooks/ext/lua/lmathlib.c +263 -0
  105. data/lua-hooks/ext/lua/lmem.c +86 -0
  106. data/lua-hooks/ext/lua/lmem.h +49 -0
  107. data/lua-hooks/ext/lua/loadlib.c +705 -0
  108. data/lua-hooks/ext/lua/loadlib_rel.c +760 -0
  109. data/lua-hooks/ext/lua/lobject.c +214 -0
  110. data/lua-hooks/ext/lua/lobject.h +381 -0
  111. data/lua-hooks/ext/lua/lopcodes.c +102 -0
  112. data/lua-hooks/ext/lua/lopcodes.h +268 -0
  113. data/lua-hooks/ext/lua/loslib.c +243 -0
  114. data/lua-hooks/ext/lua/lparser.c +1339 -0
  115. data/lua-hooks/ext/lua/lparser.h +82 -0
  116. data/lua-hooks/ext/lua/lstate.c +214 -0
  117. data/lua-hooks/ext/lua/lstate.h +169 -0
  118. data/lua-hooks/ext/lua/lstring.c +111 -0
  119. data/lua-hooks/ext/lua/lstring.h +31 -0
  120. data/lua-hooks/ext/lua/lstrlib.c +871 -0
  121. data/lua-hooks/ext/lua/ltable.c +588 -0
  122. data/lua-hooks/ext/lua/ltable.h +40 -0
  123. data/lua-hooks/ext/lua/ltablib.c +287 -0
  124. data/lua-hooks/ext/lua/ltm.c +75 -0
  125. data/lua-hooks/ext/lua/ltm.h +54 -0
  126. data/lua-hooks/ext/lua/lua.c +392 -0
  127. data/lua-hooks/ext/lua/lua.def +131 -0
  128. data/lua-hooks/ext/lua/lua.h +388 -0
  129. data/lua-hooks/ext/lua/lua.rc +28 -0
  130. data/lua-hooks/ext/lua/lua_dll.rc +26 -0
  131. data/lua-hooks/ext/lua/luac.c +200 -0
  132. data/lua-hooks/ext/lua/luac.rc +1 -0
  133. data/lua-hooks/ext/lua/luaconf.h +763 -0
  134. data/lua-hooks/ext/lua/luaconf.h.in +724 -0
  135. data/lua-hooks/ext/lua/luaconf.h.orig +763 -0
  136. data/lua-hooks/ext/lua/lualib.h +53 -0
  137. data/lua-hooks/ext/lua/lundump.c +227 -0
  138. data/lua-hooks/ext/lua/lundump.h +36 -0
  139. data/lua-hooks/ext/lua/lvm.c +767 -0
  140. data/lua-hooks/ext/lua/lvm.h +36 -0
  141. data/lua-hooks/ext/lua/lzio.c +82 -0
  142. data/lua-hooks/ext/lua/lzio.h +67 -0
  143. data/lua-hooks/ext/lua/print.c +227 -0
  144. data/lua-hooks/ext/luautf8/README.md +152 -0
  145. data/lua-hooks/ext/luautf8/lutf8lib.c +1274 -0
  146. data/lua-hooks/ext/luautf8/unidata.h +3064 -0
  147. data/lua-hooks/lib/boot.lua +254 -0
  148. data/lua-hooks/lib/encode.lua +4 -0
  149. data/lua-hooks/lib/lexers/LICENSE +21 -0
  150. data/lua-hooks/lib/lexers/bash.lua +134 -0
  151. data/lua-hooks/lib/lexers/bash_dqstr.lua +62 -0
  152. data/lua-hooks/lib/lexers/css.lua +216 -0
  153. data/lua-hooks/lib/lexers/html.lua +106 -0
  154. data/lua-hooks/lib/lexers/javascript.lua +68 -0
  155. data/lua-hooks/lib/lexers/lexer.lua +1575 -0
  156. data/lua-hooks/lib/lexers/markers.lua +33 -0
  157. metadata +308 -0
@@ -0,0 +1,34 @@
1
+ /*
2
+ ** $Id: lpcode.h,v 1.6 2013/11/28 14:56:02 roberto Exp $
3
+ */
4
+
5
+ #if !defined(lpcode_h)
6
+ #define lpcode_h
7
+
8
+ #include "../lua/lua.h"
9
+
10
+ #include "lptypes.h"
11
+ #include "lptree.h"
12
+ #include "lpvm.h"
13
+
14
+ int tocharset (TTree *tree, Charset *cs);
15
+ int checkaux (TTree *tree, int pred);
16
+ int fixedlenx (TTree *tree, int count, int len);
17
+ int hascaptures (TTree *tree);
18
+ int lp_gc (lua_State *L);
19
+ LpegInstruction *compile (lua_State *L, Pattern *p);
20
+ void realloccode (lua_State *L, Pattern *p, int nsize);
21
+ int sizei (const LpegInstruction *i);
22
+
23
+
24
+ #define PEnullable 0
25
+ #define PEnofail 1
26
+
27
+ #define nofail(t) checkaux(t, PEnofail)
28
+ #define nullable(t) checkaux(t, PEnullable)
29
+
30
+ #define fixedlen(t) fixedlenx(t, 0, 0)
31
+
32
+
33
+
34
+ #endif
@@ -0,0 +1,1429 @@
1
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4
+ <head>
5
+ <title>LPeg - Parsing Expression Grammars For Lua</title>
6
+ <link rel="stylesheet"
7
+ href="http://www.inf.puc-rio.br/~roberto/lpeg/doc.css"
8
+ type="text/css"/>
9
+ <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
10
+ </head>
11
+ <body>
12
+
13
+ <!-- $Id: lpeg.html,v 1.72 2014/12/12 17:11:35 roberto Exp $ -->
14
+
15
+ <div id="container">
16
+
17
+ <div id="product">
18
+ <div id="product_logo">
19
+ <a href="http://www.inf.puc-rio.br/~roberto/lpeg/">
20
+ <img alt="LPeg logo" src="lpeg-128.gif"/></a>
21
+
22
+ </div>
23
+ <div id="product_name"><big><strong>LPeg</strong></big></div>
24
+ <div id="product_description">
25
+ Parsing Expression Grammars For Lua, version 0.12
26
+ </div>
27
+ </div> <!-- id="product" -->
28
+
29
+ <div id="main">
30
+
31
+ <div id="navigation">
32
+ <h1>LPeg</h1>
33
+
34
+ <ul>
35
+ <li><strong>Home</strong>
36
+ <ul>
37
+ <li><a href="#intro">Introduction</a></li>
38
+ <li><a href="#func">Functions</a></li>
39
+ <li><a href="#basic">Basic Constructions</a></li>
40
+ <li><a href="#grammar">Grammars</a></li>
41
+ <li><a href="#captures">Captures</a></li>
42
+ <li><a href="#ex">Some Examples</a></li>
43
+ <li><a href="re.html">The <code>re</code> Module</a></li>
44
+ <li><a href="#download">Download</a></li>
45
+ <li><a href="#license">License</a></li>
46
+ </ul>
47
+ </li>
48
+ </ul>
49
+ </div> <!-- id="navigation" -->
50
+
51
+ <div id="content">
52
+
53
+
54
+ <h2><a name="intro">Introduction</a></h2>
55
+
56
+ <p>
57
+ <em>LPeg</em> is a new pattern-matching library for Lua,
58
+ based on
59
+ <a href="http://pdos.csail.mit.edu/%7Ebaford/packrat/">
60
+ Parsing Expression Grammars</a> (PEGs).
61
+ This text is a reference manual for the library.
62
+ For a more formal treatment of LPeg,
63
+ as well as some discussion about its implementation,
64
+ see
65
+ <a href="http://www.inf.puc-rio.br/~roberto/docs/peg.pdf">
66
+ A Text Pattern-Matching Tool based on Parsing Expression Grammars</a>.
67
+ (You may also be interested in my
68
+ <a href="http://vimeo.com/1485123">talk about LPeg</a>
69
+ given at the III Lua Workshop.)
70
+ </p>
71
+
72
+ <p>
73
+ Following the Snobol tradition,
74
+ LPeg defines patterns as first-class objects.
75
+ That is, patterns are regular Lua values
76
+ (represented by userdata).
77
+ The library offers several functions to create
78
+ and compose patterns.
79
+ With the use of metamethods,
80
+ several of these functions are provided as infix or prefix
81
+ operators.
82
+ On the one hand,
83
+ the result is usually much more verbose than the typical
84
+ encoding of patterns using the so called
85
+ <em>regular expressions</em>
86
+ (which typically are not regular expressions in the formal sense).
87
+ On the other hand,
88
+ first-class patterns allow much better documentation
89
+ (as it is easy to comment the code,
90
+ to break complex definitions in smaller parts, etc.)
91
+ and are extensible,
92
+ as we can define new functions to create and compose patterns.
93
+ </p>
94
+
95
+ <p>
96
+ For a quick glance of the library,
97
+ the following table summarizes its basic operations
98
+ for creating patterns:
99
+ </p>
100
+ <table border="1">
101
+ <tbody><tr><td><b>Operator</b></td><td><b>Description</b></td></tr>
102
+ <tr><td><a href="#op-p"><code>lpeg.P(string)</code></a></td>
103
+ <td>Matches <code>string</code> literally</td></tr>
104
+ <tr><td><a href="#op-p"><code>lpeg.P(n)</code></a></td>
105
+ <td>Matches exactly <code>n</code> characters</td></tr>
106
+ <tr><td><a href="#op-s"><code>lpeg.S(string)</code></a></td>
107
+ <td>Matches any character in <code>string</code> (Set)</td></tr>
108
+ <tr><td><a href="#op-r"><code>lpeg.R("<em>xy</em>")</code></a></td>
109
+ <td>Matches any character between <em>x</em> and <em>y</em> (Range)</td></tr>
110
+ <tr><td><a href="#op-pow"><code>patt^n</code></a></td>
111
+ <td>Matches at least <code>n</code> repetitions of <code>patt</code></td></tr>
112
+ <tr><td><a href="#op-pow"><code>patt^-n</code></a></td>
113
+ <td>Matches at most <code>n</code> repetitions of <code>patt</code></td></tr>
114
+ <tr><td><a href="#op-mul"><code>patt1 * patt2</code></a></td>
115
+ <td>Matches <code>patt1</code> followed by <code>patt2</code></td></tr>
116
+ <tr><td><a href="#op-add"><code>patt1 + patt2</code></a></td>
117
+ <td>Matches <code>patt1</code> or <code>patt2</code>
118
+ (ordered choice)</td></tr>
119
+ <tr><td><a href="#op-sub"><code>patt1 - patt2</code></a></td>
120
+ <td>Matches <code>patt1</code> if <code>patt2</code> does not match</td></tr>
121
+ <tr><td><a href="#op-unm"><code>-patt</code></a></td>
122
+ <td>Equivalent to <code>("" - patt)</code></td></tr>
123
+ <tr><td><a href="#op-len"><code>#patt</code></a></td>
124
+ <td>Matches <code>patt</code> but consumes no input</td></tr>
125
+ <tr><td><a href="#op-behind"><code>lpeg.B(patt)</code></a></td>
126
+ <td>Matches <code>patt</code> behind the current position,
127
+ consuming no input</td></tr>
128
+ </tbody></table>
129
+
130
+ <p>As a very simple example,
131
+ <code>lpeg.R("09")^1</code> creates a pattern that
132
+ matches a non-empty sequence of digits.
133
+ As a not so simple example,
134
+ <code>-lpeg.P(1)</code>
135
+ (which can be written as <code>lpeg.P(-1)</code>,
136
+ or simply <code>-1</code> for operations expecting a pattern)
137
+ matches an empty string only if it cannot match a single character;
138
+ so, it succeeds only at the end of the subject.
139
+ </p>
140
+
141
+ <p>
142
+ LPeg also offers the <a href="re.html"><code>re</code> module</a>,
143
+ which implements patterns following a regular-expression style
144
+ (e.g., <code>[09]+</code>).
145
+ (This module is 260 lines of Lua code,
146
+ and of course it uses LPeg to parse regular expressions and
147
+ translate them to regular LPeg patterns.)
148
+ </p>
149
+
150
+
151
+ <h2><a name="func">Functions</a></h2>
152
+
153
+
154
+ <h3><a name="f-match"></a><code>lpeg.match (pattern, subject [, init])</code></h3>
155
+ <p>
156
+ The matching function.
157
+ It attempts to match the given pattern against the subject string.
158
+ If the match succeeds,
159
+ returns the index in the subject of the first character after the match,
160
+ or the <a href="#captures">captured values</a>
161
+ (if the pattern captured any value).
162
+ </p>
163
+
164
+ <p>
165
+ An optional numeric argument <code>init</code> makes the match
166
+ start at that position in the subject string.
167
+ As usual in Lua libraries,
168
+ a negative value counts from the end.
169
+ </p>
170
+
171
+ <p>
172
+ Unlike typical pattern-matching functions,
173
+ <code>match</code> works only in <em>anchored</em> mode;
174
+ that is, it tries to match the pattern with a prefix of
175
+ the given subject string (at position <code>init</code>),
176
+ not with an arbitrary substring of the subject.
177
+ So, if we want to find a pattern anywhere in a string,
178
+ we must either write a loop in Lua or write a pattern that
179
+ matches anywhere.
180
+ This second approach is easy and quite efficient;
181
+ see <a href="#ex">examples</a>.
182
+ </p>
183
+
184
+ <h3><a name="f-type"></a><code>lpeg.type (value)</code></h3>
185
+ <p>
186
+ If the given value is a pattern,
187
+ returns the string <code>"pattern"</code>.
188
+ Otherwise returns nil.
189
+ </p>
190
+
191
+ <h3><a name="f-version"></a><code>lpeg.version ()</code></h3>
192
+ <p>
193
+ Returns a string with the running version of LPeg.
194
+ </p>
195
+
196
+ <h3><a name="f-setstack"></a><code>lpeg.setmaxstack (max)</code></h3>
197
+ <p>
198
+ Sets the maximum size for the backtrack stack used by LPeg to
199
+ track calls and choices.
200
+ Most well-written patterns need little backtrack levels and
201
+ therefore you seldom need to change this maximum;
202
+ but a few useful patterns may need more space.
203
+ Before changing this maximum you should try to rewrite your
204
+ pattern to avoid the need for extra space.
205
+ </p>
206
+
207
+
208
+ <h2><a name="basic">Basic Constructions</a></h2>
209
+
210
+ <p>
211
+ The following operations build patterns.
212
+ All operations that expect a pattern as an argument
213
+ may receive also strings, tables, numbers, booleans, or functions,
214
+ which are translated to patterns according to
215
+ the rules of function <a href="#op-p"><code>lpeg.P</code></a>.
216
+ </p>
217
+
218
+
219
+
220
+ <h3><a name="op-p"></a><code>lpeg.P (value)</code></h3>
221
+ <p>
222
+ Converts the given value into a proper pattern,
223
+ according to the following rules:
224
+ </p>
225
+ <ul>
226
+
227
+ <li><p>
228
+ If the argument is a pattern,
229
+ it is returned unmodified.
230
+ </p></li>
231
+
232
+ <li><p>
233
+ If the argument is a string,
234
+ it is translated to a pattern that matches the string literally.
235
+ </p></li>
236
+
237
+ <li><p>
238
+ If the argument is a non-negative number <em>n</em>,
239
+ the result is a pattern that matches exactly <em>n</em> characters.
240
+ </p></li>
241
+
242
+ <li><p>
243
+ If the argument is a negative number <em>-n</em>,
244
+ the result is a pattern that
245
+ succeeds only if the input string has less than <em>n</em> characters left:
246
+ <code>lpeg.P(-n)</code>
247
+ is equivalent to <code>-lpeg.P(n)</code>
248
+ (see the <a href="#op-unm">unary minus operation</a>).
249
+ </p></li>
250
+
251
+ <li><p>
252
+ If the argument is a boolean,
253
+ the result is a pattern that always succeeds or always fails
254
+ (according to the boolean value),
255
+ without consuming any input.
256
+ </p></li>
257
+
258
+ <li><p>
259
+ If the argument is a table,
260
+ it is interpreted as a grammar
261
+ (see <a href="#grammar">Grammars</a>).
262
+ </p></li>
263
+
264
+ <li><p>
265
+ If the argument is a function,
266
+ returns a pattern equivalent to a
267
+ <a href="#matchtime">match-time capture</a> over the empty string.
268
+ </p></li>
269
+
270
+ </ul>
271
+
272
+
273
+ <h3><a name="op-behind"></a><code>lpeg.B(patt)</code></h3>
274
+ <p>
275
+ Returns a pattern that
276
+ matches only if the input string at the current position
277
+ is preceded by <code>patt</code>.
278
+ Pattern <code>patt</code> must match only strings
279
+ with some fixed length,
280
+ and it cannot contain captures.
281
+ </p>
282
+
283
+ <p>
284
+ Like the <a href="#op-len">and predicate</a>,
285
+ this pattern never consumes any input,
286
+ independently of success or failure.
287
+ </p>
288
+
289
+
290
+ <h3><a name="op-r"></a><code>lpeg.R ({range})</code></h3>
291
+ <p>
292
+ Returns a pattern that matches any single character
293
+ belonging to one of the given <em>ranges</em>.
294
+ Each <code>range</code> is a string <em>xy</em> of length 2,
295
+ representing all characters with code
296
+ between the codes of <em>x</em> and <em>y</em>
297
+ (both inclusive).
298
+ </p>
299
+
300
+ <p>
301
+ As an example, the pattern
302
+ <code>lpeg.R("09")</code> matches any digit,
303
+ and <code>lpeg.R("az", "AZ")</code> matches any ASCII letter.
304
+ </p>
305
+
306
+
307
+ <h3><a name="op-s"></a><code>lpeg.S (string)</code></h3>
308
+ <p>
309
+ Returns a pattern that matches any single character that
310
+ appears in the given string.
311
+ (The <code>S</code> stands for <em>Set</em>.)
312
+ </p>
313
+
314
+ <p>
315
+ As an example, the pattern
316
+ <code>lpeg.S("+-*/")</code> matches any arithmetic operator.
317
+ </p>
318
+
319
+ <p>
320
+ Note that, if <code>s</code> is a character
321
+ (that is, a string of length 1),
322
+ then <code>lpeg.P(s)</code> is equivalent to <code>lpeg.S(s)</code>
323
+ which is equivalent to <code>lpeg.R(s..s)</code>.
324
+ Note also that both <code>lpeg.S("")</code> and <code>lpeg.R()</code>
325
+ are patterns that always fail.
326
+ </p>
327
+
328
+
329
+ <h3><a name="op-v"></a><code>lpeg.V (v)</code></h3>
330
+ <p>
331
+ This operation creates a non-terminal (a <em>variable</em>)
332
+ for a grammar.
333
+ The created non-terminal refers to the rule indexed by <code>v</code>
334
+ in the enclosing grammar.
335
+ (See <a href="#grammar">Grammars</a> for details.)
336
+ </p>
337
+
338
+
339
+ <h3><a name="op-locale"></a><code>lpeg.locale ([table])</code></h3>
340
+ <p>
341
+ Returns a table with patterns for matching some character classes
342
+ according to the current locale.
343
+ The table has fields named
344
+ <code>alnum</code>,
345
+ <code>alpha</code>,
346
+ <code>cntrl</code>,
347
+ <code>digit</code>,
348
+ <code>graph</code>,
349
+ <code>lower</code>,
350
+ <code>print</code>,
351
+ <code>punct</code>,
352
+ <code>space</code>,
353
+ <code>upper</code>, and
354
+ <code>xdigit</code>,
355
+ each one containing a correspondent pattern.
356
+ Each pattern matches any single character that belongs to its class.
357
+ </p>
358
+
359
+ <p>
360
+ If called with an argument <code>table</code>,
361
+ then it creates those fields inside the given table and
362
+ returns that table.
363
+ </p>
364
+
365
+
366
+ <h3><a name="op-len"></a><code>#patt</code></h3>
367
+ <p>
368
+ Returns a pattern that
369
+ matches only if the input string matches <code>patt</code>,
370
+ but without consuming any input,
371
+ independently of success or failure.
372
+ (This pattern is called an <em>and predicate</em>
373
+ and it is equivalent to
374
+ <em>&amp;patt</em> in the original PEG notation.)
375
+ </p>
376
+
377
+
378
+ <p>
379
+ This pattern never produces any capture.
380
+ </p>
381
+
382
+
383
+ <h3><a name="op-unm"></a><code>-patt</code></h3>
384
+ <p>
385
+ Returns a pattern that
386
+ matches only if the input string does not match <code>patt</code>.
387
+ It does not consume any input,
388
+ independently of success or failure.
389
+ (This pattern is equivalent to
390
+ <em>!patt</em> in the original PEG notation.)
391
+ </p>
392
+
393
+ <p>
394
+ As an example, the pattern
395
+ <code>-lpeg.P(1)</code> matches only the end of string.
396
+ </p>
397
+
398
+ <p>
399
+ This pattern never produces any captures,
400
+ because either <code>patt</code> fails
401
+ or <code>-patt</code> fails.
402
+ (A failing pattern never produces captures.)
403
+ </p>
404
+
405
+
406
+ <h3><a name="op-add"></a><code>patt1 + patt2</code></h3>
407
+ <p>
408
+ Returns a pattern equivalent to an <em>ordered choice</em>
409
+ of <code>patt1</code> and <code>patt2</code>.
410
+ (This is denoted by <em>patt1 / patt2</em> in the original PEG notation,
411
+ not to be confused with the <code>/</code> operation in LPeg.)
412
+ It matches either <code>patt1</code> or <code>patt2</code>,
413
+ with no backtracking once one of them succeeds.
414
+ The identity element for this operation is the pattern
415
+ <code>lpeg.P(false)</code>,
416
+ which always fails.
417
+ </p>
418
+
419
+ <p>
420
+ If both <code>patt1</code> and <code>patt2</code> are
421
+ character sets,
422
+ this operation is equivalent to set union.
423
+ </p>
424
+ <pre class="example">
425
+ lower = lpeg.R("az")
426
+ upper = lpeg.R("AZ")
427
+ letter = lower + upper
428
+ </pre>
429
+
430
+
431
+ <h3><a name="op-sub"></a><code>patt1 - patt2</code></h3>
432
+ <p>
433
+ Returns a pattern equivalent to <em>!patt2 patt1</em>.
434
+ This pattern asserts that the input does not match
435
+ <code>patt2</code> and then matches <code>patt1</code>.
436
+ </p>
437
+
438
+ <p>
439
+ When successful,
440
+ this pattern produces all captures from <code>patt1</code>.
441
+ It never produces any capture from <code>patt2</code>
442
+ (as either <code>patt2</code> fails or
443
+ <code>patt1 - patt2</code> fails).
444
+ </p>
445
+
446
+ <p>
447
+ If both <code>patt1</code> and <code>patt2</code> are
448
+ character sets,
449
+ this operation is equivalent to set difference.
450
+ Note that <code>-patt</code> is equivalent to <code>"" - patt</code>
451
+ (or <code>0 - patt</code>).
452
+ If <code>patt</code> is a character set,
453
+ <code>1 - patt</code> is its complement.
454
+ </p>
455
+
456
+
457
+ <h3><a name="op-mul"></a><code>patt1 * patt2</code></h3>
458
+ <p>
459
+ Returns a pattern that matches <code>patt1</code>
460
+ and then matches <code>patt2</code>,
461
+ starting where <code>patt1</code> finished.
462
+ The identity element for this operation is the
463
+ pattern <code>lpeg.P(true)</code>,
464
+ which always succeeds.
465
+ </p>
466
+
467
+ <p>
468
+ (LPeg uses the <code>*</code> operator
469
+ [instead of the more obvious <code>..</code>]
470
+ both because it has
471
+ the right priority and because in formal languages it is
472
+ common to use a dot for denoting concatenation.)
473
+ </p>
474
+
475
+
476
+ <h3><a name="op-pow"></a><code>patt^n</code></h3>
477
+ <p>
478
+ If <code>n</code> is nonnegative,
479
+ this pattern is
480
+ equivalent to <em>patt<sup>n</sup> patt*</em>:
481
+ It matches <code>n</code> or more occurrences of <code>patt</code>.
482
+ </p>
483
+
484
+ <p>
485
+ Otherwise, when <code>n</code> is negative,
486
+ this pattern is equivalent to <em>(patt?)<sup>-n</sup></em>:
487
+ It matches at most <code>|n|</code>
488
+ occurrences of <code>patt</code>.
489
+ </p>
490
+
491
+ <p>
492
+ In particular, <code>patt^0</code> is equivalent to <em>patt*</em>,
493
+ <code>patt^1</code> is equivalent to <em>patt+</em>,
494
+ and <code>patt^-1</code> is equivalent to <em>patt?</em>
495
+ in the original PEG notation.
496
+ </p>
497
+
498
+ <p>
499
+ In all cases,
500
+ the resulting pattern is greedy with no backtracking
501
+ (also called a <em>possessive</em> repetition).
502
+ That is, it matches only the longest possible sequence
503
+ of matches for <code>patt</code>.
504
+ </p>
505
+
506
+
507
+
508
+ <h2><a name="grammar">Grammars</a></h2>
509
+
510
+ <p>
511
+ With the use of Lua variables,
512
+ it is possible to define patterns incrementally,
513
+ with each new pattern using previously defined ones.
514
+ However, this technique does not allow the definition of
515
+ recursive patterns.
516
+ For recursive patterns,
517
+ we need real grammars.
518
+ </p>
519
+
520
+ <p>
521
+ LPeg represents grammars with tables,
522
+ where each entry is a rule.
523
+ </p>
524
+
525
+ <p>
526
+ The call <code>lpeg.V(v)</code>
527
+ creates a pattern that represents the nonterminal
528
+ (or <em>variable</em>) with index <code>v</code> in a grammar.
529
+ Because the grammar still does not exist when
530
+ this function is evaluated,
531
+ the result is an <em>open reference</em> to the respective rule.
532
+ </p>
533
+
534
+ <p>
535
+ A table is <em>fixed</em> when it is converted to a pattern
536
+ (either by calling <code>lpeg.P</code> or by using it wherein a
537
+ pattern is expected).
538
+ Then every open reference created by <code>lpeg.V(v)</code>
539
+ is corrected to refer to the rule indexed by <code>v</code> in the table.
540
+ </p>
541
+
542
+ <p>
543
+ When a table is fixed,
544
+ the result is a pattern that matches its <em>initial rule</em>.
545
+ The entry with index 1 in the table defines its initial rule.
546
+ If that entry is a string,
547
+ it is assumed to be the name of the initial rule.
548
+ Otherwise, LPeg assumes that the entry 1 itself is the initial rule.
549
+ </p>
550
+
551
+ <p>
552
+ As an example,
553
+ the following grammar matches strings of a's and b's that
554
+ have the same number of a's and b's:
555
+ </p>
556
+ <pre class="example">
557
+ equalcount = lpeg.P{
558
+ "S"; -- initial rule name
559
+ S = "a" * lpeg.V"B" + "b" * lpeg.V"A" + "",
560
+ A = "a" * lpeg.V"S" + "b" * lpeg.V"A" * lpeg.V"A",
561
+ B = "b" * lpeg.V"S" + "a" * lpeg.V"B" * lpeg.V"B",
562
+ } * -1
563
+ </pre>
564
+ <p>
565
+ It is equivalent to the following grammar in standard PEG notation:
566
+ </p>
567
+ <pre class="example">
568
+ S <- 'a' B / 'b' A / ''
569
+ A <- 'a' S / 'b' A A
570
+ B <- 'b' S / 'a' B B
571
+ </pre>
572
+
573
+
574
+ <h2><a name="captures">Captures</a></h2>
575
+
576
+ <p>
577
+ A <em>capture</em> is a pattern that creates values
578
+ (the so called <em>semantic information</em>) when it matches.
579
+ LPeg offers several kinds of captures,
580
+ which produces values based on matches and combine these values to
581
+ produce new values.
582
+ Each capture may produce zero or more values.
583
+ </p>
584
+
585
+ <p>
586
+ The following table summarizes the basic captures:
587
+ </p>
588
+ <table border="1">
589
+ <tbody><tr><td><b>Operation</b></td><td><b>What it Produces</b></td></tr>
590
+ <tr><td><a href="#cap-c"><code>lpeg.C(patt)</code></a></td>
591
+ <td>the match for <code>patt</code> plus all captures
592
+ made by <code>patt</code></td></tr>
593
+ <tr><td><a href="#cap-arg"><code>lpeg.Carg(n)</code></a></td>
594
+ <td>the value of the n<sup>th</sup> extra argument to
595
+ <code>lpeg.match</code> (matches the empty string)</td></tr>
596
+ <tr><td><a href="#cap-b"><code>lpeg.Cb(name)</code></a></td>
597
+ <td>the values produced by the previous
598
+ group capture named <code>name</code>
599
+ (matches the empty string)</td></tr>
600
+ <tr><td><a href="#cap-cc"><code>lpeg.Cc(values)</code></a></td>
601
+ <td>the given values (matches the empty string)</td></tr>
602
+ <tr><td><a href="#cap-f"><code>lpeg.Cf(patt, func)</code></a></td>
603
+ <td>a <em>folding</em> of the captures from <code>patt</code></td></tr>
604
+ <tr><td><a href="#cap-g"><code>lpeg.Cg(patt [, name])</code></a></td>
605
+ <td>the values produced by <code>patt</code>,
606
+ optionally tagged with <code>name</code></td></tr>
607
+ <tr><td><a href="#cap-p"><code>lpeg.Cp()</code></a></td>
608
+ <td>the current position (matches the empty string)</td></tr>
609
+ <tr><td><a href="#cap-s"><code>lpeg.Cs(patt)</code></a></td>
610
+ <td>the match for <code>patt</code>
611
+ with the values from nested captures replacing their matches</td></tr>
612
+ <tr><td><a href="#cap-t"><code>lpeg.Ct(patt)</code></a></td>
613
+ <td>a table with all captures from <code>patt</code></td></tr>
614
+ <tr><td><a href="#cap-string"><code>patt / string</code></a></td>
615
+ <td><code>string</code>, with some marks replaced by captures
616
+ of <code>patt</code></td></tr>
617
+ <tr><td><a href="#cap-num"><code>patt / number</code></a></td>
618
+ <td>the n-th value captured by <code>patt</code>,
619
+ or no value when <code>number</code> is zero.</td></tr>
620
+ <tr><td><a href="#cap-query"><code>patt / table</code></a></td>
621
+ <td><code>table[c]</code>, where <code>c</code> is the (first)
622
+ capture of <code>patt</code></td></tr>
623
+ <tr><td><a href="#cap-func"><code>patt / function</code></a></td>
624
+ <td>the returns of <code>function</code> applied to the captures
625
+ of <code>patt</code></td></tr>
626
+ <tr><td><a href="#matchtime"><code>lpeg.Cmt(patt, function)</code></a></td>
627
+ <td>the returns of <code>function</code> applied to the captures
628
+ of <code>patt</code>; the application is done at match time</td></tr>
629
+ </tbody></table>
630
+
631
+ <p>
632
+ A capture pattern produces its values every time it succeeds.
633
+ For instance,
634
+ a capture inside a loop produces as many values as matched by the loop.
635
+ A capture produces a value only when it succeeds.
636
+ For instance,
637
+ the pattern <code>lpeg.C(lpeg.P"a"^-1)</code>
638
+ produces the empty string when there is no <code>"a"</code>
639
+ (because the pattern <code>"a"?</code> succeeds),
640
+ while the pattern <code>lpeg.C("a")^-1</code>
641
+ does not produce any value when there is no <code>"a"</code>
642
+ (because the pattern <code>"a"</code> fails).
643
+ </p>
644
+
645
+ <p>
646
+ Usually,
647
+ LPeg evaluates all captures only after (and if) the entire match succeeds.
648
+ During <em>match time</em> it only gathers enough information
649
+ to produce the capture values later.
650
+ As a particularly important consequence,
651
+ most captures cannot affect the way a pattern matches a subject.
652
+ The only exception to this rule is the
653
+ so-called <a href="#matchtime"><em>match-time capture</em></a>.
654
+ When a match-time capture matches,
655
+ it forces the immediate evaluation of all its nested captures
656
+ and then calls its corresponding function,
657
+ which defines whether the match succeeds and also
658
+ what values are produced.
659
+ </p>
660
+
661
+ <h3><a name="cap-c"></a><code>lpeg.C (patt)</code></h3>
662
+ <p>
663
+ Creates a <em>simple capture</em>,
664
+ which captures the substring of the subject that matches <code>patt</code>.
665
+ The captured value is a string.
666
+ If <code>patt</code> has other captures,
667
+ their values are returned after this one.
668
+ </p>
669
+
670
+
671
+ <h3><a name="cap-arg"></a><code>lpeg.Carg (n)</code></h3>
672
+ <p>
673
+ Creates an <em>argument capture</em>.
674
+ This pattern matches the empty string and
675
+ produces the value given as the n<sup>th</sup> extra
676
+ argument given in the call to <code>lpeg.match</code>.
677
+ </p>
678
+
679
+
680
+ <h3><a name="cap-b"></a><code>lpeg.Cb (name)</code></h3>
681
+ <p>
682
+ Creates a <em>back capture</em>.
683
+ This pattern matches the empty string and
684
+ produces the values produced by the <em>most recent</em>
685
+ <a href="#cap-g">group capture</a> named <code>name</code>.
686
+ </p>
687
+
688
+ <p>
689
+ <em>Most recent</em> means the last
690
+ <em>complete</em>
691
+ <em>outermost</em>
692
+ group capture with the given name.
693
+ A <em>Complete</em> capture means that the entire pattern
694
+ corresponding to the capture has matched.
695
+ An <em>Outermost</em> capture means that the capture is not inside
696
+ another complete capture.
697
+ </p>
698
+
699
+
700
+ <h3><a name="cap-cc"></a><code>lpeg.Cc ([value, ...])</code></h3>
701
+ <p>
702
+ Creates a <em>constant capture</em>.
703
+ This pattern matches the empty string and
704
+ produces all given values as its captured values.
705
+ </p>
706
+
707
+
708
+ <h3><a name="cap-f"></a><code>lpeg.Cf (patt, func)</code></h3>
709
+ <p>
710
+ Creates a <em>fold capture</em>.
711
+ If <code>patt</code> produces a list of captures
712
+ <em>C<sub>1</sub> C<sub>2</sub> ... C<sub>n</sub></em>,
713
+ this capture will produce the value
714
+ <em>func(...func(func(C<sub>1</sub>, C<sub>2</sub>), C<sub>3</sub>)...,
715
+ C<sub>n</sub>)</em>,
716
+ that is, it will <em>fold</em>
717
+ (or <em>accumulate</em>, or <em>reduce</em>)
718
+ the captures from <code>patt</code> using function <code>func</code>.
719
+ </p>
720
+
721
+ <p>
722
+ This capture assumes that <code>patt</code> should produce
723
+ at least one capture with at least one value (of any type),
724
+ which becomes the initial value of an <em>accumulator</em>.
725
+ (If you need a specific initial value,
726
+ you may prefix a <a href="#cap-cc">constant capture</a> to <code>patt</code>.)
727
+ For each subsequent capture,
728
+ LPeg calls <code>func</code>
729
+ with this accumulator as the first argument and all values produced
730
+ by the capture as extra arguments;
731
+ the first result from this call
732
+ becomes the new value for the accumulator.
733
+ The final value of the accumulator becomes the captured value.
734
+ </p>
735
+
736
+ <p>
737
+ As an example,
738
+ the following pattern matches a list of numbers separated
739
+ by commas and returns their addition:
740
+ </p>
741
+ <pre class="example">
742
+ -- matches a numeral and captures its numerical value
743
+ number = lpeg.R"09"^1 / tonumber
744
+
745
+ -- matches a list of numbers, capturing their values
746
+ list = number * ("," * number)^0
747
+
748
+ -- auxiliary function to add two numbers
749
+ function add (acc, newvalue) return acc + newvalue end
750
+
751
+ -- folds the list of numbers adding them
752
+ sum = lpeg.Cf(list, add)
753
+
754
+ -- example of use
755
+ print(sum:match("10,30,43")) --&gt; 83
756
+ </pre>
757
+
758
+
759
+ <h3><a name="cap-g"></a><code>lpeg.Cg (patt [, name])</code></h3>
760
+ <p>
761
+ Creates a <em>group capture</em>.
762
+ It groups all values returned by <code>patt</code>
763
+ into a single capture.
764
+ The group may be anonymous (if no name is given)
765
+ or named with the given name.
766
+ </p>
767
+
768
+ <p>
769
+ An anonymous group serves to join values from several captures into
770
+ a single capture.
771
+ A named group has a different behavior.
772
+ In most situations, a named group returns no values at all.
773
+ Its values are only relevant for a following
774
+ <a href="#cap-b">back capture</a> or when used
775
+ inside a <a href="#cap-t">table capture</a>.
776
+ </p>
777
+
778
+
779
+ <h3><a name="cap-p"></a><code>lpeg.Cp ()</code></h3>
780
+ <p>
781
+ Creates a <em>position capture</em>.
782
+ It matches the empty string and
783
+ captures the position in the subject where the match occurs.
784
+ The captured value is a number.
785
+ </p>
786
+
787
+
788
+ <h3><a name="cap-s"></a><code>lpeg.Cs (patt)</code></h3>
789
+ <p>
790
+ Creates a <em>substitution capture</em>,
791
+ which captures the substring of the subject that matches <code>patt</code>,
792
+ with <em>substitutions</em>.
793
+ For any capture inside <code>patt</code> with a value,
794
+ the substring that matched the capture is replaced by the capture value
795
+ (which should be a string).
796
+ The final captured value is the string resulting from
797
+ all replacements.
798
+ </p>
799
+
800
+
801
+ <h3><a name="cap-t"></a><code>lpeg.Ct (patt)</code></h3>
802
+ <p>
803
+ Creates a <em>table capture</em>.
804
+ This capture creates a table and puts all values from all anonymous captures
805
+ made by <code>patt</code> inside this table in successive integer keys,
806
+ starting at 1.
807
+ Moreover,
808
+ for each named capture group created by <code>patt</code>,
809
+ the first value of the group is put into the table
810
+ with the group name as its key.
811
+ The captured value is only the table.
812
+ </p>
813
+
814
+
815
+ <h3><a name="cap-string"></a><code>patt / string</code></h3>
816
+ <p>
817
+ Creates a <em>string capture</em>.
818
+ It creates a capture string based on <code>string</code>.
819
+ The captured value is a copy of <code>string</code>,
820
+ except that the character <code>%</code> works as an escape character:
821
+ any sequence in <code>string</code> of the form <code>%<em>n</em></code>,
822
+ with <em>n</em> between 1 and 9,
823
+ stands for the match of the <em>n</em>-th capture in <code>patt</code>.
824
+ The sequence <code>%0</code> stands for the whole match.
825
+ The sequence <code>%%</code> stands for a single&nbsp;<code>%</code>.
826
+ </p>
827
+
828
+
829
+ <h3><a name="cap-num"></a><code>patt / number</code></h3>
830
+ <p>
831
+ Creates a <em>numbered capture</em>.
832
+ For a non-zero number,
833
+ the captured value is the n-th value
834
+ captured by <code>patt</code>.
835
+ When <code>number</code> is zero,
836
+ there are no captured values.
837
+ </p>
838
+
839
+
840
+ <h3><a name="cap-query"></a><code>patt / table</code></h3>
841
+ <p>
842
+ Creates a <em>query capture</em>.
843
+ It indexes the given table using as key the first value captured by
844
+ <code>patt</code>,
845
+ or the whole match if <code>patt</code> produced no value.
846
+ The value at that index is the final value of the capture.
847
+ If the table does not have that key,
848
+ there is no captured value.
849
+ </p>
850
+
851
+
852
+ <h3><a name="cap-func"></a><code>patt / function</code></h3>
853
+ <p>
854
+ Creates a <em>function capture</em>.
855
+ It calls the given function passing all captures made by
856
+ <code>patt</code> as arguments,
857
+ or the whole match if <code>patt</code> made no capture.
858
+ The values returned by the function
859
+ are the final values of the capture.
860
+ In particular,
861
+ if <code>function</code> returns no value,
862
+ there is no captured value.
863
+ </p>
864
+
865
+
866
+ <h3><a name="matchtime"></a><code>lpeg.Cmt(patt, function)</code></h3>
867
+ <p>
868
+ Creates a <em>match-time capture</em>.
869
+ Unlike all other captures,
870
+ this one is evaluated immediately when a match occurs.
871
+ It forces the immediate evaluation of all its nested captures
872
+ and then calls <code>function</code>.
873
+ </p>
874
+
875
+ <p>
876
+ The given function gets as arguments the entire subject,
877
+ the current position (after the match of <code>patt</code>),
878
+ plus any capture values produced by <code>patt</code>.
879
+ </p>
880
+
881
+ <p>
882
+ The first value returned by <code>function</code>
883
+ defines how the match happens.
884
+ If the call returns a number,
885
+ the match succeeds
886
+ and the returned number becomes the new current position.
887
+ (Assuming a subject <em>s</em> and current position <em>i</em>,
888
+ the returned number must be in the range <em>[i, len(s) + 1]</em>.)
889
+ If the call returns <b>true</b>,
890
+ the match succeeds without consuming any input.
891
+ (So, to return <b>true</b> is equivalent to return <em>i</em>.)
892
+ If the call returns <b>false</b>, <b>nil</b>, or no value,
893
+ the match fails.
894
+ </p>
895
+
896
+ <p>
897
+ Any extra values returned by the function become the
898
+ values produced by the capture.
899
+ </p>
900
+
901
+
902
+
903
+
904
+ <h2><a name="ex">Some Examples</a></h2>
905
+
906
+ <h3>Using a Pattern</h3>
907
+ <p>
908
+ This example shows a very simple but complete program
909
+ that builds and uses a pattern:
910
+ </p>
911
+ <pre class="example">
912
+ local lpeg = require "lpeg"
913
+
914
+ -- matches a word followed by end-of-string
915
+ p = lpeg.R"az"^1 * -1
916
+
917
+ print(p:match("hello")) --> 6
918
+ print(lpeg.match(p, "hello")) --> 6
919
+ print(p:match("1 hello")) --> nil
920
+ </pre>
921
+ <p>
922
+ The pattern is simply a sequence of one or more lower-case letters
923
+ followed by the end of string (-1).
924
+ The program calls <code>match</code> both as a method
925
+ and as a function.
926
+ In both sucessful cases,
927
+ the match returns
928
+ the index of the first character after the match,
929
+ which is the string length plus one.
930
+ </p>
931
+
932
+
933
+ <h3>Name-value lists</h3>
934
+ <p>
935
+ This example parses a list of name-value pairs and returns a table
936
+ with those pairs:
937
+ </p>
938
+ <pre class="example">
939
+ lpeg.locale(lpeg) -- adds locale entries into 'lpeg' table
940
+
941
+ local space = lpeg.space^0
942
+ local name = lpeg.C(lpeg.alpha^1) * space
943
+ local sep = lpeg.S(",;") * space
944
+ local pair = lpeg.Cg(name * "=" * space * name) * sep^-1
945
+ local list = lpeg.Cf(lpeg.Ct("") * pair^0, rawset)
946
+ t = list:match("a=b, c = hi; next = pi") --> { a = "b", c = "hi", next = "pi" }
947
+ </pre>
948
+ <p>
949
+ Each pair has the format <code>name = name</code> followed by
950
+ an optional separator (a comma or a semicolon).
951
+ The <code>pair</code> pattern encloses the pair in a group pattern,
952
+ so that the names become the values of a single capture.
953
+ The <code>list</code> pattern then folds these captures.
954
+ It starts with an empty table,
955
+ created by a table capture matching an empty string;
956
+ then for each capture (a pair of names) it applies <code>rawset</code>
957
+ over the accumulator (the table) and the capture values (the pair of names).
958
+ <code>rawset</code> returns the table itself,
959
+ so the accumulator is always the table.
960
+ </p>
961
+
962
+ <h3>Splitting a string</h3>
963
+ <p>
964
+ The following code builds a pattern that
965
+ splits a string using a given pattern
966
+ <code>sep</code> as a separator:
967
+ </p>
968
+ <pre class="example">
969
+ function split (s, sep)
970
+ sep = lpeg.P(sep)
971
+ local elem = lpeg.C((1 - sep)^0)
972
+ local p = elem * (sep * elem)^0
973
+ return lpeg.match(p, s)
974
+ end
975
+ </pre>
976
+ <p>
977
+ First the function ensures that <code>sep</code> is a proper pattern.
978
+ The pattern <code>elem</code> is a repetition of zero of more
979
+ arbitrary characters as long as there is not a match against
980
+ the separator.
981
+ It also captures its match.
982
+ The pattern <code>p</code> matches a list of elements separated
983
+ by <code>sep</code>.
984
+ </p>
985
+
986
+ <p>
987
+ If the split results in too many values,
988
+ it may overflow the maximum number of values
989
+ that can be returned by a Lua function.
990
+ In this case,
991
+ we can collect these values in a table:
992
+ </p>
993
+ <pre class="example">
994
+ function split (s, sep)
995
+ sep = lpeg.P(sep)
996
+ local elem = lpeg.C((1 - sep)^0)
997
+ local p = lpeg.Ct(elem * (sep * elem)^0) -- make a table capture
998
+ return lpeg.match(p, s)
999
+ end
1000
+ </pre>
1001
+
1002
+
1003
+ <h3>Searching for a pattern</h3>
1004
+ <p>
1005
+ The primitive <code>match</code> works only in anchored mode.
1006
+ If we want to find a pattern anywhere in a string,
1007
+ we must write a pattern that matches anywhere.
1008
+ </p>
1009
+
1010
+ <p>
1011
+ Because patterns are composable,
1012
+ we can write a function that,
1013
+ given any arbitrary pattern <code>p</code>,
1014
+ returns a new pattern that searches for <code>p</code>
1015
+ anywhere in a string.
1016
+ There are several ways to do the search.
1017
+ One way is like this:
1018
+ </p>
1019
+ <pre class="example">
1020
+ function anywhere (p)
1021
+ return lpeg.P{ p + 1 * lpeg.V(1) }
1022
+ end
1023
+ </pre>
1024
+ <p>
1025
+ This grammar has a straight reading:
1026
+ it matches <code>p</code> or skips one character and tries again.
1027
+ </p>
1028
+
1029
+ <p>
1030
+ If we want to know where the pattern is in the string
1031
+ (instead of knowing only that it is there somewhere),
1032
+ we can add position captures to the pattern:
1033
+ </p>
1034
+ <pre class="example">
1035
+ local I = lpeg.Cp()
1036
+ function anywhere (p)
1037
+ return lpeg.P{ I * p * I + 1 * lpeg.V(1) }
1038
+ end
1039
+
1040
+ print(anywhere("world"):match("hello world!")) -> 7 12
1041
+ </pre>
1042
+
1043
+ <p>
1044
+ Another option for the search is like this:
1045
+ </p>
1046
+ <pre class="example">
1047
+ local I = lpeg.Cp()
1048
+ function anywhere (p)
1049
+ return (1 - lpeg.P(p))^0 * I * p * I
1050
+ end
1051
+ </pre>
1052
+ <p>
1053
+ Again the pattern has a straight reading:
1054
+ it skips as many characters as possible while not matching <code>p</code>,
1055
+ and then matches <code>p</code> (plus appropriate captures).
1056
+ </p>
1057
+
1058
+ <p>
1059
+ If we want to look for a pattern only at word boundaries,
1060
+ we can use the following transformer:
1061
+ </p>
1062
+
1063
+ <pre class="example">
1064
+ local t = lpeg.locale()
1065
+
1066
+ function atwordboundary (p)
1067
+ return lpeg.P{
1068
+ [1] = p + t.alpha^0 * (1 - t.alpha)^1 * lpeg.V(1)
1069
+ }
1070
+ end
1071
+ </pre>
1072
+
1073
+
1074
+ <h3><a name="balanced"></a>Balanced parentheses</h3>
1075
+ <p>
1076
+ The following pattern matches only strings with balanced parentheses:
1077
+ </p>
1078
+ <pre class="example">
1079
+ b = lpeg.P{ "(" * ((1 - lpeg.S"()") + lpeg.V(1))^0 * ")" }
1080
+ </pre>
1081
+ <p>
1082
+ Reading the first (and only) rule of the given grammar,
1083
+ we have that a balanced string is
1084
+ an open parenthesis,
1085
+ followed by zero or more repetitions of either
1086
+ a non-parenthesis character or
1087
+ a balanced string (<code>lpeg.V(1)</code>),
1088
+ followed by a closing parenthesis.
1089
+ </p>
1090
+
1091
+
1092
+ <h3>Global substitution</h3>
1093
+ <p>
1094
+ The next example does a job somewhat similar to <code>string.gsub</code>.
1095
+ It receives a pattern and a replacement value,
1096
+ and substitutes the replacement value for all occurrences of the pattern
1097
+ in a given string:
1098
+ </p>
1099
+ <pre class="example">
1100
+ function gsub (s, patt, repl)
1101
+ patt = lpeg.P(patt)
1102
+ patt = lpeg.Cs((patt / repl + 1)^0)
1103
+ return lpeg.match(patt, s)
1104
+ end
1105
+ </pre>
1106
+ <p>
1107
+ As in <code>string.gsub</code>,
1108
+ the replacement value can be a string,
1109
+ a function, or a table.
1110
+ </p>
1111
+
1112
+
1113
+ <h3><a name="CSV"></a>Comma-Separated Values (CSV)</h3>
1114
+ <p>
1115
+ This example breaks a string into comma-separated values,
1116
+ returning all fields:
1117
+ </p>
1118
+ <pre class="example">
1119
+ local field = '"' * lpeg.Cs(((lpeg.P(1) - '"') + lpeg.P'""' / '"')^0) * '"' +
1120
+ lpeg.C((1 - lpeg.S',\n"')^0)
1121
+
1122
+ local record = field * (',' * field)^0 * (lpeg.P'\n' + -1)
1123
+
1124
+ function csv (s)
1125
+ return lpeg.match(record, s)
1126
+ end
1127
+ </pre>
1128
+ <p>
1129
+ A field is either a quoted field
1130
+ (which may contain any character except an individual quote,
1131
+ which may be written as two quotes that are replaced by one)
1132
+ or an unquoted field
1133
+ (which cannot contain commas, newlines, or quotes).
1134
+ A record is a list of fields separated by commas,
1135
+ ending with a newline or the string end (-1).
1136
+ </p>
1137
+
1138
+ <p>
1139
+ As it is,
1140
+ the previous pattern returns each field as a separated result.
1141
+ If we add a table capture in the definition of <code>record</code>,
1142
+ the pattern will return instead a single table
1143
+ containing all fields:
1144
+ </p>
1145
+ <pre>
1146
+ local record = lpeg.Ct(field * (',' * field)^0) * (lpeg.P'\n' + -1)
1147
+ </pre>
1148
+
1149
+
1150
+ <h3>UTF-8 and Latin 1</h3>
1151
+ <p>
1152
+ It is not difficult to use LPeg to convert a string from
1153
+ UTF-8 encoding to Latin 1 (ISO 8859-1):
1154
+ </p>
1155
+
1156
+ <pre class="example">
1157
+ -- convert a two-byte UTF-8 sequence to a Latin 1 character
1158
+ local function f2 (s)
1159
+ local c1, c2 = string.byte(s, 1, 2)
1160
+ return string.char(c1 * 64 + c2 - 12416)
1161
+ end
1162
+
1163
+ local utf8 = lpeg.R("\0\127")
1164
+ + lpeg.R("\194\195") * lpeg.R("\128\191") / f2
1165
+
1166
+ local decode_pattern = lpeg.Cs(utf8^0) * -1
1167
+ </pre>
1168
+ <p>
1169
+ In this code,
1170
+ the definition of UTF-8 is already restricted to the
1171
+ Latin 1 range (from 0 to 255).
1172
+ Any encoding outside this range (as well as any invalid encoding)
1173
+ will not match that pattern.
1174
+ </p>
1175
+
1176
+ <p>
1177
+ As the definition of <code>decode_pattern</code> demands that
1178
+ the pattern matches the whole input (because of the -1 at its end),
1179
+ any invalid string will simply fail to match,
1180
+ without any useful information about the problem.
1181
+ We can improve this situation redefining <code>decode_pattern</code>
1182
+ as follows:
1183
+ </p>
1184
+ <pre class="example">
1185
+ local function er (_, i) error("invalid encoding at position " .. i) end
1186
+
1187
+ local decode_pattern = lpeg.Cs(utf8^0) * (-1 + lpeg.P(er))
1188
+ </pre>
1189
+ <p>
1190
+ Now, if the pattern <code>utf8^0</code> stops
1191
+ before the end of the string,
1192
+ an appropriate error function is called.
1193
+ </p>
1194
+
1195
+
1196
+ <h3>UTF-8 and Unicode</h3>
1197
+ <p>
1198
+ We can extend the previous patterns to handle all Unicode code points.
1199
+ Of course,
1200
+ we cannot translate them to Latin 1 or any other one-byte encoding.
1201
+ Instead, our translation results in a array with the code points
1202
+ represented as numbers.
1203
+ The full code is here:
1204
+ </p>
1205
+ <pre class="example">
1206
+ -- decode a two-byte UTF-8 sequence
1207
+ local function f2 (s)
1208
+ local c1, c2 = string.byte(s, 1, 2)
1209
+ return c1 * 64 + c2 - 12416
1210
+ end
1211
+
1212
+ -- decode a three-byte UTF-8 sequence
1213
+ local function f3 (s)
1214
+ local c1, c2, c3 = string.byte(s, 1, 3)
1215
+ return (c1 * 64 + c2) * 64 + c3 - 925824
1216
+ end
1217
+
1218
+ -- decode a four-byte UTF-8 sequence
1219
+ local function f4 (s)
1220
+ local c1, c2, c3, c4 = string.byte(s, 1, 4)
1221
+ return ((c1 * 64 + c2) * 64 + c3) * 64 + c4 - 63447168
1222
+ end
1223
+
1224
+ local cont = lpeg.R("\128\191") -- continuation byte
1225
+
1226
+ local utf8 = lpeg.R("\0\127") / string.byte
1227
+ + lpeg.R("\194\223") * cont / f2
1228
+ + lpeg.R("\224\239") * cont * cont / f3
1229
+ + lpeg.R("\240\244") * cont * cont * cont / f4
1230
+
1231
+ local decode_pattern = lpeg.Ct(utf8^0) * -1
1232
+ </pre>
1233
+
1234
+
1235
+ <h3>Lua's long strings</h3>
1236
+ <p>
1237
+ A long string in Lua starts with the pattern <code>[=*[</code>
1238
+ and ends at the first occurrence of <code>]=*]</code> with
1239
+ exactly the same number of equal signs.
1240
+ If the opening brackets are followed by a newline,
1241
+ this newline is discarded
1242
+ (that is, it is not part of the string).
1243
+ </p>
1244
+
1245
+ <p>
1246
+ To match a long string in Lua,
1247
+ the pattern must capture the first repetition of equal signs and then,
1248
+ whenever it finds a candidate for closing the string,
1249
+ check whether it has the same number of equal signs.
1250
+ </p>
1251
+
1252
+ <pre class="example">
1253
+ equals = lpeg.P"="^0
1254
+ open = "[" * lpeg.Cg(equals, "init") * "[" * lpeg.P"\n"^-1
1255
+ close = "]" * lpeg.C(equals) * "]"
1256
+ closeeq = lpeg.Cmt(close * lpeg.Cb("init"), function (s, i, a, b) return a == b end)
1257
+ string = open * lpeg.C((lpeg.P(1) - closeeq)^0) * close / 1
1258
+ </pre>
1259
+
1260
+ <p>
1261
+ The <code>open</code> pattern matches <code>[=*[</code>,
1262
+ capturing the repetitions of equal signs in a group named <code>init</code>;
1263
+ it also discharges an optional newline, if present.
1264
+ The <code>close</code> pattern matches <code>]=*]</code>,
1265
+ also capturing the repetitions of equal signs.
1266
+ The <code>closeeq</code> pattern first matches <code>close</code>;
1267
+ then it uses a back capture to recover the capture made
1268
+ by the previous <code>open</code>,
1269
+ which is named <code>init</code>;
1270
+ finally it uses a match-time capture to check
1271
+ whether both captures are equal.
1272
+ The <code>string</code> pattern starts with an <code>open</code>,
1273
+ then it goes as far as possible until matching <code>closeeq</code>,
1274
+ and then matches the final <code>close</code>.
1275
+ The final numbered capture simply discards
1276
+ the capture made by <code>close</code>.
1277
+ </p>
1278
+
1279
+
1280
+ <h3>Arithmetic expressions</h3>
1281
+ <p>
1282
+ This example is a complete parser and evaluator for simple
1283
+ arithmetic expressions.
1284
+ We write it in two styles.
1285
+ The first approach first builds a syntax tree and then
1286
+ traverses this tree to compute the expression value:
1287
+ </p>
1288
+ <pre class="example">
1289
+ -- Lexical Elements
1290
+ local Space = lpeg.S(" \n\t")^0
1291
+ local Number = lpeg.C(lpeg.P"-"^-1 * lpeg.R("09")^1) * Space
1292
+ local TermOp = lpeg.C(lpeg.S("+-")) * Space
1293
+ local FactorOp = lpeg.C(lpeg.S("*/")) * Space
1294
+ local Open = "(" * Space
1295
+ local Close = ")" * Space
1296
+
1297
+ -- Grammar
1298
+ local Exp, Term, Factor = lpeg.V"Exp", lpeg.V"Term", lpeg.V"Factor"
1299
+ G = lpeg.P{ Exp,
1300
+ Exp = lpeg.Ct(Term * (TermOp * Term)^0);
1301
+ Term = lpeg.Ct(Factor * (FactorOp * Factor)^0);
1302
+ Factor = Number + Open * Exp * Close;
1303
+ }
1304
+
1305
+ G = Space * G * -1
1306
+
1307
+ -- Evaluator
1308
+ function eval (x)
1309
+ if type(x) == "string" then
1310
+ return tonumber(x)
1311
+ else
1312
+ local op1 = eval(x[1])
1313
+ for i = 2, #x, 2 do
1314
+ local op = x[i]
1315
+ local op2 = eval(x[i + 1])
1316
+ if (op == "+") then op1 = op1 + op2
1317
+ elseif (op == "-") then op1 = op1 - op2
1318
+ elseif (op == "*") then op1 = op1 * op2
1319
+ elseif (op == "/") then op1 = op1 / op2
1320
+ end
1321
+ end
1322
+ return op1
1323
+ end
1324
+ end
1325
+
1326
+ -- Parser/Evaluator
1327
+ function evalExp (s)
1328
+ local t = lpeg.match(G, s)
1329
+ if not t then error("syntax error", 2) end
1330
+ return eval(t)
1331
+ end
1332
+
1333
+ -- small example
1334
+ print(evalExp"3 + 5*9 / (1+1) - 12") --> 13.5
1335
+ </pre>
1336
+
1337
+ <p>
1338
+ The second style computes the expression value on the fly,
1339
+ without building the syntax tree.
1340
+ The following grammar takes this approach.
1341
+ (It assumes the same lexical elements as before.)
1342
+ </p>
1343
+ <pre class="example">
1344
+ -- Auxiliary function
1345
+ function eval (v1, op, v2)
1346
+ if (op == "+") then return v1 + v2
1347
+ elseif (op == "-") then return v1 - v2
1348
+ elseif (op == "*") then return v1 * v2
1349
+ elseif (op == "/") then return v1 / v2
1350
+ end
1351
+ end
1352
+
1353
+ -- Grammar
1354
+ local V = lpeg.V
1355
+ G = lpeg.P{ "Exp",
1356
+ Exp = lpeg.Cf(V"Term" * lpeg.Cg(TermOp * V"Term")^0, eval);
1357
+ Term = lpeg.Cf(V"Factor" * lpeg.Cg(FactorOp * V"Factor")^0, eval);
1358
+ Factor = Number / tonumber + Open * V"Exp" * Close;
1359
+ }
1360
+
1361
+ -- small example
1362
+ print(lpeg.match(G, "3 + 5*9 / (1+1) - 12")) --> 13.5
1363
+ </pre>
1364
+ <p>
1365
+ Note the use of the fold (accumulator) capture.
1366
+ To compute the value of an expression,
1367
+ the accumulator starts with the value of the first term,
1368
+ and then applies <code>eval</code> over
1369
+ the accumulator, the operator,
1370
+ and the new term for each repetition.
1371
+ </p>
1372
+
1373
+
1374
+
1375
+ <h2><a name="download"></a>Download</h2>
1376
+
1377
+ <p>LPeg
1378
+ <a href="http://www.inf.puc-rio.br/~roberto/lpeg/lpeg-0.12.1.tar.gz">source code</a>.</p>
1379
+
1380
+
1381
+ <h2><a name="license">License</a></h2>
1382
+
1383
+ <p>
1384
+ Copyright &copy; 2014 Lua.org, PUC-Rio.
1385
+ </p>
1386
+ <p>
1387
+ Permission is hereby granted, free of charge,
1388
+ to any person obtaining a copy of this software and
1389
+ associated documentation files (the "Software"),
1390
+ to deal in the Software without restriction,
1391
+ including without limitation the rights to use,
1392
+ copy, modify, merge, publish, distribute, sublicense,
1393
+ and/or sell copies of the Software,
1394
+ and to permit persons to whom the Software is
1395
+ furnished to do so,
1396
+ subject to the following conditions:
1397
+ </p>
1398
+
1399
+ <p>
1400
+ The above copyright notice and this permission notice
1401
+ shall be included in all copies or substantial portions of the Software.
1402
+ </p>
1403
+
1404
+ <p>
1405
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
1406
+ EXPRESS OR IMPLIED,
1407
+ INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1408
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
1409
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
1410
+ DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
1411
+ TORT OR OTHERWISE, ARISING FROM,
1412
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1413
+ THE SOFTWARE.
1414
+ </p>
1415
+
1416
+ </div> <!-- id="content" -->
1417
+
1418
+ </div> <!-- id="main" -->
1419
+
1420
+ <div id="about">
1421
+ <p><small>
1422
+ $Id: lpeg.html,v 1.72 2014/12/12 17:11:35 roberto Exp $
1423
+ </small></p>
1424
+ </div> <!-- id="about" -->
1425
+
1426
+ </div> <!-- id="container" -->
1427
+
1428
+ </body>
1429
+ </html>