immunio 0.15.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +234 -0
  3. data/README.md +147 -0
  4. data/bin/immunio +5 -0
  5. data/lib/immunio.rb +29 -0
  6. data/lib/immunio/agent.rb +260 -0
  7. data/lib/immunio/authentication.rb +96 -0
  8. data/lib/immunio/blocked_app.rb +38 -0
  9. data/lib/immunio/channel.rb +432 -0
  10. data/lib/immunio/cli.rb +39 -0
  11. data/lib/immunio/context.rb +114 -0
  12. data/lib/immunio/errors.rb +43 -0
  13. data/lib/immunio/immunio_ca.crt +45 -0
  14. data/lib/immunio/logger.rb +87 -0
  15. data/lib/immunio/plugins/action_dispatch.rb +45 -0
  16. data/lib/immunio/plugins/action_view.rb +431 -0
  17. data/lib/immunio/plugins/active_record.rb +707 -0
  18. data/lib/immunio/plugins/active_record_relation.rb +370 -0
  19. data/lib/immunio/plugins/authlogic.rb +80 -0
  20. data/lib/immunio/plugins/csrf.rb +24 -0
  21. data/lib/immunio/plugins/devise.rb +40 -0
  22. data/lib/immunio/plugins/environment_reporter.rb +69 -0
  23. data/lib/immunio/plugins/eval.rb +51 -0
  24. data/lib/immunio/plugins/exception_handler.rb +55 -0
  25. data/lib/immunio/plugins/gems_tracker.rb +5 -0
  26. data/lib/immunio/plugins/haml.rb +36 -0
  27. data/lib/immunio/plugins/http_finisher.rb +50 -0
  28. data/lib/immunio/plugins/http_tracker.rb +203 -0
  29. data/lib/immunio/plugins/io.rb +96 -0
  30. data/lib/immunio/plugins/redirect.rb +42 -0
  31. data/lib/immunio/plugins/warden.rb +66 -0
  32. data/lib/immunio/processor.rb +234 -0
  33. data/lib/immunio/rails.rb +26 -0
  34. data/lib/immunio/request.rb +139 -0
  35. data/lib/immunio/rufus_lua_ext/ref.rb +27 -0
  36. data/lib/immunio/rufus_lua_ext/state.rb +157 -0
  37. data/lib/immunio/rufus_lua_ext/table.rb +137 -0
  38. data/lib/immunio/rufus_lua_ext/utils.rb +13 -0
  39. data/lib/immunio/version.rb +5 -0
  40. data/lib/immunio/vm.rb +291 -0
  41. data/lua-hooks/ext/all.c +78 -0
  42. data/lua-hooks/ext/bitop/README +22 -0
  43. data/lua-hooks/ext/bitop/bit.c +189 -0
  44. data/lua-hooks/ext/extconf.rb +38 -0
  45. data/lua-hooks/ext/libinjection/COPYING +37 -0
  46. data/lua-hooks/ext/libinjection/libinjection.h +65 -0
  47. data/lua-hooks/ext/libinjection/libinjection_html5.c +847 -0
  48. data/lua-hooks/ext/libinjection/libinjection_html5.h +54 -0
  49. data/lua-hooks/ext/libinjection/libinjection_sqli.c +2301 -0
  50. data/lua-hooks/ext/libinjection/libinjection_sqli.h +295 -0
  51. data/lua-hooks/ext/libinjection/libinjection_sqli_data.h +9349 -0
  52. data/lua-hooks/ext/libinjection/libinjection_xss.c +531 -0
  53. data/lua-hooks/ext/libinjection/libinjection_xss.h +21 -0
  54. data/lua-hooks/ext/libinjection/lualib.c +109 -0
  55. data/lua-hooks/ext/lpeg/HISTORY +90 -0
  56. data/lua-hooks/ext/lpeg/lpcap.c +537 -0
  57. data/lua-hooks/ext/lpeg/lpcap.h +43 -0
  58. data/lua-hooks/ext/lpeg/lpcode.c +986 -0
  59. data/lua-hooks/ext/lpeg/lpcode.h +34 -0
  60. data/lua-hooks/ext/lpeg/lpeg-128.gif +0 -0
  61. data/lua-hooks/ext/lpeg/lpeg.html +1429 -0
  62. data/lua-hooks/ext/lpeg/lpprint.c +244 -0
  63. data/lua-hooks/ext/lpeg/lpprint.h +35 -0
  64. data/lua-hooks/ext/lpeg/lptree.c +1238 -0
  65. data/lua-hooks/ext/lpeg/lptree.h +77 -0
  66. data/lua-hooks/ext/lpeg/lptypes.h +149 -0
  67. data/lua-hooks/ext/lpeg/lpvm.c +355 -0
  68. data/lua-hooks/ext/lpeg/lpvm.h +58 -0
  69. data/lua-hooks/ext/lpeg/makefile +55 -0
  70. data/lua-hooks/ext/lpeg/re.html +498 -0
  71. data/lua-hooks/ext/lpeg/test.lua +1409 -0
  72. data/lua-hooks/ext/lua-cmsgpack/CMakeLists.txt +45 -0
  73. data/lua-hooks/ext/lua-cmsgpack/README.md +115 -0
  74. data/lua-hooks/ext/lua-cmsgpack/lua_cmsgpack.c +957 -0
  75. data/lua-hooks/ext/lua-cmsgpack/test.lua +570 -0
  76. data/lua-hooks/ext/lua-snapshot/LICENSE +7 -0
  77. data/lua-hooks/ext/lua-snapshot/Makefile +12 -0
  78. data/lua-hooks/ext/lua-snapshot/README.md +18 -0
  79. data/lua-hooks/ext/lua-snapshot/dump.lua +15 -0
  80. data/lua-hooks/ext/lua-snapshot/snapshot.c +455 -0
  81. data/lua-hooks/ext/lua/COPYRIGHT +34 -0
  82. data/lua-hooks/ext/lua/lapi.c +1087 -0
  83. data/lua-hooks/ext/lua/lapi.h +16 -0
  84. data/lua-hooks/ext/lua/lauxlib.c +652 -0
  85. data/lua-hooks/ext/lua/lauxlib.h +174 -0
  86. data/lua-hooks/ext/lua/lbaselib.c +659 -0
  87. data/lua-hooks/ext/lua/lcode.c +831 -0
  88. data/lua-hooks/ext/lua/lcode.h +76 -0
  89. data/lua-hooks/ext/lua/ldblib.c +398 -0
  90. data/lua-hooks/ext/lua/ldebug.c +638 -0
  91. data/lua-hooks/ext/lua/ldebug.h +33 -0
  92. data/lua-hooks/ext/lua/ldo.c +519 -0
  93. data/lua-hooks/ext/lua/ldo.h +57 -0
  94. data/lua-hooks/ext/lua/ldump.c +164 -0
  95. data/lua-hooks/ext/lua/lfunc.c +174 -0
  96. data/lua-hooks/ext/lua/lfunc.h +34 -0
  97. data/lua-hooks/ext/lua/lgc.c +710 -0
  98. data/lua-hooks/ext/lua/lgc.h +110 -0
  99. data/lua-hooks/ext/lua/linit.c +38 -0
  100. data/lua-hooks/ext/lua/liolib.c +556 -0
  101. data/lua-hooks/ext/lua/llex.c +463 -0
  102. data/lua-hooks/ext/lua/llex.h +81 -0
  103. data/lua-hooks/ext/lua/llimits.h +128 -0
  104. data/lua-hooks/ext/lua/lmathlib.c +263 -0
  105. data/lua-hooks/ext/lua/lmem.c +86 -0
  106. data/lua-hooks/ext/lua/lmem.h +49 -0
  107. data/lua-hooks/ext/lua/loadlib.c +705 -0
  108. data/lua-hooks/ext/lua/loadlib_rel.c +760 -0
  109. data/lua-hooks/ext/lua/lobject.c +214 -0
  110. data/lua-hooks/ext/lua/lobject.h +381 -0
  111. data/lua-hooks/ext/lua/lopcodes.c +102 -0
  112. data/lua-hooks/ext/lua/lopcodes.h +268 -0
  113. data/lua-hooks/ext/lua/loslib.c +243 -0
  114. data/lua-hooks/ext/lua/lparser.c +1339 -0
  115. data/lua-hooks/ext/lua/lparser.h +82 -0
  116. data/lua-hooks/ext/lua/lstate.c +214 -0
  117. data/lua-hooks/ext/lua/lstate.h +169 -0
  118. data/lua-hooks/ext/lua/lstring.c +111 -0
  119. data/lua-hooks/ext/lua/lstring.h +31 -0
  120. data/lua-hooks/ext/lua/lstrlib.c +871 -0
  121. data/lua-hooks/ext/lua/ltable.c +588 -0
  122. data/lua-hooks/ext/lua/ltable.h +40 -0
  123. data/lua-hooks/ext/lua/ltablib.c +287 -0
  124. data/lua-hooks/ext/lua/ltm.c +75 -0
  125. data/lua-hooks/ext/lua/ltm.h +54 -0
  126. data/lua-hooks/ext/lua/lua.c +392 -0
  127. data/lua-hooks/ext/lua/lua.def +131 -0
  128. data/lua-hooks/ext/lua/lua.h +388 -0
  129. data/lua-hooks/ext/lua/lua.rc +28 -0
  130. data/lua-hooks/ext/lua/lua_dll.rc +26 -0
  131. data/lua-hooks/ext/lua/luac.c +200 -0
  132. data/lua-hooks/ext/lua/luac.rc +1 -0
  133. data/lua-hooks/ext/lua/luaconf.h +763 -0
  134. data/lua-hooks/ext/lua/luaconf.h.in +724 -0
  135. data/lua-hooks/ext/lua/luaconf.h.orig +763 -0
  136. data/lua-hooks/ext/lua/lualib.h +53 -0
  137. data/lua-hooks/ext/lua/lundump.c +227 -0
  138. data/lua-hooks/ext/lua/lundump.h +36 -0
  139. data/lua-hooks/ext/lua/lvm.c +767 -0
  140. data/lua-hooks/ext/lua/lvm.h +36 -0
  141. data/lua-hooks/ext/lua/lzio.c +82 -0
  142. data/lua-hooks/ext/lua/lzio.h +67 -0
  143. data/lua-hooks/ext/lua/print.c +227 -0
  144. data/lua-hooks/ext/luautf8/README.md +152 -0
  145. data/lua-hooks/ext/luautf8/lutf8lib.c +1274 -0
  146. data/lua-hooks/ext/luautf8/unidata.h +3064 -0
  147. data/lua-hooks/lib/boot.lua +254 -0
  148. data/lua-hooks/lib/encode.lua +4 -0
  149. data/lua-hooks/lib/lexers/LICENSE +21 -0
  150. data/lua-hooks/lib/lexers/bash.lua +134 -0
  151. data/lua-hooks/lib/lexers/bash_dqstr.lua +62 -0
  152. data/lua-hooks/lib/lexers/css.lua +216 -0
  153. data/lua-hooks/lib/lexers/html.lua +106 -0
  154. data/lua-hooks/lib/lexers/javascript.lua +68 -0
  155. data/lua-hooks/lib/lexers/lexer.lua +1575 -0
  156. data/lua-hooks/lib/lexers/markers.lua +33 -0
  157. metadata +308 -0
@@ -0,0 +1,1274 @@
1
+ /* Modified to allow bundling.
2
+ * Original source: https://github.com/starwing/luautf8 */
3
+ /* vim: set ft=c nu et sw=2 fdc=2 fdm=syntax : */
4
+ #define LUA_LIB
5
+ #include "lua/lua.h"
6
+ #include "lua/lauxlib.h"
7
+ #include "lua/lualib.h"
8
+
9
+
10
+ #include <assert.h>
11
+ #include <string.h>
12
+
13
+
14
+ /* UTF-8 string operations */
15
+
16
+ #define UTF_MAX 8
17
+
18
+ static size_t utf8_encode(char *s, unsigned int ch) {
19
+ if (ch < 0x80) {
20
+ s[0] = (char)ch;
21
+ return 1;
22
+ }
23
+ if (ch <= 0x7FF) {
24
+ s[1] = (char) ((ch | 0x80) & 0xBF);
25
+ s[0] = (char) ((ch >> 6) | 0xC0);
26
+ return 2;
27
+ }
28
+ if (ch <= 0xFFFF) {
29
+ three:
30
+ s[2] = (char) ((ch | 0x80) & 0xBF);
31
+ s[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
32
+ s[0] = (char) ((ch >> 12) | 0xE0);
33
+ return 3;
34
+ }
35
+ if (ch <= 0x1FFFFF) {
36
+ s[3] = (char) ((ch | 0x80) & 0xBF);
37
+ s[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
38
+ s[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
39
+ s[0] = (char) ((ch >> 18) | 0xF0);
40
+ return 4;
41
+ }
42
+ if (ch <= 0x3FFFFFF) {
43
+ s[4] = (char) ((ch | 0x80) & 0xBF);
44
+ s[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
45
+ s[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
46
+ s[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
47
+ s[0] = (char) ((ch >> 24) | 0xF8);
48
+ return 5;
49
+ }
50
+ if (ch <= 0x7FFFFFFF) {
51
+ s[5] = (char) ((ch | 0x80) & 0xBF);
52
+ s[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
53
+ s[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
54
+ s[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
55
+ s[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
56
+ s[0] = (char) ((ch >> 30) | 0xFC);
57
+ return 6;
58
+ }
59
+
60
+ /* fallback */
61
+ ch = 0xFFFD;
62
+ goto three;
63
+ }
64
+
65
+ static size_t utf8_decode(const char *s, const char *e, unsigned int *pch) {
66
+ unsigned int ch;
67
+
68
+ if (s >= e) {
69
+ *pch = 0;
70
+ return 0;
71
+ }
72
+
73
+ ch = (unsigned char)s[0];
74
+ if (ch < 0xC0) goto fallback;
75
+ if (ch < 0xE0) {
76
+ if (s+1 >= e || (s[1] & 0xC0) != 0x80)
77
+ goto fallback;
78
+ *pch = ((ch & 0x1F) << 6) |
79
+ (s[1] & 0x3F);
80
+ return 2;
81
+ }
82
+ if (ch < 0xF0) {
83
+ if (s+2 >= e || (s[1] & 0xC0) != 0x80
84
+ || (s[2] & 0xC0) != 0x80)
85
+ goto fallback;
86
+ *pch = ((ch & 0x0F) << 12) |
87
+ ((s[1] & 0x3F) << 6) |
88
+ (s[2] & 0x3F);
89
+ return 3;
90
+ }
91
+ {
92
+ int count = 0; /* to count number of continuation bytes */
93
+ unsigned int res;
94
+ while ((ch & 0x40) != 0) { /* still have continuation bytes? */
95
+ int cc = (unsigned char)s[++count];
96
+ if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
97
+ goto fallback; /* invalid byte sequence, fallback */
98
+ res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
99
+ ch <<= 1; /* to test next bit */
100
+ }
101
+ if (count > 5)
102
+ goto fallback; /* invalid byte sequence */
103
+ res |= ((ch & 0x7F) << (count * 5)); /* add first byte */
104
+ return count+1;
105
+ }
106
+
107
+ fallback:
108
+ *pch = ch;
109
+ return 1;
110
+ }
111
+
112
+ static const char *utf8_next(const char *s, const char *e) {
113
+ unsigned int ch;
114
+ return s + utf8_decode(s, e, &ch);
115
+ }
116
+
117
+ static const char *utf8_prev(const char *s, const char *e) {
118
+ const char *look = e - 1;
119
+
120
+ while (s <= look) {
121
+ unsigned int ch = (unsigned char)*look;
122
+ if (ch < 0x80 || ch >= 0xC0)
123
+ return look;
124
+ --look;
125
+ }
126
+
127
+ return s;
128
+ }
129
+
130
+ static size_t utf8_length(const char *s, const char *e) {
131
+ size_t i = 0;
132
+ while (s < e) {
133
+ if ((*s & 0xFF) < 0xC0)
134
+ ++s;
135
+ else
136
+ s = utf8_next(s, e);
137
+ ++i;
138
+ }
139
+ return i;
140
+ }
141
+
142
+ static const char *utf8_index(const char *s, const char *e, int idx) {
143
+ if (idx >= 0) {
144
+ while (s < e && --idx > 0)
145
+ s = utf8_next(s, e);
146
+ return s;
147
+ }
148
+ else {
149
+ while (s < e && idx++ < 0)
150
+ e = utf8_prev(s, e);
151
+ return e;
152
+ }
153
+ }
154
+
155
+
156
+ /* Unicode character categories */
157
+
158
+ #include "unidata.h"
159
+
160
+ static int find_in_range(range_table *t, size_t size, unsigned int ch) {
161
+ size_t first, last;
162
+
163
+ first = 0;
164
+ last = size;
165
+
166
+ while (first < last) {
167
+ int mid = (first + last) / 2;
168
+ if (t[mid].last < ch)
169
+ first = mid + 1;
170
+ else if (t[mid].first > ch)
171
+ last = mid;
172
+ else
173
+ return (ch - t[mid].first) % t[mid].step == 0;
174
+ }
175
+
176
+ return 0;
177
+ }
178
+
179
+ static int convert_char(conv_table *t, size_t size, unsigned int ch) {
180
+ size_t first, last;
181
+
182
+ first = 0;
183
+ last = size;
184
+
185
+ while (first < last) {
186
+ int mid = (first + last) / 2;
187
+ if (t[mid].last < ch)
188
+ first = mid + 1;
189
+ else if (t[mid].first > ch)
190
+ last = mid;
191
+ else if ((ch - t[mid].first) % t[mid].step == 0)
192
+ return ch + t[mid].offset;
193
+ else
194
+ return ch;
195
+ }
196
+
197
+ return ch;
198
+ }
199
+
200
+ #define table_size(t) (sizeof(t)/sizeof((t)[0]))
201
+
202
+ #define define_category(name) static int utf8_is##name(unsigned int ch) \
203
+ { return find_in_range(name##_table, table_size(name##_table), ch); }
204
+
205
+ #define define_converter(name) static unsigned int utf8_##name(unsigned int ch) \
206
+ { return convert_char(name##_table, table_size(name##_table), ch); }
207
+
208
+ define_category(alpha)
209
+ define_category(lower)
210
+ define_category(upper)
211
+ define_category(cntrl)
212
+ define_category(digit)
213
+ define_category(xdigit)
214
+ define_category(punct)
215
+ define_category(space)
216
+ define_converter(tolower)
217
+ define_converter(toupper)
218
+ define_converter(totitle)
219
+ define_converter(tofold)
220
+
221
+ #undef define_category
222
+ #undef define_converter
223
+
224
+ static int utf8_isgraph(unsigned int ch) {
225
+ if (find_in_range(space_table, table_size(space_table), ch))
226
+ return 0;
227
+ if (find_in_range(graph_table, table_size(graph_table), ch))
228
+ return 1;
229
+ if (find_in_range(compose_table, table_size(compose_table), ch))
230
+ return 1;
231
+ return 0;
232
+ }
233
+
234
+ static int utf8_isalnum(unsigned int ch) {
235
+ if (find_in_range(alpha_table, table_size(alpha_table), ch))
236
+ return 1;
237
+ if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
238
+ return 1;
239
+ return 0;
240
+ }
241
+
242
+ static int utf8_width(unsigned int ch, int ambi_is_single) {
243
+ if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
244
+ return 2;
245
+ if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
246
+ return ambi_is_single ? 1 : 2;
247
+ if (find_in_range(compose_table, table_size(compose_table), ch))
248
+ return 0;
249
+ if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
250
+ return 0;
251
+ return 1;
252
+ }
253
+
254
+
255
+ /* string module compatible interface */
256
+
257
+ static const char *check_utf8(lua_State *L, int idx, const char **end) {
258
+ size_t len;
259
+ const char *s = luaL_checklstring(L, idx, &len);
260
+ if (end) *end = s+len;
261
+ return s;
262
+ }
263
+
264
+ static const char *to_utf8(lua_State *L, int idx, const char **end) {
265
+ size_t len;
266
+ const char *s = lua_tolstring(L, idx, &len);
267
+ if (end) *end = s+len;
268
+ return s;
269
+ }
270
+
271
+ static void add_utf8char(luaL_Buffer *b, unsigned int ch) {
272
+ char buff[UTF_MAX];
273
+ size_t n = utf8_encode(buff, ch);
274
+ luaL_addlstring(b, buff, n);
275
+ }
276
+
277
+ static lua_Integer byterelat(lua_Integer pos, size_t len) {
278
+ if (pos >= 0) return pos;
279
+ else if (0u - (size_t)pos > len) return 0;
280
+ else return (lua_Integer)len + pos + 1;
281
+ }
282
+
283
+ static int u_posrange(const char **ps, const char **pe,
284
+ lua_Integer posi, lua_Integer posj) {
285
+ const char *s = *ps, *e = *pe;
286
+ *ps = utf8_index(s, e, posi);
287
+ if (posj >= 0) {
288
+ while (s < e && posj-- > 0)
289
+ s = utf8_next(s, e);
290
+ *pe = s;
291
+ }
292
+ else {
293
+ while (s < e && ++posj < 0)
294
+ e = utf8_prev(s, e);
295
+ *pe = e;
296
+ }
297
+ return *ps < *pe;
298
+ }
299
+
300
+ static int Lutf8_len(lua_State *L) {
301
+ size_t len;
302
+ const char *s = luaL_checklstring(L, 1, &len);
303
+ lua_Integer posi = byterelat(luaL_optinteger(L, 2, 1), len);
304
+ lua_Integer posj = byterelat(luaL_optinteger(L, 3, -1), len);
305
+ if (posi < 1 || --posi > (lua_Integer)len
306
+ || --posj > (lua_Integer)len)
307
+ return 0;
308
+ lua_pushinteger(L, (lua_Integer)utf8_length(s+posi, s+posj+1));
309
+ return 1;
310
+ }
311
+
312
+ static int Lutf8_sub(lua_State *L) {
313
+ const char *e, *s = check_utf8(L, 1, &e);
314
+ if (u_posrange(&s, &e,
315
+ luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
316
+ lua_pushlstring(L, s, e-s);
317
+ else
318
+ lua_pushliteral(L, "");
319
+ return 1;
320
+ }
321
+
322
+ static int Lutf8_reverse(lua_State *L) {
323
+ luaL_Buffer b;
324
+ /* XXX should handle compose unicode? */
325
+ const char *e, *s = check_utf8(L, 1, &e);
326
+ luaL_buffinit(L, &b);
327
+ while (s < e) {
328
+ const char *prev = utf8_prev(s, e);
329
+ luaL_addlstring(&b, prev, e-prev);
330
+ e = prev;
331
+ }
332
+ luaL_pushresult(&b);
333
+ return 1;
334
+ }
335
+
336
+ static int convert(lua_State *L, unsigned int (*conv)(unsigned int)) {
337
+ int t = lua_type(L, 1);
338
+ if (t == LUA_TNUMBER)
339
+ lua_pushinteger(L, conv(lua_tointeger(L, 1)));
340
+ else if (t != LUA_TSTRING)
341
+ return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
342
+ else {
343
+ luaL_Buffer b;
344
+ const char *e, *s = to_utf8(L, 1, &e);
345
+ luaL_buffinit(L, &b);
346
+ while (s < e) {
347
+ unsigned int ch;
348
+ s += utf8_decode(s, e, &ch);
349
+ ch = conv(ch);
350
+ add_utf8char(&b, ch);
351
+ }
352
+ luaL_pushresult(&b);
353
+ }
354
+ return 1;
355
+ }
356
+
357
+ static int Lutf8_lower(lua_State *L)
358
+ { return convert(L, utf8_tolower); }
359
+
360
+ static int Lutf8_upper(lua_State *L)
361
+ { return convert(L, utf8_toupper); }
362
+
363
+ static int Lutf8_title(lua_State *L)
364
+ { return convert(L, utf8_totitle); }
365
+
366
+ static int Lutf8_fold(lua_State *L)
367
+ { return convert(L, utf8_tofold); }
368
+
369
+ static int Lutf8_byte(lua_State *L) {
370
+ size_t n = 0;
371
+ const char *e, *s = check_utf8(L, 1, &e);
372
+ lua_Integer posi = luaL_optinteger(L, 2, 1);
373
+ lua_Integer posj = luaL_optinteger(L, 3, posi);
374
+ if (u_posrange(&s, &e, posi, posj)) {
375
+ luaL_checkstack(L, e-s, "string slice too long");
376
+ while (s < e) {
377
+ unsigned int ch;
378
+ s += utf8_decode(s, e, &ch);
379
+ lua_pushinteger(L, ch);
380
+ ++n;
381
+ }
382
+ }
383
+ return n;
384
+ }
385
+
386
+ static int Lutf8_char(lua_State *L) {
387
+ int i, n = lua_gettop(L); /* number of arguments */
388
+ luaL_Buffer b;
389
+ luaL_buffinit(L, &b);
390
+ for (i = 1; i <= n; ++i) {
391
+ unsigned int ch = luaL_checkint(L, i);
392
+ add_utf8char(&b, ch);
393
+ }
394
+ luaL_pushresult(&b);
395
+ return 1;
396
+ }
397
+
398
+
399
+ /* unicode extra interface */
400
+
401
+ static const char *parse_escape(lua_State *L,
402
+ const char *s, const char *e,
403
+ int is_hex, unsigned int *pch) {
404
+ unsigned int escape = 0, ch;
405
+ int in_bracket = 0;
406
+ if (*s == '{') ++s, in_bracket = 1;
407
+ while (s < e) {
408
+ ch = (unsigned char)*s;
409
+ if (in_bracket && ch == '}') {
410
+ ++s;
411
+ break;
412
+ }
413
+ if (ch >= '0' && ch <= '9')
414
+ ch = ch - '0';
415
+ else if (is_hex && ch >= 'A' && ch <= 'F')
416
+ ch = 10 + (ch - 'A');
417
+ else if (is_hex && ch >= 'a' && ch <= 'f')
418
+ ch = 10 + (ch - 'a');
419
+ else {
420
+ if (in_bracket)
421
+ luaL_error(L, "invalid escape '%c'", ch);
422
+ break;
423
+ }
424
+ escape *= is_hex ? 16 : 10;
425
+ escape += ch;
426
+ ++s;
427
+ }
428
+ *pch = escape;
429
+ return s;
430
+ }
431
+
432
+ static int Lutf8_escape(lua_State *L) {
433
+ const char *e, *s = check_utf8(L, 1, &e);
434
+ luaL_Buffer b;
435
+ luaL_buffinit(L, &b);
436
+ while (s < e) {
437
+ unsigned int ch;
438
+ s += utf8_decode(s, e, &ch);
439
+ if (ch == '%') {
440
+ int is_hex = 0;
441
+ switch (*s) {
442
+ case '0': case '1': case '2': case '3':
443
+ case '4': case '5': case '6': case '7':
444
+ case '8': case '9': case '{':
445
+ break;
446
+ case 'u': case 'U': ++s; break;
447
+ case 'x': case 'X': ++s; is_hex = 1; break;
448
+ default:
449
+ s += utf8_decode(s, e, &ch);
450
+ goto next;
451
+ }
452
+ if (s >= e)
453
+ luaL_error(L, "invalid escape sequence");
454
+ s = parse_escape(L, s, e, is_hex, &ch);
455
+ }
456
+ next:
457
+ add_utf8char(&b, ch);
458
+ }
459
+ luaL_pushresult(&b);
460
+ return 1;
461
+ }
462
+
463
+ static int Lutf8_insert(lua_State *L) {
464
+ const char *e, *s = check_utf8(L, 1, &e);
465
+ size_t sublen;
466
+ const char *subs;
467
+ luaL_Buffer b;
468
+ int nargs = 2;
469
+ const char *first = e;
470
+ if (lua_type(L, 2) == LUA_TNUMBER) {
471
+ int idx = (int)lua_tointeger(L, 2);
472
+ if (idx != 0) first = utf8_index(s, e, idx);
473
+ ++nargs;
474
+ }
475
+ subs = luaL_checklstring(L, nargs, &sublen);
476
+ luaL_buffinit(L, &b);
477
+ luaL_addlstring(&b, s, first-s);
478
+ luaL_addlstring(&b, subs, sublen);
479
+ luaL_addlstring(&b, first, e-first);
480
+ luaL_pushresult(&b);
481
+ return 1;
482
+ }
483
+
484
+ static int Lutf8_remove(lua_State *L) {
485
+ const char *e, *s = check_utf8(L, 1, &e);
486
+ const char *start = s, *end = e;
487
+ if (!u_posrange(&start, &end,
488
+ luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
489
+ lua_settop(L, 1);
490
+ else {
491
+ luaL_Buffer b;
492
+ luaL_buffinit(L, &b);
493
+ luaL_addlstring(&b, s, start-s);
494
+ luaL_addlstring(&b, end, e-end);
495
+ luaL_pushresult(&b);
496
+ }
497
+ return 1;
498
+ }
499
+
500
+ static int push_offset(lua_State *L, const char *s, const char *e,
501
+ const char *cur, lua_Integer offset) {
502
+ unsigned int ch;
503
+ if (offset >= 0) {
504
+ while (cur < e && offset-- > 0)
505
+ cur = utf8_next(cur, e);
506
+ if (offset >= 0) return 0;
507
+ }
508
+ else {
509
+ while (s < cur && offset++ < 0)
510
+ cur = utf8_prev(s, cur);
511
+ if (offset < 0) return 0;
512
+ }
513
+ utf8_decode(cur, e, &ch);
514
+ lua_pushinteger(L, cur-s+1);
515
+ lua_pushinteger(L, ch);
516
+ return 2;
517
+ }
518
+
519
+ static int Lutf8_charpos(lua_State *L) {
520
+ size_t len;
521
+ const char *s = luaL_checklstring(L, 1, &len);
522
+ const char *cur = s;
523
+ lua_Integer pos;
524
+ if (lua_isnoneornil(L, 3)) {
525
+ lua_Integer offset = luaL_optinteger(L, 2, 1);
526
+ if (offset > 0) --offset;
527
+ else if (offset < 0) cur = s+len;
528
+ return push_offset(L, s, s+len, cur, offset);
529
+ }
530
+ pos = byterelat(luaL_optinteger(L, 2, 1), len);
531
+ if (pos != 0) cur += pos-1;
532
+ return push_offset(L, s, s+len, cur, luaL_checkinteger(L, 3));
533
+ }
534
+
535
+ static int Lutf8_next(lua_State *L) {
536
+ size_t len;
537
+ const char *s = luaL_checklstring(L, 1, &len);
538
+ const char *cur = s;
539
+ lua_Integer offset = 0;
540
+ if (!lua_isnoneornil(L, 2)) {
541
+ lua_Integer pos = byterelat(luaL_checkinteger(L, 2), len);
542
+ if (pos != 0) cur += pos-1;
543
+ offset = 1;
544
+ }
545
+ offset = luaL_optinteger(L, 3, offset);
546
+ return push_offset(L, s, s+len, cur, offset);
547
+ }
548
+
549
+ static int Lutf8_width(lua_State *L) {
550
+ int t = lua_type(L, 1);
551
+ int ambi_is_single = !lua_toboolean(L, 2);
552
+ int default_width = luaL_optinteger(L, 3, 0);
553
+ if (t == LUA_TNUMBER) {
554
+ size_t chwidth = utf8_width(lua_tointeger(L, 1), ambi_is_single);
555
+ if (chwidth == 0) chwidth = default_width;
556
+ lua_pushinteger(L, (lua_Integer)chwidth);
557
+ }
558
+ else if (t != LUA_TSTRING)
559
+ return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
560
+ else {
561
+ const char *e, *s = to_utf8(L, 1, &e);
562
+ size_t width = 0;
563
+ while (s < e) {
564
+ unsigned int ch;
565
+ size_t chwidth;
566
+ s += utf8_decode(s, e, &ch);
567
+ chwidth = utf8_width(ch, ambi_is_single);
568
+ width += chwidth == 0 ? default_width : chwidth;
569
+ }
570
+ lua_pushinteger(L, (lua_Integer)width);
571
+ }
572
+ return 1;
573
+ }
574
+
575
+ static int Lutf8_widthindex(lua_State *L) {
576
+ const char *e, *s = check_utf8(L, 1, &e);
577
+ int width = luaL_checkinteger(L, 2);
578
+ int ambi_is_single = !lua_toboolean(L, 3);
579
+ int default_width = luaL_optinteger(L, 4, 0);
580
+ size_t idx = 1;
581
+ while (s < e) {
582
+ unsigned int ch;
583
+ size_t chwidth;
584
+ s += utf8_decode(s, e, &ch);
585
+ chwidth = utf8_width(ch, ambi_is_single);
586
+ if (chwidth == 0) chwidth = default_width;
587
+ width -= chwidth;
588
+ if (width <= 0) {
589
+ lua_pushinteger(L, idx);
590
+ lua_pushinteger(L, width + chwidth);
591
+ lua_pushinteger(L, chwidth);
592
+ return 3;
593
+ }
594
+ ++idx;
595
+ }
596
+ lua_pushinteger(L, (lua_Integer)idx);
597
+ return 1;
598
+ }
599
+
600
+ static int Lutf8_ncasecmp(lua_State *L) {
601
+ const char *e1, *s1 = check_utf8(L, 1, &e1);
602
+ const char *e2, *s2 = check_utf8(L, 2, &e2);
603
+ while (s1 < e1 || s2 < e2) {
604
+ unsigned int ch1 = 0, ch2 = 0;
605
+ if (s1 == e1)
606
+ ch2 = 1;
607
+ else if (s2 == e2)
608
+ ch1 = 1;
609
+ else {
610
+ s1 += utf8_decode(s1, e1, &ch1);
611
+ s2 += utf8_decode(s2, e2, &ch2);
612
+ ch1 = utf8_tofold(ch1);
613
+ ch2 = utf8_tofold(ch2);
614
+ }
615
+ if (ch1 != ch2) {
616
+ lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
617
+ return 1;
618
+ }
619
+ }
620
+ lua_pushinteger(L, 0);
621
+ return 1;
622
+ }
623
+
624
+
625
+ /* utf8 pattern matching implement */
626
+
627
+ #ifndef LUA_MAXCAPTURES
628
+ # define LUA_MAXCAPTURES 32
629
+ #endif /* LUA_MAXCAPTURES */
630
+
631
+ #define CAP_UNFINISHED (-1)
632
+ #define CAP_POSITION (-2)
633
+
634
+
635
+ typedef struct utf8MatchState {
636
+ int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
637
+ const char *src_init; /* init of source string */
638
+ const char *src_end; /* end ('\0') of source string */
639
+ const char *p_end; /* end ('\0') of pattern */
640
+ lua_State *L;
641
+ int level; /* total number of captures (finished or unfinished) */
642
+ struct {
643
+ const char *init;
644
+ ptrdiff_t len;
645
+ } capture[LUA_MAXCAPTURES];
646
+ } utf8MatchState;
647
+
648
+ /* recursive function */
649
+ static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p);
650
+
651
+ /* maximum recursion depth for 'match' */
652
+ #if !defined(MAXCCALLS)
653
+ #define MAXCCALLS 200
654
+ #endif
655
+
656
+ #define L_ESC '%'
657
+ #define SPECIALS "^$*+?.([%-"
658
+
659
+ static int utf8_check_capture (utf8MatchState *ms, int l) {
660
+ l -= '1';
661
+ if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
662
+ return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
663
+ return l;
664
+ }
665
+
666
+ static int utf8_capture_to_close (utf8MatchState *ms) {
667
+ int level = ms->level;
668
+ for (level--; level>=0; level--)
669
+ if (ms->capture[level].len == CAP_UNFINISHED) return level;
670
+ return luaL_error(ms->L, "invalid pattern capture");
671
+ }
672
+
673
+ static const char *utf8_classend (utf8MatchState *ms, const char *p) {
674
+ unsigned int ch;
675
+ p += utf8_decode(p, ms->p_end, &ch);
676
+ switch (ch) {
677
+ case L_ESC: {
678
+ if (p == ms->p_end)
679
+ luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
680
+ return utf8_next(p, ms->p_end);
681
+ }
682
+ case '[': {
683
+ if (*p == '^') p++;
684
+ do { /* look for a `]' */
685
+ if (p == ms->p_end)
686
+ luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
687
+ if (*(p++) == L_ESC && p < ms->p_end)
688
+ p++; /* skip escapes (e.g. `%]') */
689
+ } while (*p != ']');
690
+ return p+1;
691
+ }
692
+ default: {
693
+ return p;
694
+ }
695
+ }
696
+ }
697
+
698
+ static int utf8_match_class (unsigned int c, unsigned int cl) {
699
+ int res;
700
+ switch (utf8_tolower(cl)) {
701
+ case 'a' : res = utf8_isalpha(c); break;
702
+ case 'c' : res = utf8_iscntrl(c); break;
703
+ case 'd' : res = utf8_isdigit(c); break;
704
+ case 'g' : res = utf8_isgraph(c); break;
705
+ case 'l' : res = utf8_islower(c); break;
706
+ case 'p' : res = utf8_ispunct(c); break;
707
+ case 's' : res = utf8_isspace(c); break;
708
+ case 'u' : res = utf8_isupper(c); break;
709
+ case 'w' : res = utf8_isalnum(c); break;
710
+ case 'x' : res = utf8_isxdigit(c); break;
711
+ case 'z' : res = (c == 0); break; /* deprecated option */
712
+ default: return (cl == c);
713
+ }
714
+ return (utf8_islower(cl) ? res : !res);
715
+ }
716
+
717
+ static int utf8_matchbracketclass (unsigned int c, const char *p, const char *ec) {
718
+ int sig = 1;
719
+ assert(*p == '[');
720
+ if (*++p == '^') {
721
+ sig = 0;
722
+ p++; /* skip the `^' */
723
+ }
724
+ while (p < ec) {
725
+ unsigned int ch;
726
+ p += utf8_decode(p, ec, &ch);
727
+ if (ch == L_ESC) {
728
+ p += utf8_decode(p, ec, &ch);
729
+ if (utf8_match_class(c, ch))
730
+ return sig;
731
+ }
732
+ else {
733
+ unsigned int next;
734
+ const char *np = p + utf8_decode(p, ec, &next);
735
+ if (next == '-' && np < ec) {
736
+ p = np + utf8_decode(np, ec, &next);
737
+ if (ch <= c && c <= next)
738
+ return sig;
739
+ }
740
+ else if (ch == c) return sig;
741
+ }
742
+ }
743
+ return !sig;
744
+ }
745
+
746
+ static int utf8_singlematch (utf8MatchState *ms, const char *s, const char *p,
747
+ const char *ep) {
748
+ if (s >= ms->src_end)
749
+ return 0;
750
+ else {
751
+ unsigned int ch, pch;
752
+ utf8_decode(s, ms->src_end, &ch);
753
+ p += utf8_decode(p, ms->p_end, &pch);
754
+ switch (pch) {
755
+ case '.': return 1; /* matches any char */
756
+ case L_ESC: utf8_decode(p, ms->p_end, &pch);
757
+ return utf8_match_class(ch, pch);
758
+ case '[': return utf8_matchbracketclass(ch, p-1, ep-1);
759
+ default: return pch == ch;
760
+ }
761
+ }
762
+ }
763
+
764
+ static const char *utf8_matchbalance (utf8MatchState *ms, const char *s,
765
+ const char **p) {
766
+ unsigned int ch, begin, end;
767
+ *p += utf8_decode(*p, ms->p_end, &begin);
768
+ if (*p >= ms->p_end)
769
+ luaL_error(ms->L, "malformed pattern "
770
+ "(missing arguments to " LUA_QL("%%b") ")");
771
+ *p += utf8_decode(*p, ms->p_end, &end);
772
+ s += utf8_decode(s, ms->src_end, &ch);
773
+ if (ch != begin) return NULL;
774
+ else {
775
+ int cont = 1;
776
+ while (s < ms->src_end) {
777
+ s += utf8_decode(s, ms->src_end, &ch);
778
+ if (ch == end) {
779
+ if (--cont == 0) return s;
780
+ }
781
+ else if (ch == begin) cont++;
782
+ }
783
+ }
784
+ return NULL; /* string ends out of balance */
785
+ }
786
+
787
+ static const char *utf8_max_expand (utf8MatchState *ms, const char *s,
788
+ const char *p, const char *ep) {
789
+ const char *m = s; /* matched end of single match p */
790
+ while (utf8_singlematch(ms, m, p, ep))
791
+ m = utf8_next(m, ms->src_end);
792
+ /* keeps trying to match with the maximum repetitions */
793
+ while (s <= m) {
794
+ const char *res = utf8_match(ms, m, ep+1);
795
+ if (res) return res;
796
+ /* else didn't match; reduce 1 repetition to try again */
797
+ if (s == m) break;
798
+ m = utf8_prev(s, m);
799
+ }
800
+ return NULL;
801
+ }
802
+
803
+ static const char *utf8_min_expand (utf8MatchState *ms, const char *s,
804
+ const char *p, const char *ep) {
805
+ for (;;) {
806
+ const char *res = utf8_match(ms, s, ep+1);
807
+ if (res != NULL)
808
+ return res;
809
+ else if (utf8_singlematch(ms, s, p, ep))
810
+ s = utf8_next(s, ms->src_end); /* try with one more repetition */
811
+ else return NULL;
812
+ }
813
+ }
814
+
815
+ static const char *utf8_start_capture (utf8MatchState *ms, const char *s,
816
+ const char *p, int what) {
817
+ const char *res;
818
+ int level = ms->level;
819
+ if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
820
+ ms->capture[level].init = s;
821
+ ms->capture[level].len = what;
822
+ ms->level = level+1;
823
+ if ((res=utf8_match(ms, s, p)) == NULL) /* match failed? */
824
+ ms->level--; /* undo capture */
825
+ return res;
826
+ }
827
+
828
+ static const char *utf8_end_capture (utf8MatchState *ms, const char *s,
829
+ const char *p) {
830
+ int l = utf8_capture_to_close(ms);
831
+ const char *res;
832
+ ms->capture[l].len = s - ms->capture[l].init; /* close capture */
833
+ if ((res = utf8_match(ms, s, p)) == NULL) /* match failed? */
834
+ ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
835
+ return res;
836
+ }
837
+
838
+ static const char *utf8_match_capture (utf8MatchState *ms, const char *s, int l) {
839
+ size_t len;
840
+ l = utf8_check_capture(ms, l);
841
+ len = ms->capture[l].len;
842
+ if ((size_t)(ms->src_end-s) >= len &&
843
+ memcmp(ms->capture[l].init, s, len) == 0)
844
+ return s+len;
845
+ else return NULL;
846
+ }
847
+
848
+ static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p) {
849
+ if (ms->matchdepth-- == 0)
850
+ luaL_error(ms->L, "pattern too complex");
851
+ init: /* using goto's to optimize tail recursion */
852
+ if (p != ms->p_end) { /* end of pattern? */
853
+ unsigned int ch;
854
+ utf8_decode(p, ms->p_end, &ch);
855
+ switch (ch) {
856
+ case '(': { /* start capture */
857
+ if (*(p + 1) == ')') /* position capture? */
858
+ s = utf8_start_capture(ms, s, p + 2, CAP_POSITION);
859
+ else
860
+ s = utf8_start_capture(ms, s, p + 1, CAP_UNFINISHED);
861
+ break;
862
+ }
863
+ case ')': { /* end capture */
864
+ s = utf8_end_capture(ms, s, p + 1);
865
+ break;
866
+ }
867
+ case '$': {
868
+ if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
869
+ goto dflt; /* no; go to default */
870
+ s = (s == ms->src_end) ? s : NULL; /* check end of string */
871
+ break;
872
+ }
873
+ case L_ESC: { /* escaped sequence not in the format class[*+?-]? */
874
+ const char *prev_p = p;
875
+ p += utf8_decode(p+1, ms->p_end, &ch) + 1;
876
+ switch (ch) {
877
+ case 'b': { /* balanced string? */
878
+ s = utf8_matchbalance(ms, s, &p);
879
+ if (s != NULL)
880
+ goto init; /* return utf8_match(ms, s, p + 4); */
881
+ /* else fail (s == NULL) */
882
+ break;
883
+ }
884
+ case 'f': { /* frontier? */
885
+ const char *ep; unsigned int previous = 0, current = 0;
886
+ if (*p != '[')
887
+ luaL_error(ms->L, "missing " LUA_QL("[") " after "
888
+ LUA_QL("%%f") " in pattern");
889
+ ep = utf8_classend(ms, p); /* points to what is next */
890
+ if (s != ms->src_init)
891
+ utf8_decode(utf8_prev(ms->src_init, s), ms->src_end, &previous);
892
+ if (s != ms->src_end)
893
+ utf8_decode(s, ms->src_end, &current);
894
+ if (!utf8_matchbracketclass(previous, p, ep - 1) &&
895
+ utf8_matchbracketclass(current, p, ep - 1)) {
896
+ p = ep; goto init; /* return utf8_match(ms, s, ep); */
897
+ }
898
+ s = NULL; /* match failed */
899
+ break;
900
+ }
901
+ case '0': case '1': case '2': case '3':
902
+ case '4': case '5': case '6': case '7':
903
+ case '8': case '9': { /* capture results (%0-%9)? */
904
+ s = utf8_match_capture(ms, s, ch - '1');
905
+ if (s != NULL) goto init; /* return utf8_match(ms, s, p + 2) */
906
+ break;
907
+ }
908
+ default: p = prev_p; goto dflt;
909
+ }
910
+ break;
911
+ }
912
+ default: dflt: { /* pattern class plus optional suffix */
913
+ const char *ep = utf8_classend(ms, p); /* points to optional suffix */
914
+ /* does not match at least once? */
915
+ if (!utf8_singlematch(ms, s, p, ep)) {
916
+ if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
917
+ p = ep + 1; goto init; /* return utf8_match(ms, s, ep + 1); */
918
+ }
919
+ else /* '+' or no suffix */
920
+ s = NULL; /* fail */
921
+ }
922
+ else { /* matched once */
923
+ const char *next_s = utf8_next(s, ms->src_end);
924
+ switch (*ep) { /* handle optional suffix */
925
+ case '?': { /* optional */
926
+ const char *res;
927
+ const char *next_ep = utf8_next(ep, ms->p_end);
928
+ if ((res = utf8_match(ms, next_s, next_ep)) != NULL)
929
+ s = res;
930
+ else {
931
+ p = next_ep; goto init; /* else return utf8_match(ms, s, ep + 1); */
932
+ }
933
+ break;
934
+ }
935
+ case '+': /* 1 or more repetitions */
936
+ s = next_s; /* 1 match already done */
937
+ /* go through */
938
+ case '*': /* 0 or more repetitions */
939
+ s = utf8_max_expand(ms, s, p, ep);
940
+ break;
941
+ case '-': /* 0 or more repetitions (minimum) */
942
+ s = utf8_min_expand(ms, s, p, ep);
943
+ break;
944
+ default: /* no suffix */
945
+ s = next_s; p = ep; goto init; /* return utf8_match(ms, s + 1, ep); */
946
+ }
947
+ }
948
+ break;
949
+ }
950
+ }
951
+ }
952
+ ms->matchdepth++;
953
+ return s;
954
+ }
955
+
956
+ static const char *utf8_lmemfind (const char *s1, size_t l1,
957
+ const char *s2, size_t l2) {
958
+ if (l2 == 0) return s1; /* empty strings are everywhere */
959
+ else if (l2 > l1) return NULL; /* avoids a negative `l1' */
960
+ else {
961
+ const char *init; /* to search for a `*s2' inside `s1' */
962
+ l2--; /* 1st char will be checked by `memchr' */
963
+ l1 = l1-l2; /* `s2' cannot be found after that */
964
+ while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
965
+ init++; /* 1st char is already checked */
966
+ if (memcmp(init, s2+1, l2) == 0)
967
+ return init-1;
968
+ else { /* correct `l1' and `s1' to try again */
969
+ l1 -= init-s1;
970
+ s1 = init;
971
+ }
972
+ }
973
+ return NULL; /* not found */
974
+ }
975
+ }
976
+
977
+ static const char *utf8_get_index(const char *p, const char *s, const char *e, int *pidx) {
978
+ int idx = 0;
979
+ while (s < e) {
980
+ if (s == p)
981
+ break;
982
+ else if (s > p) {
983
+ --idx;
984
+ break;
985
+ }
986
+ s = utf8_next(s, e);
987
+ ++idx;
988
+ }
989
+ if (pidx) *pidx = idx;
990
+ return s;
991
+ }
992
+
993
+ static void utf8_push_onecapture (utf8MatchState *ms, int i, const char *s,
994
+ const char *e) {
995
+ if (i >= ms->level) {
996
+ if (i == 0) /* ms->level == 0, too */
997
+ lua_pushlstring(ms->L, s, e - s); /* add whole match */
998
+ else
999
+ luaL_error(ms->L, "invalid capture index");
1000
+ }
1001
+ else {
1002
+ ptrdiff_t l = ms->capture[i].len;
1003
+ if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
1004
+ if (l == CAP_POSITION) {
1005
+ int idx;
1006
+ utf8_get_index(ms->capture[i].init, ms->src_init, ms->src_end, &idx);
1007
+ lua_pushinteger(ms->L, idx+1);
1008
+ } else
1009
+ lua_pushlstring(ms->L, ms->capture[i].init, l);
1010
+ }
1011
+ }
1012
+
1013
+ static int utf8_push_captures (utf8MatchState *ms, const char *s, const char *e) {
1014
+ int i;
1015
+ int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1016
+ luaL_checkstack(ms->L, nlevels, "too many captures");
1017
+ for (i = 0; i < nlevels; i++)
1018
+ utf8_push_onecapture(ms, i, s, e);
1019
+ return nlevels; /* number of strings pushed */
1020
+ }
1021
+
1022
+ /* check whether pattern has no special characters */
1023
+ static int nospecials (const char *p, const char * ep) {
1024
+ while (p < ep) {
1025
+ if (strpbrk(p, SPECIALS))
1026
+ return 0; /* pattern has a special character */
1027
+ p += strlen(p) + 1; /* may have more after \0 */
1028
+ }
1029
+ return 1; /* no special chars found */
1030
+ }
1031
+
1032
+
1033
+ /* utf8 pattern matching interface */
1034
+
1035
+ static int find_aux (lua_State *L, int find) {
1036
+ const char *es, *s = check_utf8(L, 1, &es);
1037
+ const char *ep, *p = check_utf8(L, 2, &ep);
1038
+ lua_Integer idx = luaL_optinteger(L, 3, 1);
1039
+ const char *init;
1040
+ size_t slen = utf8_length(s, es);
1041
+ if (idx > 0 && idx > (lua_Integer)slen + 1) { /* start after string's end? */
1042
+ lua_pushnil(L); /* cannot find anything */
1043
+ return 1;
1044
+ }
1045
+ if (idx < 0) idx += utf8_length(s, es) + 1;
1046
+ init = utf8_index(s, es, idx);
1047
+ /* explicit request or no special characters? */
1048
+ if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1049
+ /* do a plain search */
1050
+ do {
1051
+ const char *s2 = utf8_lmemfind(init, es-init, p, ep-p);
1052
+ if (!s2) break;
1053
+ else {
1054
+ int relidx;
1055
+ const char *pch = utf8_get_index(s2, init, es, &relidx);
1056
+ if (pch == s2) {
1057
+ lua_pushinteger(L, idx + relidx);
1058
+ lua_pushinteger(L, idx + relidx + utf8_length(p, ep) - 1);
1059
+ return 2;
1060
+ }
1061
+ idx += relidx + 1;
1062
+ init = utf8_next(pch, es);
1063
+ }
1064
+ } while (init < es);
1065
+ }
1066
+ else {
1067
+ utf8MatchState ms;
1068
+ int anchor = (*p == '^');
1069
+ if (anchor) p++; /* skip anchor character */
1070
+ ms.L = L;
1071
+ ms.matchdepth = MAXCCALLS;
1072
+ ms.src_init = s;
1073
+ ms.src_end = es;
1074
+ ms.p_end = ep;
1075
+ do {
1076
+ const char *res;
1077
+ ms.level = 0;
1078
+ assert(ms.matchdepth == MAXCCALLS);
1079
+ if ((res=utf8_match(&ms, init, p)) != NULL) {
1080
+ if (find) {
1081
+ lua_pushinteger(L, idx); /* start */
1082
+ lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */
1083
+ return utf8_push_captures(&ms, NULL, 0) + 2;
1084
+ }
1085
+ else
1086
+ return utf8_push_captures(&ms, init, res);
1087
+ }
1088
+ if (init == es) break;
1089
+ idx += 1;
1090
+ init = utf8_next(init, es);
1091
+ } while (init <= es && !anchor);
1092
+ }
1093
+ lua_pushnil(L); /* not found */
1094
+ return 1;
1095
+ }
1096
+
1097
+ static int Lutf8_find(lua_State *L)
1098
+ { return find_aux(L, 1); }
1099
+
1100
+ static int Lutf8_match(lua_State *L)
1101
+ { return find_aux(L, 0); }
1102
+
1103
+ static int utf8_gmatch_aux (lua_State *L) {
1104
+ utf8MatchState ms;
1105
+ const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1106
+ const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1107
+ const char *src;
1108
+ ms.L = L;
1109
+ ms.matchdepth = MAXCCALLS;
1110
+ ms.src_init = s;
1111
+ ms.src_end = es;
1112
+ ms.p_end = ep;
1113
+ for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
1114
+ src <= ms.src_end;
1115
+ src = utf8_next(src, ms.src_end)) {
1116
+ const char *e;
1117
+ ms.level = 0;
1118
+ assert(ms.matchdepth == MAXCCALLS);
1119
+ if ((e = utf8_match(&ms, src, p)) != NULL) {
1120
+ lua_Integer newstart = e-s;
1121
+ if (e == src) newstart++; /* empty match? go at least one position */
1122
+ lua_pushinteger(L, newstart);
1123
+ lua_replace(L, lua_upvalueindex(3));
1124
+ return utf8_push_captures(&ms, src, e);
1125
+ }
1126
+ if (src == ms.src_end) break;
1127
+ }
1128
+ return 0; /* not found */
1129
+ }
1130
+
1131
+ static int Lutf8_gmatch(lua_State *L) {
1132
+ luaL_checkstring(L, 1);
1133
+ luaL_checkstring(L, 2);
1134
+ lua_settop(L, 2);
1135
+ lua_pushinteger(L, 0);
1136
+ lua_pushcclosure(L, utf8_gmatch_aux, 3);
1137
+ return 1;
1138
+ }
1139
+
1140
+ static void utf8_add_s (utf8MatchState *ms, luaL_Buffer *b, const char *s,
1141
+ const char *e) {
1142
+ const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1143
+ while (news < new_end) {
1144
+ unsigned int ch;
1145
+ news += utf8_decode(news, new_end, &ch);
1146
+ if (ch != L_ESC)
1147
+ add_utf8char(b, ch);
1148
+ else {
1149
+ news += utf8_decode(news, new_end, &ch); /* skip ESC */
1150
+ if (!utf8_isdigit(ch)) {
1151
+ if (ch != L_ESC)
1152
+ luaL_error(ms->L, "invalid use of " LUA_QL("%c")
1153
+ " in replacement string", L_ESC);
1154
+ add_utf8char(b, ch);
1155
+ }
1156
+ else if (ch == '0')
1157
+ luaL_addlstring(b, s, e-s);
1158
+ else {
1159
+ utf8_push_onecapture(ms, ch-'1', s, e);
1160
+ luaL_addvalue(b); /* add capture to accumulated result */
1161
+ }
1162
+ }
1163
+ }
1164
+ }
1165
+
1166
+ static void utf8_add_value (utf8MatchState *ms, luaL_Buffer *b, const char *s,
1167
+ const char *e, int tr) {
1168
+ lua_State *L = ms->L;
1169
+ switch (tr) {
1170
+ case LUA_TFUNCTION: {
1171
+ int n;
1172
+ lua_pushvalue(L, 3);
1173
+ n = utf8_push_captures(ms, s, e);
1174
+ lua_call(L, n, 1);
1175
+ break;
1176
+ }
1177
+ case LUA_TTABLE: {
1178
+ utf8_push_onecapture(ms, 0, s, e);
1179
+ lua_gettable(L, 3);
1180
+ break;
1181
+ }
1182
+ default: { /* LUA_TNUMBER or LUA_TSTRING */
1183
+ utf8_add_s(ms, b, s, e);
1184
+ return;
1185
+ }
1186
+ }
1187
+ if (!lua_toboolean(L, -1)) { /* nil or false? */
1188
+ lua_pop(L, 1);
1189
+ lua_pushlstring(L, s, e - s); /* keep original text */
1190
+ }
1191
+ else if (!lua_isstring(L, -1))
1192
+ luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
1193
+ luaL_addvalue(b); /* add result to accumulator */
1194
+ }
1195
+
1196
+ static int Lutf8_gsub(lua_State *L) {
1197
+ const char *es, *s = check_utf8(L, 1, &es);
1198
+ const char *ep, *p = check_utf8(L, 2, &ep);
1199
+ int tr = lua_type(L, 3);
1200
+ lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1201
+ int anchor = (*p == '^');
1202
+ lua_Integer n = 0;
1203
+ utf8MatchState ms;
1204
+ luaL_Buffer b;
1205
+ luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1206
+ tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1207
+ "string/function/table expected");
1208
+ luaL_buffinit(L, &b);
1209
+ if (anchor) p++; /* skip anchor character */
1210
+ ms.L = L;
1211
+ ms.matchdepth = MAXCCALLS;
1212
+ ms.src_init = s;
1213
+ ms.src_end = es;
1214
+ ms.p_end = ep;
1215
+ while (n < max_s) {
1216
+ const char *e;
1217
+ ms.level = 0;
1218
+ assert(ms.matchdepth == MAXCCALLS);
1219
+ e = utf8_match(&ms, s, p);
1220
+ if (e) {
1221
+ n++;
1222
+ utf8_add_value(&ms, &b, s, e, tr);
1223
+ }
1224
+ if (e && e > s) /* non empty match? */
1225
+ s = e; /* skip it */
1226
+ else if (s < es) {
1227
+ unsigned int ch;
1228
+ s += utf8_decode(s, es, &ch);
1229
+ add_utf8char(&b, ch);
1230
+ }
1231
+ else break;
1232
+ if (anchor) break;
1233
+ }
1234
+ luaL_addlstring(&b, s, es-s);
1235
+ luaL_pushresult(&b);
1236
+ lua_pushinteger(L, n); /* number of substitutions */
1237
+ return 2;
1238
+ }
1239
+
1240
+
1241
+ /* lua module import interface */
1242
+
1243
+ LUALIB_API int luaopen_utf8(lua_State *L) {
1244
+ luaL_Reg libs[] = {
1245
+ #define ENTRY(name) { #name, Lutf8_##name }
1246
+ ENTRY(len),
1247
+ ENTRY(sub),
1248
+ ENTRY(reverse),
1249
+ ENTRY(lower),
1250
+ ENTRY(upper),
1251
+ ENTRY(title),
1252
+ ENTRY(fold),
1253
+ ENTRY(byte),
1254
+ ENTRY(char),
1255
+ ENTRY(escape),
1256
+ ENTRY(insert),
1257
+ ENTRY(remove),
1258
+ ENTRY(charpos),
1259
+ ENTRY(next),
1260
+ ENTRY(width),
1261
+ ENTRY(widthindex),
1262
+ ENTRY(ncasecmp),
1263
+ ENTRY(find),
1264
+ ENTRY(gmatch),
1265
+ ENTRY(gsub),
1266
+ ENTRY(match),
1267
+ #undef ENTRY
1268
+ { NULL, NULL }
1269
+ };
1270
+
1271
+ luaL_register(L, "utf8", libs);
1272
+
1273
+ return 1;
1274
+ }