immunio 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +234 -0
  3. data/README.md +147 -0
  4. data/bin/immunio +5 -0
  5. data/lib/immunio.rb +29 -0
  6. data/lib/immunio/agent.rb +260 -0
  7. data/lib/immunio/authentication.rb +96 -0
  8. data/lib/immunio/blocked_app.rb +38 -0
  9. data/lib/immunio/channel.rb +432 -0
  10. data/lib/immunio/cli.rb +39 -0
  11. data/lib/immunio/context.rb +114 -0
  12. data/lib/immunio/errors.rb +43 -0
  13. data/lib/immunio/immunio_ca.crt +45 -0
  14. data/lib/immunio/logger.rb +87 -0
  15. data/lib/immunio/plugins/action_dispatch.rb +45 -0
  16. data/lib/immunio/plugins/action_view.rb +431 -0
  17. data/lib/immunio/plugins/active_record.rb +707 -0
  18. data/lib/immunio/plugins/active_record_relation.rb +370 -0
  19. data/lib/immunio/plugins/authlogic.rb +80 -0
  20. data/lib/immunio/plugins/csrf.rb +24 -0
  21. data/lib/immunio/plugins/devise.rb +40 -0
  22. data/lib/immunio/plugins/environment_reporter.rb +69 -0
  23. data/lib/immunio/plugins/eval.rb +51 -0
  24. data/lib/immunio/plugins/exception_handler.rb +55 -0
  25. data/lib/immunio/plugins/gems_tracker.rb +5 -0
  26. data/lib/immunio/plugins/haml.rb +36 -0
  27. data/lib/immunio/plugins/http_finisher.rb +50 -0
  28. data/lib/immunio/plugins/http_tracker.rb +203 -0
  29. data/lib/immunio/plugins/io.rb +96 -0
  30. data/lib/immunio/plugins/redirect.rb +42 -0
  31. data/lib/immunio/plugins/warden.rb +66 -0
  32. data/lib/immunio/processor.rb +234 -0
  33. data/lib/immunio/rails.rb +26 -0
  34. data/lib/immunio/request.rb +139 -0
  35. data/lib/immunio/rufus_lua_ext/ref.rb +27 -0
  36. data/lib/immunio/rufus_lua_ext/state.rb +157 -0
  37. data/lib/immunio/rufus_lua_ext/table.rb +137 -0
  38. data/lib/immunio/rufus_lua_ext/utils.rb +13 -0
  39. data/lib/immunio/version.rb +5 -0
  40. data/lib/immunio/vm.rb +291 -0
  41. data/lua-hooks/ext/all.c +78 -0
  42. data/lua-hooks/ext/bitop/README +22 -0
  43. data/lua-hooks/ext/bitop/bit.c +189 -0
  44. data/lua-hooks/ext/extconf.rb +38 -0
  45. data/lua-hooks/ext/libinjection/COPYING +37 -0
  46. data/lua-hooks/ext/libinjection/libinjection.h +65 -0
  47. data/lua-hooks/ext/libinjection/libinjection_html5.c +847 -0
  48. data/lua-hooks/ext/libinjection/libinjection_html5.h +54 -0
  49. data/lua-hooks/ext/libinjection/libinjection_sqli.c +2301 -0
  50. data/lua-hooks/ext/libinjection/libinjection_sqli.h +295 -0
  51. data/lua-hooks/ext/libinjection/libinjection_sqli_data.h +9349 -0
  52. data/lua-hooks/ext/libinjection/libinjection_xss.c +531 -0
  53. data/lua-hooks/ext/libinjection/libinjection_xss.h +21 -0
  54. data/lua-hooks/ext/libinjection/lualib.c +109 -0
  55. data/lua-hooks/ext/lpeg/HISTORY +90 -0
  56. data/lua-hooks/ext/lpeg/lpcap.c +537 -0
  57. data/lua-hooks/ext/lpeg/lpcap.h +43 -0
  58. data/lua-hooks/ext/lpeg/lpcode.c +986 -0
  59. data/lua-hooks/ext/lpeg/lpcode.h +34 -0
  60. data/lua-hooks/ext/lpeg/lpeg-128.gif +0 -0
  61. data/lua-hooks/ext/lpeg/lpeg.html +1429 -0
  62. data/lua-hooks/ext/lpeg/lpprint.c +244 -0
  63. data/lua-hooks/ext/lpeg/lpprint.h +35 -0
  64. data/lua-hooks/ext/lpeg/lptree.c +1238 -0
  65. data/lua-hooks/ext/lpeg/lptree.h +77 -0
  66. data/lua-hooks/ext/lpeg/lptypes.h +149 -0
  67. data/lua-hooks/ext/lpeg/lpvm.c +355 -0
  68. data/lua-hooks/ext/lpeg/lpvm.h +58 -0
  69. data/lua-hooks/ext/lpeg/makefile +55 -0
  70. data/lua-hooks/ext/lpeg/re.html +498 -0
  71. data/lua-hooks/ext/lpeg/test.lua +1409 -0
  72. data/lua-hooks/ext/lua-cmsgpack/CMakeLists.txt +45 -0
  73. data/lua-hooks/ext/lua-cmsgpack/README.md +115 -0
  74. data/lua-hooks/ext/lua-cmsgpack/lua_cmsgpack.c +957 -0
  75. data/lua-hooks/ext/lua-cmsgpack/test.lua +570 -0
  76. data/lua-hooks/ext/lua-snapshot/LICENSE +7 -0
  77. data/lua-hooks/ext/lua-snapshot/Makefile +12 -0
  78. data/lua-hooks/ext/lua-snapshot/README.md +18 -0
  79. data/lua-hooks/ext/lua-snapshot/dump.lua +15 -0
  80. data/lua-hooks/ext/lua-snapshot/snapshot.c +455 -0
  81. data/lua-hooks/ext/lua/COPYRIGHT +34 -0
  82. data/lua-hooks/ext/lua/lapi.c +1087 -0
  83. data/lua-hooks/ext/lua/lapi.h +16 -0
  84. data/lua-hooks/ext/lua/lauxlib.c +652 -0
  85. data/lua-hooks/ext/lua/lauxlib.h +174 -0
  86. data/lua-hooks/ext/lua/lbaselib.c +659 -0
  87. data/lua-hooks/ext/lua/lcode.c +831 -0
  88. data/lua-hooks/ext/lua/lcode.h +76 -0
  89. data/lua-hooks/ext/lua/ldblib.c +398 -0
  90. data/lua-hooks/ext/lua/ldebug.c +638 -0
  91. data/lua-hooks/ext/lua/ldebug.h +33 -0
  92. data/lua-hooks/ext/lua/ldo.c +519 -0
  93. data/lua-hooks/ext/lua/ldo.h +57 -0
  94. data/lua-hooks/ext/lua/ldump.c +164 -0
  95. data/lua-hooks/ext/lua/lfunc.c +174 -0
  96. data/lua-hooks/ext/lua/lfunc.h +34 -0
  97. data/lua-hooks/ext/lua/lgc.c +710 -0
  98. data/lua-hooks/ext/lua/lgc.h +110 -0
  99. data/lua-hooks/ext/lua/linit.c +38 -0
  100. data/lua-hooks/ext/lua/liolib.c +556 -0
  101. data/lua-hooks/ext/lua/llex.c +463 -0
  102. data/lua-hooks/ext/lua/llex.h +81 -0
  103. data/lua-hooks/ext/lua/llimits.h +128 -0
  104. data/lua-hooks/ext/lua/lmathlib.c +263 -0
  105. data/lua-hooks/ext/lua/lmem.c +86 -0
  106. data/lua-hooks/ext/lua/lmem.h +49 -0
  107. data/lua-hooks/ext/lua/loadlib.c +705 -0
  108. data/lua-hooks/ext/lua/loadlib_rel.c +760 -0
  109. data/lua-hooks/ext/lua/lobject.c +214 -0
  110. data/lua-hooks/ext/lua/lobject.h +381 -0
  111. data/lua-hooks/ext/lua/lopcodes.c +102 -0
  112. data/lua-hooks/ext/lua/lopcodes.h +268 -0
  113. data/lua-hooks/ext/lua/loslib.c +243 -0
  114. data/lua-hooks/ext/lua/lparser.c +1339 -0
  115. data/lua-hooks/ext/lua/lparser.h +82 -0
  116. data/lua-hooks/ext/lua/lstate.c +214 -0
  117. data/lua-hooks/ext/lua/lstate.h +169 -0
  118. data/lua-hooks/ext/lua/lstring.c +111 -0
  119. data/lua-hooks/ext/lua/lstring.h +31 -0
  120. data/lua-hooks/ext/lua/lstrlib.c +871 -0
  121. data/lua-hooks/ext/lua/ltable.c +588 -0
  122. data/lua-hooks/ext/lua/ltable.h +40 -0
  123. data/lua-hooks/ext/lua/ltablib.c +287 -0
  124. data/lua-hooks/ext/lua/ltm.c +75 -0
  125. data/lua-hooks/ext/lua/ltm.h +54 -0
  126. data/lua-hooks/ext/lua/lua.c +392 -0
  127. data/lua-hooks/ext/lua/lua.def +131 -0
  128. data/lua-hooks/ext/lua/lua.h +388 -0
  129. data/lua-hooks/ext/lua/lua.rc +28 -0
  130. data/lua-hooks/ext/lua/lua_dll.rc +26 -0
  131. data/lua-hooks/ext/lua/luac.c +200 -0
  132. data/lua-hooks/ext/lua/luac.rc +1 -0
  133. data/lua-hooks/ext/lua/luaconf.h +763 -0
  134. data/lua-hooks/ext/lua/luaconf.h.in +724 -0
  135. data/lua-hooks/ext/lua/luaconf.h.orig +763 -0
  136. data/lua-hooks/ext/lua/lualib.h +53 -0
  137. data/lua-hooks/ext/lua/lundump.c +227 -0
  138. data/lua-hooks/ext/lua/lundump.h +36 -0
  139. data/lua-hooks/ext/lua/lvm.c +767 -0
  140. data/lua-hooks/ext/lua/lvm.h +36 -0
  141. data/lua-hooks/ext/lua/lzio.c +82 -0
  142. data/lua-hooks/ext/lua/lzio.h +67 -0
  143. data/lua-hooks/ext/lua/print.c +227 -0
  144. data/lua-hooks/ext/luautf8/README.md +152 -0
  145. data/lua-hooks/ext/luautf8/lutf8lib.c +1274 -0
  146. data/lua-hooks/ext/luautf8/unidata.h +3064 -0
  147. data/lua-hooks/lib/boot.lua +254 -0
  148. data/lua-hooks/lib/encode.lua +4 -0
  149. data/lua-hooks/lib/lexers/LICENSE +21 -0
  150. data/lua-hooks/lib/lexers/bash.lua +134 -0
  151. data/lua-hooks/lib/lexers/bash_dqstr.lua +62 -0
  152. data/lua-hooks/lib/lexers/css.lua +216 -0
  153. data/lua-hooks/lib/lexers/html.lua +106 -0
  154. data/lua-hooks/lib/lexers/javascript.lua +68 -0
  155. data/lua-hooks/lib/lexers/lexer.lua +1575 -0
  156. data/lua-hooks/lib/lexers/markers.lua +33 -0
  157. metadata +308 -0
@@ -0,0 +1,1274 @@
1
+ /* Modified to allow bundling.
2
+ * Original source: https://github.com/starwing/luautf8 */
3
+ /* vim: set ft=c nu et sw=2 fdc=2 fdm=syntax : */
4
+ #define LUA_LIB
5
+ #include "lua/lua.h"
6
+ #include "lua/lauxlib.h"
7
+ #include "lua/lualib.h"
8
+
9
+
10
+ #include <assert.h>
11
+ #include <string.h>
12
+
13
+
14
+ /* UTF-8 string operations */
15
+
16
+ #define UTF_MAX 8
17
+
18
+ static size_t utf8_encode(char *s, unsigned int ch) {
19
+ if (ch < 0x80) {
20
+ s[0] = (char)ch;
21
+ return 1;
22
+ }
23
+ if (ch <= 0x7FF) {
24
+ s[1] = (char) ((ch | 0x80) & 0xBF);
25
+ s[0] = (char) ((ch >> 6) | 0xC0);
26
+ return 2;
27
+ }
28
+ if (ch <= 0xFFFF) {
29
+ three:
30
+ s[2] = (char) ((ch | 0x80) & 0xBF);
31
+ s[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
32
+ s[0] = (char) ((ch >> 12) | 0xE0);
33
+ return 3;
34
+ }
35
+ if (ch <= 0x1FFFFF) {
36
+ s[3] = (char) ((ch | 0x80) & 0xBF);
37
+ s[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
38
+ s[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
39
+ s[0] = (char) ((ch >> 18) | 0xF0);
40
+ return 4;
41
+ }
42
+ if (ch <= 0x3FFFFFF) {
43
+ s[4] = (char) ((ch | 0x80) & 0xBF);
44
+ s[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
45
+ s[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
46
+ s[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
47
+ s[0] = (char) ((ch >> 24) | 0xF8);
48
+ return 5;
49
+ }
50
+ if (ch <= 0x7FFFFFFF) {
51
+ s[5] = (char) ((ch | 0x80) & 0xBF);
52
+ s[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
53
+ s[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
54
+ s[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
55
+ s[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
56
+ s[0] = (char) ((ch >> 30) | 0xFC);
57
+ return 6;
58
+ }
59
+
60
+ /* fallback */
61
+ ch = 0xFFFD;
62
+ goto three;
63
+ }
64
+
65
+ static size_t utf8_decode(const char *s, const char *e, unsigned int *pch) {
66
+ unsigned int ch;
67
+
68
+ if (s >= e) {
69
+ *pch = 0;
70
+ return 0;
71
+ }
72
+
73
+ ch = (unsigned char)s[0];
74
+ if (ch < 0xC0) goto fallback;
75
+ if (ch < 0xE0) {
76
+ if (s+1 >= e || (s[1] & 0xC0) != 0x80)
77
+ goto fallback;
78
+ *pch = ((ch & 0x1F) << 6) |
79
+ (s[1] & 0x3F);
80
+ return 2;
81
+ }
82
+ if (ch < 0xF0) {
83
+ if (s+2 >= e || (s[1] & 0xC0) != 0x80
84
+ || (s[2] & 0xC0) != 0x80)
85
+ goto fallback;
86
+ *pch = ((ch & 0x0F) << 12) |
87
+ ((s[1] & 0x3F) << 6) |
88
+ (s[2] & 0x3F);
89
+ return 3;
90
+ }
91
+ {
92
+ int count = 0; /* to count number of continuation bytes */
93
+ unsigned int res;
94
+ while ((ch & 0x40) != 0) { /* still have continuation bytes? */
95
+ int cc = (unsigned char)s[++count];
96
+ if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
97
+ goto fallback; /* invalid byte sequence, fallback */
98
+ res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
99
+ ch <<= 1; /* to test next bit */
100
+ }
101
+ if (count > 5)
102
+ goto fallback; /* invalid byte sequence */
103
+ res |= ((ch & 0x7F) << (count * 5)); /* add first byte */
104
+ return count+1;
105
+ }
106
+
107
+ fallback:
108
+ *pch = ch;
109
+ return 1;
110
+ }
111
+
112
+ static const char *utf8_next(const char *s, const char *e) {
113
+ unsigned int ch;
114
+ return s + utf8_decode(s, e, &ch);
115
+ }
116
+
117
+ static const char *utf8_prev(const char *s, const char *e) {
118
+ const char *look = e - 1;
119
+
120
+ while (s <= look) {
121
+ unsigned int ch = (unsigned char)*look;
122
+ if (ch < 0x80 || ch >= 0xC0)
123
+ return look;
124
+ --look;
125
+ }
126
+
127
+ return s;
128
+ }
129
+
130
+ static size_t utf8_length(const char *s, const char *e) {
131
+ size_t i = 0;
132
+ while (s < e) {
133
+ if ((*s & 0xFF) < 0xC0)
134
+ ++s;
135
+ else
136
+ s = utf8_next(s, e);
137
+ ++i;
138
+ }
139
+ return i;
140
+ }
141
+
142
+ static const char *utf8_index(const char *s, const char *e, int idx) {
143
+ if (idx >= 0) {
144
+ while (s < e && --idx > 0)
145
+ s = utf8_next(s, e);
146
+ return s;
147
+ }
148
+ else {
149
+ while (s < e && idx++ < 0)
150
+ e = utf8_prev(s, e);
151
+ return e;
152
+ }
153
+ }
154
+
155
+
156
+ /* Unicode character categories */
157
+
158
+ #include "unidata.h"
159
+
160
+ static int find_in_range(range_table *t, size_t size, unsigned int ch) {
161
+ size_t first, last;
162
+
163
+ first = 0;
164
+ last = size;
165
+
166
+ while (first < last) {
167
+ int mid = (first + last) / 2;
168
+ if (t[mid].last < ch)
169
+ first = mid + 1;
170
+ else if (t[mid].first > ch)
171
+ last = mid;
172
+ else
173
+ return (ch - t[mid].first) % t[mid].step == 0;
174
+ }
175
+
176
+ return 0;
177
+ }
178
+
179
+ static int convert_char(conv_table *t, size_t size, unsigned int ch) {
180
+ size_t first, last;
181
+
182
+ first = 0;
183
+ last = size;
184
+
185
+ while (first < last) {
186
+ int mid = (first + last) / 2;
187
+ if (t[mid].last < ch)
188
+ first = mid + 1;
189
+ else if (t[mid].first > ch)
190
+ last = mid;
191
+ else if ((ch - t[mid].first) % t[mid].step == 0)
192
+ return ch + t[mid].offset;
193
+ else
194
+ return ch;
195
+ }
196
+
197
+ return ch;
198
+ }
199
+
200
+ #define table_size(t) (sizeof(t)/sizeof((t)[0]))
201
+
202
+ #define define_category(name) static int utf8_is##name(unsigned int ch) \
203
+ { return find_in_range(name##_table, table_size(name##_table), ch); }
204
+
205
+ #define define_converter(name) static unsigned int utf8_##name(unsigned int ch) \
206
+ { return convert_char(name##_table, table_size(name##_table), ch); }
207
+
208
+ define_category(alpha)
209
+ define_category(lower)
210
+ define_category(upper)
211
+ define_category(cntrl)
212
+ define_category(digit)
213
+ define_category(xdigit)
214
+ define_category(punct)
215
+ define_category(space)
216
+ define_converter(tolower)
217
+ define_converter(toupper)
218
+ define_converter(totitle)
219
+ define_converter(tofold)
220
+
221
+ #undef define_category
222
+ #undef define_converter
223
+
224
+ static int utf8_isgraph(unsigned int ch) {
225
+ if (find_in_range(space_table, table_size(space_table), ch))
226
+ return 0;
227
+ if (find_in_range(graph_table, table_size(graph_table), ch))
228
+ return 1;
229
+ if (find_in_range(compose_table, table_size(compose_table), ch))
230
+ return 1;
231
+ return 0;
232
+ }
233
+
234
+ static int utf8_isalnum(unsigned int ch) {
235
+ if (find_in_range(alpha_table, table_size(alpha_table), ch))
236
+ return 1;
237
+ if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
238
+ return 1;
239
+ return 0;
240
+ }
241
+
242
+ static int utf8_width(unsigned int ch, int ambi_is_single) {
243
+ if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
244
+ return 2;
245
+ if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
246
+ return ambi_is_single ? 1 : 2;
247
+ if (find_in_range(compose_table, table_size(compose_table), ch))
248
+ return 0;
249
+ if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
250
+ return 0;
251
+ return 1;
252
+ }
253
+
254
+
255
+ /* string module compatible interface */
256
+
257
+ static const char *check_utf8(lua_State *L, int idx, const char **end) {
258
+ size_t len;
259
+ const char *s = luaL_checklstring(L, idx, &len);
260
+ if (end) *end = s+len;
261
+ return s;
262
+ }
263
+
264
+ static const char *to_utf8(lua_State *L, int idx, const char **end) {
265
+ size_t len;
266
+ const char *s = lua_tolstring(L, idx, &len);
267
+ if (end) *end = s+len;
268
+ return s;
269
+ }
270
+
271
+ static void add_utf8char(luaL_Buffer *b, unsigned int ch) {
272
+ char buff[UTF_MAX];
273
+ size_t n = utf8_encode(buff, ch);
274
+ luaL_addlstring(b, buff, n);
275
+ }
276
+
277
+ static lua_Integer byterelat(lua_Integer pos, size_t len) {
278
+ if (pos >= 0) return pos;
279
+ else if (0u - (size_t)pos > len) return 0;
280
+ else return (lua_Integer)len + pos + 1;
281
+ }
282
+
283
+ static int u_posrange(const char **ps, const char **pe,
284
+ lua_Integer posi, lua_Integer posj) {
285
+ const char *s = *ps, *e = *pe;
286
+ *ps = utf8_index(s, e, posi);
287
+ if (posj >= 0) {
288
+ while (s < e && posj-- > 0)
289
+ s = utf8_next(s, e);
290
+ *pe = s;
291
+ }
292
+ else {
293
+ while (s < e && ++posj < 0)
294
+ e = utf8_prev(s, e);
295
+ *pe = e;
296
+ }
297
+ return *ps < *pe;
298
+ }
299
+
300
+ static int Lutf8_len(lua_State *L) {
301
+ size_t len;
302
+ const char *s = luaL_checklstring(L, 1, &len);
303
+ lua_Integer posi = byterelat(luaL_optinteger(L, 2, 1), len);
304
+ lua_Integer posj = byterelat(luaL_optinteger(L, 3, -1), len);
305
+ if (posi < 1 || --posi > (lua_Integer)len
306
+ || --posj > (lua_Integer)len)
307
+ return 0;
308
+ lua_pushinteger(L, (lua_Integer)utf8_length(s+posi, s+posj+1));
309
+ return 1;
310
+ }
311
+
312
+ static int Lutf8_sub(lua_State *L) {
313
+ const char *e, *s = check_utf8(L, 1, &e);
314
+ if (u_posrange(&s, &e,
315
+ luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
316
+ lua_pushlstring(L, s, e-s);
317
+ else
318
+ lua_pushliteral(L, "");
319
+ return 1;
320
+ }
321
+
322
+ static int Lutf8_reverse(lua_State *L) {
323
+ luaL_Buffer b;
324
+ /* XXX should handle compose unicode? */
325
+ const char *e, *s = check_utf8(L, 1, &e);
326
+ luaL_buffinit(L, &b);
327
+ while (s < e) {
328
+ const char *prev = utf8_prev(s, e);
329
+ luaL_addlstring(&b, prev, e-prev);
330
+ e = prev;
331
+ }
332
+ luaL_pushresult(&b);
333
+ return 1;
334
+ }
335
+
336
+ static int convert(lua_State *L, unsigned int (*conv)(unsigned int)) {
337
+ int t = lua_type(L, 1);
338
+ if (t == LUA_TNUMBER)
339
+ lua_pushinteger(L, conv(lua_tointeger(L, 1)));
340
+ else if (t != LUA_TSTRING)
341
+ return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
342
+ else {
343
+ luaL_Buffer b;
344
+ const char *e, *s = to_utf8(L, 1, &e);
345
+ luaL_buffinit(L, &b);
346
+ while (s < e) {
347
+ unsigned int ch;
348
+ s += utf8_decode(s, e, &ch);
349
+ ch = conv(ch);
350
+ add_utf8char(&b, ch);
351
+ }
352
+ luaL_pushresult(&b);
353
+ }
354
+ return 1;
355
+ }
356
+
357
+ static int Lutf8_lower(lua_State *L)
358
+ { return convert(L, utf8_tolower); }
359
+
360
+ static int Lutf8_upper(lua_State *L)
361
+ { return convert(L, utf8_toupper); }
362
+
363
+ static int Lutf8_title(lua_State *L)
364
+ { return convert(L, utf8_totitle); }
365
+
366
+ static int Lutf8_fold(lua_State *L)
367
+ { return convert(L, utf8_tofold); }
368
+
369
+ static int Lutf8_byte(lua_State *L) {
370
+ size_t n = 0;
371
+ const char *e, *s = check_utf8(L, 1, &e);
372
+ lua_Integer posi = luaL_optinteger(L, 2, 1);
373
+ lua_Integer posj = luaL_optinteger(L, 3, posi);
374
+ if (u_posrange(&s, &e, posi, posj)) {
375
+ luaL_checkstack(L, e-s, "string slice too long");
376
+ while (s < e) {
377
+ unsigned int ch;
378
+ s += utf8_decode(s, e, &ch);
379
+ lua_pushinteger(L, ch);
380
+ ++n;
381
+ }
382
+ }
383
+ return n;
384
+ }
385
+
386
+ static int Lutf8_char(lua_State *L) {
387
+ int i, n = lua_gettop(L); /* number of arguments */
388
+ luaL_Buffer b;
389
+ luaL_buffinit(L, &b);
390
+ for (i = 1; i <= n; ++i) {
391
+ unsigned int ch = luaL_checkint(L, i);
392
+ add_utf8char(&b, ch);
393
+ }
394
+ luaL_pushresult(&b);
395
+ return 1;
396
+ }
397
+
398
+
399
+ /* unicode extra interface */
400
+
401
+ static const char *parse_escape(lua_State *L,
402
+ const char *s, const char *e,
403
+ int is_hex, unsigned int *pch) {
404
+ unsigned int escape = 0, ch;
405
+ int in_bracket = 0;
406
+ if (*s == '{') ++s, in_bracket = 1;
407
+ while (s < e) {
408
+ ch = (unsigned char)*s;
409
+ if (in_bracket && ch == '}') {
410
+ ++s;
411
+ break;
412
+ }
413
+ if (ch >= '0' && ch <= '9')
414
+ ch = ch - '0';
415
+ else if (is_hex && ch >= 'A' && ch <= 'F')
416
+ ch = 10 + (ch - 'A');
417
+ else if (is_hex && ch >= 'a' && ch <= 'f')
418
+ ch = 10 + (ch - 'a');
419
+ else {
420
+ if (in_bracket)
421
+ luaL_error(L, "invalid escape '%c'", ch);
422
+ break;
423
+ }
424
+ escape *= is_hex ? 16 : 10;
425
+ escape += ch;
426
+ ++s;
427
+ }
428
+ *pch = escape;
429
+ return s;
430
+ }
431
+
432
+ static int Lutf8_escape(lua_State *L) {
433
+ const char *e, *s = check_utf8(L, 1, &e);
434
+ luaL_Buffer b;
435
+ luaL_buffinit(L, &b);
436
+ while (s < e) {
437
+ unsigned int ch;
438
+ s += utf8_decode(s, e, &ch);
439
+ if (ch == '%') {
440
+ int is_hex = 0;
441
+ switch (*s) {
442
+ case '0': case '1': case '2': case '3':
443
+ case '4': case '5': case '6': case '7':
444
+ case '8': case '9': case '{':
445
+ break;
446
+ case 'u': case 'U': ++s; break;
447
+ case 'x': case 'X': ++s; is_hex = 1; break;
448
+ default:
449
+ s += utf8_decode(s, e, &ch);
450
+ goto next;
451
+ }
452
+ if (s >= e)
453
+ luaL_error(L, "invalid escape sequence");
454
+ s = parse_escape(L, s, e, is_hex, &ch);
455
+ }
456
+ next:
457
+ add_utf8char(&b, ch);
458
+ }
459
+ luaL_pushresult(&b);
460
+ return 1;
461
+ }
462
+
463
+ static int Lutf8_insert(lua_State *L) {
464
+ const char *e, *s = check_utf8(L, 1, &e);
465
+ size_t sublen;
466
+ const char *subs;
467
+ luaL_Buffer b;
468
+ int nargs = 2;
469
+ const char *first = e;
470
+ if (lua_type(L, 2) == LUA_TNUMBER) {
471
+ int idx = (int)lua_tointeger(L, 2);
472
+ if (idx != 0) first = utf8_index(s, e, idx);
473
+ ++nargs;
474
+ }
475
+ subs = luaL_checklstring(L, nargs, &sublen);
476
+ luaL_buffinit(L, &b);
477
+ luaL_addlstring(&b, s, first-s);
478
+ luaL_addlstring(&b, subs, sublen);
479
+ luaL_addlstring(&b, first, e-first);
480
+ luaL_pushresult(&b);
481
+ return 1;
482
+ }
483
+
484
+ static int Lutf8_remove(lua_State *L) {
485
+ const char *e, *s = check_utf8(L, 1, &e);
486
+ const char *start = s, *end = e;
487
+ if (!u_posrange(&start, &end,
488
+ luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
489
+ lua_settop(L, 1);
490
+ else {
491
+ luaL_Buffer b;
492
+ luaL_buffinit(L, &b);
493
+ luaL_addlstring(&b, s, start-s);
494
+ luaL_addlstring(&b, end, e-end);
495
+ luaL_pushresult(&b);
496
+ }
497
+ return 1;
498
+ }
499
+
500
+ static int push_offset(lua_State *L, const char *s, const char *e,
501
+ const char *cur, lua_Integer offset) {
502
+ unsigned int ch;
503
+ if (offset >= 0) {
504
+ while (cur < e && offset-- > 0)
505
+ cur = utf8_next(cur, e);
506
+ if (offset >= 0) return 0;
507
+ }
508
+ else {
509
+ while (s < cur && offset++ < 0)
510
+ cur = utf8_prev(s, cur);
511
+ if (offset < 0) return 0;
512
+ }
513
+ utf8_decode(cur, e, &ch);
514
+ lua_pushinteger(L, cur-s+1);
515
+ lua_pushinteger(L, ch);
516
+ return 2;
517
+ }
518
+
519
+ static int Lutf8_charpos(lua_State *L) {
520
+ size_t len;
521
+ const char *s = luaL_checklstring(L, 1, &len);
522
+ const char *cur = s;
523
+ lua_Integer pos;
524
+ if (lua_isnoneornil(L, 3)) {
525
+ lua_Integer offset = luaL_optinteger(L, 2, 1);
526
+ if (offset > 0) --offset;
527
+ else if (offset < 0) cur = s+len;
528
+ return push_offset(L, s, s+len, cur, offset);
529
+ }
530
+ pos = byterelat(luaL_optinteger(L, 2, 1), len);
531
+ if (pos != 0) cur += pos-1;
532
+ return push_offset(L, s, s+len, cur, luaL_checkinteger(L, 3));
533
+ }
534
+
535
+ static int Lutf8_next(lua_State *L) {
536
+ size_t len;
537
+ const char *s = luaL_checklstring(L, 1, &len);
538
+ const char *cur = s;
539
+ lua_Integer offset = 0;
540
+ if (!lua_isnoneornil(L, 2)) {
541
+ lua_Integer pos = byterelat(luaL_checkinteger(L, 2), len);
542
+ if (pos != 0) cur += pos-1;
543
+ offset = 1;
544
+ }
545
+ offset = luaL_optinteger(L, 3, offset);
546
+ return push_offset(L, s, s+len, cur, offset);
547
+ }
548
+
549
+ static int Lutf8_width(lua_State *L) {
550
+ int t = lua_type(L, 1);
551
+ int ambi_is_single = !lua_toboolean(L, 2);
552
+ int default_width = luaL_optinteger(L, 3, 0);
553
+ if (t == LUA_TNUMBER) {
554
+ size_t chwidth = utf8_width(lua_tointeger(L, 1), ambi_is_single);
555
+ if (chwidth == 0) chwidth = default_width;
556
+ lua_pushinteger(L, (lua_Integer)chwidth);
557
+ }
558
+ else if (t != LUA_TSTRING)
559
+ return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
560
+ else {
561
+ const char *e, *s = to_utf8(L, 1, &e);
562
+ size_t width = 0;
563
+ while (s < e) {
564
+ unsigned int ch;
565
+ size_t chwidth;
566
+ s += utf8_decode(s, e, &ch);
567
+ chwidth = utf8_width(ch, ambi_is_single);
568
+ width += chwidth == 0 ? default_width : chwidth;
569
+ }
570
+ lua_pushinteger(L, (lua_Integer)width);
571
+ }
572
+ return 1;
573
+ }
574
+
575
+ static int Lutf8_widthindex(lua_State *L) {
576
+ const char *e, *s = check_utf8(L, 1, &e);
577
+ int width = luaL_checkinteger(L, 2);
578
+ int ambi_is_single = !lua_toboolean(L, 3);
579
+ int default_width = luaL_optinteger(L, 4, 0);
580
+ size_t idx = 1;
581
+ while (s < e) {
582
+ unsigned int ch;
583
+ size_t chwidth;
584
+ s += utf8_decode(s, e, &ch);
585
+ chwidth = utf8_width(ch, ambi_is_single);
586
+ if (chwidth == 0) chwidth = default_width;
587
+ width -= chwidth;
588
+ if (width <= 0) {
589
+ lua_pushinteger(L, idx);
590
+ lua_pushinteger(L, width + chwidth);
591
+ lua_pushinteger(L, chwidth);
592
+ return 3;
593
+ }
594
+ ++idx;
595
+ }
596
+ lua_pushinteger(L, (lua_Integer)idx);
597
+ return 1;
598
+ }
599
+
600
+ static int Lutf8_ncasecmp(lua_State *L) {
601
+ const char *e1, *s1 = check_utf8(L, 1, &e1);
602
+ const char *e2, *s2 = check_utf8(L, 2, &e2);
603
+ while (s1 < e1 || s2 < e2) {
604
+ unsigned int ch1 = 0, ch2 = 0;
605
+ if (s1 == e1)
606
+ ch2 = 1;
607
+ else if (s2 == e2)
608
+ ch1 = 1;
609
+ else {
610
+ s1 += utf8_decode(s1, e1, &ch1);
611
+ s2 += utf8_decode(s2, e2, &ch2);
612
+ ch1 = utf8_tofold(ch1);
613
+ ch2 = utf8_tofold(ch2);
614
+ }
615
+ if (ch1 != ch2) {
616
+ lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
617
+ return 1;
618
+ }
619
+ }
620
+ lua_pushinteger(L, 0);
621
+ return 1;
622
+ }
623
+
624
+
625
+ /* utf8 pattern matching implement */
626
+
627
+ #ifndef LUA_MAXCAPTURES
628
+ # define LUA_MAXCAPTURES 32
629
+ #endif /* LUA_MAXCAPTURES */
630
+
631
+ #define CAP_UNFINISHED (-1)
632
+ #define CAP_POSITION (-2)
633
+
634
+
635
+ typedef struct utf8MatchState {
636
+ int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
637
+ const char *src_init; /* init of source string */
638
+ const char *src_end; /* end ('\0') of source string */
639
+ const char *p_end; /* end ('\0') of pattern */
640
+ lua_State *L;
641
+ int level; /* total number of captures (finished or unfinished) */
642
+ struct {
643
+ const char *init;
644
+ ptrdiff_t len;
645
+ } capture[LUA_MAXCAPTURES];
646
+ } utf8MatchState;
647
+
648
+ /* recursive function */
649
+ static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p);
650
+
651
+ /* maximum recursion depth for 'match' */
652
+ #if !defined(MAXCCALLS)
653
+ #define MAXCCALLS 200
654
+ #endif
655
+
656
+ #define L_ESC '%'
657
+ #define SPECIALS "^$*+?.([%-"
658
+
659
+ static int utf8_check_capture (utf8MatchState *ms, int l) {
660
+ l -= '1';
661
+ if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
662
+ return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
663
+ return l;
664
+ }
665
+
666
+ static int utf8_capture_to_close (utf8MatchState *ms) {
667
+ int level = ms->level;
668
+ for (level--; level>=0; level--)
669
+ if (ms->capture[level].len == CAP_UNFINISHED) return level;
670
+ return luaL_error(ms->L, "invalid pattern capture");
671
+ }
672
+
673
+ static const char *utf8_classend (utf8MatchState *ms, const char *p) {
674
+ unsigned int ch;
675
+ p += utf8_decode(p, ms->p_end, &ch);
676
+ switch (ch) {
677
+ case L_ESC: {
678
+ if (p == ms->p_end)
679
+ luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
680
+ return utf8_next(p, ms->p_end);
681
+ }
682
+ case '[': {
683
+ if (*p == '^') p++;
684
+ do { /* look for a `]' */
685
+ if (p == ms->p_end)
686
+ luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
687
+ if (*(p++) == L_ESC && p < ms->p_end)
688
+ p++; /* skip escapes (e.g. `%]') */
689
+ } while (*p != ']');
690
+ return p+1;
691
+ }
692
+ default: {
693
+ return p;
694
+ }
695
+ }
696
+ }
697
+
698
+ static int utf8_match_class (unsigned int c, unsigned int cl) {
699
+ int res;
700
+ switch (utf8_tolower(cl)) {
701
+ case 'a' : res = utf8_isalpha(c); break;
702
+ case 'c' : res = utf8_iscntrl(c); break;
703
+ case 'd' : res = utf8_isdigit(c); break;
704
+ case 'g' : res = utf8_isgraph(c); break;
705
+ case 'l' : res = utf8_islower(c); break;
706
+ case 'p' : res = utf8_ispunct(c); break;
707
+ case 's' : res = utf8_isspace(c); break;
708
+ case 'u' : res = utf8_isupper(c); break;
709
+ case 'w' : res = utf8_isalnum(c); break;
710
+ case 'x' : res = utf8_isxdigit(c); break;
711
+ case 'z' : res = (c == 0); break; /* deprecated option */
712
+ default: return (cl == c);
713
+ }
714
+ return (utf8_islower(cl) ? res : !res);
715
+ }
716
+
717
+ static int utf8_matchbracketclass (unsigned int c, const char *p, const char *ec) {
718
+ int sig = 1;
719
+ assert(*p == '[');
720
+ if (*++p == '^') {
721
+ sig = 0;
722
+ p++; /* skip the `^' */
723
+ }
724
+ while (p < ec) {
725
+ unsigned int ch;
726
+ p += utf8_decode(p, ec, &ch);
727
+ if (ch == L_ESC) {
728
+ p += utf8_decode(p, ec, &ch);
729
+ if (utf8_match_class(c, ch))
730
+ return sig;
731
+ }
732
+ else {
733
+ unsigned int next;
734
+ const char *np = p + utf8_decode(p, ec, &next);
735
+ if (next == '-' && np < ec) {
736
+ p = np + utf8_decode(np, ec, &next);
737
+ if (ch <= c && c <= next)
738
+ return sig;
739
+ }
740
+ else if (ch == c) return sig;
741
+ }
742
+ }
743
+ return !sig;
744
+ }
745
+
746
+ static int utf8_singlematch (utf8MatchState *ms, const char *s, const char *p,
747
+ const char *ep) {
748
+ if (s >= ms->src_end)
749
+ return 0;
750
+ else {
751
+ unsigned int ch, pch;
752
+ utf8_decode(s, ms->src_end, &ch);
753
+ p += utf8_decode(p, ms->p_end, &pch);
754
+ switch (pch) {
755
+ case '.': return 1; /* matches any char */
756
+ case L_ESC: utf8_decode(p, ms->p_end, &pch);
757
+ return utf8_match_class(ch, pch);
758
+ case '[': return utf8_matchbracketclass(ch, p-1, ep-1);
759
+ default: return pch == ch;
760
+ }
761
+ }
762
+ }
763
+
764
+ static const char *utf8_matchbalance (utf8MatchState *ms, const char *s,
765
+ const char **p) {
766
+ unsigned int ch, begin, end;
767
+ *p += utf8_decode(*p, ms->p_end, &begin);
768
+ if (*p >= ms->p_end)
769
+ luaL_error(ms->L, "malformed pattern "
770
+ "(missing arguments to " LUA_QL("%%b") ")");
771
+ *p += utf8_decode(*p, ms->p_end, &end);
772
+ s += utf8_decode(s, ms->src_end, &ch);
773
+ if (ch != begin) return NULL;
774
+ else {
775
+ int cont = 1;
776
+ while (s < ms->src_end) {
777
+ s += utf8_decode(s, ms->src_end, &ch);
778
+ if (ch == end) {
779
+ if (--cont == 0) return s;
780
+ }
781
+ else if (ch == begin) cont++;
782
+ }
783
+ }
784
+ return NULL; /* string ends out of balance */
785
+ }
786
+
787
+ static const char *utf8_max_expand (utf8MatchState *ms, const char *s,
788
+ const char *p, const char *ep) {
789
+ const char *m = s; /* matched end of single match p */
790
+ while (utf8_singlematch(ms, m, p, ep))
791
+ m = utf8_next(m, ms->src_end);
792
+ /* keeps trying to match with the maximum repetitions */
793
+ while (s <= m) {
794
+ const char *res = utf8_match(ms, m, ep+1);
795
+ if (res) return res;
796
+ /* else didn't match; reduce 1 repetition to try again */
797
+ if (s == m) break;
798
+ m = utf8_prev(s, m);
799
+ }
800
+ return NULL;
801
+ }
802
+
803
+ static const char *utf8_min_expand (utf8MatchState *ms, const char *s,
804
+ const char *p, const char *ep) {
805
+ for (;;) {
806
+ const char *res = utf8_match(ms, s, ep+1);
807
+ if (res != NULL)
808
+ return res;
809
+ else if (utf8_singlematch(ms, s, p, ep))
810
+ s = utf8_next(s, ms->src_end); /* try with one more repetition */
811
+ else return NULL;
812
+ }
813
+ }
814
+
815
+ static const char *utf8_start_capture (utf8MatchState *ms, const char *s,
816
+ const char *p, int what) {
817
+ const char *res;
818
+ int level = ms->level;
819
+ if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
820
+ ms->capture[level].init = s;
821
+ ms->capture[level].len = what;
822
+ ms->level = level+1;
823
+ if ((res=utf8_match(ms, s, p)) == NULL) /* match failed? */
824
+ ms->level--; /* undo capture */
825
+ return res;
826
+ }
827
+
828
+ static const char *utf8_end_capture (utf8MatchState *ms, const char *s,
829
+ const char *p) {
830
+ int l = utf8_capture_to_close(ms);
831
+ const char *res;
832
+ ms->capture[l].len = s - ms->capture[l].init; /* close capture */
833
+ if ((res = utf8_match(ms, s, p)) == NULL) /* match failed? */
834
+ ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
835
+ return res;
836
+ }
837
+
838
+ static const char *utf8_match_capture (utf8MatchState *ms, const char *s, int l) {
839
+ size_t len;
840
+ l = utf8_check_capture(ms, l);
841
+ len = ms->capture[l].len;
842
+ if ((size_t)(ms->src_end-s) >= len &&
843
+ memcmp(ms->capture[l].init, s, len) == 0)
844
+ return s+len;
845
+ else return NULL;
846
+ }
847
+
848
+ static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p) {
849
+ if (ms->matchdepth-- == 0)
850
+ luaL_error(ms->L, "pattern too complex");
851
+ init: /* using goto's to optimize tail recursion */
852
+ if (p != ms->p_end) { /* end of pattern? */
853
+ unsigned int ch;
854
+ utf8_decode(p, ms->p_end, &ch);
855
+ switch (ch) {
856
+ case '(': { /* start capture */
857
+ if (*(p + 1) == ')') /* position capture? */
858
+ s = utf8_start_capture(ms, s, p + 2, CAP_POSITION);
859
+ else
860
+ s = utf8_start_capture(ms, s, p + 1, CAP_UNFINISHED);
861
+ break;
862
+ }
863
+ case ')': { /* end capture */
864
+ s = utf8_end_capture(ms, s, p + 1);
865
+ break;
866
+ }
867
+ case '$': {
868
+ if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
869
+ goto dflt; /* no; go to default */
870
+ s = (s == ms->src_end) ? s : NULL; /* check end of string */
871
+ break;
872
+ }
873
+ case L_ESC: { /* escaped sequence not in the format class[*+?-]? */
874
+ const char *prev_p = p;
875
+ p += utf8_decode(p+1, ms->p_end, &ch) + 1;
876
+ switch (ch) {
877
+ case 'b': { /* balanced string? */
878
+ s = utf8_matchbalance(ms, s, &p);
879
+ if (s != NULL)
880
+ goto init; /* return utf8_match(ms, s, p + 4); */
881
+ /* else fail (s == NULL) */
882
+ break;
883
+ }
884
+ case 'f': { /* frontier? */
885
+ const char *ep; unsigned int previous = 0, current = 0;
886
+ if (*p != '[')
887
+ luaL_error(ms->L, "missing " LUA_QL("[") " after "
888
+ LUA_QL("%%f") " in pattern");
889
+ ep = utf8_classend(ms, p); /* points to what is next */
890
+ if (s != ms->src_init)
891
+ utf8_decode(utf8_prev(ms->src_init, s), ms->src_end, &previous);
892
+ if (s != ms->src_end)
893
+ utf8_decode(s, ms->src_end, &current);
894
+ if (!utf8_matchbracketclass(previous, p, ep - 1) &&
895
+ utf8_matchbracketclass(current, p, ep - 1)) {
896
+ p = ep; goto init; /* return utf8_match(ms, s, ep); */
897
+ }
898
+ s = NULL; /* match failed */
899
+ break;
900
+ }
901
+ case '0': case '1': case '2': case '3':
902
+ case '4': case '5': case '6': case '7':
903
+ case '8': case '9': { /* capture results (%0-%9)? */
904
+ s = utf8_match_capture(ms, s, ch - '1');
905
+ if (s != NULL) goto init; /* return utf8_match(ms, s, p + 2) */
906
+ break;
907
+ }
908
+ default: p = prev_p; goto dflt;
909
+ }
910
+ break;
911
+ }
912
+ default: dflt: { /* pattern class plus optional suffix */
913
+ const char *ep = utf8_classend(ms, p); /* points to optional suffix */
914
+ /* does not match at least once? */
915
+ if (!utf8_singlematch(ms, s, p, ep)) {
916
+ if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
917
+ p = ep + 1; goto init; /* return utf8_match(ms, s, ep + 1); */
918
+ }
919
+ else /* '+' or no suffix */
920
+ s = NULL; /* fail */
921
+ }
922
+ else { /* matched once */
923
+ const char *next_s = utf8_next(s, ms->src_end);
924
+ switch (*ep) { /* handle optional suffix */
925
+ case '?': { /* optional */
926
+ const char *res;
927
+ const char *next_ep = utf8_next(ep, ms->p_end);
928
+ if ((res = utf8_match(ms, next_s, next_ep)) != NULL)
929
+ s = res;
930
+ else {
931
+ p = next_ep; goto init; /* else return utf8_match(ms, s, ep + 1); */
932
+ }
933
+ break;
934
+ }
935
+ case '+': /* 1 or more repetitions */
936
+ s = next_s; /* 1 match already done */
937
+ /* go through */
938
+ case '*': /* 0 or more repetitions */
939
+ s = utf8_max_expand(ms, s, p, ep);
940
+ break;
941
+ case '-': /* 0 or more repetitions (minimum) */
942
+ s = utf8_min_expand(ms, s, p, ep);
943
+ break;
944
+ default: /* no suffix */
945
+ s = next_s; p = ep; goto init; /* return utf8_match(ms, s + 1, ep); */
946
+ }
947
+ }
948
+ break;
949
+ }
950
+ }
951
+ }
952
+ ms->matchdepth++;
953
+ return s;
954
+ }
955
+
956
+ static const char *utf8_lmemfind (const char *s1, size_t l1,
957
+ const char *s2, size_t l2) {
958
+ if (l2 == 0) return s1; /* empty strings are everywhere */
959
+ else if (l2 > l1) return NULL; /* avoids a negative `l1' */
960
+ else {
961
+ const char *init; /* to search for a `*s2' inside `s1' */
962
+ l2--; /* 1st char will be checked by `memchr' */
963
+ l1 = l1-l2; /* `s2' cannot be found after that */
964
+ while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
965
+ init++; /* 1st char is already checked */
966
+ if (memcmp(init, s2+1, l2) == 0)
967
+ return init-1;
968
+ else { /* correct `l1' and `s1' to try again */
969
+ l1 -= init-s1;
970
+ s1 = init;
971
+ }
972
+ }
973
+ return NULL; /* not found */
974
+ }
975
+ }
976
+
977
+ static const char *utf8_get_index(const char *p, const char *s, const char *e, int *pidx) {
978
+ int idx = 0;
979
+ while (s < e) {
980
+ if (s == p)
981
+ break;
982
+ else if (s > p) {
983
+ --idx;
984
+ break;
985
+ }
986
+ s = utf8_next(s, e);
987
+ ++idx;
988
+ }
989
+ if (pidx) *pidx = idx;
990
+ return s;
991
+ }
992
+
993
+ static void utf8_push_onecapture (utf8MatchState *ms, int i, const char *s,
994
+ const char *e) {
995
+ if (i >= ms->level) {
996
+ if (i == 0) /* ms->level == 0, too */
997
+ lua_pushlstring(ms->L, s, e - s); /* add whole match */
998
+ else
999
+ luaL_error(ms->L, "invalid capture index");
1000
+ }
1001
+ else {
1002
+ ptrdiff_t l = ms->capture[i].len;
1003
+ if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
1004
+ if (l == CAP_POSITION) {
1005
+ int idx;
1006
+ utf8_get_index(ms->capture[i].init, ms->src_init, ms->src_end, &idx);
1007
+ lua_pushinteger(ms->L, idx+1);
1008
+ } else
1009
+ lua_pushlstring(ms->L, ms->capture[i].init, l);
1010
+ }
1011
+ }
1012
+
1013
+ static int utf8_push_captures (utf8MatchState *ms, const char *s, const char *e) {
1014
+ int i;
1015
+ int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1016
+ luaL_checkstack(ms->L, nlevels, "too many captures");
1017
+ for (i = 0; i < nlevels; i++)
1018
+ utf8_push_onecapture(ms, i, s, e);
1019
+ return nlevels; /* number of strings pushed */
1020
+ }
1021
+
1022
+ /* check whether pattern has no special characters */
1023
+ static int nospecials (const char *p, const char * ep) {
1024
+ while (p < ep) {
1025
+ if (strpbrk(p, SPECIALS))
1026
+ return 0; /* pattern has a special character */
1027
+ p += strlen(p) + 1; /* may have more after \0 */
1028
+ }
1029
+ return 1; /* no special chars found */
1030
+ }
1031
+
1032
+
1033
+ /* utf8 pattern matching interface */
1034
+
1035
+ static int find_aux (lua_State *L, int find) {
1036
+ const char *es, *s = check_utf8(L, 1, &es);
1037
+ const char *ep, *p = check_utf8(L, 2, &ep);
1038
+ lua_Integer idx = luaL_optinteger(L, 3, 1);
1039
+ const char *init;
1040
+ size_t slen = utf8_length(s, es);
1041
+ if (idx > 0 && idx > (lua_Integer)slen + 1) { /* start after string's end? */
1042
+ lua_pushnil(L); /* cannot find anything */
1043
+ return 1;
1044
+ }
1045
+ if (idx < 0) idx += utf8_length(s, es) + 1;
1046
+ init = utf8_index(s, es, idx);
1047
+ /* explicit request or no special characters? */
1048
+ if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1049
+ /* do a plain search */
1050
+ do {
1051
+ const char *s2 = utf8_lmemfind(init, es-init, p, ep-p);
1052
+ if (!s2) break;
1053
+ else {
1054
+ int relidx;
1055
+ const char *pch = utf8_get_index(s2, init, es, &relidx);
1056
+ if (pch == s2) {
1057
+ lua_pushinteger(L, idx + relidx);
1058
+ lua_pushinteger(L, idx + relidx + utf8_length(p, ep) - 1);
1059
+ return 2;
1060
+ }
1061
+ idx += relidx + 1;
1062
+ init = utf8_next(pch, es);
1063
+ }
1064
+ } while (init < es);
1065
+ }
1066
+ else {
1067
+ utf8MatchState ms;
1068
+ int anchor = (*p == '^');
1069
+ if (anchor) p++; /* skip anchor character */
1070
+ ms.L = L;
1071
+ ms.matchdepth = MAXCCALLS;
1072
+ ms.src_init = s;
1073
+ ms.src_end = es;
1074
+ ms.p_end = ep;
1075
+ do {
1076
+ const char *res;
1077
+ ms.level = 0;
1078
+ assert(ms.matchdepth == MAXCCALLS);
1079
+ if ((res=utf8_match(&ms, init, p)) != NULL) {
1080
+ if (find) {
1081
+ lua_pushinteger(L, idx); /* start */
1082
+ lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */
1083
+ return utf8_push_captures(&ms, NULL, 0) + 2;
1084
+ }
1085
+ else
1086
+ return utf8_push_captures(&ms, init, res);
1087
+ }
1088
+ if (init == es) break;
1089
+ idx += 1;
1090
+ init = utf8_next(init, es);
1091
+ } while (init <= es && !anchor);
1092
+ }
1093
+ lua_pushnil(L); /* not found */
1094
+ return 1;
1095
+ }
1096
+
1097
+ static int Lutf8_find(lua_State *L)
1098
+ { return find_aux(L, 1); }
1099
+
1100
+ static int Lutf8_match(lua_State *L)
1101
+ { return find_aux(L, 0); }
1102
+
1103
+ static int utf8_gmatch_aux (lua_State *L) {
1104
+ utf8MatchState ms;
1105
+ const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1106
+ const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1107
+ const char *src;
1108
+ ms.L = L;
1109
+ ms.matchdepth = MAXCCALLS;
1110
+ ms.src_init = s;
1111
+ ms.src_end = es;
1112
+ ms.p_end = ep;
1113
+ for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
1114
+ src <= ms.src_end;
1115
+ src = utf8_next(src, ms.src_end)) {
1116
+ const char *e;
1117
+ ms.level = 0;
1118
+ assert(ms.matchdepth == MAXCCALLS);
1119
+ if ((e = utf8_match(&ms, src, p)) != NULL) {
1120
+ lua_Integer newstart = e-s;
1121
+ if (e == src) newstart++; /* empty match? go at least one position */
1122
+ lua_pushinteger(L, newstart);
1123
+ lua_replace(L, lua_upvalueindex(3));
1124
+ return utf8_push_captures(&ms, src, e);
1125
+ }
1126
+ if (src == ms.src_end) break;
1127
+ }
1128
+ return 0; /* not found */
1129
+ }
1130
+
1131
+ static int Lutf8_gmatch(lua_State *L) {
1132
+ luaL_checkstring(L, 1);
1133
+ luaL_checkstring(L, 2);
1134
+ lua_settop(L, 2);
1135
+ lua_pushinteger(L, 0);
1136
+ lua_pushcclosure(L, utf8_gmatch_aux, 3);
1137
+ return 1;
1138
+ }
1139
+
1140
+ static void utf8_add_s (utf8MatchState *ms, luaL_Buffer *b, const char *s,
1141
+ const char *e) {
1142
+ const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1143
+ while (news < new_end) {
1144
+ unsigned int ch;
1145
+ news += utf8_decode(news, new_end, &ch);
1146
+ if (ch != L_ESC)
1147
+ add_utf8char(b, ch);
1148
+ else {
1149
+ news += utf8_decode(news, new_end, &ch); /* skip ESC */
1150
+ if (!utf8_isdigit(ch)) {
1151
+ if (ch != L_ESC)
1152
+ luaL_error(ms->L, "invalid use of " LUA_QL("%c")
1153
+ " in replacement string", L_ESC);
1154
+ add_utf8char(b, ch);
1155
+ }
1156
+ else if (ch == '0')
1157
+ luaL_addlstring(b, s, e-s);
1158
+ else {
1159
+ utf8_push_onecapture(ms, ch-'1', s, e);
1160
+ luaL_addvalue(b); /* add capture to accumulated result */
1161
+ }
1162
+ }
1163
+ }
1164
+ }
1165
+
1166
+ static void utf8_add_value (utf8MatchState *ms, luaL_Buffer *b, const char *s,
1167
+ const char *e, int tr) {
1168
+ lua_State *L = ms->L;
1169
+ switch (tr) {
1170
+ case LUA_TFUNCTION: {
1171
+ int n;
1172
+ lua_pushvalue(L, 3);
1173
+ n = utf8_push_captures(ms, s, e);
1174
+ lua_call(L, n, 1);
1175
+ break;
1176
+ }
1177
+ case LUA_TTABLE: {
1178
+ utf8_push_onecapture(ms, 0, s, e);
1179
+ lua_gettable(L, 3);
1180
+ break;
1181
+ }
1182
+ default: { /* LUA_TNUMBER or LUA_TSTRING */
1183
+ utf8_add_s(ms, b, s, e);
1184
+ return;
1185
+ }
1186
+ }
1187
+ if (!lua_toboolean(L, -1)) { /* nil or false? */
1188
+ lua_pop(L, 1);
1189
+ lua_pushlstring(L, s, e - s); /* keep original text */
1190
+ }
1191
+ else if (!lua_isstring(L, -1))
1192
+ luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
1193
+ luaL_addvalue(b); /* add result to accumulator */
1194
+ }
1195
+
1196
+ static int Lutf8_gsub(lua_State *L) {
1197
+ const char *es, *s = check_utf8(L, 1, &es);
1198
+ const char *ep, *p = check_utf8(L, 2, &ep);
1199
+ int tr = lua_type(L, 3);
1200
+ lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1201
+ int anchor = (*p == '^');
1202
+ lua_Integer n = 0;
1203
+ utf8MatchState ms;
1204
+ luaL_Buffer b;
1205
+ luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1206
+ tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1207
+ "string/function/table expected");
1208
+ luaL_buffinit(L, &b);
1209
+ if (anchor) p++; /* skip anchor character */
1210
+ ms.L = L;
1211
+ ms.matchdepth = MAXCCALLS;
1212
+ ms.src_init = s;
1213
+ ms.src_end = es;
1214
+ ms.p_end = ep;
1215
+ while (n < max_s) {
1216
+ const char *e;
1217
+ ms.level = 0;
1218
+ assert(ms.matchdepth == MAXCCALLS);
1219
+ e = utf8_match(&ms, s, p);
1220
+ if (e) {
1221
+ n++;
1222
+ utf8_add_value(&ms, &b, s, e, tr);
1223
+ }
1224
+ if (e && e > s) /* non empty match? */
1225
+ s = e; /* skip it */
1226
+ else if (s < es) {
1227
+ unsigned int ch;
1228
+ s += utf8_decode(s, es, &ch);
1229
+ add_utf8char(&b, ch);
1230
+ }
1231
+ else break;
1232
+ if (anchor) break;
1233
+ }
1234
+ luaL_addlstring(&b, s, es-s);
1235
+ luaL_pushresult(&b);
1236
+ lua_pushinteger(L, n); /* number of substitutions */
1237
+ return 2;
1238
+ }
1239
+
1240
+
1241
+ /* lua module import interface */
1242
+
1243
+ LUALIB_API int luaopen_utf8(lua_State *L) {
1244
+ luaL_Reg libs[] = {
1245
+ #define ENTRY(name) { #name, Lutf8_##name }
1246
+ ENTRY(len),
1247
+ ENTRY(sub),
1248
+ ENTRY(reverse),
1249
+ ENTRY(lower),
1250
+ ENTRY(upper),
1251
+ ENTRY(title),
1252
+ ENTRY(fold),
1253
+ ENTRY(byte),
1254
+ ENTRY(char),
1255
+ ENTRY(escape),
1256
+ ENTRY(insert),
1257
+ ENTRY(remove),
1258
+ ENTRY(charpos),
1259
+ ENTRY(next),
1260
+ ENTRY(width),
1261
+ ENTRY(widthindex),
1262
+ ENTRY(ncasecmp),
1263
+ ENTRY(find),
1264
+ ENTRY(gmatch),
1265
+ ENTRY(gsub),
1266
+ ENTRY(match),
1267
+ #undef ENTRY
1268
+ { NULL, NULL }
1269
+ };
1270
+
1271
+ luaL_register(L, "utf8", libs);
1272
+
1273
+ return 1;
1274
+ }