immunio 1.2.1 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (291) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +13 -5
  3. data/ext/immunio/Rakefile +14 -6
  4. data/lib/immunio/context.rb +2 -0
  5. data/lib/immunio/plugins/action_view.rb +7 -668
  6. data/lib/immunio/plugins/action_view/action_view.rb +22 -0
  7. data/lib/immunio/plugins/action_view/active_support_hash.rb +29 -0
  8. data/lib/immunio/plugins/action_view/cache_store.rb +24 -0
  9. data/lib/immunio/plugins/action_view/erubi.rb +38 -0
  10. data/lib/immunio/plugins/action_view/erubis.rb +39 -0
  11. data/lib/immunio/plugins/action_view/fragment_caching.rb +29 -0
  12. data/lib/immunio/plugins/action_view/haml.rb +46 -0
  13. data/lib/immunio/plugins/action_view/slim.rb +42 -0
  14. data/lib/immunio/plugins/action_view/template.rb +431 -0
  15. data/lib/immunio/plugins/action_view/template_rendering.rb +45 -0
  16. data/lib/immunio/plugins/http_tracker.rb +2 -0
  17. data/lib/immunio/plugins/io.rb +34 -0
  18. data/lib/immunio/version.rb +1 -1
  19. data/lua-hooks/Makefile +36 -9
  20. data/lua-hooks/ext/luajit/COPYRIGHT +1 -1
  21. data/lua-hooks/ext/luajit/Makefile +22 -15
  22. data/lua-hooks/ext/luajit/README +2 -2
  23. data/lua-hooks/ext/luajit/doc/bluequad-print.css +1 -1
  24. data/lua-hooks/ext/luajit/doc/bluequad.css +1 -1
  25. data/lua-hooks/ext/luajit/doc/changes.html +69 -3
  26. data/lua-hooks/ext/luajit/doc/contact.html +10 -3
  27. data/lua-hooks/ext/luajit/doc/ext_c_api.html +2 -2
  28. data/lua-hooks/ext/luajit/doc/ext_ffi.html +2 -2
  29. data/lua-hooks/ext/luajit/doc/ext_ffi_api.html +2 -2
  30. data/lua-hooks/ext/luajit/doc/ext_ffi_semantics.html +3 -4
  31. data/lua-hooks/ext/luajit/doc/ext_ffi_tutorial.html +2 -2
  32. data/lua-hooks/ext/luajit/doc/ext_jit.html +3 -3
  33. data/lua-hooks/ext/luajit/doc/ext_profiler.html +2 -2
  34. data/lua-hooks/ext/luajit/doc/extensions.html +47 -20
  35. data/lua-hooks/ext/luajit/doc/faq.html +2 -2
  36. data/lua-hooks/ext/luajit/doc/install.html +74 -45
  37. data/lua-hooks/ext/luajit/doc/luajit.html +5 -5
  38. data/lua-hooks/ext/luajit/doc/running.html +3 -3
  39. data/lua-hooks/ext/luajit/doc/status.html +13 -8
  40. data/lua-hooks/ext/luajit/dynasm/dasm_arm.h +1 -1
  41. data/lua-hooks/ext/luajit/dynasm/dasm_arm.lua +1 -1
  42. data/lua-hooks/ext/luajit/dynasm/dasm_arm64.h +1 -1
  43. data/lua-hooks/ext/luajit/dynasm/dasm_arm64.lua +1 -1
  44. data/lua-hooks/ext/luajit/dynasm/dasm_mips.h +8 -5
  45. data/lua-hooks/ext/luajit/dynasm/dasm_mips.lua +66 -11
  46. data/lua-hooks/ext/luajit/dynasm/dasm_mips64.lua +12 -0
  47. data/lua-hooks/ext/luajit/dynasm/dasm_ppc.h +1 -1
  48. data/lua-hooks/ext/luajit/dynasm/dasm_ppc.lua +1 -1
  49. data/lua-hooks/ext/luajit/dynasm/dasm_proto.h +1 -1
  50. data/lua-hooks/ext/luajit/dynasm/dasm_x64.lua +1 -1
  51. data/lua-hooks/ext/luajit/dynasm/dasm_x86.h +1 -1
  52. data/lua-hooks/ext/luajit/dynasm/dasm_x86.lua +5 -1
  53. data/lua-hooks/ext/luajit/dynasm/dynasm.lua +2 -2
  54. data/lua-hooks/ext/luajit/etc/luajit.1 +1 -1
  55. data/lua-hooks/ext/luajit/etc/luajit.pc +1 -1
  56. data/lua-hooks/ext/luajit/src/Makefile +15 -11
  57. data/lua-hooks/ext/luajit/src/Makefile.dep +16 -16
  58. data/lua-hooks/ext/luajit/src/host/buildvm.c +2 -2
  59. data/lua-hooks/ext/luajit/src/host/buildvm.h +1 -1
  60. data/lua-hooks/ext/luajit/src/host/buildvm_asm.c +9 -4
  61. data/lua-hooks/ext/luajit/src/host/buildvm_fold.c +2 -2
  62. data/lua-hooks/ext/luajit/src/host/buildvm_lib.c +1 -1
  63. data/lua-hooks/ext/luajit/src/host/buildvm_libbc.h +14 -3
  64. data/lua-hooks/ext/luajit/src/host/buildvm_peobj.c +27 -3
  65. data/lua-hooks/ext/luajit/src/host/genlibbc.lua +1 -1
  66. data/lua-hooks/ext/luajit/src/host/genminilua.lua +6 -5
  67. data/lua-hooks/ext/luajit/src/host/minilua.c +1 -1
  68. data/lua-hooks/ext/luajit/src/jit/bc.lua +1 -1
  69. data/lua-hooks/ext/luajit/src/jit/bcsave.lua +8 -8
  70. data/lua-hooks/ext/luajit/src/jit/dis_arm.lua +2 -2
  71. data/lua-hooks/ext/luajit/src/jit/dis_arm64.lua +1216 -0
  72. data/lua-hooks/ext/luajit/src/jit/dis_arm64be.lua +12 -0
  73. data/lua-hooks/ext/luajit/src/jit/dis_mips.lua +35 -20
  74. data/lua-hooks/ext/luajit/src/jit/dis_mips64.lua +17 -0
  75. data/lua-hooks/ext/luajit/src/jit/dis_mips64el.lua +17 -0
  76. data/lua-hooks/ext/luajit/src/jit/dis_mipsel.lua +1 -1
  77. data/lua-hooks/ext/luajit/src/jit/dis_ppc.lua +2 -2
  78. data/lua-hooks/ext/luajit/src/jit/dis_x64.lua +1 -1
  79. data/lua-hooks/ext/luajit/src/jit/dis_x86.lua +7 -4
  80. data/lua-hooks/ext/luajit/src/jit/dump.lua +17 -12
  81. data/lua-hooks/ext/luajit/src/jit/p.lua +3 -2
  82. data/lua-hooks/ext/luajit/src/jit/v.lua +2 -2
  83. data/lua-hooks/ext/luajit/src/jit/zone.lua +1 -1
  84. data/lua-hooks/ext/luajit/src/lauxlib.h +14 -20
  85. data/lua-hooks/ext/luajit/src/lib_aux.c +38 -27
  86. data/lua-hooks/ext/luajit/src/lib_base.c +12 -5
  87. data/lua-hooks/ext/luajit/src/lib_bit.c +1 -1
  88. data/lua-hooks/ext/luajit/src/lib_debug.c +5 -5
  89. data/lua-hooks/ext/luajit/src/lib_ffi.c +2 -2
  90. data/lua-hooks/ext/luajit/src/lib_init.c +16 -16
  91. data/lua-hooks/ext/luajit/src/lib_io.c +6 -7
  92. data/lua-hooks/ext/luajit/src/lib_jit.c +14 -4
  93. data/lua-hooks/ext/luajit/src/lib_math.c +1 -5
  94. data/lua-hooks/ext/luajit/src/lib_os.c +1 -1
  95. data/lua-hooks/ext/luajit/src/lib_package.c +14 -23
  96. data/lua-hooks/ext/luajit/src/lib_string.c +1 -5
  97. data/lua-hooks/ext/luajit/src/lib_table.c +21 -1
  98. data/lua-hooks/ext/luajit/src/lj.supp +3 -3
  99. data/lua-hooks/ext/luajit/src/lj_alloc.c +174 -83
  100. data/lua-hooks/ext/luajit/src/lj_api.c +97 -18
  101. data/lua-hooks/ext/luajit/src/lj_arch.h +54 -22
  102. data/lua-hooks/ext/luajit/src/lj_asm.c +172 -53
  103. data/lua-hooks/ext/luajit/src/lj_asm.h +1 -1
  104. data/lua-hooks/ext/luajit/src/lj_asm_arm.h +19 -16
  105. data/lua-hooks/ext/luajit/src/lj_asm_arm64.h +2022 -0
  106. data/lua-hooks/ext/luajit/src/lj_asm_mips.h +564 -158
  107. data/lua-hooks/ext/luajit/src/lj_asm_ppc.h +19 -18
  108. data/lua-hooks/ext/luajit/src/lj_asm_x86.h +578 -92
  109. data/lua-hooks/ext/luajit/src/lj_bc.c +1 -1
  110. data/lua-hooks/ext/luajit/src/lj_bc.h +1 -1
  111. data/lua-hooks/ext/luajit/src/lj_bcdump.h +1 -1
  112. data/lua-hooks/ext/luajit/src/lj_bcread.c +1 -1
  113. data/lua-hooks/ext/luajit/src/lj_bcwrite.c +1 -1
  114. data/lua-hooks/ext/luajit/src/lj_buf.c +1 -1
  115. data/lua-hooks/ext/luajit/src/lj_buf.h +1 -1
  116. data/lua-hooks/ext/luajit/src/lj_carith.c +1 -1
  117. data/lua-hooks/ext/luajit/src/lj_carith.h +1 -1
  118. data/lua-hooks/ext/luajit/src/lj_ccall.c +172 -7
  119. data/lua-hooks/ext/luajit/src/lj_ccall.h +21 -5
  120. data/lua-hooks/ext/luajit/src/lj_ccallback.c +71 -17
  121. data/lua-hooks/ext/luajit/src/lj_ccallback.h +1 -1
  122. data/lua-hooks/ext/luajit/src/lj_cconv.c +4 -2
  123. data/lua-hooks/ext/luajit/src/lj_cconv.h +1 -1
  124. data/lua-hooks/ext/luajit/src/lj_cdata.c +7 -5
  125. data/lua-hooks/ext/luajit/src/lj_cdata.h +1 -1
  126. data/lua-hooks/ext/luajit/src/lj_clib.c +5 -5
  127. data/lua-hooks/ext/luajit/src/lj_clib.h +1 -1
  128. data/lua-hooks/ext/luajit/src/lj_cparse.c +11 -6
  129. data/lua-hooks/ext/luajit/src/lj_cparse.h +1 -1
  130. data/lua-hooks/ext/luajit/src/lj_crecord.c +70 -14
  131. data/lua-hooks/ext/luajit/src/lj_crecord.h +1 -1
  132. data/lua-hooks/ext/luajit/src/lj_ctype.c +1 -1
  133. data/lua-hooks/ext/luajit/src/lj_ctype.h +8 -8
  134. data/lua-hooks/ext/luajit/src/lj_debug.c +1 -1
  135. data/lua-hooks/ext/luajit/src/lj_debug.h +1 -1
  136. data/lua-hooks/ext/luajit/src/lj_def.h +6 -9
  137. data/lua-hooks/ext/luajit/src/lj_dispatch.c +3 -3
  138. data/lua-hooks/ext/luajit/src/lj_dispatch.h +2 -1
  139. data/lua-hooks/ext/luajit/src/lj_emit_arm.h +5 -4
  140. data/lua-hooks/ext/luajit/src/lj_emit_arm64.h +419 -0
  141. data/lua-hooks/ext/luajit/src/lj_emit_mips.h +100 -20
  142. data/lua-hooks/ext/luajit/src/lj_emit_ppc.h +4 -4
  143. data/lua-hooks/ext/luajit/src/lj_emit_x86.h +116 -25
  144. data/lua-hooks/ext/luajit/src/lj_err.c +34 -13
  145. data/lua-hooks/ext/luajit/src/lj_err.h +1 -1
  146. data/lua-hooks/ext/luajit/src/lj_errmsg.h +1 -1
  147. data/lua-hooks/ext/luajit/src/lj_ff.h +1 -1
  148. data/lua-hooks/ext/luajit/src/lj_ffrecord.c +58 -49
  149. data/lua-hooks/ext/luajit/src/lj_ffrecord.h +1 -1
  150. data/lua-hooks/ext/luajit/src/lj_frame.h +33 -6
  151. data/lua-hooks/ext/luajit/src/lj_func.c +4 -2
  152. data/lua-hooks/ext/luajit/src/lj_func.h +1 -1
  153. data/lua-hooks/ext/luajit/src/lj_gc.c +16 -7
  154. data/lua-hooks/ext/luajit/src/lj_gc.h +1 -1
  155. data/lua-hooks/ext/luajit/src/lj_gdbjit.c +31 -1
  156. data/lua-hooks/ext/luajit/src/lj_gdbjit.h +1 -1
  157. data/lua-hooks/ext/luajit/src/lj_ir.c +69 -96
  158. data/lua-hooks/ext/luajit/src/lj_ir.h +29 -18
  159. data/lua-hooks/ext/luajit/src/lj_ircall.h +24 -30
  160. data/lua-hooks/ext/luajit/src/lj_iropt.h +9 -9
  161. data/lua-hooks/ext/luajit/src/lj_jit.h +67 -9
  162. data/lua-hooks/ext/luajit/src/lj_lex.c +1 -1
  163. data/lua-hooks/ext/luajit/src/lj_lex.h +1 -1
  164. data/lua-hooks/ext/luajit/src/lj_lib.c +1 -1
  165. data/lua-hooks/ext/luajit/src/lj_lib.h +1 -1
  166. data/lua-hooks/ext/luajit/src/lj_load.c +1 -1
  167. data/lua-hooks/ext/luajit/src/lj_mcode.c +11 -10
  168. data/lua-hooks/ext/luajit/src/lj_mcode.h +1 -1
  169. data/lua-hooks/ext/luajit/src/lj_meta.c +1 -1
  170. data/lua-hooks/ext/luajit/src/lj_meta.h +1 -1
  171. data/lua-hooks/ext/luajit/src/lj_obj.c +1 -1
  172. data/lua-hooks/ext/luajit/src/lj_obj.h +7 -3
  173. data/lua-hooks/ext/luajit/src/lj_opt_dce.c +1 -1
  174. data/lua-hooks/ext/luajit/src/lj_opt_fold.c +84 -17
  175. data/lua-hooks/ext/luajit/src/lj_opt_loop.c +1 -1
  176. data/lua-hooks/ext/luajit/src/lj_opt_mem.c +3 -3
  177. data/lua-hooks/ext/luajit/src/lj_opt_narrow.c +24 -22
  178. data/lua-hooks/ext/luajit/src/lj_opt_sink.c +11 -6
  179. data/lua-hooks/ext/luajit/src/lj_opt_split.c +11 -2
  180. data/lua-hooks/ext/luajit/src/lj_parse.c +9 -7
  181. data/lua-hooks/ext/luajit/src/lj_parse.h +1 -1
  182. data/lua-hooks/ext/luajit/src/lj_profile.c +1 -1
  183. data/lua-hooks/ext/luajit/src/lj_profile.h +1 -1
  184. data/lua-hooks/ext/luajit/src/lj_record.c +201 -117
  185. data/lua-hooks/ext/luajit/src/lj_record.h +1 -1
  186. data/lua-hooks/ext/luajit/src/lj_snap.c +72 -26
  187. data/lua-hooks/ext/luajit/src/lj_snap.h +1 -1
  188. data/lua-hooks/ext/luajit/src/lj_state.c +6 -6
  189. data/lua-hooks/ext/luajit/src/lj_state.h +2 -2
  190. data/lua-hooks/ext/luajit/src/lj_str.c +1 -1
  191. data/lua-hooks/ext/luajit/src/lj_str.h +1 -1
  192. data/lua-hooks/ext/luajit/src/lj_strfmt.c +7 -3
  193. data/lua-hooks/ext/luajit/src/lj_strfmt.h +1 -1
  194. data/lua-hooks/ext/luajit/src/lj_strfmt_num.c +4 -3
  195. data/lua-hooks/ext/luajit/src/lj_strscan.c +1 -1
  196. data/lua-hooks/ext/luajit/src/lj_strscan.h +1 -1
  197. data/lua-hooks/ext/luajit/src/lj_tab.c +1 -2
  198. data/lua-hooks/ext/luajit/src/lj_tab.h +1 -1
  199. data/lua-hooks/ext/luajit/src/lj_target.h +3 -3
  200. data/lua-hooks/ext/luajit/src/lj_target_arm.h +1 -1
  201. data/lua-hooks/ext/luajit/src/lj_target_arm64.h +239 -7
  202. data/lua-hooks/ext/luajit/src/lj_target_mips.h +111 -22
  203. data/lua-hooks/ext/luajit/src/lj_target_ppc.h +1 -1
  204. data/lua-hooks/ext/luajit/src/lj_target_x86.h +21 -4
  205. data/lua-hooks/ext/luajit/src/lj_trace.c +63 -18
  206. data/lua-hooks/ext/luajit/src/lj_trace.h +2 -1
  207. data/lua-hooks/ext/luajit/src/lj_traceerr.h +1 -1
  208. data/lua-hooks/ext/luajit/src/lj_udata.c +1 -1
  209. data/lua-hooks/ext/luajit/src/lj_udata.h +1 -1
  210. data/lua-hooks/ext/luajit/src/lj_vm.h +5 -1
  211. data/lua-hooks/ext/luajit/src/lj_vmevent.c +1 -1
  212. data/lua-hooks/ext/luajit/src/lj_vmevent.h +1 -1
  213. data/lua-hooks/ext/luajit/src/lj_vmmath.c +1 -1
  214. data/lua-hooks/ext/luajit/src/ljamalg.c +1 -1
  215. data/lua-hooks/ext/luajit/src/lua.h +9 -1
  216. data/lua-hooks/ext/luajit/src/luaconf.h +3 -7
  217. data/lua-hooks/ext/luajit/src/luajit.c +69 -54
  218. data/lua-hooks/ext/luajit/src/luajit.h +4 -4
  219. data/lua-hooks/ext/luajit/src/lualib.h +1 -1
  220. data/lua-hooks/ext/luajit/src/msvcbuild.bat +12 -4
  221. data/lua-hooks/ext/luajit/src/vm_arm.dasc +1 -1
  222. data/lua-hooks/ext/luajit/src/vm_arm64.dasc +255 -32
  223. data/lua-hooks/ext/luajit/src/vm_mips.dasc +26 -23
  224. data/lua-hooks/ext/luajit/src/vm_mips64.dasc +5062 -0
  225. data/lua-hooks/ext/luajit/src/vm_ppc.dasc +1 -1
  226. data/lua-hooks/ext/luajit/src/vm_x64.dasc +24 -25
  227. data/lua-hooks/ext/luajit/src/vm_x86.dasc +77 -4
  228. data/lua-hooks/libluahooks.darwin.a +0 -0
  229. data/lua-hooks/libluahooks.linux.a +0 -0
  230. data/lua-hooks/options.mk +1 -1
  231. metadata +37 -77
  232. data/lua-hooks/ext/all.c +0 -69
  233. data/lua-hooks/ext/libinjection/COPYING +0 -37
  234. data/lua-hooks/ext/libinjection/libinjection.h +0 -65
  235. data/lua-hooks/ext/libinjection/libinjection_html5.c +0 -847
  236. data/lua-hooks/ext/libinjection/libinjection_html5.h +0 -54
  237. data/lua-hooks/ext/libinjection/libinjection_sqli.c +0 -2301
  238. data/lua-hooks/ext/libinjection/libinjection_sqli.h +0 -295
  239. data/lua-hooks/ext/libinjection/libinjection_sqli_data.h +0 -9349
  240. data/lua-hooks/ext/libinjection/libinjection_xss.c +0 -531
  241. data/lua-hooks/ext/libinjection/libinjection_xss.h +0 -21
  242. data/lua-hooks/ext/libinjection/lualib.c +0 -145
  243. data/lua-hooks/ext/libinjection/module.mk +0 -5
  244. data/lua-hooks/ext/lpeg/HISTORY +0 -96
  245. data/lua-hooks/ext/lpeg/lpcap.c +0 -537
  246. data/lua-hooks/ext/lpeg/lpcap.h +0 -56
  247. data/lua-hooks/ext/lpeg/lpcode.c +0 -1014
  248. data/lua-hooks/ext/lpeg/lpcode.h +0 -40
  249. data/lua-hooks/ext/lpeg/lpeg-128.gif +0 -0
  250. data/lua-hooks/ext/lpeg/lpeg.html +0 -1445
  251. data/lua-hooks/ext/lpeg/lpprint.c +0 -244
  252. data/lua-hooks/ext/lpeg/lpprint.h +0 -36
  253. data/lua-hooks/ext/lpeg/lptree.c +0 -1303
  254. data/lua-hooks/ext/lpeg/lptree.h +0 -82
  255. data/lua-hooks/ext/lpeg/lptypes.h +0 -149
  256. data/lua-hooks/ext/lpeg/lpvm.c +0 -364
  257. data/lua-hooks/ext/lpeg/lpvm.h +0 -58
  258. data/lua-hooks/ext/lpeg/makefile +0 -55
  259. data/lua-hooks/ext/lpeg/module.mk +0 -6
  260. data/lua-hooks/ext/lpeg/re.html +0 -498
  261. data/lua-hooks/ext/lua-cmsgpack/.gitignore +0 -13
  262. data/lua-hooks/ext/lua-cmsgpack/CMakeLists.txt +0 -45
  263. data/lua-hooks/ext/lua-cmsgpack/README.md +0 -115
  264. data/lua-hooks/ext/lua-cmsgpack/lua_cmsgpack.c +0 -970
  265. data/lua-hooks/ext/lua-cmsgpack/module.mk +0 -2
  266. data/lua-hooks/ext/lua-cmsgpack/test.lua +0 -570
  267. data/lua-hooks/ext/lua-snapshot/LICENSE +0 -7
  268. data/lua-hooks/ext/lua-snapshot/Makefile +0 -12
  269. data/lua-hooks/ext/lua-snapshot/README.md +0 -18
  270. data/lua-hooks/ext/lua-snapshot/dump.lua +0 -15
  271. data/lua-hooks/ext/lua-snapshot/module.mk +0 -2
  272. data/lua-hooks/ext/lua-snapshot/snapshot.c +0 -462
  273. data/lua-hooks/ext/luautf8/README.md +0 -152
  274. data/lua-hooks/ext/luautf8/lutf8lib.c +0 -1274
  275. data/lua-hooks/ext/luautf8/module.mk +0 -2
  276. data/lua-hooks/ext/luautf8/unidata.h +0 -3064
  277. data/lua-hooks/ext/module.mk +0 -15
  278. data/lua-hooks/ext/modules.h +0 -17
  279. data/lua-hooks/ext/perf/luacpu.c +0 -114
  280. data/lua-hooks/ext/perf/lualoadavg.c +0 -40
  281. data/lua-hooks/ext/perf/luameminfo.c +0 -38
  282. data/lua-hooks/ext/perf/luaoslib.c +0 -203
  283. data/lua-hooks/ext/perf/module.mk +0 -5
  284. data/lua-hooks/ext/sha1/luasha1.c +0 -74
  285. data/lua-hooks/ext/sha1/module.mk +0 -5
  286. data/lua-hooks/ext/sha1/sha1.c +0 -145
  287. data/lua-hooks/ext/sha2/luasha256.c +0 -77
  288. data/lua-hooks/ext/sha2/module.mk +0 -5
  289. data/lua-hooks/ext/sha2/sha256.c +0 -196
  290. data/lua-hooks/ext/sysutils/lua_utils.c +0 -56
  291. data/lua-hooks/ext/sysutils/module.mk +0 -2
@@ -1,152 +0,0 @@
1
- UTF-8 module for Lua 5.x
2
- ========================
3
-
4
- This module is add UTF-8 support to Lua.
5
-
6
- It use data extracted from [Unicode Character Database](http://www.unicode.org/reports/tr44/), and tested on Lua
7
- 5.2.3 and LuaJIT.
8
-
9
- parseucd.lua is a pure Lua script generate unidata.h, to support convert
10
- characters and check characters' category.
11
-
12
- It mainly used to compatible with Lua's own string module, it passed all
13
- string and pattern matching test in lua test suite[2].
14
-
15
- It also add some useful routines against UTF-8 features, some like:
16
- - a convenient interface to escape Unicode sequence in string.
17
- - string insert/remove, since UTF-8 substring extract may expensive.
18
- - calculate Unicode width, useful when implement e.g. console emulator.
19
- - a useful interface to translate Unicode offset and byte offset.
20
-
21
- [2]: http://www.lua.org/tests/5.2/
22
-
23
-
24
- LuaRocks Installation
25
- ---------------------
26
- `luarocks install utf8`
27
-
28
- Usage
29
- -----
30
-
31
- Many routines are same as Lua's string module:
32
- - `utf8.byte`
33
- - `utf8.char`
34
- - `utf8.find`
35
- - `utf8.gmatch`
36
- - `utf8.gsub`
37
- - `utf8.len`
38
- - `utf8.lower`
39
- - `utf8.match`
40
- - `utf8.reverse`
41
- - `utf8.sub`
42
- - `utf8.upper`
43
-
44
- The document of these functions can be find in Lua manual[3].
45
-
46
- [3]: http://www.lua.org/manual/5.2/manual.html#6.4
47
-
48
-
49
- Some routines in string module needn't support Unicode:
50
- - `string.dump`
51
- - `string.format`
52
- - `string.rep`
53
-
54
- They are NOT in utf8 module.
55
-
56
- Some routines are new, with some Unicode-spec functions:
57
-
58
- ###utf8.escape(str) -> utf8 string
59
- escape a str to UTF-8 format string. It support several escape format:
60
-
61
- %ddd - which ddd is a decimal number at any length:
62
- change Unicode code point to UTF-8 format.
63
- %{ddd} - same as %nnn but has bracket around.
64
- %uddd - same as %ddd, u stands Unicode
65
- %u{ddd} - same as %{ddd}
66
- %xhhh - hexadigit version of %ddd
67
- %x{hhh} same as %xhhh.
68
- %? - '?' stands for any other character: escape this character.
69
-
70
- ####Examples:
71
- ```
72
- local u = utf8.escape
73
- print(u"%123%u123%{123}%u{123}%xABC%x{ABC}")
74
- print(u"%%123%?%d%%u")
75
- ```
76
-
77
- ###utf8.charpos(s[[, charpos], offset]) -> charpos, code point
78
- convert UTF-8 position to byte offset.
79
- if only offset is given, return byte offset of this UTF-8 char index.
80
- if charpos and offset is given, a new charpos will calculate, by
81
- add/subtract UTF-8 char offset to current charpos.
82
- in all case, it return a new char position, and code point (a number) at
83
- this position.
84
-
85
- ###utf8.next(s[, charpos[, offset]]) -> charpos, code point
86
- iterate though the UTF-8 string s.
87
- If only s is given, it can used as a iterator:
88
- ```
89
- for pos, code in utf8.next, "utf8-string" do
90
- -- ...
91
- end
92
- ```
93
- if only charpos is given, return the next byte offset of in string.
94
- if charpos and offset is given, a new charpos will calculate, by
95
- add/subtract UTF-8 char offset to current charpos.
96
- in all case, it return a new char position, and code point (a number) at
97
- this position.
98
-
99
-
100
- ###utf8.insert(s[, idx], substring) -> new_string
101
- insert a substring to s. If idx is given, insert substring before char at
102
- this index, otherwise substring will concat to s. idx can be negative.
103
-
104
-
105
- ###utf8.remove(s[, start[, stop]]) -> new_string
106
- delete a substring in s. If neither start nor stop is given, delete the
107
- last UTF-8 char in s, otherwise delete char from start to end of s. if
108
- stop is given, delete char from start to stop (include start and stop).
109
- start and stop can be negative.
110
-
111
-
112
- ###utf8.width(s[, ambi_is_double[, default_width]]) -> width
113
- calculate the width of UTF-8 string s. if ambi_is_double is given, the
114
- ambiguous width character's width is 2, otherwise it's 1.
115
- fullwidth/doublewidth character's width is 2, and other character's width
116
- is 1.
117
- if default_width is given, it will be the width of unprintable character,
118
- used display a non-character mark for these characters.
119
- if s is a code point, return the width of this code point.
120
-
121
-
122
- ###utf8.widthindex(s, location[, ambi_is_double[, default_width]]) -> idx, offset, width
123
- return the character index at given location in string s. this is a
124
- reverse operation of utf8.width().
125
- this function return a index of location, and a offset in in UTF-8
126
- encoding. e.g. if cursor is at the second column (middle) of the wide
127
- char, offset will be 2. the width of character at idx is returned, also.
128
-
129
-
130
- ###utf8.title(s) -> new_string
131
- ###utf8.fold(s) -> new_string
132
- convert UTF-8 string s to title-case, or folded case used to compare by
133
- ignore case.
134
- if s is a number, it's treat as a code point and return a convert code
135
- point (number). utf8.lower/utf8.upper has the same extension.
136
-
137
-
138
- ###utf8.ncasecmp(a, b) -> [-1,0,1]
139
- compare a and b without case, -1 means a < b, 0 means a == b and 1 means a > b.
140
-
141
-
142
- Improvement needed
143
- ------------------
144
-
145
- - more test case.
146
- - grapheme-compose support, and affect in utf8.reverse and utf8.width
147
- - Unicode normalize algorithm implement.
148
-
149
-
150
- License
151
- -------
152
- It use same license with Lua: http://www.lua.org/license.html
@@ -1,1274 +0,0 @@
1
- /* Modified to allow bundling.
2
- * Original source: https://github.com/starwing/luautf8 */
3
- /* vim: set ft=c nu et sw=2 fdc=2 fdm=syntax : */
4
- #define LUA_LIB
5
- #include "lua.h"
6
- #include "lauxlib.h"
7
- #include "lualib.h"
8
-
9
-
10
- #include <assert.h>
11
- #include <string.h>
12
-
13
-
14
- /* UTF-8 string operations */
15
-
16
- #define UTF_MAX 8
17
-
18
- static size_t utf8_encode(char *s, unsigned int ch) {
19
- if (ch < 0x80) {
20
- s[0] = (char)ch;
21
- return 1;
22
- }
23
- if (ch <= 0x7FF) {
24
- s[1] = (char) ((ch | 0x80) & 0xBF);
25
- s[0] = (char) ((ch >> 6) | 0xC0);
26
- return 2;
27
- }
28
- if (ch <= 0xFFFF) {
29
- three:
30
- s[2] = (char) ((ch | 0x80) & 0xBF);
31
- s[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
32
- s[0] = (char) ((ch >> 12) | 0xE0);
33
- return 3;
34
- }
35
- if (ch <= 0x1FFFFF) {
36
- s[3] = (char) ((ch | 0x80) & 0xBF);
37
- s[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
38
- s[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
39
- s[0] = (char) ((ch >> 18) | 0xF0);
40
- return 4;
41
- }
42
- if (ch <= 0x3FFFFFF) {
43
- s[4] = (char) ((ch | 0x80) & 0xBF);
44
- s[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
45
- s[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
46
- s[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
47
- s[0] = (char) ((ch >> 24) | 0xF8);
48
- return 5;
49
- }
50
- if (ch <= 0x7FFFFFFF) {
51
- s[5] = (char) ((ch | 0x80) & 0xBF);
52
- s[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
53
- s[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
54
- s[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
55
- s[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
56
- s[0] = (char) ((ch >> 30) | 0xFC);
57
- return 6;
58
- }
59
-
60
- /* fallback */
61
- ch = 0xFFFD;
62
- goto three;
63
- }
64
-
65
- static size_t utf8_decode(const char *s, const char *e, unsigned int *pch) {
66
- unsigned int ch;
67
-
68
- if (s >= e) {
69
- *pch = 0;
70
- return 0;
71
- }
72
-
73
- ch = (unsigned char)s[0];
74
- if (ch < 0xC0) goto fallback;
75
- if (ch < 0xE0) {
76
- if (s+1 >= e || (s[1] & 0xC0) != 0x80)
77
- goto fallback;
78
- *pch = ((ch & 0x1F) << 6) |
79
- (s[1] & 0x3F);
80
- return 2;
81
- }
82
- if (ch < 0xF0) {
83
- if (s+2 >= e || (s[1] & 0xC0) != 0x80
84
- || (s[2] & 0xC0) != 0x80)
85
- goto fallback;
86
- *pch = ((ch & 0x0F) << 12) |
87
- ((s[1] & 0x3F) << 6) |
88
- (s[2] & 0x3F);
89
- return 3;
90
- }
91
- {
92
- int count = 0; /* to count number of continuation bytes */
93
- unsigned int res;
94
- while ((ch & 0x40) != 0) { /* still have continuation bytes? */
95
- int cc = (unsigned char)s[++count];
96
- if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
97
- goto fallback; /* invalid byte sequence, fallback */
98
- res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
99
- ch <<= 1; /* to test next bit */
100
- }
101
- if (count > 5)
102
- goto fallback; /* invalid byte sequence */
103
- res |= ((ch & 0x7F) << (count * 5)); /* add first byte */
104
- return count+1;
105
- }
106
-
107
- fallback:
108
- *pch = ch;
109
- return 1;
110
- }
111
-
112
- static const char *utf8_next(const char *s, const char *e) {
113
- unsigned int ch;
114
- return s + utf8_decode(s, e, &ch);
115
- }
116
-
117
- static const char *utf8_prev(const char *s, const char *e) {
118
- const char *look = e - 1;
119
-
120
- while (s <= look) {
121
- unsigned int ch = (unsigned char)*look;
122
- if (ch < 0x80 || ch >= 0xC0)
123
- return look;
124
- --look;
125
- }
126
-
127
- return s;
128
- }
129
-
130
- static size_t utf8_length(const char *s, const char *e) {
131
- size_t i = 0;
132
- while (s < e) {
133
- if ((*s & 0xFF) < 0xC0)
134
- ++s;
135
- else
136
- s = utf8_next(s, e);
137
- ++i;
138
- }
139
- return i;
140
- }
141
-
142
- static const char *utf8_index(const char *s, const char *e, int idx) {
143
- if (idx >= 0) {
144
- while (s < e && --idx > 0)
145
- s = utf8_next(s, e);
146
- return s;
147
- }
148
- else {
149
- while (s < e && idx++ < 0)
150
- e = utf8_prev(s, e);
151
- return e;
152
- }
153
- }
154
-
155
-
156
- /* Unicode character categories */
157
-
158
- #include "unidata.h"
159
-
160
- static int find_in_range(range_table *t, size_t size, unsigned int ch) {
161
- size_t first, last;
162
-
163
- first = 0;
164
- last = size;
165
-
166
- while (first < last) {
167
- int mid = (first + last) / 2;
168
- if (t[mid].last < ch)
169
- first = mid + 1;
170
- else if (t[mid].first > ch)
171
- last = mid;
172
- else
173
- return (ch - t[mid].first) % t[mid].step == 0;
174
- }
175
-
176
- return 0;
177
- }
178
-
179
- static int convert_char(conv_table *t, size_t size, unsigned int ch) {
180
- size_t first, last;
181
-
182
- first = 0;
183
- last = size;
184
-
185
- while (first < last) {
186
- int mid = (first + last) / 2;
187
- if (t[mid].last < ch)
188
- first = mid + 1;
189
- else if (t[mid].first > ch)
190
- last = mid;
191
- else if ((ch - t[mid].first) % t[mid].step == 0)
192
- return ch + t[mid].offset;
193
- else
194
- return ch;
195
- }
196
-
197
- return ch;
198
- }
199
-
200
- #define table_size(t) (sizeof(t)/sizeof((t)[0]))
201
-
202
- #define define_category(name) static int utf8_is##name(unsigned int ch) \
203
- { return find_in_range(name##_table, table_size(name##_table), ch); }
204
-
205
- #define define_converter(name) static unsigned int utf8_##name(unsigned int ch) \
206
- { return convert_char(name##_table, table_size(name##_table), ch); }
207
-
208
- define_category(alpha)
209
- define_category(lower)
210
- define_category(upper)
211
- define_category(cntrl)
212
- define_category(digit)
213
- define_category(xdigit)
214
- define_category(punct)
215
- define_category(space)
216
- define_converter(tolower)
217
- define_converter(toupper)
218
- define_converter(totitle)
219
- define_converter(tofold)
220
-
221
- #undef define_category
222
- #undef define_converter
223
-
224
- static int utf8_isgraph(unsigned int ch) {
225
- if (find_in_range(space_table, table_size(space_table), ch))
226
- return 0;
227
- if (find_in_range(graph_table, table_size(graph_table), ch))
228
- return 1;
229
- if (find_in_range(compose_table, table_size(compose_table), ch))
230
- return 1;
231
- return 0;
232
- }
233
-
234
- static int utf8_isalnum(unsigned int ch) {
235
- if (find_in_range(alpha_table, table_size(alpha_table), ch))
236
- return 1;
237
- if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
238
- return 1;
239
- return 0;
240
- }
241
-
242
- static int utf8_width(unsigned int ch, int ambi_is_single) {
243
- if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
244
- return 2;
245
- if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
246
- return ambi_is_single ? 1 : 2;
247
- if (find_in_range(compose_table, table_size(compose_table), ch))
248
- return 0;
249
- if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
250
- return 0;
251
- return 1;
252
- }
253
-
254
-
255
- /* string module compatible interface */
256
-
257
- static const char *check_utf8(lua_State *L, int idx, const char **end) {
258
- size_t len;
259
- const char *s = luaL_checklstring(L, idx, &len);
260
- if (end) *end = s+len;
261
- return s;
262
- }
263
-
264
- static const char *to_utf8(lua_State *L, int idx, const char **end) {
265
- size_t len;
266
- const char *s = lua_tolstring(L, idx, &len);
267
- if (end) *end = s+len;
268
- return s;
269
- }
270
-
271
- static void add_utf8char(luaL_Buffer *b, unsigned int ch) {
272
- char buff[UTF_MAX];
273
- size_t n = utf8_encode(buff, ch);
274
- luaL_addlstring(b, buff, n);
275
- }
276
-
277
- static lua_Integer byterelat(lua_Integer pos, size_t len) {
278
- if (pos >= 0) return pos;
279
- else if (0u - (size_t)pos > len) return 0;
280
- else return (lua_Integer)len + pos + 1;
281
- }
282
-
283
- static int u_posrange(const char **ps, const char **pe,
284
- lua_Integer posi, lua_Integer posj) {
285
- const char *s = *ps, *e = *pe;
286
- *ps = utf8_index(s, e, posi);
287
- if (posj >= 0) {
288
- while (s < e && posj-- > 0)
289
- s = utf8_next(s, e);
290
- *pe = s;
291
- }
292
- else {
293
- while (s < e && ++posj < 0)
294
- e = utf8_prev(s, e);
295
- *pe = e;
296
- }
297
- return *ps < *pe;
298
- }
299
-
300
- static int Lutf8_len(lua_State *L) {
301
- size_t len;
302
- const char *s = luaL_checklstring(L, 1, &len);
303
- lua_Integer posi = byterelat(luaL_optinteger(L, 2, 1), len);
304
- lua_Integer posj = byterelat(luaL_optinteger(L, 3, -1), len);
305
- if (posi < 1 || --posi > (lua_Integer)len
306
- || --posj > (lua_Integer)len)
307
- return 0;
308
- lua_pushinteger(L, (lua_Integer)utf8_length(s+posi, s+posj+1));
309
- return 1;
310
- }
311
-
312
- static int Lutf8_sub(lua_State *L) {
313
- const char *e, *s = check_utf8(L, 1, &e);
314
- if (u_posrange(&s, &e,
315
- luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
316
- lua_pushlstring(L, s, e-s);
317
- else
318
- lua_pushliteral(L, "");
319
- return 1;
320
- }
321
-
322
- static int Lutf8_reverse(lua_State *L) {
323
- luaL_Buffer b;
324
- /* XXX should handle compose unicode? */
325
- const char *e, *s = check_utf8(L, 1, &e);
326
- luaL_buffinit(L, &b);
327
- while (s < e) {
328
- const char *prev = utf8_prev(s, e);
329
- luaL_addlstring(&b, prev, e-prev);
330
- e = prev;
331
- }
332
- luaL_pushresult(&b);
333
- return 1;
334
- }
335
-
336
- static int convert(lua_State *L, unsigned int (*conv)(unsigned int)) {
337
- int t = lua_type(L, 1);
338
- if (t == LUA_TNUMBER)
339
- lua_pushinteger(L, conv(lua_tointeger(L, 1)));
340
- else if (t != LUA_TSTRING)
341
- return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
342
- else {
343
- luaL_Buffer b;
344
- const char *e, *s = to_utf8(L, 1, &e);
345
- luaL_buffinit(L, &b);
346
- while (s < e) {
347
- unsigned int ch;
348
- s += utf8_decode(s, e, &ch);
349
- ch = conv(ch);
350
- add_utf8char(&b, ch);
351
- }
352
- luaL_pushresult(&b);
353
- }
354
- return 1;
355
- }
356
-
357
- static int Lutf8_lower(lua_State *L)
358
- { return convert(L, utf8_tolower); }
359
-
360
- static int Lutf8_upper(lua_State *L)
361
- { return convert(L, utf8_toupper); }
362
-
363
- static int Lutf8_title(lua_State *L)
364
- { return convert(L, utf8_totitle); }
365
-
366
- static int Lutf8_fold(lua_State *L)
367
- { return convert(L, utf8_tofold); }
368
-
369
- static int Lutf8_byte(lua_State *L) {
370
- size_t n = 0;
371
- const char *e, *s = check_utf8(L, 1, &e);
372
- lua_Integer posi = luaL_optinteger(L, 2, 1);
373
- lua_Integer posj = luaL_optinteger(L, 3, posi);
374
- if (u_posrange(&s, &e, posi, posj)) {
375
- luaL_checkstack(L, e-s, "string slice too long");
376
- while (s < e) {
377
- unsigned int ch;
378
- s += utf8_decode(s, e, &ch);
379
- lua_pushinteger(L, ch);
380
- ++n;
381
- }
382
- }
383
- return n;
384
- }
385
-
386
- static int Lutf8_char(lua_State *L) {
387
- int i, n = lua_gettop(L); /* number of arguments */
388
- luaL_Buffer b;
389
- luaL_buffinit(L, &b);
390
- for (i = 1; i <= n; ++i) {
391
- unsigned int ch = luaL_checkint(L, i);
392
- add_utf8char(&b, ch);
393
- }
394
- luaL_pushresult(&b);
395
- return 1;
396
- }
397
-
398
-
399
- /* unicode extra interface */
400
-
401
- static const char *parse_escape(lua_State *L,
402
- const char *s, const char *e,
403
- int is_hex, unsigned int *pch) {
404
- unsigned int escape = 0, ch;
405
- int in_bracket = 0;
406
- if (*s == '{') ++s, in_bracket = 1;
407
- while (s < e) {
408
- ch = (unsigned char)*s;
409
- if (in_bracket && ch == '}') {
410
- ++s;
411
- break;
412
- }
413
- if (ch >= '0' && ch <= '9')
414
- ch = ch - '0';
415
- else if (is_hex && ch >= 'A' && ch <= 'F')
416
- ch = 10 + (ch - 'A');
417
- else if (is_hex && ch >= 'a' && ch <= 'f')
418
- ch = 10 + (ch - 'a');
419
- else {
420
- if (in_bracket)
421
- luaL_error(L, "invalid escape '%c'", ch);
422
- break;
423
- }
424
- escape *= is_hex ? 16 : 10;
425
- escape += ch;
426
- ++s;
427
- }
428
- *pch = escape;
429
- return s;
430
- }
431
-
432
- static int Lutf8_escape(lua_State *L) {
433
- const char *e, *s = check_utf8(L, 1, &e);
434
- luaL_Buffer b;
435
- luaL_buffinit(L, &b);
436
- while (s < e) {
437
- unsigned int ch;
438
- s += utf8_decode(s, e, &ch);
439
- if (ch == '%') {
440
- int is_hex = 0;
441
- switch (*s) {
442
- case '0': case '1': case '2': case '3':
443
- case '4': case '5': case '6': case '7':
444
- case '8': case '9': case '{':
445
- break;
446
- case 'u': case 'U': ++s; break;
447
- case 'x': case 'X': ++s; is_hex = 1; break;
448
- default:
449
- s += utf8_decode(s, e, &ch);
450
- goto next;
451
- }
452
- if (s >= e)
453
- luaL_error(L, "invalid escape sequence");
454
- s = parse_escape(L, s, e, is_hex, &ch);
455
- }
456
- next:
457
- add_utf8char(&b, ch);
458
- }
459
- luaL_pushresult(&b);
460
- return 1;
461
- }
462
-
463
- static int Lutf8_insert(lua_State *L) {
464
- const char *e, *s = check_utf8(L, 1, &e);
465
- size_t sublen;
466
- const char *subs;
467
- luaL_Buffer b;
468
- int nargs = 2;
469
- const char *first = e;
470
- if (lua_type(L, 2) == LUA_TNUMBER) {
471
- int idx = (int)lua_tointeger(L, 2);
472
- if (idx != 0) first = utf8_index(s, e, idx);
473
- ++nargs;
474
- }
475
- subs = luaL_checklstring(L, nargs, &sublen);
476
- luaL_buffinit(L, &b);
477
- luaL_addlstring(&b, s, first-s);
478
- luaL_addlstring(&b, subs, sublen);
479
- luaL_addlstring(&b, first, e-first);
480
- luaL_pushresult(&b);
481
- return 1;
482
- }
483
-
484
- static int Lutf8_remove(lua_State *L) {
485
- const char *e, *s = check_utf8(L, 1, &e);
486
- const char *start = s, *end = e;
487
- if (!u_posrange(&start, &end,
488
- luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
489
- lua_settop(L, 1);
490
- else {
491
- luaL_Buffer b;
492
- luaL_buffinit(L, &b);
493
- luaL_addlstring(&b, s, start-s);
494
- luaL_addlstring(&b, end, e-end);
495
- luaL_pushresult(&b);
496
- }
497
- return 1;
498
- }
499
-
500
- static int push_offset(lua_State *L, const char *s, const char *e,
501
- const char *cur, lua_Integer offset) {
502
- unsigned int ch;
503
- if (offset >= 0) {
504
- while (cur < e && offset-- > 0)
505
- cur = utf8_next(cur, e);
506
- if (offset >= 0) return 0;
507
- }
508
- else {
509
- while (s < cur && offset++ < 0)
510
- cur = utf8_prev(s, cur);
511
- if (offset < 0) return 0;
512
- }
513
- utf8_decode(cur, e, &ch);
514
- lua_pushinteger(L, cur-s+1);
515
- lua_pushinteger(L, ch);
516
- return 2;
517
- }
518
-
519
- static int Lutf8_charpos(lua_State *L) {
520
- size_t len;
521
- const char *s = luaL_checklstring(L, 1, &len);
522
- const char *cur = s;
523
- lua_Integer pos;
524
- if (lua_isnoneornil(L, 3)) {
525
- lua_Integer offset = luaL_optinteger(L, 2, 1);
526
- if (offset > 0) --offset;
527
- else if (offset < 0) cur = s+len;
528
- return push_offset(L, s, s+len, cur, offset);
529
- }
530
- pos = byterelat(luaL_optinteger(L, 2, 1), len);
531
- if (pos != 0) cur += pos-1;
532
- return push_offset(L, s, s+len, cur, luaL_checkinteger(L, 3));
533
- }
534
-
535
- static int Lutf8_next(lua_State *L) {
536
- size_t len;
537
- const char *s = luaL_checklstring(L, 1, &len);
538
- const char *cur = s;
539
- lua_Integer offset = 0;
540
- if (!lua_isnoneornil(L, 2)) {
541
- lua_Integer pos = byterelat(luaL_checkinteger(L, 2), len);
542
- if (pos != 0) cur += pos-1;
543
- offset = 1;
544
- }
545
- offset = luaL_optinteger(L, 3, offset);
546
- return push_offset(L, s, s+len, cur, offset);
547
- }
548
-
549
- static int Lutf8_width(lua_State *L) {
550
- int t = lua_type(L, 1);
551
- int ambi_is_single = !lua_toboolean(L, 2);
552
- int default_width = luaL_optinteger(L, 3, 0);
553
- if (t == LUA_TNUMBER) {
554
- size_t chwidth = utf8_width(lua_tointeger(L, 1), ambi_is_single);
555
- if (chwidth == 0) chwidth = default_width;
556
- lua_pushinteger(L, (lua_Integer)chwidth);
557
- }
558
- else if (t != LUA_TSTRING)
559
- return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
560
- else {
561
- const char *e, *s = to_utf8(L, 1, &e);
562
- size_t width = 0;
563
- while (s < e) {
564
- unsigned int ch;
565
- size_t chwidth;
566
- s += utf8_decode(s, e, &ch);
567
- chwidth = utf8_width(ch, ambi_is_single);
568
- width += chwidth == 0 ? default_width : chwidth;
569
- }
570
- lua_pushinteger(L, (lua_Integer)width);
571
- }
572
- return 1;
573
- }
574
-
575
- static int Lutf8_widthindex(lua_State *L) {
576
- const char *e, *s = check_utf8(L, 1, &e);
577
- int width = luaL_checkinteger(L, 2);
578
- int ambi_is_single = !lua_toboolean(L, 3);
579
- int default_width = luaL_optinteger(L, 4, 0);
580
- size_t idx = 1;
581
- while (s < e) {
582
- unsigned int ch;
583
- size_t chwidth;
584
- s += utf8_decode(s, e, &ch);
585
- chwidth = utf8_width(ch, ambi_is_single);
586
- if (chwidth == 0) chwidth = default_width;
587
- width -= chwidth;
588
- if (width <= 0) {
589
- lua_pushinteger(L, idx);
590
- lua_pushinteger(L, width + chwidth);
591
- lua_pushinteger(L, chwidth);
592
- return 3;
593
- }
594
- ++idx;
595
- }
596
- lua_pushinteger(L, (lua_Integer)idx);
597
- return 1;
598
- }
599
-
600
- static int Lutf8_ncasecmp(lua_State *L) {
601
- const char *e1, *s1 = check_utf8(L, 1, &e1);
602
- const char *e2, *s2 = check_utf8(L, 2, &e2);
603
- while (s1 < e1 || s2 < e2) {
604
- unsigned int ch1 = 0, ch2 = 0;
605
- if (s1 == e1)
606
- ch2 = 1;
607
- else if (s2 == e2)
608
- ch1 = 1;
609
- else {
610
- s1 += utf8_decode(s1, e1, &ch1);
611
- s2 += utf8_decode(s2, e2, &ch2);
612
- ch1 = utf8_tofold(ch1);
613
- ch2 = utf8_tofold(ch2);
614
- }
615
- if (ch1 != ch2) {
616
- lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
617
- return 1;
618
- }
619
- }
620
- lua_pushinteger(L, 0);
621
- return 1;
622
- }
623
-
624
-
625
- /* utf8 pattern matching implement */
626
-
627
- #ifndef LUA_MAXCAPTURES
628
- # define LUA_MAXCAPTURES 32
629
- #endif /* LUA_MAXCAPTURES */
630
-
631
- #define CAP_UNFINISHED (-1)
632
- #define CAP_POSITION (-2)
633
-
634
-
635
- typedef struct utf8MatchState {
636
- int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
637
- const char *src_init; /* init of source string */
638
- const char *src_end; /* end ('\0') of source string */
639
- const char *p_end; /* end ('\0') of pattern */
640
- lua_State *L;
641
- int level; /* total number of captures (finished or unfinished) */
642
- struct {
643
- const char *init;
644
- ptrdiff_t len;
645
- } capture[LUA_MAXCAPTURES];
646
- } utf8MatchState;
647
-
648
- /* recursive function */
649
- static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p);
650
-
651
- /* maximum recursion depth for 'match' */
652
- #if !defined(MAXCCALLS)
653
- #define MAXCCALLS 200
654
- #endif
655
-
656
- #define L_ESC '%'
657
- #define SPECIALS "^$*+?.([%-"
658
-
659
- static int utf8_check_capture (utf8MatchState *ms, int l) {
660
- l -= '1';
661
- if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
662
- return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
663
- return l;
664
- }
665
-
666
- static int utf8_capture_to_close (utf8MatchState *ms) {
667
- int level = ms->level;
668
- for (level--; level>=0; level--)
669
- if (ms->capture[level].len == CAP_UNFINISHED) return level;
670
- return luaL_error(ms->L, "invalid pattern capture");
671
- }
672
-
673
- static const char *utf8_classend (utf8MatchState *ms, const char *p) {
674
- unsigned int ch;
675
- p += utf8_decode(p, ms->p_end, &ch);
676
- switch (ch) {
677
- case L_ESC: {
678
- if (p == ms->p_end)
679
- luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
680
- return utf8_next(p, ms->p_end);
681
- }
682
- case '[': {
683
- if (*p == '^') p++;
684
- do { /* look for a `]' */
685
- if (p == ms->p_end)
686
- luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
687
- if (*(p++) == L_ESC && p < ms->p_end)
688
- p++; /* skip escapes (e.g. `%]') */
689
- } while (*p != ']');
690
- return p+1;
691
- }
692
- default: {
693
- return p;
694
- }
695
- }
696
- }
697
-
698
- static int utf8_match_class (unsigned int c, unsigned int cl) {
699
- int res;
700
- switch (utf8_tolower(cl)) {
701
- case 'a' : res = utf8_isalpha(c); break;
702
- case 'c' : res = utf8_iscntrl(c); break;
703
- case 'd' : res = utf8_isdigit(c); break;
704
- case 'g' : res = utf8_isgraph(c); break;
705
- case 'l' : res = utf8_islower(c); break;
706
- case 'p' : res = utf8_ispunct(c); break;
707
- case 's' : res = utf8_isspace(c); break;
708
- case 'u' : res = utf8_isupper(c); break;
709
- case 'w' : res = utf8_isalnum(c); break;
710
- case 'x' : res = utf8_isxdigit(c); break;
711
- case 'z' : res = (c == 0); break; /* deprecated option */
712
- default: return (cl == c);
713
- }
714
- return (utf8_islower(cl) ? res : !res);
715
- }
716
-
717
- static int utf8_matchbracketclass (unsigned int c, const char *p, const char *ec) {
718
- int sig = 1;
719
- assert(*p == '[');
720
- if (*++p == '^') {
721
- sig = 0;
722
- p++; /* skip the `^' */
723
- }
724
- while (p < ec) {
725
- unsigned int ch;
726
- p += utf8_decode(p, ec, &ch);
727
- if (ch == L_ESC) {
728
- p += utf8_decode(p, ec, &ch);
729
- if (utf8_match_class(c, ch))
730
- return sig;
731
- }
732
- else {
733
- unsigned int next;
734
- const char *np = p + utf8_decode(p, ec, &next);
735
- if (next == '-' && np < ec) {
736
- p = np + utf8_decode(np, ec, &next);
737
- if (ch <= c && c <= next)
738
- return sig;
739
- }
740
- else if (ch == c) return sig;
741
- }
742
- }
743
- return !sig;
744
- }
745
-
746
- static int utf8_singlematch (utf8MatchState *ms, const char *s, const char *p,
747
- const char *ep) {
748
- if (s >= ms->src_end)
749
- return 0;
750
- else {
751
- unsigned int ch, pch;
752
- utf8_decode(s, ms->src_end, &ch);
753
- p += utf8_decode(p, ms->p_end, &pch);
754
- switch (pch) {
755
- case '.': return 1; /* matches any char */
756
- case L_ESC: utf8_decode(p, ms->p_end, &pch);
757
- return utf8_match_class(ch, pch);
758
- case '[': return utf8_matchbracketclass(ch, p-1, ep-1);
759
- default: return pch == ch;
760
- }
761
- }
762
- }
763
-
764
- static const char *utf8_matchbalance (utf8MatchState *ms, const char *s,
765
- const char **p) {
766
- unsigned int ch, begin, end;
767
- *p += utf8_decode(*p, ms->p_end, &begin);
768
- if (*p >= ms->p_end)
769
- luaL_error(ms->L, "malformed pattern "
770
- "(missing arguments to " LUA_QL("%%b") ")");
771
- *p += utf8_decode(*p, ms->p_end, &end);
772
- s += utf8_decode(s, ms->src_end, &ch);
773
- if (ch != begin) return NULL;
774
- else {
775
- int cont = 1;
776
- while (s < ms->src_end) {
777
- s += utf8_decode(s, ms->src_end, &ch);
778
- if (ch == end) {
779
- if (--cont == 0) return s;
780
- }
781
- else if (ch == begin) cont++;
782
- }
783
- }
784
- return NULL; /* string ends out of balance */
785
- }
786
-
787
- static const char *utf8_max_expand (utf8MatchState *ms, const char *s,
788
- const char *p, const char *ep) {
789
- const char *m = s; /* matched end of single match p */
790
- while (utf8_singlematch(ms, m, p, ep))
791
- m = utf8_next(m, ms->src_end);
792
- /* keeps trying to match with the maximum repetitions */
793
- while (s <= m) {
794
- const char *res = utf8_match(ms, m, ep+1);
795
- if (res) return res;
796
- /* else didn't match; reduce 1 repetition to try again */
797
- if (s == m) break;
798
- m = utf8_prev(s, m);
799
- }
800
- return NULL;
801
- }
802
-
803
- static const char *utf8_min_expand (utf8MatchState *ms, const char *s,
804
- const char *p, const char *ep) {
805
- for (;;) {
806
- const char *res = utf8_match(ms, s, ep+1);
807
- if (res != NULL)
808
- return res;
809
- else if (utf8_singlematch(ms, s, p, ep))
810
- s = utf8_next(s, ms->src_end); /* try with one more repetition */
811
- else return NULL;
812
- }
813
- }
814
-
815
- static const char *utf8_start_capture (utf8MatchState *ms, const char *s,
816
- const char *p, int what) {
817
- const char *res;
818
- int level = ms->level;
819
- if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
820
- ms->capture[level].init = s;
821
- ms->capture[level].len = what;
822
- ms->level = level+1;
823
- if ((res=utf8_match(ms, s, p)) == NULL) /* match failed? */
824
- ms->level--; /* undo capture */
825
- return res;
826
- }
827
-
828
- static const char *utf8_end_capture (utf8MatchState *ms, const char *s,
829
- const char *p) {
830
- int l = utf8_capture_to_close(ms);
831
- const char *res;
832
- ms->capture[l].len = s - ms->capture[l].init; /* close capture */
833
- if ((res = utf8_match(ms, s, p)) == NULL) /* match failed? */
834
- ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
835
- return res;
836
- }
837
-
838
- static const char *utf8_match_capture (utf8MatchState *ms, const char *s, int l) {
839
- size_t len;
840
- l = utf8_check_capture(ms, l);
841
- len = ms->capture[l].len;
842
- if ((size_t)(ms->src_end-s) >= len &&
843
- memcmp(ms->capture[l].init, s, len) == 0)
844
- return s+len;
845
- else return NULL;
846
- }
847
-
848
- static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p) {
849
- if (ms->matchdepth-- == 0)
850
- luaL_error(ms->L, "pattern too complex");
851
- init: /* using goto's to optimize tail recursion */
852
- if (p != ms->p_end) { /* end of pattern? */
853
- unsigned int ch;
854
- utf8_decode(p, ms->p_end, &ch);
855
- switch (ch) {
856
- case '(': { /* start capture */
857
- if (*(p + 1) == ')') /* position capture? */
858
- s = utf8_start_capture(ms, s, p + 2, CAP_POSITION);
859
- else
860
- s = utf8_start_capture(ms, s, p + 1, CAP_UNFINISHED);
861
- break;
862
- }
863
- case ')': { /* end capture */
864
- s = utf8_end_capture(ms, s, p + 1);
865
- break;
866
- }
867
- case '$': {
868
- if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
869
- goto dflt; /* no; go to default */
870
- s = (s == ms->src_end) ? s : NULL; /* check end of string */
871
- break;
872
- }
873
- case L_ESC: { /* escaped sequence not in the format class[*+?-]? */
874
- const char *prev_p = p;
875
- p += utf8_decode(p+1, ms->p_end, &ch) + 1;
876
- switch (ch) {
877
- case 'b': { /* balanced string? */
878
- s = utf8_matchbalance(ms, s, &p);
879
- if (s != NULL)
880
- goto init; /* return utf8_match(ms, s, p + 4); */
881
- /* else fail (s == NULL) */
882
- break;
883
- }
884
- case 'f': { /* frontier? */
885
- const char *ep; unsigned int previous = 0, current = 0;
886
- if (*p != '[')
887
- luaL_error(ms->L, "missing " LUA_QL("[") " after "
888
- LUA_QL("%%f") " in pattern");
889
- ep = utf8_classend(ms, p); /* points to what is next */
890
- if (s != ms->src_init)
891
- utf8_decode(utf8_prev(ms->src_init, s), ms->src_end, &previous);
892
- if (s != ms->src_end)
893
- utf8_decode(s, ms->src_end, &current);
894
- if (!utf8_matchbracketclass(previous, p, ep - 1) &&
895
- utf8_matchbracketclass(current, p, ep - 1)) {
896
- p = ep; goto init; /* return utf8_match(ms, s, ep); */
897
- }
898
- s = NULL; /* match failed */
899
- break;
900
- }
901
- case '0': case '1': case '2': case '3':
902
- case '4': case '5': case '6': case '7':
903
- case '8': case '9': { /* capture results (%0-%9)? */
904
- s = utf8_match_capture(ms, s, ch - '1');
905
- if (s != NULL) goto init; /* return utf8_match(ms, s, p + 2) */
906
- break;
907
- }
908
- default: p = prev_p; goto dflt;
909
- }
910
- break;
911
- }
912
- default: dflt: { /* pattern class plus optional suffix */
913
- const char *ep = utf8_classend(ms, p); /* points to optional suffix */
914
- /* does not match at least once? */
915
- if (!utf8_singlematch(ms, s, p, ep)) {
916
- if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
917
- p = ep + 1; goto init; /* return utf8_match(ms, s, ep + 1); */
918
- }
919
- else /* '+' or no suffix */
920
- s = NULL; /* fail */
921
- }
922
- else { /* matched once */
923
- const char *next_s = utf8_next(s, ms->src_end);
924
- switch (*ep) { /* handle optional suffix */
925
- case '?': { /* optional */
926
- const char *res;
927
- const char *next_ep = utf8_next(ep, ms->p_end);
928
- if ((res = utf8_match(ms, next_s, next_ep)) != NULL)
929
- s = res;
930
- else {
931
- p = next_ep; goto init; /* else return utf8_match(ms, s, ep + 1); */
932
- }
933
- break;
934
- }
935
- case '+': /* 1 or more repetitions */
936
- s = next_s; /* 1 match already done */
937
- /* go through */
938
- case '*': /* 0 or more repetitions */
939
- s = utf8_max_expand(ms, s, p, ep);
940
- break;
941
- case '-': /* 0 or more repetitions (minimum) */
942
- s = utf8_min_expand(ms, s, p, ep);
943
- break;
944
- default: /* no suffix */
945
- s = next_s; p = ep; goto init; /* return utf8_match(ms, s + 1, ep); */
946
- }
947
- }
948
- break;
949
- }
950
- }
951
- }
952
- ms->matchdepth++;
953
- return s;
954
- }
955
-
956
- static const char *utf8_lmemfind (const char *s1, size_t l1,
957
- const char *s2, size_t l2) {
958
- if (l2 == 0) return s1; /* empty strings are everywhere */
959
- else if (l2 > l1) return NULL; /* avoids a negative `l1' */
960
- else {
961
- const char *init; /* to search for a `*s2' inside `s1' */
962
- l2--; /* 1st char will be checked by `memchr' */
963
- l1 = l1-l2; /* `s2' cannot be found after that */
964
- while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
965
- init++; /* 1st char is already checked */
966
- if (memcmp(init, s2+1, l2) == 0)
967
- return init-1;
968
- else { /* correct `l1' and `s1' to try again */
969
- l1 -= init-s1;
970
- s1 = init;
971
- }
972
- }
973
- return NULL; /* not found */
974
- }
975
- }
976
-
977
- static const char *utf8_get_index(const char *p, const char *s, const char *e, int *pidx) {
978
- int idx = 0;
979
- while (s < e) {
980
- if (s == p)
981
- break;
982
- else if (s > p) {
983
- --idx;
984
- break;
985
- }
986
- s = utf8_next(s, e);
987
- ++idx;
988
- }
989
- if (pidx) *pidx = idx;
990
- return s;
991
- }
992
-
993
- static void utf8_push_onecapture (utf8MatchState *ms, int i, const char *s,
994
- const char *e) {
995
- if (i >= ms->level) {
996
- if (i == 0) /* ms->level == 0, too */
997
- lua_pushlstring(ms->L, s, e - s); /* add whole match */
998
- else
999
- luaL_error(ms->L, "invalid capture index");
1000
- }
1001
- else {
1002
- ptrdiff_t l = ms->capture[i].len;
1003
- if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
1004
- if (l == CAP_POSITION) {
1005
- int idx;
1006
- utf8_get_index(ms->capture[i].init, ms->src_init, ms->src_end, &idx);
1007
- lua_pushinteger(ms->L, idx+1);
1008
- } else
1009
- lua_pushlstring(ms->L, ms->capture[i].init, l);
1010
- }
1011
- }
1012
-
1013
- static int utf8_push_captures (utf8MatchState *ms, const char *s, const char *e) {
1014
- int i;
1015
- int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1016
- luaL_checkstack(ms->L, nlevels, "too many captures");
1017
- for (i = 0; i < nlevels; i++)
1018
- utf8_push_onecapture(ms, i, s, e);
1019
- return nlevels; /* number of strings pushed */
1020
- }
1021
-
1022
- /* check whether pattern has no special characters */
1023
- static int nospecials (const char *p, const char * ep) {
1024
- while (p < ep) {
1025
- if (strpbrk(p, SPECIALS))
1026
- return 0; /* pattern has a special character */
1027
- p += strlen(p) + 1; /* may have more after \0 */
1028
- }
1029
- return 1; /* no special chars found */
1030
- }
1031
-
1032
-
1033
- /* utf8 pattern matching interface */
1034
-
1035
- static int find_aux (lua_State *L, int find) {
1036
- const char *es, *s = check_utf8(L, 1, &es);
1037
- const char *ep, *p = check_utf8(L, 2, &ep);
1038
- lua_Integer idx = luaL_optinteger(L, 3, 1);
1039
- const char *init;
1040
- size_t slen = utf8_length(s, es);
1041
- if (idx > 0 && idx > (lua_Integer)slen + 1) { /* start after string's end? */
1042
- lua_pushnil(L); /* cannot find anything */
1043
- return 1;
1044
- }
1045
- if (idx < 0) idx += utf8_length(s, es) + 1;
1046
- init = utf8_index(s, es, idx);
1047
- /* explicit request or no special characters? */
1048
- if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1049
- /* do a plain search */
1050
- do {
1051
- const char *s2 = utf8_lmemfind(init, es-init, p, ep-p);
1052
- if (!s2) break;
1053
- else {
1054
- int relidx;
1055
- const char *pch = utf8_get_index(s2, init, es, &relidx);
1056
- if (pch == s2) {
1057
- lua_pushinteger(L, idx + relidx);
1058
- lua_pushinteger(L, idx + relidx + utf8_length(p, ep) - 1);
1059
- return 2;
1060
- }
1061
- idx += relidx + 1;
1062
- init = utf8_next(pch, es);
1063
- }
1064
- } while (init < es);
1065
- }
1066
- else {
1067
- utf8MatchState ms;
1068
- int anchor = (*p == '^');
1069
- if (anchor) p++; /* skip anchor character */
1070
- ms.L = L;
1071
- ms.matchdepth = MAXCCALLS;
1072
- ms.src_init = s;
1073
- ms.src_end = es;
1074
- ms.p_end = ep;
1075
- do {
1076
- const char *res;
1077
- ms.level = 0;
1078
- assert(ms.matchdepth == MAXCCALLS);
1079
- if ((res=utf8_match(&ms, init, p)) != NULL) {
1080
- if (find) {
1081
- lua_pushinteger(L, idx); /* start */
1082
- lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */
1083
- return utf8_push_captures(&ms, NULL, 0) + 2;
1084
- }
1085
- else
1086
- return utf8_push_captures(&ms, init, res);
1087
- }
1088
- if (init == es) break;
1089
- idx += 1;
1090
- init = utf8_next(init, es);
1091
- } while (init <= es && !anchor);
1092
- }
1093
- lua_pushnil(L); /* not found */
1094
- return 1;
1095
- }
1096
-
1097
- static int Lutf8_find(lua_State *L)
1098
- { return find_aux(L, 1); }
1099
-
1100
- static int Lutf8_match(lua_State *L)
1101
- { return find_aux(L, 0); }
1102
-
1103
- static int utf8_gmatch_aux (lua_State *L) {
1104
- utf8MatchState ms;
1105
- const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1106
- const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1107
- const char *src;
1108
- ms.L = L;
1109
- ms.matchdepth = MAXCCALLS;
1110
- ms.src_init = s;
1111
- ms.src_end = es;
1112
- ms.p_end = ep;
1113
- for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
1114
- src <= ms.src_end;
1115
- src = utf8_next(src, ms.src_end)) {
1116
- const char *e;
1117
- ms.level = 0;
1118
- assert(ms.matchdepth == MAXCCALLS);
1119
- if ((e = utf8_match(&ms, src, p)) != NULL) {
1120
- lua_Integer newstart = e-s;
1121
- if (e == src) newstart++; /* empty match? go at least one position */
1122
- lua_pushinteger(L, newstart);
1123
- lua_replace(L, lua_upvalueindex(3));
1124
- return utf8_push_captures(&ms, src, e);
1125
- }
1126
- if (src == ms.src_end) break;
1127
- }
1128
- return 0; /* not found */
1129
- }
1130
-
1131
- static int Lutf8_gmatch(lua_State *L) {
1132
- luaL_checkstring(L, 1);
1133
- luaL_checkstring(L, 2);
1134
- lua_settop(L, 2);
1135
- lua_pushinteger(L, 0);
1136
- lua_pushcclosure(L, utf8_gmatch_aux, 3);
1137
- return 1;
1138
- }
1139
-
1140
- static void utf8_add_s (utf8MatchState *ms, luaL_Buffer *b, const char *s,
1141
- const char *e) {
1142
- const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1143
- while (news < new_end) {
1144
- unsigned int ch;
1145
- news += utf8_decode(news, new_end, &ch);
1146
- if (ch != L_ESC)
1147
- add_utf8char(b, ch);
1148
- else {
1149
- news += utf8_decode(news, new_end, &ch); /* skip ESC */
1150
- if (!utf8_isdigit(ch)) {
1151
- if (ch != L_ESC)
1152
- luaL_error(ms->L, "invalid use of " LUA_QL("%c")
1153
- " in replacement string", L_ESC);
1154
- add_utf8char(b, ch);
1155
- }
1156
- else if (ch == '0')
1157
- luaL_addlstring(b, s, e-s);
1158
- else {
1159
- utf8_push_onecapture(ms, ch-'1', s, e);
1160
- luaL_addvalue(b); /* add capture to accumulated result */
1161
- }
1162
- }
1163
- }
1164
- }
1165
-
1166
- static void utf8_add_value (utf8MatchState *ms, luaL_Buffer *b, const char *s,
1167
- const char *e, int tr) {
1168
- lua_State *L = ms->L;
1169
- switch (tr) {
1170
- case LUA_TFUNCTION: {
1171
- int n;
1172
- lua_pushvalue(L, 3);
1173
- n = utf8_push_captures(ms, s, e);
1174
- lua_call(L, n, 1);
1175
- break;
1176
- }
1177
- case LUA_TTABLE: {
1178
- utf8_push_onecapture(ms, 0, s, e);
1179
- lua_gettable(L, 3);
1180
- break;
1181
- }
1182
- default: { /* LUA_TNUMBER or LUA_TSTRING */
1183
- utf8_add_s(ms, b, s, e);
1184
- return;
1185
- }
1186
- }
1187
- if (!lua_toboolean(L, -1)) { /* nil or false? */
1188
- lua_pop(L, 1);
1189
- lua_pushlstring(L, s, e - s); /* keep original text */
1190
- }
1191
- else if (!lua_isstring(L, -1))
1192
- luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
1193
- luaL_addvalue(b); /* add result to accumulator */
1194
- }
1195
-
1196
- static int Lutf8_gsub(lua_State *L) {
1197
- const char *es, *s = check_utf8(L, 1, &es);
1198
- const char *ep, *p = check_utf8(L, 2, &ep);
1199
- int tr = lua_type(L, 3);
1200
- lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1201
- int anchor = (*p == '^');
1202
- lua_Integer n = 0;
1203
- utf8MatchState ms;
1204
- luaL_Buffer b;
1205
- luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1206
- tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1207
- "string/function/table expected");
1208
- luaL_buffinit(L, &b);
1209
- if (anchor) p++; /* skip anchor character */
1210
- ms.L = L;
1211
- ms.matchdepth = MAXCCALLS;
1212
- ms.src_init = s;
1213
- ms.src_end = es;
1214
- ms.p_end = ep;
1215
- while (n < max_s) {
1216
- const char *e;
1217
- ms.level = 0;
1218
- assert(ms.matchdepth == MAXCCALLS);
1219
- e = utf8_match(&ms, s, p);
1220
- if (e) {
1221
- n++;
1222
- utf8_add_value(&ms, &b, s, e, tr);
1223
- }
1224
- if (e && e > s) /* non empty match? */
1225
- s = e; /* skip it */
1226
- else if (s < es) {
1227
- unsigned int ch;
1228
- s += utf8_decode(s, es, &ch);
1229
- add_utf8char(&b, ch);
1230
- }
1231
- else break;
1232
- if (anchor) break;
1233
- }
1234
- luaL_addlstring(&b, s, es-s);
1235
- luaL_pushresult(&b);
1236
- lua_pushinteger(L, n); /* number of substitutions */
1237
- return 2;
1238
- }
1239
-
1240
-
1241
- /* lua module import interface */
1242
-
1243
- LUALIB_API int luaopen_utf8(lua_State *L) {
1244
- luaL_Reg libs[] = {
1245
- #define ENTRY(name) { #name, Lutf8_##name }
1246
- ENTRY(len),
1247
- ENTRY(sub),
1248
- ENTRY(reverse),
1249
- ENTRY(lower),
1250
- ENTRY(upper),
1251
- ENTRY(title),
1252
- ENTRY(fold),
1253
- ENTRY(byte),
1254
- ENTRY(char),
1255
- ENTRY(escape),
1256
- ENTRY(insert),
1257
- ENTRY(remove),
1258
- ENTRY(charpos),
1259
- ENTRY(next),
1260
- ENTRY(width),
1261
- ENTRY(widthindex),
1262
- ENTRY(ncasecmp),
1263
- ENTRY(find),
1264
- ENTRY(gmatch),
1265
- ENTRY(gsub),
1266
- ENTRY(match),
1267
- #undef ENTRY
1268
- { NULL, NULL }
1269
- };
1270
-
1271
- luaL_register(L, "utf8", libs);
1272
-
1273
- return 1;
1274
- }