immunio 1.2.1 → 2.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (291) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +13 -5
  3. data/ext/immunio/Rakefile +14 -6
  4. data/lib/immunio/context.rb +2 -0
  5. data/lib/immunio/plugins/action_view.rb +7 -668
  6. data/lib/immunio/plugins/action_view/action_view.rb +22 -0
  7. data/lib/immunio/plugins/action_view/active_support_hash.rb +29 -0
  8. data/lib/immunio/plugins/action_view/cache_store.rb +24 -0
  9. data/lib/immunio/plugins/action_view/erubi.rb +38 -0
  10. data/lib/immunio/plugins/action_view/erubis.rb +39 -0
  11. data/lib/immunio/plugins/action_view/fragment_caching.rb +29 -0
  12. data/lib/immunio/plugins/action_view/haml.rb +46 -0
  13. data/lib/immunio/plugins/action_view/slim.rb +42 -0
  14. data/lib/immunio/plugins/action_view/template.rb +431 -0
  15. data/lib/immunio/plugins/action_view/template_rendering.rb +45 -0
  16. data/lib/immunio/plugins/http_tracker.rb +2 -0
  17. data/lib/immunio/plugins/io.rb +34 -0
  18. data/lib/immunio/version.rb +1 -1
  19. data/lua-hooks/Makefile +36 -9
  20. data/lua-hooks/ext/luajit/COPYRIGHT +1 -1
  21. data/lua-hooks/ext/luajit/Makefile +22 -15
  22. data/lua-hooks/ext/luajit/README +2 -2
  23. data/lua-hooks/ext/luajit/doc/bluequad-print.css +1 -1
  24. data/lua-hooks/ext/luajit/doc/bluequad.css +1 -1
  25. data/lua-hooks/ext/luajit/doc/changes.html +69 -3
  26. data/lua-hooks/ext/luajit/doc/contact.html +10 -3
  27. data/lua-hooks/ext/luajit/doc/ext_c_api.html +2 -2
  28. data/lua-hooks/ext/luajit/doc/ext_ffi.html +2 -2
  29. data/lua-hooks/ext/luajit/doc/ext_ffi_api.html +2 -2
  30. data/lua-hooks/ext/luajit/doc/ext_ffi_semantics.html +3 -4
  31. data/lua-hooks/ext/luajit/doc/ext_ffi_tutorial.html +2 -2
  32. data/lua-hooks/ext/luajit/doc/ext_jit.html +3 -3
  33. data/lua-hooks/ext/luajit/doc/ext_profiler.html +2 -2
  34. data/lua-hooks/ext/luajit/doc/extensions.html +47 -20
  35. data/lua-hooks/ext/luajit/doc/faq.html +2 -2
  36. data/lua-hooks/ext/luajit/doc/install.html +74 -45
  37. data/lua-hooks/ext/luajit/doc/luajit.html +5 -5
  38. data/lua-hooks/ext/luajit/doc/running.html +3 -3
  39. data/lua-hooks/ext/luajit/doc/status.html +13 -8
  40. data/lua-hooks/ext/luajit/dynasm/dasm_arm.h +1 -1
  41. data/lua-hooks/ext/luajit/dynasm/dasm_arm.lua +1 -1
  42. data/lua-hooks/ext/luajit/dynasm/dasm_arm64.h +1 -1
  43. data/lua-hooks/ext/luajit/dynasm/dasm_arm64.lua +1 -1
  44. data/lua-hooks/ext/luajit/dynasm/dasm_mips.h +8 -5
  45. data/lua-hooks/ext/luajit/dynasm/dasm_mips.lua +66 -11
  46. data/lua-hooks/ext/luajit/dynasm/dasm_mips64.lua +12 -0
  47. data/lua-hooks/ext/luajit/dynasm/dasm_ppc.h +1 -1
  48. data/lua-hooks/ext/luajit/dynasm/dasm_ppc.lua +1 -1
  49. data/lua-hooks/ext/luajit/dynasm/dasm_proto.h +1 -1
  50. data/lua-hooks/ext/luajit/dynasm/dasm_x64.lua +1 -1
  51. data/lua-hooks/ext/luajit/dynasm/dasm_x86.h +1 -1
  52. data/lua-hooks/ext/luajit/dynasm/dasm_x86.lua +5 -1
  53. data/lua-hooks/ext/luajit/dynasm/dynasm.lua +2 -2
  54. data/lua-hooks/ext/luajit/etc/luajit.1 +1 -1
  55. data/lua-hooks/ext/luajit/etc/luajit.pc +1 -1
  56. data/lua-hooks/ext/luajit/src/Makefile +15 -11
  57. data/lua-hooks/ext/luajit/src/Makefile.dep +16 -16
  58. data/lua-hooks/ext/luajit/src/host/buildvm.c +2 -2
  59. data/lua-hooks/ext/luajit/src/host/buildvm.h +1 -1
  60. data/lua-hooks/ext/luajit/src/host/buildvm_asm.c +9 -4
  61. data/lua-hooks/ext/luajit/src/host/buildvm_fold.c +2 -2
  62. data/lua-hooks/ext/luajit/src/host/buildvm_lib.c +1 -1
  63. data/lua-hooks/ext/luajit/src/host/buildvm_libbc.h +14 -3
  64. data/lua-hooks/ext/luajit/src/host/buildvm_peobj.c +27 -3
  65. data/lua-hooks/ext/luajit/src/host/genlibbc.lua +1 -1
  66. data/lua-hooks/ext/luajit/src/host/genminilua.lua +6 -5
  67. data/lua-hooks/ext/luajit/src/host/minilua.c +1 -1
  68. data/lua-hooks/ext/luajit/src/jit/bc.lua +1 -1
  69. data/lua-hooks/ext/luajit/src/jit/bcsave.lua +8 -8
  70. data/lua-hooks/ext/luajit/src/jit/dis_arm.lua +2 -2
  71. data/lua-hooks/ext/luajit/src/jit/dis_arm64.lua +1216 -0
  72. data/lua-hooks/ext/luajit/src/jit/dis_arm64be.lua +12 -0
  73. data/lua-hooks/ext/luajit/src/jit/dis_mips.lua +35 -20
  74. data/lua-hooks/ext/luajit/src/jit/dis_mips64.lua +17 -0
  75. data/lua-hooks/ext/luajit/src/jit/dis_mips64el.lua +17 -0
  76. data/lua-hooks/ext/luajit/src/jit/dis_mipsel.lua +1 -1
  77. data/lua-hooks/ext/luajit/src/jit/dis_ppc.lua +2 -2
  78. data/lua-hooks/ext/luajit/src/jit/dis_x64.lua +1 -1
  79. data/lua-hooks/ext/luajit/src/jit/dis_x86.lua +7 -4
  80. data/lua-hooks/ext/luajit/src/jit/dump.lua +17 -12
  81. data/lua-hooks/ext/luajit/src/jit/p.lua +3 -2
  82. data/lua-hooks/ext/luajit/src/jit/v.lua +2 -2
  83. data/lua-hooks/ext/luajit/src/jit/zone.lua +1 -1
  84. data/lua-hooks/ext/luajit/src/lauxlib.h +14 -20
  85. data/lua-hooks/ext/luajit/src/lib_aux.c +38 -27
  86. data/lua-hooks/ext/luajit/src/lib_base.c +12 -5
  87. data/lua-hooks/ext/luajit/src/lib_bit.c +1 -1
  88. data/lua-hooks/ext/luajit/src/lib_debug.c +5 -5
  89. data/lua-hooks/ext/luajit/src/lib_ffi.c +2 -2
  90. data/lua-hooks/ext/luajit/src/lib_init.c +16 -16
  91. data/lua-hooks/ext/luajit/src/lib_io.c +6 -7
  92. data/lua-hooks/ext/luajit/src/lib_jit.c +14 -4
  93. data/lua-hooks/ext/luajit/src/lib_math.c +1 -5
  94. data/lua-hooks/ext/luajit/src/lib_os.c +1 -1
  95. data/lua-hooks/ext/luajit/src/lib_package.c +14 -23
  96. data/lua-hooks/ext/luajit/src/lib_string.c +1 -5
  97. data/lua-hooks/ext/luajit/src/lib_table.c +21 -1
  98. data/lua-hooks/ext/luajit/src/lj.supp +3 -3
  99. data/lua-hooks/ext/luajit/src/lj_alloc.c +174 -83
  100. data/lua-hooks/ext/luajit/src/lj_api.c +97 -18
  101. data/lua-hooks/ext/luajit/src/lj_arch.h +54 -22
  102. data/lua-hooks/ext/luajit/src/lj_asm.c +172 -53
  103. data/lua-hooks/ext/luajit/src/lj_asm.h +1 -1
  104. data/lua-hooks/ext/luajit/src/lj_asm_arm.h +19 -16
  105. data/lua-hooks/ext/luajit/src/lj_asm_arm64.h +2022 -0
  106. data/lua-hooks/ext/luajit/src/lj_asm_mips.h +564 -158
  107. data/lua-hooks/ext/luajit/src/lj_asm_ppc.h +19 -18
  108. data/lua-hooks/ext/luajit/src/lj_asm_x86.h +578 -92
  109. data/lua-hooks/ext/luajit/src/lj_bc.c +1 -1
  110. data/lua-hooks/ext/luajit/src/lj_bc.h +1 -1
  111. data/lua-hooks/ext/luajit/src/lj_bcdump.h +1 -1
  112. data/lua-hooks/ext/luajit/src/lj_bcread.c +1 -1
  113. data/lua-hooks/ext/luajit/src/lj_bcwrite.c +1 -1
  114. data/lua-hooks/ext/luajit/src/lj_buf.c +1 -1
  115. data/lua-hooks/ext/luajit/src/lj_buf.h +1 -1
  116. data/lua-hooks/ext/luajit/src/lj_carith.c +1 -1
  117. data/lua-hooks/ext/luajit/src/lj_carith.h +1 -1
  118. data/lua-hooks/ext/luajit/src/lj_ccall.c +172 -7
  119. data/lua-hooks/ext/luajit/src/lj_ccall.h +21 -5
  120. data/lua-hooks/ext/luajit/src/lj_ccallback.c +71 -17
  121. data/lua-hooks/ext/luajit/src/lj_ccallback.h +1 -1
  122. data/lua-hooks/ext/luajit/src/lj_cconv.c +4 -2
  123. data/lua-hooks/ext/luajit/src/lj_cconv.h +1 -1
  124. data/lua-hooks/ext/luajit/src/lj_cdata.c +7 -5
  125. data/lua-hooks/ext/luajit/src/lj_cdata.h +1 -1
  126. data/lua-hooks/ext/luajit/src/lj_clib.c +5 -5
  127. data/lua-hooks/ext/luajit/src/lj_clib.h +1 -1
  128. data/lua-hooks/ext/luajit/src/lj_cparse.c +11 -6
  129. data/lua-hooks/ext/luajit/src/lj_cparse.h +1 -1
  130. data/lua-hooks/ext/luajit/src/lj_crecord.c +70 -14
  131. data/lua-hooks/ext/luajit/src/lj_crecord.h +1 -1
  132. data/lua-hooks/ext/luajit/src/lj_ctype.c +1 -1
  133. data/lua-hooks/ext/luajit/src/lj_ctype.h +8 -8
  134. data/lua-hooks/ext/luajit/src/lj_debug.c +1 -1
  135. data/lua-hooks/ext/luajit/src/lj_debug.h +1 -1
  136. data/lua-hooks/ext/luajit/src/lj_def.h +6 -9
  137. data/lua-hooks/ext/luajit/src/lj_dispatch.c +3 -3
  138. data/lua-hooks/ext/luajit/src/lj_dispatch.h +2 -1
  139. data/lua-hooks/ext/luajit/src/lj_emit_arm.h +5 -4
  140. data/lua-hooks/ext/luajit/src/lj_emit_arm64.h +419 -0
  141. data/lua-hooks/ext/luajit/src/lj_emit_mips.h +100 -20
  142. data/lua-hooks/ext/luajit/src/lj_emit_ppc.h +4 -4
  143. data/lua-hooks/ext/luajit/src/lj_emit_x86.h +116 -25
  144. data/lua-hooks/ext/luajit/src/lj_err.c +34 -13
  145. data/lua-hooks/ext/luajit/src/lj_err.h +1 -1
  146. data/lua-hooks/ext/luajit/src/lj_errmsg.h +1 -1
  147. data/lua-hooks/ext/luajit/src/lj_ff.h +1 -1
  148. data/lua-hooks/ext/luajit/src/lj_ffrecord.c +58 -49
  149. data/lua-hooks/ext/luajit/src/lj_ffrecord.h +1 -1
  150. data/lua-hooks/ext/luajit/src/lj_frame.h +33 -6
  151. data/lua-hooks/ext/luajit/src/lj_func.c +4 -2
  152. data/lua-hooks/ext/luajit/src/lj_func.h +1 -1
  153. data/lua-hooks/ext/luajit/src/lj_gc.c +16 -7
  154. data/lua-hooks/ext/luajit/src/lj_gc.h +1 -1
  155. data/lua-hooks/ext/luajit/src/lj_gdbjit.c +31 -1
  156. data/lua-hooks/ext/luajit/src/lj_gdbjit.h +1 -1
  157. data/lua-hooks/ext/luajit/src/lj_ir.c +69 -96
  158. data/lua-hooks/ext/luajit/src/lj_ir.h +29 -18
  159. data/lua-hooks/ext/luajit/src/lj_ircall.h +24 -30
  160. data/lua-hooks/ext/luajit/src/lj_iropt.h +9 -9
  161. data/lua-hooks/ext/luajit/src/lj_jit.h +67 -9
  162. data/lua-hooks/ext/luajit/src/lj_lex.c +1 -1
  163. data/lua-hooks/ext/luajit/src/lj_lex.h +1 -1
  164. data/lua-hooks/ext/luajit/src/lj_lib.c +1 -1
  165. data/lua-hooks/ext/luajit/src/lj_lib.h +1 -1
  166. data/lua-hooks/ext/luajit/src/lj_load.c +1 -1
  167. data/lua-hooks/ext/luajit/src/lj_mcode.c +11 -10
  168. data/lua-hooks/ext/luajit/src/lj_mcode.h +1 -1
  169. data/lua-hooks/ext/luajit/src/lj_meta.c +1 -1
  170. data/lua-hooks/ext/luajit/src/lj_meta.h +1 -1
  171. data/lua-hooks/ext/luajit/src/lj_obj.c +1 -1
  172. data/lua-hooks/ext/luajit/src/lj_obj.h +7 -3
  173. data/lua-hooks/ext/luajit/src/lj_opt_dce.c +1 -1
  174. data/lua-hooks/ext/luajit/src/lj_opt_fold.c +84 -17
  175. data/lua-hooks/ext/luajit/src/lj_opt_loop.c +1 -1
  176. data/lua-hooks/ext/luajit/src/lj_opt_mem.c +3 -3
  177. data/lua-hooks/ext/luajit/src/lj_opt_narrow.c +24 -22
  178. data/lua-hooks/ext/luajit/src/lj_opt_sink.c +11 -6
  179. data/lua-hooks/ext/luajit/src/lj_opt_split.c +11 -2
  180. data/lua-hooks/ext/luajit/src/lj_parse.c +9 -7
  181. data/lua-hooks/ext/luajit/src/lj_parse.h +1 -1
  182. data/lua-hooks/ext/luajit/src/lj_profile.c +1 -1
  183. data/lua-hooks/ext/luajit/src/lj_profile.h +1 -1
  184. data/lua-hooks/ext/luajit/src/lj_record.c +201 -117
  185. data/lua-hooks/ext/luajit/src/lj_record.h +1 -1
  186. data/lua-hooks/ext/luajit/src/lj_snap.c +72 -26
  187. data/lua-hooks/ext/luajit/src/lj_snap.h +1 -1
  188. data/lua-hooks/ext/luajit/src/lj_state.c +6 -6
  189. data/lua-hooks/ext/luajit/src/lj_state.h +2 -2
  190. data/lua-hooks/ext/luajit/src/lj_str.c +1 -1
  191. data/lua-hooks/ext/luajit/src/lj_str.h +1 -1
  192. data/lua-hooks/ext/luajit/src/lj_strfmt.c +7 -3
  193. data/lua-hooks/ext/luajit/src/lj_strfmt.h +1 -1
  194. data/lua-hooks/ext/luajit/src/lj_strfmt_num.c +4 -3
  195. data/lua-hooks/ext/luajit/src/lj_strscan.c +1 -1
  196. data/lua-hooks/ext/luajit/src/lj_strscan.h +1 -1
  197. data/lua-hooks/ext/luajit/src/lj_tab.c +1 -2
  198. data/lua-hooks/ext/luajit/src/lj_tab.h +1 -1
  199. data/lua-hooks/ext/luajit/src/lj_target.h +3 -3
  200. data/lua-hooks/ext/luajit/src/lj_target_arm.h +1 -1
  201. data/lua-hooks/ext/luajit/src/lj_target_arm64.h +239 -7
  202. data/lua-hooks/ext/luajit/src/lj_target_mips.h +111 -22
  203. data/lua-hooks/ext/luajit/src/lj_target_ppc.h +1 -1
  204. data/lua-hooks/ext/luajit/src/lj_target_x86.h +21 -4
  205. data/lua-hooks/ext/luajit/src/lj_trace.c +63 -18
  206. data/lua-hooks/ext/luajit/src/lj_trace.h +2 -1
  207. data/lua-hooks/ext/luajit/src/lj_traceerr.h +1 -1
  208. data/lua-hooks/ext/luajit/src/lj_udata.c +1 -1
  209. data/lua-hooks/ext/luajit/src/lj_udata.h +1 -1
  210. data/lua-hooks/ext/luajit/src/lj_vm.h +5 -1
  211. data/lua-hooks/ext/luajit/src/lj_vmevent.c +1 -1
  212. data/lua-hooks/ext/luajit/src/lj_vmevent.h +1 -1
  213. data/lua-hooks/ext/luajit/src/lj_vmmath.c +1 -1
  214. data/lua-hooks/ext/luajit/src/ljamalg.c +1 -1
  215. data/lua-hooks/ext/luajit/src/lua.h +9 -1
  216. data/lua-hooks/ext/luajit/src/luaconf.h +3 -7
  217. data/lua-hooks/ext/luajit/src/luajit.c +69 -54
  218. data/lua-hooks/ext/luajit/src/luajit.h +4 -4
  219. data/lua-hooks/ext/luajit/src/lualib.h +1 -1
  220. data/lua-hooks/ext/luajit/src/msvcbuild.bat +12 -4
  221. data/lua-hooks/ext/luajit/src/vm_arm.dasc +1 -1
  222. data/lua-hooks/ext/luajit/src/vm_arm64.dasc +255 -32
  223. data/lua-hooks/ext/luajit/src/vm_mips.dasc +26 -23
  224. data/lua-hooks/ext/luajit/src/vm_mips64.dasc +5062 -0
  225. data/lua-hooks/ext/luajit/src/vm_ppc.dasc +1 -1
  226. data/lua-hooks/ext/luajit/src/vm_x64.dasc +24 -25
  227. data/lua-hooks/ext/luajit/src/vm_x86.dasc +77 -4
  228. data/lua-hooks/libluahooks.darwin.a +0 -0
  229. data/lua-hooks/libluahooks.linux.a +0 -0
  230. data/lua-hooks/options.mk +1 -1
  231. metadata +37 -77
  232. data/lua-hooks/ext/all.c +0 -69
  233. data/lua-hooks/ext/libinjection/COPYING +0 -37
  234. data/lua-hooks/ext/libinjection/libinjection.h +0 -65
  235. data/lua-hooks/ext/libinjection/libinjection_html5.c +0 -847
  236. data/lua-hooks/ext/libinjection/libinjection_html5.h +0 -54
  237. data/lua-hooks/ext/libinjection/libinjection_sqli.c +0 -2301
  238. data/lua-hooks/ext/libinjection/libinjection_sqli.h +0 -295
  239. data/lua-hooks/ext/libinjection/libinjection_sqli_data.h +0 -9349
  240. data/lua-hooks/ext/libinjection/libinjection_xss.c +0 -531
  241. data/lua-hooks/ext/libinjection/libinjection_xss.h +0 -21
  242. data/lua-hooks/ext/libinjection/lualib.c +0 -145
  243. data/lua-hooks/ext/libinjection/module.mk +0 -5
  244. data/lua-hooks/ext/lpeg/HISTORY +0 -96
  245. data/lua-hooks/ext/lpeg/lpcap.c +0 -537
  246. data/lua-hooks/ext/lpeg/lpcap.h +0 -56
  247. data/lua-hooks/ext/lpeg/lpcode.c +0 -1014
  248. data/lua-hooks/ext/lpeg/lpcode.h +0 -40
  249. data/lua-hooks/ext/lpeg/lpeg-128.gif +0 -0
  250. data/lua-hooks/ext/lpeg/lpeg.html +0 -1445
  251. data/lua-hooks/ext/lpeg/lpprint.c +0 -244
  252. data/lua-hooks/ext/lpeg/lpprint.h +0 -36
  253. data/lua-hooks/ext/lpeg/lptree.c +0 -1303
  254. data/lua-hooks/ext/lpeg/lptree.h +0 -82
  255. data/lua-hooks/ext/lpeg/lptypes.h +0 -149
  256. data/lua-hooks/ext/lpeg/lpvm.c +0 -364
  257. data/lua-hooks/ext/lpeg/lpvm.h +0 -58
  258. data/lua-hooks/ext/lpeg/makefile +0 -55
  259. data/lua-hooks/ext/lpeg/module.mk +0 -6
  260. data/lua-hooks/ext/lpeg/re.html +0 -498
  261. data/lua-hooks/ext/lua-cmsgpack/.gitignore +0 -13
  262. data/lua-hooks/ext/lua-cmsgpack/CMakeLists.txt +0 -45
  263. data/lua-hooks/ext/lua-cmsgpack/README.md +0 -115
  264. data/lua-hooks/ext/lua-cmsgpack/lua_cmsgpack.c +0 -970
  265. data/lua-hooks/ext/lua-cmsgpack/module.mk +0 -2
  266. data/lua-hooks/ext/lua-cmsgpack/test.lua +0 -570
  267. data/lua-hooks/ext/lua-snapshot/LICENSE +0 -7
  268. data/lua-hooks/ext/lua-snapshot/Makefile +0 -12
  269. data/lua-hooks/ext/lua-snapshot/README.md +0 -18
  270. data/lua-hooks/ext/lua-snapshot/dump.lua +0 -15
  271. data/lua-hooks/ext/lua-snapshot/module.mk +0 -2
  272. data/lua-hooks/ext/lua-snapshot/snapshot.c +0 -462
  273. data/lua-hooks/ext/luautf8/README.md +0 -152
  274. data/lua-hooks/ext/luautf8/lutf8lib.c +0 -1274
  275. data/lua-hooks/ext/luautf8/module.mk +0 -2
  276. data/lua-hooks/ext/luautf8/unidata.h +0 -3064
  277. data/lua-hooks/ext/module.mk +0 -15
  278. data/lua-hooks/ext/modules.h +0 -17
  279. data/lua-hooks/ext/perf/luacpu.c +0 -114
  280. data/lua-hooks/ext/perf/lualoadavg.c +0 -40
  281. data/lua-hooks/ext/perf/luameminfo.c +0 -38
  282. data/lua-hooks/ext/perf/luaoslib.c +0 -203
  283. data/lua-hooks/ext/perf/module.mk +0 -5
  284. data/lua-hooks/ext/sha1/luasha1.c +0 -74
  285. data/lua-hooks/ext/sha1/module.mk +0 -5
  286. data/lua-hooks/ext/sha1/sha1.c +0 -145
  287. data/lua-hooks/ext/sha2/luasha256.c +0 -77
  288. data/lua-hooks/ext/sha2/module.mk +0 -5
  289. data/lua-hooks/ext/sha2/sha256.c +0 -196
  290. data/lua-hooks/ext/sysutils/lua_utils.c +0 -56
  291. data/lua-hooks/ext/sysutils/module.mk +0 -2
@@ -1,152 +0,0 @@
1
- UTF-8 module for Lua 5.x
2
- ========================
3
-
4
- This module is add UTF-8 support to Lua.
5
-
6
- It use data extracted from [Unicode Character Database](http://www.unicode.org/reports/tr44/), and tested on Lua
7
- 5.2.3 and LuaJIT.
8
-
9
- parseucd.lua is a pure Lua script generate unidata.h, to support convert
10
- characters and check characters' category.
11
-
12
- It mainly used to compatible with Lua's own string module, it passed all
13
- string and pattern matching test in lua test suite[2].
14
-
15
- It also add some useful routines against UTF-8 features, some like:
16
- - a convenient interface to escape Unicode sequence in string.
17
- - string insert/remove, since UTF-8 substring extract may expensive.
18
- - calculate Unicode width, useful when implement e.g. console emulator.
19
- - a useful interface to translate Unicode offset and byte offset.
20
-
21
- [2]: http://www.lua.org/tests/5.2/
22
-
23
-
24
- LuaRocks Installation
25
- ---------------------
26
- `luarocks install utf8`
27
-
28
- Usage
29
- -----
30
-
31
- Many routines are same as Lua's string module:
32
- - `utf8.byte`
33
- - `utf8.char`
34
- - `utf8.find`
35
- - `utf8.gmatch`
36
- - `utf8.gsub`
37
- - `utf8.len`
38
- - `utf8.lower`
39
- - `utf8.match`
40
- - `utf8.reverse`
41
- - `utf8.sub`
42
- - `utf8.upper`
43
-
44
- The document of these functions can be find in Lua manual[3].
45
-
46
- [3]: http://www.lua.org/manual/5.2/manual.html#6.4
47
-
48
-
49
- Some routines in string module needn't support Unicode:
50
- - `string.dump`
51
- - `string.format`
52
- - `string.rep`
53
-
54
- They are NOT in utf8 module.
55
-
56
- Some routines are new, with some Unicode-spec functions:
57
-
58
- ###utf8.escape(str) -> utf8 string
59
- escape a str to UTF-8 format string. It support several escape format:
60
-
61
- %ddd - which ddd is a decimal number at any length:
62
- change Unicode code point to UTF-8 format.
63
- %{ddd} - same as %nnn but has bracket around.
64
- %uddd - same as %ddd, u stands Unicode
65
- %u{ddd} - same as %{ddd}
66
- %xhhh - hexadigit version of %ddd
67
- %x{hhh} same as %xhhh.
68
- %? - '?' stands for any other character: escape this character.
69
-
70
- ####Examples:
71
- ```
72
- local u = utf8.escape
73
- print(u"%123%u123%{123}%u{123}%xABC%x{ABC}")
74
- print(u"%%123%?%d%%u")
75
- ```
76
-
77
- ###utf8.charpos(s[[, charpos], offset]) -> charpos, code point
78
- convert UTF-8 position to byte offset.
79
- if only offset is given, return byte offset of this UTF-8 char index.
80
- if charpos and offset is given, a new charpos will calculate, by
81
- add/subtract UTF-8 char offset to current charpos.
82
- in all case, it return a new char position, and code point (a number) at
83
- this position.
84
-
85
- ###utf8.next(s[, charpos[, offset]]) -> charpos, code point
86
- iterate though the UTF-8 string s.
87
- If only s is given, it can used as a iterator:
88
- ```
89
- for pos, code in utf8.next, "utf8-string" do
90
- -- ...
91
- end
92
- ```
93
- if only charpos is given, return the next byte offset of in string.
94
- if charpos and offset is given, a new charpos will calculate, by
95
- add/subtract UTF-8 char offset to current charpos.
96
- in all case, it return a new char position, and code point (a number) at
97
- this position.
98
-
99
-
100
- ###utf8.insert(s[, idx], substring) -> new_string
101
- insert a substring to s. If idx is given, insert substring before char at
102
- this index, otherwise substring will concat to s. idx can be negative.
103
-
104
-
105
- ###utf8.remove(s[, start[, stop]]) -> new_string
106
- delete a substring in s. If neither start nor stop is given, delete the
107
- last UTF-8 char in s, otherwise delete char from start to end of s. if
108
- stop is given, delete char from start to stop (include start and stop).
109
- start and stop can be negative.
110
-
111
-
112
- ###utf8.width(s[, ambi_is_double[, default_width]]) -> width
113
- calculate the width of UTF-8 string s. if ambi_is_double is given, the
114
- ambiguous width character's width is 2, otherwise it's 1.
115
- fullwidth/doublewidth character's width is 2, and other character's width
116
- is 1.
117
- if default_width is given, it will be the width of unprintable character,
118
- used display a non-character mark for these characters.
119
- if s is a code point, return the width of this code point.
120
-
121
-
122
- ###utf8.widthindex(s, location[, ambi_is_double[, default_width]]) -> idx, offset, width
123
- return the character index at given location in string s. this is a
124
- reverse operation of utf8.width().
125
- this function return a index of location, and a offset in in UTF-8
126
- encoding. e.g. if cursor is at the second column (middle) of the wide
127
- char, offset will be 2. the width of character at idx is returned, also.
128
-
129
-
130
- ###utf8.title(s) -> new_string
131
- ###utf8.fold(s) -> new_string
132
- convert UTF-8 string s to title-case, or folded case used to compare by
133
- ignore case.
134
- if s is a number, it's treat as a code point and return a convert code
135
- point (number). utf8.lower/utf8.upper has the same extension.
136
-
137
-
138
- ###utf8.ncasecmp(a, b) -> [-1,0,1]
139
- compare a and b without case, -1 means a < b, 0 means a == b and 1 means a > b.
140
-
141
-
142
- Improvement needed
143
- ------------------
144
-
145
- - more test case.
146
- - grapheme-compose support, and affect in utf8.reverse and utf8.width
147
- - Unicode normalize algorithm implement.
148
-
149
-
150
- License
151
- -------
152
- It use same license with Lua: http://www.lua.org/license.html
@@ -1,1274 +0,0 @@
1
- /* Modified to allow bundling.
2
- * Original source: https://github.com/starwing/luautf8 */
3
- /* vim: set ft=c nu et sw=2 fdc=2 fdm=syntax : */
4
- #define LUA_LIB
5
- #include "lua.h"
6
- #include "lauxlib.h"
7
- #include "lualib.h"
8
-
9
-
10
- #include <assert.h>
11
- #include <string.h>
12
-
13
-
14
- /* UTF-8 string operations */
15
-
16
- #define UTF_MAX 8
17
-
18
- static size_t utf8_encode(char *s, unsigned int ch) {
19
- if (ch < 0x80) {
20
- s[0] = (char)ch;
21
- return 1;
22
- }
23
- if (ch <= 0x7FF) {
24
- s[1] = (char) ((ch | 0x80) & 0xBF);
25
- s[0] = (char) ((ch >> 6) | 0xC0);
26
- return 2;
27
- }
28
- if (ch <= 0xFFFF) {
29
- three:
30
- s[2] = (char) ((ch | 0x80) & 0xBF);
31
- s[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
32
- s[0] = (char) ((ch >> 12) | 0xE0);
33
- return 3;
34
- }
35
- if (ch <= 0x1FFFFF) {
36
- s[3] = (char) ((ch | 0x80) & 0xBF);
37
- s[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
38
- s[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
39
- s[0] = (char) ((ch >> 18) | 0xF0);
40
- return 4;
41
- }
42
- if (ch <= 0x3FFFFFF) {
43
- s[4] = (char) ((ch | 0x80) & 0xBF);
44
- s[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
45
- s[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
46
- s[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
47
- s[0] = (char) ((ch >> 24) | 0xF8);
48
- return 5;
49
- }
50
- if (ch <= 0x7FFFFFFF) {
51
- s[5] = (char) ((ch | 0x80) & 0xBF);
52
- s[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
53
- s[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
54
- s[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
55
- s[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
56
- s[0] = (char) ((ch >> 30) | 0xFC);
57
- return 6;
58
- }
59
-
60
- /* fallback */
61
- ch = 0xFFFD;
62
- goto three;
63
- }
64
-
65
- static size_t utf8_decode(const char *s, const char *e, unsigned int *pch) {
66
- unsigned int ch;
67
-
68
- if (s >= e) {
69
- *pch = 0;
70
- return 0;
71
- }
72
-
73
- ch = (unsigned char)s[0];
74
- if (ch < 0xC0) goto fallback;
75
- if (ch < 0xE0) {
76
- if (s+1 >= e || (s[1] & 0xC0) != 0x80)
77
- goto fallback;
78
- *pch = ((ch & 0x1F) << 6) |
79
- (s[1] & 0x3F);
80
- return 2;
81
- }
82
- if (ch < 0xF0) {
83
- if (s+2 >= e || (s[1] & 0xC0) != 0x80
84
- || (s[2] & 0xC0) != 0x80)
85
- goto fallback;
86
- *pch = ((ch & 0x0F) << 12) |
87
- ((s[1] & 0x3F) << 6) |
88
- (s[2] & 0x3F);
89
- return 3;
90
- }
91
- {
92
- int count = 0; /* to count number of continuation bytes */
93
- unsigned int res;
94
- while ((ch & 0x40) != 0) { /* still have continuation bytes? */
95
- int cc = (unsigned char)s[++count];
96
- if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
97
- goto fallback; /* invalid byte sequence, fallback */
98
- res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
99
- ch <<= 1; /* to test next bit */
100
- }
101
- if (count > 5)
102
- goto fallback; /* invalid byte sequence */
103
- res |= ((ch & 0x7F) << (count * 5)); /* add first byte */
104
- return count+1;
105
- }
106
-
107
- fallback:
108
- *pch = ch;
109
- return 1;
110
- }
111
-
112
- static const char *utf8_next(const char *s, const char *e) {
113
- unsigned int ch;
114
- return s + utf8_decode(s, e, &ch);
115
- }
116
-
117
- static const char *utf8_prev(const char *s, const char *e) {
118
- const char *look = e - 1;
119
-
120
- while (s <= look) {
121
- unsigned int ch = (unsigned char)*look;
122
- if (ch < 0x80 || ch >= 0xC0)
123
- return look;
124
- --look;
125
- }
126
-
127
- return s;
128
- }
129
-
130
- static size_t utf8_length(const char *s, const char *e) {
131
- size_t i = 0;
132
- while (s < e) {
133
- if ((*s & 0xFF) < 0xC0)
134
- ++s;
135
- else
136
- s = utf8_next(s, e);
137
- ++i;
138
- }
139
- return i;
140
- }
141
-
142
- static const char *utf8_index(const char *s, const char *e, int idx) {
143
- if (idx >= 0) {
144
- while (s < e && --idx > 0)
145
- s = utf8_next(s, e);
146
- return s;
147
- }
148
- else {
149
- while (s < e && idx++ < 0)
150
- e = utf8_prev(s, e);
151
- return e;
152
- }
153
- }
154
-
155
-
156
- /* Unicode character categories */
157
-
158
- #include "unidata.h"
159
-
160
- static int find_in_range(range_table *t, size_t size, unsigned int ch) {
161
- size_t first, last;
162
-
163
- first = 0;
164
- last = size;
165
-
166
- while (first < last) {
167
- int mid = (first + last) / 2;
168
- if (t[mid].last < ch)
169
- first = mid + 1;
170
- else if (t[mid].first > ch)
171
- last = mid;
172
- else
173
- return (ch - t[mid].first) % t[mid].step == 0;
174
- }
175
-
176
- return 0;
177
- }
178
-
179
- static int convert_char(conv_table *t, size_t size, unsigned int ch) {
180
- size_t first, last;
181
-
182
- first = 0;
183
- last = size;
184
-
185
- while (first < last) {
186
- int mid = (first + last) / 2;
187
- if (t[mid].last < ch)
188
- first = mid + 1;
189
- else if (t[mid].first > ch)
190
- last = mid;
191
- else if ((ch - t[mid].first) % t[mid].step == 0)
192
- return ch + t[mid].offset;
193
- else
194
- return ch;
195
- }
196
-
197
- return ch;
198
- }
199
-
200
- #define table_size(t) (sizeof(t)/sizeof((t)[0]))
201
-
202
- #define define_category(name) static int utf8_is##name(unsigned int ch) \
203
- { return find_in_range(name##_table, table_size(name##_table), ch); }
204
-
205
- #define define_converter(name) static unsigned int utf8_##name(unsigned int ch) \
206
- { return convert_char(name##_table, table_size(name##_table), ch); }
207
-
208
- define_category(alpha)
209
- define_category(lower)
210
- define_category(upper)
211
- define_category(cntrl)
212
- define_category(digit)
213
- define_category(xdigit)
214
- define_category(punct)
215
- define_category(space)
216
- define_converter(tolower)
217
- define_converter(toupper)
218
- define_converter(totitle)
219
- define_converter(tofold)
220
-
221
- #undef define_category
222
- #undef define_converter
223
-
224
- static int utf8_isgraph(unsigned int ch) {
225
- if (find_in_range(space_table, table_size(space_table), ch))
226
- return 0;
227
- if (find_in_range(graph_table, table_size(graph_table), ch))
228
- return 1;
229
- if (find_in_range(compose_table, table_size(compose_table), ch))
230
- return 1;
231
- return 0;
232
- }
233
-
234
- static int utf8_isalnum(unsigned int ch) {
235
- if (find_in_range(alpha_table, table_size(alpha_table), ch))
236
- return 1;
237
- if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
238
- return 1;
239
- return 0;
240
- }
241
-
242
- static int utf8_width(unsigned int ch, int ambi_is_single) {
243
- if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
244
- return 2;
245
- if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
246
- return ambi_is_single ? 1 : 2;
247
- if (find_in_range(compose_table, table_size(compose_table), ch))
248
- return 0;
249
- if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
250
- return 0;
251
- return 1;
252
- }
253
-
254
-
255
- /* string module compatible interface */
256
-
257
- static const char *check_utf8(lua_State *L, int idx, const char **end) {
258
- size_t len;
259
- const char *s = luaL_checklstring(L, idx, &len);
260
- if (end) *end = s+len;
261
- return s;
262
- }
263
-
264
- static const char *to_utf8(lua_State *L, int idx, const char **end) {
265
- size_t len;
266
- const char *s = lua_tolstring(L, idx, &len);
267
- if (end) *end = s+len;
268
- return s;
269
- }
270
-
271
- static void add_utf8char(luaL_Buffer *b, unsigned int ch) {
272
- char buff[UTF_MAX];
273
- size_t n = utf8_encode(buff, ch);
274
- luaL_addlstring(b, buff, n);
275
- }
276
-
277
- static lua_Integer byterelat(lua_Integer pos, size_t len) {
278
- if (pos >= 0) return pos;
279
- else if (0u - (size_t)pos > len) return 0;
280
- else return (lua_Integer)len + pos + 1;
281
- }
282
-
283
- static int u_posrange(const char **ps, const char **pe,
284
- lua_Integer posi, lua_Integer posj) {
285
- const char *s = *ps, *e = *pe;
286
- *ps = utf8_index(s, e, posi);
287
- if (posj >= 0) {
288
- while (s < e && posj-- > 0)
289
- s = utf8_next(s, e);
290
- *pe = s;
291
- }
292
- else {
293
- while (s < e && ++posj < 0)
294
- e = utf8_prev(s, e);
295
- *pe = e;
296
- }
297
- return *ps < *pe;
298
- }
299
-
300
- static int Lutf8_len(lua_State *L) {
301
- size_t len;
302
- const char *s = luaL_checklstring(L, 1, &len);
303
- lua_Integer posi = byterelat(luaL_optinteger(L, 2, 1), len);
304
- lua_Integer posj = byterelat(luaL_optinteger(L, 3, -1), len);
305
- if (posi < 1 || --posi > (lua_Integer)len
306
- || --posj > (lua_Integer)len)
307
- return 0;
308
- lua_pushinteger(L, (lua_Integer)utf8_length(s+posi, s+posj+1));
309
- return 1;
310
- }
311
-
312
- static int Lutf8_sub(lua_State *L) {
313
- const char *e, *s = check_utf8(L, 1, &e);
314
- if (u_posrange(&s, &e,
315
- luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
316
- lua_pushlstring(L, s, e-s);
317
- else
318
- lua_pushliteral(L, "");
319
- return 1;
320
- }
321
-
322
- static int Lutf8_reverse(lua_State *L) {
323
- luaL_Buffer b;
324
- /* XXX should handle compose unicode? */
325
- const char *e, *s = check_utf8(L, 1, &e);
326
- luaL_buffinit(L, &b);
327
- while (s < e) {
328
- const char *prev = utf8_prev(s, e);
329
- luaL_addlstring(&b, prev, e-prev);
330
- e = prev;
331
- }
332
- luaL_pushresult(&b);
333
- return 1;
334
- }
335
-
336
- static int convert(lua_State *L, unsigned int (*conv)(unsigned int)) {
337
- int t = lua_type(L, 1);
338
- if (t == LUA_TNUMBER)
339
- lua_pushinteger(L, conv(lua_tointeger(L, 1)));
340
- else if (t != LUA_TSTRING)
341
- return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
342
- else {
343
- luaL_Buffer b;
344
- const char *e, *s = to_utf8(L, 1, &e);
345
- luaL_buffinit(L, &b);
346
- while (s < e) {
347
- unsigned int ch;
348
- s += utf8_decode(s, e, &ch);
349
- ch = conv(ch);
350
- add_utf8char(&b, ch);
351
- }
352
- luaL_pushresult(&b);
353
- }
354
- return 1;
355
- }
356
-
357
- static int Lutf8_lower(lua_State *L)
358
- { return convert(L, utf8_tolower); }
359
-
360
- static int Lutf8_upper(lua_State *L)
361
- { return convert(L, utf8_toupper); }
362
-
363
- static int Lutf8_title(lua_State *L)
364
- { return convert(L, utf8_totitle); }
365
-
366
- static int Lutf8_fold(lua_State *L)
367
- { return convert(L, utf8_tofold); }
368
-
369
- static int Lutf8_byte(lua_State *L) {
370
- size_t n = 0;
371
- const char *e, *s = check_utf8(L, 1, &e);
372
- lua_Integer posi = luaL_optinteger(L, 2, 1);
373
- lua_Integer posj = luaL_optinteger(L, 3, posi);
374
- if (u_posrange(&s, &e, posi, posj)) {
375
- luaL_checkstack(L, e-s, "string slice too long");
376
- while (s < e) {
377
- unsigned int ch;
378
- s += utf8_decode(s, e, &ch);
379
- lua_pushinteger(L, ch);
380
- ++n;
381
- }
382
- }
383
- return n;
384
- }
385
-
386
- static int Lutf8_char(lua_State *L) {
387
- int i, n = lua_gettop(L); /* number of arguments */
388
- luaL_Buffer b;
389
- luaL_buffinit(L, &b);
390
- for (i = 1; i <= n; ++i) {
391
- unsigned int ch = luaL_checkint(L, i);
392
- add_utf8char(&b, ch);
393
- }
394
- luaL_pushresult(&b);
395
- return 1;
396
- }
397
-
398
-
399
- /* unicode extra interface */
400
-
401
- static const char *parse_escape(lua_State *L,
402
- const char *s, const char *e,
403
- int is_hex, unsigned int *pch) {
404
- unsigned int escape = 0, ch;
405
- int in_bracket = 0;
406
- if (*s == '{') ++s, in_bracket = 1;
407
- while (s < e) {
408
- ch = (unsigned char)*s;
409
- if (in_bracket && ch == '}') {
410
- ++s;
411
- break;
412
- }
413
- if (ch >= '0' && ch <= '9')
414
- ch = ch - '0';
415
- else if (is_hex && ch >= 'A' && ch <= 'F')
416
- ch = 10 + (ch - 'A');
417
- else if (is_hex && ch >= 'a' && ch <= 'f')
418
- ch = 10 + (ch - 'a');
419
- else {
420
- if (in_bracket)
421
- luaL_error(L, "invalid escape '%c'", ch);
422
- break;
423
- }
424
- escape *= is_hex ? 16 : 10;
425
- escape += ch;
426
- ++s;
427
- }
428
- *pch = escape;
429
- return s;
430
- }
431
-
432
- static int Lutf8_escape(lua_State *L) {
433
- const char *e, *s = check_utf8(L, 1, &e);
434
- luaL_Buffer b;
435
- luaL_buffinit(L, &b);
436
- while (s < e) {
437
- unsigned int ch;
438
- s += utf8_decode(s, e, &ch);
439
- if (ch == '%') {
440
- int is_hex = 0;
441
- switch (*s) {
442
- case '0': case '1': case '2': case '3':
443
- case '4': case '5': case '6': case '7':
444
- case '8': case '9': case '{':
445
- break;
446
- case 'u': case 'U': ++s; break;
447
- case 'x': case 'X': ++s; is_hex = 1; break;
448
- default:
449
- s += utf8_decode(s, e, &ch);
450
- goto next;
451
- }
452
- if (s >= e)
453
- luaL_error(L, "invalid escape sequence");
454
- s = parse_escape(L, s, e, is_hex, &ch);
455
- }
456
- next:
457
- add_utf8char(&b, ch);
458
- }
459
- luaL_pushresult(&b);
460
- return 1;
461
- }
462
-
463
- static int Lutf8_insert(lua_State *L) {
464
- const char *e, *s = check_utf8(L, 1, &e);
465
- size_t sublen;
466
- const char *subs;
467
- luaL_Buffer b;
468
- int nargs = 2;
469
- const char *first = e;
470
- if (lua_type(L, 2) == LUA_TNUMBER) {
471
- int idx = (int)lua_tointeger(L, 2);
472
- if (idx != 0) first = utf8_index(s, e, idx);
473
- ++nargs;
474
- }
475
- subs = luaL_checklstring(L, nargs, &sublen);
476
- luaL_buffinit(L, &b);
477
- luaL_addlstring(&b, s, first-s);
478
- luaL_addlstring(&b, subs, sublen);
479
- luaL_addlstring(&b, first, e-first);
480
- luaL_pushresult(&b);
481
- return 1;
482
- }
483
-
484
- static int Lutf8_remove(lua_State *L) {
485
- const char *e, *s = check_utf8(L, 1, &e);
486
- const char *start = s, *end = e;
487
- if (!u_posrange(&start, &end,
488
- luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
489
- lua_settop(L, 1);
490
- else {
491
- luaL_Buffer b;
492
- luaL_buffinit(L, &b);
493
- luaL_addlstring(&b, s, start-s);
494
- luaL_addlstring(&b, end, e-end);
495
- luaL_pushresult(&b);
496
- }
497
- return 1;
498
- }
499
-
500
- static int push_offset(lua_State *L, const char *s, const char *e,
501
- const char *cur, lua_Integer offset) {
502
- unsigned int ch;
503
- if (offset >= 0) {
504
- while (cur < e && offset-- > 0)
505
- cur = utf8_next(cur, e);
506
- if (offset >= 0) return 0;
507
- }
508
- else {
509
- while (s < cur && offset++ < 0)
510
- cur = utf8_prev(s, cur);
511
- if (offset < 0) return 0;
512
- }
513
- utf8_decode(cur, e, &ch);
514
- lua_pushinteger(L, cur-s+1);
515
- lua_pushinteger(L, ch);
516
- return 2;
517
- }
518
-
519
- static int Lutf8_charpos(lua_State *L) {
520
- size_t len;
521
- const char *s = luaL_checklstring(L, 1, &len);
522
- const char *cur = s;
523
- lua_Integer pos;
524
- if (lua_isnoneornil(L, 3)) {
525
- lua_Integer offset = luaL_optinteger(L, 2, 1);
526
- if (offset > 0) --offset;
527
- else if (offset < 0) cur = s+len;
528
- return push_offset(L, s, s+len, cur, offset);
529
- }
530
- pos = byterelat(luaL_optinteger(L, 2, 1), len);
531
- if (pos != 0) cur += pos-1;
532
- return push_offset(L, s, s+len, cur, luaL_checkinteger(L, 3));
533
- }
534
-
535
- static int Lutf8_next(lua_State *L) {
536
- size_t len;
537
- const char *s = luaL_checklstring(L, 1, &len);
538
- const char *cur = s;
539
- lua_Integer offset = 0;
540
- if (!lua_isnoneornil(L, 2)) {
541
- lua_Integer pos = byterelat(luaL_checkinteger(L, 2), len);
542
- if (pos != 0) cur += pos-1;
543
- offset = 1;
544
- }
545
- offset = luaL_optinteger(L, 3, offset);
546
- return push_offset(L, s, s+len, cur, offset);
547
- }
548
-
549
- static int Lutf8_width(lua_State *L) {
550
- int t = lua_type(L, 1);
551
- int ambi_is_single = !lua_toboolean(L, 2);
552
- int default_width = luaL_optinteger(L, 3, 0);
553
- if (t == LUA_TNUMBER) {
554
- size_t chwidth = utf8_width(lua_tointeger(L, 1), ambi_is_single);
555
- if (chwidth == 0) chwidth = default_width;
556
- lua_pushinteger(L, (lua_Integer)chwidth);
557
- }
558
- else if (t != LUA_TSTRING)
559
- return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
560
- else {
561
- const char *e, *s = to_utf8(L, 1, &e);
562
- size_t width = 0;
563
- while (s < e) {
564
- unsigned int ch;
565
- size_t chwidth;
566
- s += utf8_decode(s, e, &ch);
567
- chwidth = utf8_width(ch, ambi_is_single);
568
- width += chwidth == 0 ? default_width : chwidth;
569
- }
570
- lua_pushinteger(L, (lua_Integer)width);
571
- }
572
- return 1;
573
- }
574
-
575
- static int Lutf8_widthindex(lua_State *L) {
576
- const char *e, *s = check_utf8(L, 1, &e);
577
- int width = luaL_checkinteger(L, 2);
578
- int ambi_is_single = !lua_toboolean(L, 3);
579
- int default_width = luaL_optinteger(L, 4, 0);
580
- size_t idx = 1;
581
- while (s < e) {
582
- unsigned int ch;
583
- size_t chwidth;
584
- s += utf8_decode(s, e, &ch);
585
- chwidth = utf8_width(ch, ambi_is_single);
586
- if (chwidth == 0) chwidth = default_width;
587
- width -= chwidth;
588
- if (width <= 0) {
589
- lua_pushinteger(L, idx);
590
- lua_pushinteger(L, width + chwidth);
591
- lua_pushinteger(L, chwidth);
592
- return 3;
593
- }
594
- ++idx;
595
- }
596
- lua_pushinteger(L, (lua_Integer)idx);
597
- return 1;
598
- }
599
-
600
- static int Lutf8_ncasecmp(lua_State *L) {
601
- const char *e1, *s1 = check_utf8(L, 1, &e1);
602
- const char *e2, *s2 = check_utf8(L, 2, &e2);
603
- while (s1 < e1 || s2 < e2) {
604
- unsigned int ch1 = 0, ch2 = 0;
605
- if (s1 == e1)
606
- ch2 = 1;
607
- else if (s2 == e2)
608
- ch1 = 1;
609
- else {
610
- s1 += utf8_decode(s1, e1, &ch1);
611
- s2 += utf8_decode(s2, e2, &ch2);
612
- ch1 = utf8_tofold(ch1);
613
- ch2 = utf8_tofold(ch2);
614
- }
615
- if (ch1 != ch2) {
616
- lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
617
- return 1;
618
- }
619
- }
620
- lua_pushinteger(L, 0);
621
- return 1;
622
- }
623
-
624
-
625
- /* utf8 pattern matching implement */
626
-
627
- #ifndef LUA_MAXCAPTURES
628
- # define LUA_MAXCAPTURES 32
629
- #endif /* LUA_MAXCAPTURES */
630
-
631
- #define CAP_UNFINISHED (-1)
632
- #define CAP_POSITION (-2)
633
-
634
-
635
- typedef struct utf8MatchState {
636
- int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
637
- const char *src_init; /* init of source string */
638
- const char *src_end; /* end ('\0') of source string */
639
- const char *p_end; /* end ('\0') of pattern */
640
- lua_State *L;
641
- int level; /* total number of captures (finished or unfinished) */
642
- struct {
643
- const char *init;
644
- ptrdiff_t len;
645
- } capture[LUA_MAXCAPTURES];
646
- } utf8MatchState;
647
-
648
- /* recursive function */
649
- static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p);
650
-
651
- /* maximum recursion depth for 'match' */
652
- #if !defined(MAXCCALLS)
653
- #define MAXCCALLS 200
654
- #endif
655
-
656
- #define L_ESC '%'
657
- #define SPECIALS "^$*+?.([%-"
658
-
659
- static int utf8_check_capture (utf8MatchState *ms, int l) {
660
- l -= '1';
661
- if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
662
- return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
663
- return l;
664
- }
665
-
666
- static int utf8_capture_to_close (utf8MatchState *ms) {
667
- int level = ms->level;
668
- for (level--; level>=0; level--)
669
- if (ms->capture[level].len == CAP_UNFINISHED) return level;
670
- return luaL_error(ms->L, "invalid pattern capture");
671
- }
672
-
673
- static const char *utf8_classend (utf8MatchState *ms, const char *p) {
674
- unsigned int ch;
675
- p += utf8_decode(p, ms->p_end, &ch);
676
- switch (ch) {
677
- case L_ESC: {
678
- if (p == ms->p_end)
679
- luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
680
- return utf8_next(p, ms->p_end);
681
- }
682
- case '[': {
683
- if (*p == '^') p++;
684
- do { /* look for a `]' */
685
- if (p == ms->p_end)
686
- luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
687
- if (*(p++) == L_ESC && p < ms->p_end)
688
- p++; /* skip escapes (e.g. `%]') */
689
- } while (*p != ']');
690
- return p+1;
691
- }
692
- default: {
693
- return p;
694
- }
695
- }
696
- }
697
-
698
- static int utf8_match_class (unsigned int c, unsigned int cl) {
699
- int res;
700
- switch (utf8_tolower(cl)) {
701
- case 'a' : res = utf8_isalpha(c); break;
702
- case 'c' : res = utf8_iscntrl(c); break;
703
- case 'd' : res = utf8_isdigit(c); break;
704
- case 'g' : res = utf8_isgraph(c); break;
705
- case 'l' : res = utf8_islower(c); break;
706
- case 'p' : res = utf8_ispunct(c); break;
707
- case 's' : res = utf8_isspace(c); break;
708
- case 'u' : res = utf8_isupper(c); break;
709
- case 'w' : res = utf8_isalnum(c); break;
710
- case 'x' : res = utf8_isxdigit(c); break;
711
- case 'z' : res = (c == 0); break; /* deprecated option */
712
- default: return (cl == c);
713
- }
714
- return (utf8_islower(cl) ? res : !res);
715
- }
716
-
717
- static int utf8_matchbracketclass (unsigned int c, const char *p, const char *ec) {
718
- int sig = 1;
719
- assert(*p == '[');
720
- if (*++p == '^') {
721
- sig = 0;
722
- p++; /* skip the `^' */
723
- }
724
- while (p < ec) {
725
- unsigned int ch;
726
- p += utf8_decode(p, ec, &ch);
727
- if (ch == L_ESC) {
728
- p += utf8_decode(p, ec, &ch);
729
- if (utf8_match_class(c, ch))
730
- return sig;
731
- }
732
- else {
733
- unsigned int next;
734
- const char *np = p + utf8_decode(p, ec, &next);
735
- if (next == '-' && np < ec) {
736
- p = np + utf8_decode(np, ec, &next);
737
- if (ch <= c && c <= next)
738
- return sig;
739
- }
740
- else if (ch == c) return sig;
741
- }
742
- }
743
- return !sig;
744
- }
745
-
746
- static int utf8_singlematch (utf8MatchState *ms, const char *s, const char *p,
747
- const char *ep) {
748
- if (s >= ms->src_end)
749
- return 0;
750
- else {
751
- unsigned int ch, pch;
752
- utf8_decode(s, ms->src_end, &ch);
753
- p += utf8_decode(p, ms->p_end, &pch);
754
- switch (pch) {
755
- case '.': return 1; /* matches any char */
756
- case L_ESC: utf8_decode(p, ms->p_end, &pch);
757
- return utf8_match_class(ch, pch);
758
- case '[': return utf8_matchbracketclass(ch, p-1, ep-1);
759
- default: return pch == ch;
760
- }
761
- }
762
- }
763
-
764
- static const char *utf8_matchbalance (utf8MatchState *ms, const char *s,
765
- const char **p) {
766
- unsigned int ch, begin, end;
767
- *p += utf8_decode(*p, ms->p_end, &begin);
768
- if (*p >= ms->p_end)
769
- luaL_error(ms->L, "malformed pattern "
770
- "(missing arguments to " LUA_QL("%%b") ")");
771
- *p += utf8_decode(*p, ms->p_end, &end);
772
- s += utf8_decode(s, ms->src_end, &ch);
773
- if (ch != begin) return NULL;
774
- else {
775
- int cont = 1;
776
- while (s < ms->src_end) {
777
- s += utf8_decode(s, ms->src_end, &ch);
778
- if (ch == end) {
779
- if (--cont == 0) return s;
780
- }
781
- else if (ch == begin) cont++;
782
- }
783
- }
784
- return NULL; /* string ends out of balance */
785
- }
786
-
787
- static const char *utf8_max_expand (utf8MatchState *ms, const char *s,
788
- const char *p, const char *ep) {
789
- const char *m = s; /* matched end of single match p */
790
- while (utf8_singlematch(ms, m, p, ep))
791
- m = utf8_next(m, ms->src_end);
792
- /* keeps trying to match with the maximum repetitions */
793
- while (s <= m) {
794
- const char *res = utf8_match(ms, m, ep+1);
795
- if (res) return res;
796
- /* else didn't match; reduce 1 repetition to try again */
797
- if (s == m) break;
798
- m = utf8_prev(s, m);
799
- }
800
- return NULL;
801
- }
802
-
803
- static const char *utf8_min_expand (utf8MatchState *ms, const char *s,
804
- const char *p, const char *ep) {
805
- for (;;) {
806
- const char *res = utf8_match(ms, s, ep+1);
807
- if (res != NULL)
808
- return res;
809
- else if (utf8_singlematch(ms, s, p, ep))
810
- s = utf8_next(s, ms->src_end); /* try with one more repetition */
811
- else return NULL;
812
- }
813
- }
814
-
815
- static const char *utf8_start_capture (utf8MatchState *ms, const char *s,
816
- const char *p, int what) {
817
- const char *res;
818
- int level = ms->level;
819
- if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
820
- ms->capture[level].init = s;
821
- ms->capture[level].len = what;
822
- ms->level = level+1;
823
- if ((res=utf8_match(ms, s, p)) == NULL) /* match failed? */
824
- ms->level--; /* undo capture */
825
- return res;
826
- }
827
-
828
- static const char *utf8_end_capture (utf8MatchState *ms, const char *s,
829
- const char *p) {
830
- int l = utf8_capture_to_close(ms);
831
- const char *res;
832
- ms->capture[l].len = s - ms->capture[l].init; /* close capture */
833
- if ((res = utf8_match(ms, s, p)) == NULL) /* match failed? */
834
- ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
835
- return res;
836
- }
837
-
838
- static const char *utf8_match_capture (utf8MatchState *ms, const char *s, int l) {
839
- size_t len;
840
- l = utf8_check_capture(ms, l);
841
- len = ms->capture[l].len;
842
- if ((size_t)(ms->src_end-s) >= len &&
843
- memcmp(ms->capture[l].init, s, len) == 0)
844
- return s+len;
845
- else return NULL;
846
- }
847
-
848
- static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p) {
849
- if (ms->matchdepth-- == 0)
850
- luaL_error(ms->L, "pattern too complex");
851
- init: /* using goto's to optimize tail recursion */
852
- if (p != ms->p_end) { /* end of pattern? */
853
- unsigned int ch;
854
- utf8_decode(p, ms->p_end, &ch);
855
- switch (ch) {
856
- case '(': { /* start capture */
857
- if (*(p + 1) == ')') /* position capture? */
858
- s = utf8_start_capture(ms, s, p + 2, CAP_POSITION);
859
- else
860
- s = utf8_start_capture(ms, s, p + 1, CAP_UNFINISHED);
861
- break;
862
- }
863
- case ')': { /* end capture */
864
- s = utf8_end_capture(ms, s, p + 1);
865
- break;
866
- }
867
- case '$': {
868
- if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
869
- goto dflt; /* no; go to default */
870
- s = (s == ms->src_end) ? s : NULL; /* check end of string */
871
- break;
872
- }
873
- case L_ESC: { /* escaped sequence not in the format class[*+?-]? */
874
- const char *prev_p = p;
875
- p += utf8_decode(p+1, ms->p_end, &ch) + 1;
876
- switch (ch) {
877
- case 'b': { /* balanced string? */
878
- s = utf8_matchbalance(ms, s, &p);
879
- if (s != NULL)
880
- goto init; /* return utf8_match(ms, s, p + 4); */
881
- /* else fail (s == NULL) */
882
- break;
883
- }
884
- case 'f': { /* frontier? */
885
- const char *ep; unsigned int previous = 0, current = 0;
886
- if (*p != '[')
887
- luaL_error(ms->L, "missing " LUA_QL("[") " after "
888
- LUA_QL("%%f") " in pattern");
889
- ep = utf8_classend(ms, p); /* points to what is next */
890
- if (s != ms->src_init)
891
- utf8_decode(utf8_prev(ms->src_init, s), ms->src_end, &previous);
892
- if (s != ms->src_end)
893
- utf8_decode(s, ms->src_end, &current);
894
- if (!utf8_matchbracketclass(previous, p, ep - 1) &&
895
- utf8_matchbracketclass(current, p, ep - 1)) {
896
- p = ep; goto init; /* return utf8_match(ms, s, ep); */
897
- }
898
- s = NULL; /* match failed */
899
- break;
900
- }
901
- case '0': case '1': case '2': case '3':
902
- case '4': case '5': case '6': case '7':
903
- case '8': case '9': { /* capture results (%0-%9)? */
904
- s = utf8_match_capture(ms, s, ch - '1');
905
- if (s != NULL) goto init; /* return utf8_match(ms, s, p + 2) */
906
- break;
907
- }
908
- default: p = prev_p; goto dflt;
909
- }
910
- break;
911
- }
912
- default: dflt: { /* pattern class plus optional suffix */
913
- const char *ep = utf8_classend(ms, p); /* points to optional suffix */
914
- /* does not match at least once? */
915
- if (!utf8_singlematch(ms, s, p, ep)) {
916
- if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
917
- p = ep + 1; goto init; /* return utf8_match(ms, s, ep + 1); */
918
- }
919
- else /* '+' or no suffix */
920
- s = NULL; /* fail */
921
- }
922
- else { /* matched once */
923
- const char *next_s = utf8_next(s, ms->src_end);
924
- switch (*ep) { /* handle optional suffix */
925
- case '?': { /* optional */
926
- const char *res;
927
- const char *next_ep = utf8_next(ep, ms->p_end);
928
- if ((res = utf8_match(ms, next_s, next_ep)) != NULL)
929
- s = res;
930
- else {
931
- p = next_ep; goto init; /* else return utf8_match(ms, s, ep + 1); */
932
- }
933
- break;
934
- }
935
- case '+': /* 1 or more repetitions */
936
- s = next_s; /* 1 match already done */
937
- /* go through */
938
- case '*': /* 0 or more repetitions */
939
- s = utf8_max_expand(ms, s, p, ep);
940
- break;
941
- case '-': /* 0 or more repetitions (minimum) */
942
- s = utf8_min_expand(ms, s, p, ep);
943
- break;
944
- default: /* no suffix */
945
- s = next_s; p = ep; goto init; /* return utf8_match(ms, s + 1, ep); */
946
- }
947
- }
948
- break;
949
- }
950
- }
951
- }
952
- ms->matchdepth++;
953
- return s;
954
- }
955
-
956
- static const char *utf8_lmemfind (const char *s1, size_t l1,
957
- const char *s2, size_t l2) {
958
- if (l2 == 0) return s1; /* empty strings are everywhere */
959
- else if (l2 > l1) return NULL; /* avoids a negative `l1' */
960
- else {
961
- const char *init; /* to search for a `*s2' inside `s1' */
962
- l2--; /* 1st char will be checked by `memchr' */
963
- l1 = l1-l2; /* `s2' cannot be found after that */
964
- while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
965
- init++; /* 1st char is already checked */
966
- if (memcmp(init, s2+1, l2) == 0)
967
- return init-1;
968
- else { /* correct `l1' and `s1' to try again */
969
- l1 -= init-s1;
970
- s1 = init;
971
- }
972
- }
973
- return NULL; /* not found */
974
- }
975
- }
976
-
977
- static const char *utf8_get_index(const char *p, const char *s, const char *e, int *pidx) {
978
- int idx = 0;
979
- while (s < e) {
980
- if (s == p)
981
- break;
982
- else if (s > p) {
983
- --idx;
984
- break;
985
- }
986
- s = utf8_next(s, e);
987
- ++idx;
988
- }
989
- if (pidx) *pidx = idx;
990
- return s;
991
- }
992
-
993
- static void utf8_push_onecapture (utf8MatchState *ms, int i, const char *s,
994
- const char *e) {
995
- if (i >= ms->level) {
996
- if (i == 0) /* ms->level == 0, too */
997
- lua_pushlstring(ms->L, s, e - s); /* add whole match */
998
- else
999
- luaL_error(ms->L, "invalid capture index");
1000
- }
1001
- else {
1002
- ptrdiff_t l = ms->capture[i].len;
1003
- if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
1004
- if (l == CAP_POSITION) {
1005
- int idx;
1006
- utf8_get_index(ms->capture[i].init, ms->src_init, ms->src_end, &idx);
1007
- lua_pushinteger(ms->L, idx+1);
1008
- } else
1009
- lua_pushlstring(ms->L, ms->capture[i].init, l);
1010
- }
1011
- }
1012
-
1013
- static int utf8_push_captures (utf8MatchState *ms, const char *s, const char *e) {
1014
- int i;
1015
- int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
1016
- luaL_checkstack(ms->L, nlevels, "too many captures");
1017
- for (i = 0; i < nlevels; i++)
1018
- utf8_push_onecapture(ms, i, s, e);
1019
- return nlevels; /* number of strings pushed */
1020
- }
1021
-
1022
- /* check whether pattern has no special characters */
1023
- static int nospecials (const char *p, const char * ep) {
1024
- while (p < ep) {
1025
- if (strpbrk(p, SPECIALS))
1026
- return 0; /* pattern has a special character */
1027
- p += strlen(p) + 1; /* may have more after \0 */
1028
- }
1029
- return 1; /* no special chars found */
1030
- }
1031
-
1032
-
1033
- /* utf8 pattern matching interface */
1034
-
1035
- static int find_aux (lua_State *L, int find) {
1036
- const char *es, *s = check_utf8(L, 1, &es);
1037
- const char *ep, *p = check_utf8(L, 2, &ep);
1038
- lua_Integer idx = luaL_optinteger(L, 3, 1);
1039
- const char *init;
1040
- size_t slen = utf8_length(s, es);
1041
- if (idx > 0 && idx > (lua_Integer)slen + 1) { /* start after string's end? */
1042
- lua_pushnil(L); /* cannot find anything */
1043
- return 1;
1044
- }
1045
- if (idx < 0) idx += utf8_length(s, es) + 1;
1046
- init = utf8_index(s, es, idx);
1047
- /* explicit request or no special characters? */
1048
- if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
1049
- /* do a plain search */
1050
- do {
1051
- const char *s2 = utf8_lmemfind(init, es-init, p, ep-p);
1052
- if (!s2) break;
1053
- else {
1054
- int relidx;
1055
- const char *pch = utf8_get_index(s2, init, es, &relidx);
1056
- if (pch == s2) {
1057
- lua_pushinteger(L, idx + relidx);
1058
- lua_pushinteger(L, idx + relidx + utf8_length(p, ep) - 1);
1059
- return 2;
1060
- }
1061
- idx += relidx + 1;
1062
- init = utf8_next(pch, es);
1063
- }
1064
- } while (init < es);
1065
- }
1066
- else {
1067
- utf8MatchState ms;
1068
- int anchor = (*p == '^');
1069
- if (anchor) p++; /* skip anchor character */
1070
- ms.L = L;
1071
- ms.matchdepth = MAXCCALLS;
1072
- ms.src_init = s;
1073
- ms.src_end = es;
1074
- ms.p_end = ep;
1075
- do {
1076
- const char *res;
1077
- ms.level = 0;
1078
- assert(ms.matchdepth == MAXCCALLS);
1079
- if ((res=utf8_match(&ms, init, p)) != NULL) {
1080
- if (find) {
1081
- lua_pushinteger(L, idx); /* start */
1082
- lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */
1083
- return utf8_push_captures(&ms, NULL, 0) + 2;
1084
- }
1085
- else
1086
- return utf8_push_captures(&ms, init, res);
1087
- }
1088
- if (init == es) break;
1089
- idx += 1;
1090
- init = utf8_next(init, es);
1091
- } while (init <= es && !anchor);
1092
- }
1093
- lua_pushnil(L); /* not found */
1094
- return 1;
1095
- }
1096
-
1097
- static int Lutf8_find(lua_State *L)
1098
- { return find_aux(L, 1); }
1099
-
1100
- static int Lutf8_match(lua_State *L)
1101
- { return find_aux(L, 0); }
1102
-
1103
- static int utf8_gmatch_aux (lua_State *L) {
1104
- utf8MatchState ms;
1105
- const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
1106
- const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
1107
- const char *src;
1108
- ms.L = L;
1109
- ms.matchdepth = MAXCCALLS;
1110
- ms.src_init = s;
1111
- ms.src_end = es;
1112
- ms.p_end = ep;
1113
- for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
1114
- src <= ms.src_end;
1115
- src = utf8_next(src, ms.src_end)) {
1116
- const char *e;
1117
- ms.level = 0;
1118
- assert(ms.matchdepth == MAXCCALLS);
1119
- if ((e = utf8_match(&ms, src, p)) != NULL) {
1120
- lua_Integer newstart = e-s;
1121
- if (e == src) newstart++; /* empty match? go at least one position */
1122
- lua_pushinteger(L, newstart);
1123
- lua_replace(L, lua_upvalueindex(3));
1124
- return utf8_push_captures(&ms, src, e);
1125
- }
1126
- if (src == ms.src_end) break;
1127
- }
1128
- return 0; /* not found */
1129
- }
1130
-
1131
- static int Lutf8_gmatch(lua_State *L) {
1132
- luaL_checkstring(L, 1);
1133
- luaL_checkstring(L, 2);
1134
- lua_settop(L, 2);
1135
- lua_pushinteger(L, 0);
1136
- lua_pushcclosure(L, utf8_gmatch_aux, 3);
1137
- return 1;
1138
- }
1139
-
1140
- static void utf8_add_s (utf8MatchState *ms, luaL_Buffer *b, const char *s,
1141
- const char *e) {
1142
- const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
1143
- while (news < new_end) {
1144
- unsigned int ch;
1145
- news += utf8_decode(news, new_end, &ch);
1146
- if (ch != L_ESC)
1147
- add_utf8char(b, ch);
1148
- else {
1149
- news += utf8_decode(news, new_end, &ch); /* skip ESC */
1150
- if (!utf8_isdigit(ch)) {
1151
- if (ch != L_ESC)
1152
- luaL_error(ms->L, "invalid use of " LUA_QL("%c")
1153
- " in replacement string", L_ESC);
1154
- add_utf8char(b, ch);
1155
- }
1156
- else if (ch == '0')
1157
- luaL_addlstring(b, s, e-s);
1158
- else {
1159
- utf8_push_onecapture(ms, ch-'1', s, e);
1160
- luaL_addvalue(b); /* add capture to accumulated result */
1161
- }
1162
- }
1163
- }
1164
- }
1165
-
1166
- static void utf8_add_value (utf8MatchState *ms, luaL_Buffer *b, const char *s,
1167
- const char *e, int tr) {
1168
- lua_State *L = ms->L;
1169
- switch (tr) {
1170
- case LUA_TFUNCTION: {
1171
- int n;
1172
- lua_pushvalue(L, 3);
1173
- n = utf8_push_captures(ms, s, e);
1174
- lua_call(L, n, 1);
1175
- break;
1176
- }
1177
- case LUA_TTABLE: {
1178
- utf8_push_onecapture(ms, 0, s, e);
1179
- lua_gettable(L, 3);
1180
- break;
1181
- }
1182
- default: { /* LUA_TNUMBER or LUA_TSTRING */
1183
- utf8_add_s(ms, b, s, e);
1184
- return;
1185
- }
1186
- }
1187
- if (!lua_toboolean(L, -1)) { /* nil or false? */
1188
- lua_pop(L, 1);
1189
- lua_pushlstring(L, s, e - s); /* keep original text */
1190
- }
1191
- else if (!lua_isstring(L, -1))
1192
- luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
1193
- luaL_addvalue(b); /* add result to accumulator */
1194
- }
1195
-
1196
- static int Lutf8_gsub(lua_State *L) {
1197
- const char *es, *s = check_utf8(L, 1, &es);
1198
- const char *ep, *p = check_utf8(L, 2, &ep);
1199
- int tr = lua_type(L, 3);
1200
- lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
1201
- int anchor = (*p == '^');
1202
- lua_Integer n = 0;
1203
- utf8MatchState ms;
1204
- luaL_Buffer b;
1205
- luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
1206
- tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
1207
- "string/function/table expected");
1208
- luaL_buffinit(L, &b);
1209
- if (anchor) p++; /* skip anchor character */
1210
- ms.L = L;
1211
- ms.matchdepth = MAXCCALLS;
1212
- ms.src_init = s;
1213
- ms.src_end = es;
1214
- ms.p_end = ep;
1215
- while (n < max_s) {
1216
- const char *e;
1217
- ms.level = 0;
1218
- assert(ms.matchdepth == MAXCCALLS);
1219
- e = utf8_match(&ms, s, p);
1220
- if (e) {
1221
- n++;
1222
- utf8_add_value(&ms, &b, s, e, tr);
1223
- }
1224
- if (e && e > s) /* non empty match? */
1225
- s = e; /* skip it */
1226
- else if (s < es) {
1227
- unsigned int ch;
1228
- s += utf8_decode(s, es, &ch);
1229
- add_utf8char(&b, ch);
1230
- }
1231
- else break;
1232
- if (anchor) break;
1233
- }
1234
- luaL_addlstring(&b, s, es-s);
1235
- luaL_pushresult(&b);
1236
- lua_pushinteger(L, n); /* number of substitutions */
1237
- return 2;
1238
- }
1239
-
1240
-
1241
- /* lua module import interface */
1242
-
1243
- LUALIB_API int luaopen_utf8(lua_State *L) {
1244
- luaL_Reg libs[] = {
1245
- #define ENTRY(name) { #name, Lutf8_##name }
1246
- ENTRY(len),
1247
- ENTRY(sub),
1248
- ENTRY(reverse),
1249
- ENTRY(lower),
1250
- ENTRY(upper),
1251
- ENTRY(title),
1252
- ENTRY(fold),
1253
- ENTRY(byte),
1254
- ENTRY(char),
1255
- ENTRY(escape),
1256
- ENTRY(insert),
1257
- ENTRY(remove),
1258
- ENTRY(charpos),
1259
- ENTRY(next),
1260
- ENTRY(width),
1261
- ENTRY(widthindex),
1262
- ENTRY(ncasecmp),
1263
- ENTRY(find),
1264
- ENTRY(gmatch),
1265
- ENTRY(gsub),
1266
- ENTRY(match),
1267
- #undef ENTRY
1268
- { NULL, NULL }
1269
- };
1270
-
1271
- luaL_register(L, "utf8", libs);
1272
-
1273
- return 1;
1274
- }