immunio 1.2.1 → 2.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +13 -5
- data/ext/immunio/Rakefile +14 -6
- data/lib/immunio/context.rb +2 -0
- data/lib/immunio/plugins/action_view.rb +7 -668
- data/lib/immunio/plugins/action_view/action_view.rb +22 -0
- data/lib/immunio/plugins/action_view/active_support_hash.rb +29 -0
- data/lib/immunio/plugins/action_view/cache_store.rb +24 -0
- data/lib/immunio/plugins/action_view/erubi.rb +38 -0
- data/lib/immunio/plugins/action_view/erubis.rb +39 -0
- data/lib/immunio/plugins/action_view/fragment_caching.rb +29 -0
- data/lib/immunio/plugins/action_view/haml.rb +46 -0
- data/lib/immunio/plugins/action_view/slim.rb +42 -0
- data/lib/immunio/plugins/action_view/template.rb +431 -0
- data/lib/immunio/plugins/action_view/template_rendering.rb +45 -0
- data/lib/immunio/plugins/http_tracker.rb +2 -0
- data/lib/immunio/plugins/io.rb +34 -0
- data/lib/immunio/version.rb +1 -1
- data/lua-hooks/Makefile +36 -9
- data/lua-hooks/ext/luajit/COPYRIGHT +1 -1
- data/lua-hooks/ext/luajit/Makefile +22 -15
- data/lua-hooks/ext/luajit/README +2 -2
- data/lua-hooks/ext/luajit/doc/bluequad-print.css +1 -1
- data/lua-hooks/ext/luajit/doc/bluequad.css +1 -1
- data/lua-hooks/ext/luajit/doc/changes.html +69 -3
- data/lua-hooks/ext/luajit/doc/contact.html +10 -3
- data/lua-hooks/ext/luajit/doc/ext_c_api.html +2 -2
- data/lua-hooks/ext/luajit/doc/ext_ffi.html +2 -2
- data/lua-hooks/ext/luajit/doc/ext_ffi_api.html +2 -2
- data/lua-hooks/ext/luajit/doc/ext_ffi_semantics.html +3 -4
- data/lua-hooks/ext/luajit/doc/ext_ffi_tutorial.html +2 -2
- data/lua-hooks/ext/luajit/doc/ext_jit.html +3 -3
- data/lua-hooks/ext/luajit/doc/ext_profiler.html +2 -2
- data/lua-hooks/ext/luajit/doc/extensions.html +47 -20
- data/lua-hooks/ext/luajit/doc/faq.html +2 -2
- data/lua-hooks/ext/luajit/doc/install.html +74 -45
- data/lua-hooks/ext/luajit/doc/luajit.html +5 -5
- data/lua-hooks/ext/luajit/doc/running.html +3 -3
- data/lua-hooks/ext/luajit/doc/status.html +13 -8
- data/lua-hooks/ext/luajit/dynasm/dasm_arm.h +1 -1
- data/lua-hooks/ext/luajit/dynasm/dasm_arm.lua +1 -1
- data/lua-hooks/ext/luajit/dynasm/dasm_arm64.h +1 -1
- data/lua-hooks/ext/luajit/dynasm/dasm_arm64.lua +1 -1
- data/lua-hooks/ext/luajit/dynasm/dasm_mips.h +8 -5
- data/lua-hooks/ext/luajit/dynasm/dasm_mips.lua +66 -11
- data/lua-hooks/ext/luajit/dynasm/dasm_mips64.lua +12 -0
- data/lua-hooks/ext/luajit/dynasm/dasm_ppc.h +1 -1
- data/lua-hooks/ext/luajit/dynasm/dasm_ppc.lua +1 -1
- data/lua-hooks/ext/luajit/dynasm/dasm_proto.h +1 -1
- data/lua-hooks/ext/luajit/dynasm/dasm_x64.lua +1 -1
- data/lua-hooks/ext/luajit/dynasm/dasm_x86.h +1 -1
- data/lua-hooks/ext/luajit/dynasm/dasm_x86.lua +5 -1
- data/lua-hooks/ext/luajit/dynasm/dynasm.lua +2 -2
- data/lua-hooks/ext/luajit/etc/luajit.1 +1 -1
- data/lua-hooks/ext/luajit/etc/luajit.pc +1 -1
- data/lua-hooks/ext/luajit/src/Makefile +15 -11
- data/lua-hooks/ext/luajit/src/Makefile.dep +16 -16
- data/lua-hooks/ext/luajit/src/host/buildvm.c +2 -2
- data/lua-hooks/ext/luajit/src/host/buildvm.h +1 -1
- data/lua-hooks/ext/luajit/src/host/buildvm_asm.c +9 -4
- data/lua-hooks/ext/luajit/src/host/buildvm_fold.c +2 -2
- data/lua-hooks/ext/luajit/src/host/buildvm_lib.c +1 -1
- data/lua-hooks/ext/luajit/src/host/buildvm_libbc.h +14 -3
- data/lua-hooks/ext/luajit/src/host/buildvm_peobj.c +27 -3
- data/lua-hooks/ext/luajit/src/host/genlibbc.lua +1 -1
- data/lua-hooks/ext/luajit/src/host/genminilua.lua +6 -5
- data/lua-hooks/ext/luajit/src/host/minilua.c +1 -1
- data/lua-hooks/ext/luajit/src/jit/bc.lua +1 -1
- data/lua-hooks/ext/luajit/src/jit/bcsave.lua +8 -8
- data/lua-hooks/ext/luajit/src/jit/dis_arm.lua +2 -2
- data/lua-hooks/ext/luajit/src/jit/dis_arm64.lua +1216 -0
- data/lua-hooks/ext/luajit/src/jit/dis_arm64be.lua +12 -0
- data/lua-hooks/ext/luajit/src/jit/dis_mips.lua +35 -20
- data/lua-hooks/ext/luajit/src/jit/dis_mips64.lua +17 -0
- data/lua-hooks/ext/luajit/src/jit/dis_mips64el.lua +17 -0
- data/lua-hooks/ext/luajit/src/jit/dis_mipsel.lua +1 -1
- data/lua-hooks/ext/luajit/src/jit/dis_ppc.lua +2 -2
- data/lua-hooks/ext/luajit/src/jit/dis_x64.lua +1 -1
- data/lua-hooks/ext/luajit/src/jit/dis_x86.lua +7 -4
- data/lua-hooks/ext/luajit/src/jit/dump.lua +17 -12
- data/lua-hooks/ext/luajit/src/jit/p.lua +3 -2
- data/lua-hooks/ext/luajit/src/jit/v.lua +2 -2
- data/lua-hooks/ext/luajit/src/jit/zone.lua +1 -1
- data/lua-hooks/ext/luajit/src/lauxlib.h +14 -20
- data/lua-hooks/ext/luajit/src/lib_aux.c +38 -27
- data/lua-hooks/ext/luajit/src/lib_base.c +12 -5
- data/lua-hooks/ext/luajit/src/lib_bit.c +1 -1
- data/lua-hooks/ext/luajit/src/lib_debug.c +5 -5
- data/lua-hooks/ext/luajit/src/lib_ffi.c +2 -2
- data/lua-hooks/ext/luajit/src/lib_init.c +16 -16
- data/lua-hooks/ext/luajit/src/lib_io.c +6 -7
- data/lua-hooks/ext/luajit/src/lib_jit.c +14 -4
- data/lua-hooks/ext/luajit/src/lib_math.c +1 -5
- data/lua-hooks/ext/luajit/src/lib_os.c +1 -1
- data/lua-hooks/ext/luajit/src/lib_package.c +14 -23
- data/lua-hooks/ext/luajit/src/lib_string.c +1 -5
- data/lua-hooks/ext/luajit/src/lib_table.c +21 -1
- data/lua-hooks/ext/luajit/src/lj.supp +3 -3
- data/lua-hooks/ext/luajit/src/lj_alloc.c +174 -83
- data/lua-hooks/ext/luajit/src/lj_api.c +97 -18
- data/lua-hooks/ext/luajit/src/lj_arch.h +54 -22
- data/lua-hooks/ext/luajit/src/lj_asm.c +172 -53
- data/lua-hooks/ext/luajit/src/lj_asm.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_asm_arm.h +19 -16
- data/lua-hooks/ext/luajit/src/lj_asm_arm64.h +2022 -0
- data/lua-hooks/ext/luajit/src/lj_asm_mips.h +564 -158
- data/lua-hooks/ext/luajit/src/lj_asm_ppc.h +19 -18
- data/lua-hooks/ext/luajit/src/lj_asm_x86.h +578 -92
- data/lua-hooks/ext/luajit/src/lj_bc.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_bc.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_bcdump.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_bcread.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_bcwrite.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_buf.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_buf.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_carith.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_carith.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_ccall.c +172 -7
- data/lua-hooks/ext/luajit/src/lj_ccall.h +21 -5
- data/lua-hooks/ext/luajit/src/lj_ccallback.c +71 -17
- data/lua-hooks/ext/luajit/src/lj_ccallback.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_cconv.c +4 -2
- data/lua-hooks/ext/luajit/src/lj_cconv.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_cdata.c +7 -5
- data/lua-hooks/ext/luajit/src/lj_cdata.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_clib.c +5 -5
- data/lua-hooks/ext/luajit/src/lj_clib.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_cparse.c +11 -6
- data/lua-hooks/ext/luajit/src/lj_cparse.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_crecord.c +70 -14
- data/lua-hooks/ext/luajit/src/lj_crecord.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_ctype.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_ctype.h +8 -8
- data/lua-hooks/ext/luajit/src/lj_debug.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_debug.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_def.h +6 -9
- data/lua-hooks/ext/luajit/src/lj_dispatch.c +3 -3
- data/lua-hooks/ext/luajit/src/lj_dispatch.h +2 -1
- data/lua-hooks/ext/luajit/src/lj_emit_arm.h +5 -4
- data/lua-hooks/ext/luajit/src/lj_emit_arm64.h +419 -0
- data/lua-hooks/ext/luajit/src/lj_emit_mips.h +100 -20
- data/lua-hooks/ext/luajit/src/lj_emit_ppc.h +4 -4
- data/lua-hooks/ext/luajit/src/lj_emit_x86.h +116 -25
- data/lua-hooks/ext/luajit/src/lj_err.c +34 -13
- data/lua-hooks/ext/luajit/src/lj_err.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_errmsg.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_ff.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_ffrecord.c +58 -49
- data/lua-hooks/ext/luajit/src/lj_ffrecord.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_frame.h +33 -6
- data/lua-hooks/ext/luajit/src/lj_func.c +4 -2
- data/lua-hooks/ext/luajit/src/lj_func.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_gc.c +16 -7
- data/lua-hooks/ext/luajit/src/lj_gc.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_gdbjit.c +31 -1
- data/lua-hooks/ext/luajit/src/lj_gdbjit.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_ir.c +69 -96
- data/lua-hooks/ext/luajit/src/lj_ir.h +29 -18
- data/lua-hooks/ext/luajit/src/lj_ircall.h +24 -30
- data/lua-hooks/ext/luajit/src/lj_iropt.h +9 -9
- data/lua-hooks/ext/luajit/src/lj_jit.h +67 -9
- data/lua-hooks/ext/luajit/src/lj_lex.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_lex.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_lib.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_lib.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_load.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_mcode.c +11 -10
- data/lua-hooks/ext/luajit/src/lj_mcode.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_meta.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_meta.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_obj.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_obj.h +7 -3
- data/lua-hooks/ext/luajit/src/lj_opt_dce.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_opt_fold.c +84 -17
- data/lua-hooks/ext/luajit/src/lj_opt_loop.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_opt_mem.c +3 -3
- data/lua-hooks/ext/luajit/src/lj_opt_narrow.c +24 -22
- data/lua-hooks/ext/luajit/src/lj_opt_sink.c +11 -6
- data/lua-hooks/ext/luajit/src/lj_opt_split.c +11 -2
- data/lua-hooks/ext/luajit/src/lj_parse.c +9 -7
- data/lua-hooks/ext/luajit/src/lj_parse.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_profile.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_profile.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_record.c +201 -117
- data/lua-hooks/ext/luajit/src/lj_record.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_snap.c +72 -26
- data/lua-hooks/ext/luajit/src/lj_snap.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_state.c +6 -6
- data/lua-hooks/ext/luajit/src/lj_state.h +2 -2
- data/lua-hooks/ext/luajit/src/lj_str.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_str.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_strfmt.c +7 -3
- data/lua-hooks/ext/luajit/src/lj_strfmt.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_strfmt_num.c +4 -3
- data/lua-hooks/ext/luajit/src/lj_strscan.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_strscan.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_tab.c +1 -2
- data/lua-hooks/ext/luajit/src/lj_tab.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_target.h +3 -3
- data/lua-hooks/ext/luajit/src/lj_target_arm.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_target_arm64.h +239 -7
- data/lua-hooks/ext/luajit/src/lj_target_mips.h +111 -22
- data/lua-hooks/ext/luajit/src/lj_target_ppc.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_target_x86.h +21 -4
- data/lua-hooks/ext/luajit/src/lj_trace.c +63 -18
- data/lua-hooks/ext/luajit/src/lj_trace.h +2 -1
- data/lua-hooks/ext/luajit/src/lj_traceerr.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_udata.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_udata.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_vm.h +5 -1
- data/lua-hooks/ext/luajit/src/lj_vmevent.c +1 -1
- data/lua-hooks/ext/luajit/src/lj_vmevent.h +1 -1
- data/lua-hooks/ext/luajit/src/lj_vmmath.c +1 -1
- data/lua-hooks/ext/luajit/src/ljamalg.c +1 -1
- data/lua-hooks/ext/luajit/src/lua.h +9 -1
- data/lua-hooks/ext/luajit/src/luaconf.h +3 -7
- data/lua-hooks/ext/luajit/src/luajit.c +69 -54
- data/lua-hooks/ext/luajit/src/luajit.h +4 -4
- data/lua-hooks/ext/luajit/src/lualib.h +1 -1
- data/lua-hooks/ext/luajit/src/msvcbuild.bat +12 -4
- data/lua-hooks/ext/luajit/src/vm_arm.dasc +1 -1
- data/lua-hooks/ext/luajit/src/vm_arm64.dasc +255 -32
- data/lua-hooks/ext/luajit/src/vm_mips.dasc +26 -23
- data/lua-hooks/ext/luajit/src/vm_mips64.dasc +5062 -0
- data/lua-hooks/ext/luajit/src/vm_ppc.dasc +1 -1
- data/lua-hooks/ext/luajit/src/vm_x64.dasc +24 -25
- data/lua-hooks/ext/luajit/src/vm_x86.dasc +77 -4
- data/lua-hooks/libluahooks.darwin.a +0 -0
- data/lua-hooks/libluahooks.linux.a +0 -0
- data/lua-hooks/options.mk +1 -1
- metadata +37 -77
- data/lua-hooks/ext/all.c +0 -69
- data/lua-hooks/ext/libinjection/COPYING +0 -37
- data/lua-hooks/ext/libinjection/libinjection.h +0 -65
- data/lua-hooks/ext/libinjection/libinjection_html5.c +0 -847
- data/lua-hooks/ext/libinjection/libinjection_html5.h +0 -54
- data/lua-hooks/ext/libinjection/libinjection_sqli.c +0 -2301
- data/lua-hooks/ext/libinjection/libinjection_sqli.h +0 -295
- data/lua-hooks/ext/libinjection/libinjection_sqli_data.h +0 -9349
- data/lua-hooks/ext/libinjection/libinjection_xss.c +0 -531
- data/lua-hooks/ext/libinjection/libinjection_xss.h +0 -21
- data/lua-hooks/ext/libinjection/lualib.c +0 -145
- data/lua-hooks/ext/libinjection/module.mk +0 -5
- data/lua-hooks/ext/lpeg/HISTORY +0 -96
- data/lua-hooks/ext/lpeg/lpcap.c +0 -537
- data/lua-hooks/ext/lpeg/lpcap.h +0 -56
- data/lua-hooks/ext/lpeg/lpcode.c +0 -1014
- data/lua-hooks/ext/lpeg/lpcode.h +0 -40
- data/lua-hooks/ext/lpeg/lpeg-128.gif +0 -0
- data/lua-hooks/ext/lpeg/lpeg.html +0 -1445
- data/lua-hooks/ext/lpeg/lpprint.c +0 -244
- data/lua-hooks/ext/lpeg/lpprint.h +0 -36
- data/lua-hooks/ext/lpeg/lptree.c +0 -1303
- data/lua-hooks/ext/lpeg/lptree.h +0 -82
- data/lua-hooks/ext/lpeg/lptypes.h +0 -149
- data/lua-hooks/ext/lpeg/lpvm.c +0 -364
- data/lua-hooks/ext/lpeg/lpvm.h +0 -58
- data/lua-hooks/ext/lpeg/makefile +0 -55
- data/lua-hooks/ext/lpeg/module.mk +0 -6
- data/lua-hooks/ext/lpeg/re.html +0 -498
- data/lua-hooks/ext/lua-cmsgpack/.gitignore +0 -13
- data/lua-hooks/ext/lua-cmsgpack/CMakeLists.txt +0 -45
- data/lua-hooks/ext/lua-cmsgpack/README.md +0 -115
- data/lua-hooks/ext/lua-cmsgpack/lua_cmsgpack.c +0 -970
- data/lua-hooks/ext/lua-cmsgpack/module.mk +0 -2
- data/lua-hooks/ext/lua-cmsgpack/test.lua +0 -570
- data/lua-hooks/ext/lua-snapshot/LICENSE +0 -7
- data/lua-hooks/ext/lua-snapshot/Makefile +0 -12
- data/lua-hooks/ext/lua-snapshot/README.md +0 -18
- data/lua-hooks/ext/lua-snapshot/dump.lua +0 -15
- data/lua-hooks/ext/lua-snapshot/module.mk +0 -2
- data/lua-hooks/ext/lua-snapshot/snapshot.c +0 -462
- data/lua-hooks/ext/luautf8/README.md +0 -152
- data/lua-hooks/ext/luautf8/lutf8lib.c +0 -1274
- data/lua-hooks/ext/luautf8/module.mk +0 -2
- data/lua-hooks/ext/luautf8/unidata.h +0 -3064
- data/lua-hooks/ext/module.mk +0 -15
- data/lua-hooks/ext/modules.h +0 -17
- data/lua-hooks/ext/perf/luacpu.c +0 -114
- data/lua-hooks/ext/perf/lualoadavg.c +0 -40
- data/lua-hooks/ext/perf/luameminfo.c +0 -38
- data/lua-hooks/ext/perf/luaoslib.c +0 -203
- data/lua-hooks/ext/perf/module.mk +0 -5
- data/lua-hooks/ext/sha1/luasha1.c +0 -74
- data/lua-hooks/ext/sha1/module.mk +0 -5
- data/lua-hooks/ext/sha1/sha1.c +0 -145
- data/lua-hooks/ext/sha2/luasha256.c +0 -77
- data/lua-hooks/ext/sha2/module.mk +0 -5
- data/lua-hooks/ext/sha2/sha256.c +0 -196
- data/lua-hooks/ext/sysutils/lua_utils.c +0 -56
- data/lua-hooks/ext/sysutils/module.mk +0 -2
@@ -1,152 +0,0 @@
|
|
1
|
-
UTF-8 module for Lua 5.x
|
2
|
-
========================
|
3
|
-
|
4
|
-
This module is add UTF-8 support to Lua.
|
5
|
-
|
6
|
-
It use data extracted from [Unicode Character Database](http://www.unicode.org/reports/tr44/), and tested on Lua
|
7
|
-
5.2.3 and LuaJIT.
|
8
|
-
|
9
|
-
parseucd.lua is a pure Lua script generate unidata.h, to support convert
|
10
|
-
characters and check characters' category.
|
11
|
-
|
12
|
-
It mainly used to compatible with Lua's own string module, it passed all
|
13
|
-
string and pattern matching test in lua test suite[2].
|
14
|
-
|
15
|
-
It also add some useful routines against UTF-8 features, some like:
|
16
|
-
- a convenient interface to escape Unicode sequence in string.
|
17
|
-
- string insert/remove, since UTF-8 substring extract may expensive.
|
18
|
-
- calculate Unicode width, useful when implement e.g. console emulator.
|
19
|
-
- a useful interface to translate Unicode offset and byte offset.
|
20
|
-
|
21
|
-
[2]: http://www.lua.org/tests/5.2/
|
22
|
-
|
23
|
-
|
24
|
-
LuaRocks Installation
|
25
|
-
---------------------
|
26
|
-
`luarocks install utf8`
|
27
|
-
|
28
|
-
Usage
|
29
|
-
-----
|
30
|
-
|
31
|
-
Many routines are same as Lua's string module:
|
32
|
-
- `utf8.byte`
|
33
|
-
- `utf8.char`
|
34
|
-
- `utf8.find`
|
35
|
-
- `utf8.gmatch`
|
36
|
-
- `utf8.gsub`
|
37
|
-
- `utf8.len`
|
38
|
-
- `utf8.lower`
|
39
|
-
- `utf8.match`
|
40
|
-
- `utf8.reverse`
|
41
|
-
- `utf8.sub`
|
42
|
-
- `utf8.upper`
|
43
|
-
|
44
|
-
The document of these functions can be find in Lua manual[3].
|
45
|
-
|
46
|
-
[3]: http://www.lua.org/manual/5.2/manual.html#6.4
|
47
|
-
|
48
|
-
|
49
|
-
Some routines in string module needn't support Unicode:
|
50
|
-
- `string.dump`
|
51
|
-
- `string.format`
|
52
|
-
- `string.rep`
|
53
|
-
|
54
|
-
They are NOT in utf8 module.
|
55
|
-
|
56
|
-
Some routines are new, with some Unicode-spec functions:
|
57
|
-
|
58
|
-
###utf8.escape(str) -> utf8 string
|
59
|
-
escape a str to UTF-8 format string. It support several escape format:
|
60
|
-
|
61
|
-
%ddd - which ddd is a decimal number at any length:
|
62
|
-
change Unicode code point to UTF-8 format.
|
63
|
-
%{ddd} - same as %nnn but has bracket around.
|
64
|
-
%uddd - same as %ddd, u stands Unicode
|
65
|
-
%u{ddd} - same as %{ddd}
|
66
|
-
%xhhh - hexadigit version of %ddd
|
67
|
-
%x{hhh} same as %xhhh.
|
68
|
-
%? - '?' stands for any other character: escape this character.
|
69
|
-
|
70
|
-
####Examples:
|
71
|
-
```
|
72
|
-
local u = utf8.escape
|
73
|
-
print(u"%123%u123%{123}%u{123}%xABC%x{ABC}")
|
74
|
-
print(u"%%123%?%d%%u")
|
75
|
-
```
|
76
|
-
|
77
|
-
###utf8.charpos(s[[, charpos], offset]) -> charpos, code point
|
78
|
-
convert UTF-8 position to byte offset.
|
79
|
-
if only offset is given, return byte offset of this UTF-8 char index.
|
80
|
-
if charpos and offset is given, a new charpos will calculate, by
|
81
|
-
add/subtract UTF-8 char offset to current charpos.
|
82
|
-
in all case, it return a new char position, and code point (a number) at
|
83
|
-
this position.
|
84
|
-
|
85
|
-
###utf8.next(s[, charpos[, offset]]) -> charpos, code point
|
86
|
-
iterate though the UTF-8 string s.
|
87
|
-
If only s is given, it can used as a iterator:
|
88
|
-
```
|
89
|
-
for pos, code in utf8.next, "utf8-string" do
|
90
|
-
-- ...
|
91
|
-
end
|
92
|
-
```
|
93
|
-
if only charpos is given, return the next byte offset of in string.
|
94
|
-
if charpos and offset is given, a new charpos will calculate, by
|
95
|
-
add/subtract UTF-8 char offset to current charpos.
|
96
|
-
in all case, it return a new char position, and code point (a number) at
|
97
|
-
this position.
|
98
|
-
|
99
|
-
|
100
|
-
###utf8.insert(s[, idx], substring) -> new_string
|
101
|
-
insert a substring to s. If idx is given, insert substring before char at
|
102
|
-
this index, otherwise substring will concat to s. idx can be negative.
|
103
|
-
|
104
|
-
|
105
|
-
###utf8.remove(s[, start[, stop]]) -> new_string
|
106
|
-
delete a substring in s. If neither start nor stop is given, delete the
|
107
|
-
last UTF-8 char in s, otherwise delete char from start to end of s. if
|
108
|
-
stop is given, delete char from start to stop (include start and stop).
|
109
|
-
start and stop can be negative.
|
110
|
-
|
111
|
-
|
112
|
-
###utf8.width(s[, ambi_is_double[, default_width]]) -> width
|
113
|
-
calculate the width of UTF-8 string s. if ambi_is_double is given, the
|
114
|
-
ambiguous width character's width is 2, otherwise it's 1.
|
115
|
-
fullwidth/doublewidth character's width is 2, and other character's width
|
116
|
-
is 1.
|
117
|
-
if default_width is given, it will be the width of unprintable character,
|
118
|
-
used display a non-character mark for these characters.
|
119
|
-
if s is a code point, return the width of this code point.
|
120
|
-
|
121
|
-
|
122
|
-
###utf8.widthindex(s, location[, ambi_is_double[, default_width]]) -> idx, offset, width
|
123
|
-
return the character index at given location in string s. this is a
|
124
|
-
reverse operation of utf8.width().
|
125
|
-
this function return a index of location, and a offset in in UTF-8
|
126
|
-
encoding. e.g. if cursor is at the second column (middle) of the wide
|
127
|
-
char, offset will be 2. the width of character at idx is returned, also.
|
128
|
-
|
129
|
-
|
130
|
-
###utf8.title(s) -> new_string
|
131
|
-
###utf8.fold(s) -> new_string
|
132
|
-
convert UTF-8 string s to title-case, or folded case used to compare by
|
133
|
-
ignore case.
|
134
|
-
if s is a number, it's treat as a code point and return a convert code
|
135
|
-
point (number). utf8.lower/utf8.upper has the same extension.
|
136
|
-
|
137
|
-
|
138
|
-
###utf8.ncasecmp(a, b) -> [-1,0,1]
|
139
|
-
compare a and b without case, -1 means a < b, 0 means a == b and 1 means a > b.
|
140
|
-
|
141
|
-
|
142
|
-
Improvement needed
|
143
|
-
------------------
|
144
|
-
|
145
|
-
- more test case.
|
146
|
-
- grapheme-compose support, and affect in utf8.reverse and utf8.width
|
147
|
-
- Unicode normalize algorithm implement.
|
148
|
-
|
149
|
-
|
150
|
-
License
|
151
|
-
-------
|
152
|
-
It use same license with Lua: http://www.lua.org/license.html
|
@@ -1,1274 +0,0 @@
|
|
1
|
-
/* Modified to allow bundling.
|
2
|
-
* Original source: https://github.com/starwing/luautf8 */
|
3
|
-
/* vim: set ft=c nu et sw=2 fdc=2 fdm=syntax : */
|
4
|
-
#define LUA_LIB
|
5
|
-
#include "lua.h"
|
6
|
-
#include "lauxlib.h"
|
7
|
-
#include "lualib.h"
|
8
|
-
|
9
|
-
|
10
|
-
#include <assert.h>
|
11
|
-
#include <string.h>
|
12
|
-
|
13
|
-
|
14
|
-
/* UTF-8 string operations */
|
15
|
-
|
16
|
-
#define UTF_MAX 8
|
17
|
-
|
18
|
-
static size_t utf8_encode(char *s, unsigned int ch) {
|
19
|
-
if (ch < 0x80) {
|
20
|
-
s[0] = (char)ch;
|
21
|
-
return 1;
|
22
|
-
}
|
23
|
-
if (ch <= 0x7FF) {
|
24
|
-
s[1] = (char) ((ch | 0x80) & 0xBF);
|
25
|
-
s[0] = (char) ((ch >> 6) | 0xC0);
|
26
|
-
return 2;
|
27
|
-
}
|
28
|
-
if (ch <= 0xFFFF) {
|
29
|
-
three:
|
30
|
-
s[2] = (char) ((ch | 0x80) & 0xBF);
|
31
|
-
s[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
|
32
|
-
s[0] = (char) ((ch >> 12) | 0xE0);
|
33
|
-
return 3;
|
34
|
-
}
|
35
|
-
if (ch <= 0x1FFFFF) {
|
36
|
-
s[3] = (char) ((ch | 0x80) & 0xBF);
|
37
|
-
s[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
|
38
|
-
s[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
|
39
|
-
s[0] = (char) ((ch >> 18) | 0xF0);
|
40
|
-
return 4;
|
41
|
-
}
|
42
|
-
if (ch <= 0x3FFFFFF) {
|
43
|
-
s[4] = (char) ((ch | 0x80) & 0xBF);
|
44
|
-
s[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
|
45
|
-
s[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
|
46
|
-
s[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
|
47
|
-
s[0] = (char) ((ch >> 24) | 0xF8);
|
48
|
-
return 5;
|
49
|
-
}
|
50
|
-
if (ch <= 0x7FFFFFFF) {
|
51
|
-
s[5] = (char) ((ch | 0x80) & 0xBF);
|
52
|
-
s[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
|
53
|
-
s[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
|
54
|
-
s[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
|
55
|
-
s[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
|
56
|
-
s[0] = (char) ((ch >> 30) | 0xFC);
|
57
|
-
return 6;
|
58
|
-
}
|
59
|
-
|
60
|
-
/* fallback */
|
61
|
-
ch = 0xFFFD;
|
62
|
-
goto three;
|
63
|
-
}
|
64
|
-
|
65
|
-
static size_t utf8_decode(const char *s, const char *e, unsigned int *pch) {
|
66
|
-
unsigned int ch;
|
67
|
-
|
68
|
-
if (s >= e) {
|
69
|
-
*pch = 0;
|
70
|
-
return 0;
|
71
|
-
}
|
72
|
-
|
73
|
-
ch = (unsigned char)s[0];
|
74
|
-
if (ch < 0xC0) goto fallback;
|
75
|
-
if (ch < 0xE0) {
|
76
|
-
if (s+1 >= e || (s[1] & 0xC0) != 0x80)
|
77
|
-
goto fallback;
|
78
|
-
*pch = ((ch & 0x1F) << 6) |
|
79
|
-
(s[1] & 0x3F);
|
80
|
-
return 2;
|
81
|
-
}
|
82
|
-
if (ch < 0xF0) {
|
83
|
-
if (s+2 >= e || (s[1] & 0xC0) != 0x80
|
84
|
-
|| (s[2] & 0xC0) != 0x80)
|
85
|
-
goto fallback;
|
86
|
-
*pch = ((ch & 0x0F) << 12) |
|
87
|
-
((s[1] & 0x3F) << 6) |
|
88
|
-
(s[2] & 0x3F);
|
89
|
-
return 3;
|
90
|
-
}
|
91
|
-
{
|
92
|
-
int count = 0; /* to count number of continuation bytes */
|
93
|
-
unsigned int res;
|
94
|
-
while ((ch & 0x40) != 0) { /* still have continuation bytes? */
|
95
|
-
int cc = (unsigned char)s[++count];
|
96
|
-
if ((cc & 0xC0) != 0x80) /* not a continuation byte? */
|
97
|
-
goto fallback; /* invalid byte sequence, fallback */
|
98
|
-
res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
|
99
|
-
ch <<= 1; /* to test next bit */
|
100
|
-
}
|
101
|
-
if (count > 5)
|
102
|
-
goto fallback; /* invalid byte sequence */
|
103
|
-
res |= ((ch & 0x7F) << (count * 5)); /* add first byte */
|
104
|
-
return count+1;
|
105
|
-
}
|
106
|
-
|
107
|
-
fallback:
|
108
|
-
*pch = ch;
|
109
|
-
return 1;
|
110
|
-
}
|
111
|
-
|
112
|
-
static const char *utf8_next(const char *s, const char *e) {
|
113
|
-
unsigned int ch;
|
114
|
-
return s + utf8_decode(s, e, &ch);
|
115
|
-
}
|
116
|
-
|
117
|
-
static const char *utf8_prev(const char *s, const char *e) {
|
118
|
-
const char *look = e - 1;
|
119
|
-
|
120
|
-
while (s <= look) {
|
121
|
-
unsigned int ch = (unsigned char)*look;
|
122
|
-
if (ch < 0x80 || ch >= 0xC0)
|
123
|
-
return look;
|
124
|
-
--look;
|
125
|
-
}
|
126
|
-
|
127
|
-
return s;
|
128
|
-
}
|
129
|
-
|
130
|
-
static size_t utf8_length(const char *s, const char *e) {
|
131
|
-
size_t i = 0;
|
132
|
-
while (s < e) {
|
133
|
-
if ((*s & 0xFF) < 0xC0)
|
134
|
-
++s;
|
135
|
-
else
|
136
|
-
s = utf8_next(s, e);
|
137
|
-
++i;
|
138
|
-
}
|
139
|
-
return i;
|
140
|
-
}
|
141
|
-
|
142
|
-
static const char *utf8_index(const char *s, const char *e, int idx) {
|
143
|
-
if (idx >= 0) {
|
144
|
-
while (s < e && --idx > 0)
|
145
|
-
s = utf8_next(s, e);
|
146
|
-
return s;
|
147
|
-
}
|
148
|
-
else {
|
149
|
-
while (s < e && idx++ < 0)
|
150
|
-
e = utf8_prev(s, e);
|
151
|
-
return e;
|
152
|
-
}
|
153
|
-
}
|
154
|
-
|
155
|
-
|
156
|
-
/* Unicode character categories */
|
157
|
-
|
158
|
-
#include "unidata.h"
|
159
|
-
|
160
|
-
static int find_in_range(range_table *t, size_t size, unsigned int ch) {
|
161
|
-
size_t first, last;
|
162
|
-
|
163
|
-
first = 0;
|
164
|
-
last = size;
|
165
|
-
|
166
|
-
while (first < last) {
|
167
|
-
int mid = (first + last) / 2;
|
168
|
-
if (t[mid].last < ch)
|
169
|
-
first = mid + 1;
|
170
|
-
else if (t[mid].first > ch)
|
171
|
-
last = mid;
|
172
|
-
else
|
173
|
-
return (ch - t[mid].first) % t[mid].step == 0;
|
174
|
-
}
|
175
|
-
|
176
|
-
return 0;
|
177
|
-
}
|
178
|
-
|
179
|
-
static int convert_char(conv_table *t, size_t size, unsigned int ch) {
|
180
|
-
size_t first, last;
|
181
|
-
|
182
|
-
first = 0;
|
183
|
-
last = size;
|
184
|
-
|
185
|
-
while (first < last) {
|
186
|
-
int mid = (first + last) / 2;
|
187
|
-
if (t[mid].last < ch)
|
188
|
-
first = mid + 1;
|
189
|
-
else if (t[mid].first > ch)
|
190
|
-
last = mid;
|
191
|
-
else if ((ch - t[mid].first) % t[mid].step == 0)
|
192
|
-
return ch + t[mid].offset;
|
193
|
-
else
|
194
|
-
return ch;
|
195
|
-
}
|
196
|
-
|
197
|
-
return ch;
|
198
|
-
}
|
199
|
-
|
200
|
-
#define table_size(t) (sizeof(t)/sizeof((t)[0]))
|
201
|
-
|
202
|
-
#define define_category(name) static int utf8_is##name(unsigned int ch) \
|
203
|
-
{ return find_in_range(name##_table, table_size(name##_table), ch); }
|
204
|
-
|
205
|
-
#define define_converter(name) static unsigned int utf8_##name(unsigned int ch) \
|
206
|
-
{ return convert_char(name##_table, table_size(name##_table), ch); }
|
207
|
-
|
208
|
-
define_category(alpha)
|
209
|
-
define_category(lower)
|
210
|
-
define_category(upper)
|
211
|
-
define_category(cntrl)
|
212
|
-
define_category(digit)
|
213
|
-
define_category(xdigit)
|
214
|
-
define_category(punct)
|
215
|
-
define_category(space)
|
216
|
-
define_converter(tolower)
|
217
|
-
define_converter(toupper)
|
218
|
-
define_converter(totitle)
|
219
|
-
define_converter(tofold)
|
220
|
-
|
221
|
-
#undef define_category
|
222
|
-
#undef define_converter
|
223
|
-
|
224
|
-
static int utf8_isgraph(unsigned int ch) {
|
225
|
-
if (find_in_range(space_table, table_size(space_table), ch))
|
226
|
-
return 0;
|
227
|
-
if (find_in_range(graph_table, table_size(graph_table), ch))
|
228
|
-
return 1;
|
229
|
-
if (find_in_range(compose_table, table_size(compose_table), ch))
|
230
|
-
return 1;
|
231
|
-
return 0;
|
232
|
-
}
|
233
|
-
|
234
|
-
static int utf8_isalnum(unsigned int ch) {
|
235
|
-
if (find_in_range(alpha_table, table_size(alpha_table), ch))
|
236
|
-
return 1;
|
237
|
-
if (find_in_range(alnum_extend_table, table_size(alnum_extend_table), ch))
|
238
|
-
return 1;
|
239
|
-
return 0;
|
240
|
-
}
|
241
|
-
|
242
|
-
static int utf8_width(unsigned int ch, int ambi_is_single) {
|
243
|
-
if (find_in_range(doublewidth_table, table_size(doublewidth_table), ch))
|
244
|
-
return 2;
|
245
|
-
if (find_in_range(ambiwidth_table, table_size(ambiwidth_table), ch))
|
246
|
-
return ambi_is_single ? 1 : 2;
|
247
|
-
if (find_in_range(compose_table, table_size(compose_table), ch))
|
248
|
-
return 0;
|
249
|
-
if (find_in_range(unprintable_table, table_size(unprintable_table), ch))
|
250
|
-
return 0;
|
251
|
-
return 1;
|
252
|
-
}
|
253
|
-
|
254
|
-
|
255
|
-
/* string module compatible interface */
|
256
|
-
|
257
|
-
static const char *check_utf8(lua_State *L, int idx, const char **end) {
|
258
|
-
size_t len;
|
259
|
-
const char *s = luaL_checklstring(L, idx, &len);
|
260
|
-
if (end) *end = s+len;
|
261
|
-
return s;
|
262
|
-
}
|
263
|
-
|
264
|
-
static const char *to_utf8(lua_State *L, int idx, const char **end) {
|
265
|
-
size_t len;
|
266
|
-
const char *s = lua_tolstring(L, idx, &len);
|
267
|
-
if (end) *end = s+len;
|
268
|
-
return s;
|
269
|
-
}
|
270
|
-
|
271
|
-
static void add_utf8char(luaL_Buffer *b, unsigned int ch) {
|
272
|
-
char buff[UTF_MAX];
|
273
|
-
size_t n = utf8_encode(buff, ch);
|
274
|
-
luaL_addlstring(b, buff, n);
|
275
|
-
}
|
276
|
-
|
277
|
-
static lua_Integer byterelat(lua_Integer pos, size_t len) {
|
278
|
-
if (pos >= 0) return pos;
|
279
|
-
else if (0u - (size_t)pos > len) return 0;
|
280
|
-
else return (lua_Integer)len + pos + 1;
|
281
|
-
}
|
282
|
-
|
283
|
-
static int u_posrange(const char **ps, const char **pe,
|
284
|
-
lua_Integer posi, lua_Integer posj) {
|
285
|
-
const char *s = *ps, *e = *pe;
|
286
|
-
*ps = utf8_index(s, e, posi);
|
287
|
-
if (posj >= 0) {
|
288
|
-
while (s < e && posj-- > 0)
|
289
|
-
s = utf8_next(s, e);
|
290
|
-
*pe = s;
|
291
|
-
}
|
292
|
-
else {
|
293
|
-
while (s < e && ++posj < 0)
|
294
|
-
e = utf8_prev(s, e);
|
295
|
-
*pe = e;
|
296
|
-
}
|
297
|
-
return *ps < *pe;
|
298
|
-
}
|
299
|
-
|
300
|
-
static int Lutf8_len(lua_State *L) {
|
301
|
-
size_t len;
|
302
|
-
const char *s = luaL_checklstring(L, 1, &len);
|
303
|
-
lua_Integer posi = byterelat(luaL_optinteger(L, 2, 1), len);
|
304
|
-
lua_Integer posj = byterelat(luaL_optinteger(L, 3, -1), len);
|
305
|
-
if (posi < 1 || --posi > (lua_Integer)len
|
306
|
-
|| --posj > (lua_Integer)len)
|
307
|
-
return 0;
|
308
|
-
lua_pushinteger(L, (lua_Integer)utf8_length(s+posi, s+posj+1));
|
309
|
-
return 1;
|
310
|
-
}
|
311
|
-
|
312
|
-
static int Lutf8_sub(lua_State *L) {
|
313
|
-
const char *e, *s = check_utf8(L, 1, &e);
|
314
|
-
if (u_posrange(&s, &e,
|
315
|
-
luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
|
316
|
-
lua_pushlstring(L, s, e-s);
|
317
|
-
else
|
318
|
-
lua_pushliteral(L, "");
|
319
|
-
return 1;
|
320
|
-
}
|
321
|
-
|
322
|
-
static int Lutf8_reverse(lua_State *L) {
|
323
|
-
luaL_Buffer b;
|
324
|
-
/* XXX should handle compose unicode? */
|
325
|
-
const char *e, *s = check_utf8(L, 1, &e);
|
326
|
-
luaL_buffinit(L, &b);
|
327
|
-
while (s < e) {
|
328
|
-
const char *prev = utf8_prev(s, e);
|
329
|
-
luaL_addlstring(&b, prev, e-prev);
|
330
|
-
e = prev;
|
331
|
-
}
|
332
|
-
luaL_pushresult(&b);
|
333
|
-
return 1;
|
334
|
-
}
|
335
|
-
|
336
|
-
static int convert(lua_State *L, unsigned int (*conv)(unsigned int)) {
|
337
|
-
int t = lua_type(L, 1);
|
338
|
-
if (t == LUA_TNUMBER)
|
339
|
-
lua_pushinteger(L, conv(lua_tointeger(L, 1)));
|
340
|
-
else if (t != LUA_TSTRING)
|
341
|
-
return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
|
342
|
-
else {
|
343
|
-
luaL_Buffer b;
|
344
|
-
const char *e, *s = to_utf8(L, 1, &e);
|
345
|
-
luaL_buffinit(L, &b);
|
346
|
-
while (s < e) {
|
347
|
-
unsigned int ch;
|
348
|
-
s += utf8_decode(s, e, &ch);
|
349
|
-
ch = conv(ch);
|
350
|
-
add_utf8char(&b, ch);
|
351
|
-
}
|
352
|
-
luaL_pushresult(&b);
|
353
|
-
}
|
354
|
-
return 1;
|
355
|
-
}
|
356
|
-
|
357
|
-
static int Lutf8_lower(lua_State *L)
|
358
|
-
{ return convert(L, utf8_tolower); }
|
359
|
-
|
360
|
-
static int Lutf8_upper(lua_State *L)
|
361
|
-
{ return convert(L, utf8_toupper); }
|
362
|
-
|
363
|
-
static int Lutf8_title(lua_State *L)
|
364
|
-
{ return convert(L, utf8_totitle); }
|
365
|
-
|
366
|
-
static int Lutf8_fold(lua_State *L)
|
367
|
-
{ return convert(L, utf8_tofold); }
|
368
|
-
|
369
|
-
static int Lutf8_byte(lua_State *L) {
|
370
|
-
size_t n = 0;
|
371
|
-
const char *e, *s = check_utf8(L, 1, &e);
|
372
|
-
lua_Integer posi = luaL_optinteger(L, 2, 1);
|
373
|
-
lua_Integer posj = luaL_optinteger(L, 3, posi);
|
374
|
-
if (u_posrange(&s, &e, posi, posj)) {
|
375
|
-
luaL_checkstack(L, e-s, "string slice too long");
|
376
|
-
while (s < e) {
|
377
|
-
unsigned int ch;
|
378
|
-
s += utf8_decode(s, e, &ch);
|
379
|
-
lua_pushinteger(L, ch);
|
380
|
-
++n;
|
381
|
-
}
|
382
|
-
}
|
383
|
-
return n;
|
384
|
-
}
|
385
|
-
|
386
|
-
static int Lutf8_char(lua_State *L) {
|
387
|
-
int i, n = lua_gettop(L); /* number of arguments */
|
388
|
-
luaL_Buffer b;
|
389
|
-
luaL_buffinit(L, &b);
|
390
|
-
for (i = 1; i <= n; ++i) {
|
391
|
-
unsigned int ch = luaL_checkint(L, i);
|
392
|
-
add_utf8char(&b, ch);
|
393
|
-
}
|
394
|
-
luaL_pushresult(&b);
|
395
|
-
return 1;
|
396
|
-
}
|
397
|
-
|
398
|
-
|
399
|
-
/* unicode extra interface */
|
400
|
-
|
401
|
-
static const char *parse_escape(lua_State *L,
|
402
|
-
const char *s, const char *e,
|
403
|
-
int is_hex, unsigned int *pch) {
|
404
|
-
unsigned int escape = 0, ch;
|
405
|
-
int in_bracket = 0;
|
406
|
-
if (*s == '{') ++s, in_bracket = 1;
|
407
|
-
while (s < e) {
|
408
|
-
ch = (unsigned char)*s;
|
409
|
-
if (in_bracket && ch == '}') {
|
410
|
-
++s;
|
411
|
-
break;
|
412
|
-
}
|
413
|
-
if (ch >= '0' && ch <= '9')
|
414
|
-
ch = ch - '0';
|
415
|
-
else if (is_hex && ch >= 'A' && ch <= 'F')
|
416
|
-
ch = 10 + (ch - 'A');
|
417
|
-
else if (is_hex && ch >= 'a' && ch <= 'f')
|
418
|
-
ch = 10 + (ch - 'a');
|
419
|
-
else {
|
420
|
-
if (in_bracket)
|
421
|
-
luaL_error(L, "invalid escape '%c'", ch);
|
422
|
-
break;
|
423
|
-
}
|
424
|
-
escape *= is_hex ? 16 : 10;
|
425
|
-
escape += ch;
|
426
|
-
++s;
|
427
|
-
}
|
428
|
-
*pch = escape;
|
429
|
-
return s;
|
430
|
-
}
|
431
|
-
|
432
|
-
static int Lutf8_escape(lua_State *L) {
|
433
|
-
const char *e, *s = check_utf8(L, 1, &e);
|
434
|
-
luaL_Buffer b;
|
435
|
-
luaL_buffinit(L, &b);
|
436
|
-
while (s < e) {
|
437
|
-
unsigned int ch;
|
438
|
-
s += utf8_decode(s, e, &ch);
|
439
|
-
if (ch == '%') {
|
440
|
-
int is_hex = 0;
|
441
|
-
switch (*s) {
|
442
|
-
case '0': case '1': case '2': case '3':
|
443
|
-
case '4': case '5': case '6': case '7':
|
444
|
-
case '8': case '9': case '{':
|
445
|
-
break;
|
446
|
-
case 'u': case 'U': ++s; break;
|
447
|
-
case 'x': case 'X': ++s; is_hex = 1; break;
|
448
|
-
default:
|
449
|
-
s += utf8_decode(s, e, &ch);
|
450
|
-
goto next;
|
451
|
-
}
|
452
|
-
if (s >= e)
|
453
|
-
luaL_error(L, "invalid escape sequence");
|
454
|
-
s = parse_escape(L, s, e, is_hex, &ch);
|
455
|
-
}
|
456
|
-
next:
|
457
|
-
add_utf8char(&b, ch);
|
458
|
-
}
|
459
|
-
luaL_pushresult(&b);
|
460
|
-
return 1;
|
461
|
-
}
|
462
|
-
|
463
|
-
static int Lutf8_insert(lua_State *L) {
|
464
|
-
const char *e, *s = check_utf8(L, 1, &e);
|
465
|
-
size_t sublen;
|
466
|
-
const char *subs;
|
467
|
-
luaL_Buffer b;
|
468
|
-
int nargs = 2;
|
469
|
-
const char *first = e;
|
470
|
-
if (lua_type(L, 2) == LUA_TNUMBER) {
|
471
|
-
int idx = (int)lua_tointeger(L, 2);
|
472
|
-
if (idx != 0) first = utf8_index(s, e, idx);
|
473
|
-
++nargs;
|
474
|
-
}
|
475
|
-
subs = luaL_checklstring(L, nargs, &sublen);
|
476
|
-
luaL_buffinit(L, &b);
|
477
|
-
luaL_addlstring(&b, s, first-s);
|
478
|
-
luaL_addlstring(&b, subs, sublen);
|
479
|
-
luaL_addlstring(&b, first, e-first);
|
480
|
-
luaL_pushresult(&b);
|
481
|
-
return 1;
|
482
|
-
}
|
483
|
-
|
484
|
-
static int Lutf8_remove(lua_State *L) {
|
485
|
-
const char *e, *s = check_utf8(L, 1, &e);
|
486
|
-
const char *start = s, *end = e;
|
487
|
-
if (!u_posrange(&start, &end,
|
488
|
-
luaL_checkinteger(L, 2), luaL_optinteger(L, 3, -1)))
|
489
|
-
lua_settop(L, 1);
|
490
|
-
else {
|
491
|
-
luaL_Buffer b;
|
492
|
-
luaL_buffinit(L, &b);
|
493
|
-
luaL_addlstring(&b, s, start-s);
|
494
|
-
luaL_addlstring(&b, end, e-end);
|
495
|
-
luaL_pushresult(&b);
|
496
|
-
}
|
497
|
-
return 1;
|
498
|
-
}
|
499
|
-
|
500
|
-
static int push_offset(lua_State *L, const char *s, const char *e,
|
501
|
-
const char *cur, lua_Integer offset) {
|
502
|
-
unsigned int ch;
|
503
|
-
if (offset >= 0) {
|
504
|
-
while (cur < e && offset-- > 0)
|
505
|
-
cur = utf8_next(cur, e);
|
506
|
-
if (offset >= 0) return 0;
|
507
|
-
}
|
508
|
-
else {
|
509
|
-
while (s < cur && offset++ < 0)
|
510
|
-
cur = utf8_prev(s, cur);
|
511
|
-
if (offset < 0) return 0;
|
512
|
-
}
|
513
|
-
utf8_decode(cur, e, &ch);
|
514
|
-
lua_pushinteger(L, cur-s+1);
|
515
|
-
lua_pushinteger(L, ch);
|
516
|
-
return 2;
|
517
|
-
}
|
518
|
-
|
519
|
-
static int Lutf8_charpos(lua_State *L) {
|
520
|
-
size_t len;
|
521
|
-
const char *s = luaL_checklstring(L, 1, &len);
|
522
|
-
const char *cur = s;
|
523
|
-
lua_Integer pos;
|
524
|
-
if (lua_isnoneornil(L, 3)) {
|
525
|
-
lua_Integer offset = luaL_optinteger(L, 2, 1);
|
526
|
-
if (offset > 0) --offset;
|
527
|
-
else if (offset < 0) cur = s+len;
|
528
|
-
return push_offset(L, s, s+len, cur, offset);
|
529
|
-
}
|
530
|
-
pos = byterelat(luaL_optinteger(L, 2, 1), len);
|
531
|
-
if (pos != 0) cur += pos-1;
|
532
|
-
return push_offset(L, s, s+len, cur, luaL_checkinteger(L, 3));
|
533
|
-
}
|
534
|
-
|
535
|
-
static int Lutf8_next(lua_State *L) {
|
536
|
-
size_t len;
|
537
|
-
const char *s = luaL_checklstring(L, 1, &len);
|
538
|
-
const char *cur = s;
|
539
|
-
lua_Integer offset = 0;
|
540
|
-
if (!lua_isnoneornil(L, 2)) {
|
541
|
-
lua_Integer pos = byterelat(luaL_checkinteger(L, 2), len);
|
542
|
-
if (pos != 0) cur += pos-1;
|
543
|
-
offset = 1;
|
544
|
-
}
|
545
|
-
offset = luaL_optinteger(L, 3, offset);
|
546
|
-
return push_offset(L, s, s+len, cur, offset);
|
547
|
-
}
|
548
|
-
|
549
|
-
static int Lutf8_width(lua_State *L) {
|
550
|
-
int t = lua_type(L, 1);
|
551
|
-
int ambi_is_single = !lua_toboolean(L, 2);
|
552
|
-
int default_width = luaL_optinteger(L, 3, 0);
|
553
|
-
if (t == LUA_TNUMBER) {
|
554
|
-
size_t chwidth = utf8_width(lua_tointeger(L, 1), ambi_is_single);
|
555
|
-
if (chwidth == 0) chwidth = default_width;
|
556
|
-
lua_pushinteger(L, (lua_Integer)chwidth);
|
557
|
-
}
|
558
|
-
else if (t != LUA_TSTRING)
|
559
|
-
return luaL_error(L, "number/string expected, got %s", luaL_typename(L, 1));
|
560
|
-
else {
|
561
|
-
const char *e, *s = to_utf8(L, 1, &e);
|
562
|
-
size_t width = 0;
|
563
|
-
while (s < e) {
|
564
|
-
unsigned int ch;
|
565
|
-
size_t chwidth;
|
566
|
-
s += utf8_decode(s, e, &ch);
|
567
|
-
chwidth = utf8_width(ch, ambi_is_single);
|
568
|
-
width += chwidth == 0 ? default_width : chwidth;
|
569
|
-
}
|
570
|
-
lua_pushinteger(L, (lua_Integer)width);
|
571
|
-
}
|
572
|
-
return 1;
|
573
|
-
}
|
574
|
-
|
575
|
-
static int Lutf8_widthindex(lua_State *L) {
|
576
|
-
const char *e, *s = check_utf8(L, 1, &e);
|
577
|
-
int width = luaL_checkinteger(L, 2);
|
578
|
-
int ambi_is_single = !lua_toboolean(L, 3);
|
579
|
-
int default_width = luaL_optinteger(L, 4, 0);
|
580
|
-
size_t idx = 1;
|
581
|
-
while (s < e) {
|
582
|
-
unsigned int ch;
|
583
|
-
size_t chwidth;
|
584
|
-
s += utf8_decode(s, e, &ch);
|
585
|
-
chwidth = utf8_width(ch, ambi_is_single);
|
586
|
-
if (chwidth == 0) chwidth = default_width;
|
587
|
-
width -= chwidth;
|
588
|
-
if (width <= 0) {
|
589
|
-
lua_pushinteger(L, idx);
|
590
|
-
lua_pushinteger(L, width + chwidth);
|
591
|
-
lua_pushinteger(L, chwidth);
|
592
|
-
return 3;
|
593
|
-
}
|
594
|
-
++idx;
|
595
|
-
}
|
596
|
-
lua_pushinteger(L, (lua_Integer)idx);
|
597
|
-
return 1;
|
598
|
-
}
|
599
|
-
|
600
|
-
static int Lutf8_ncasecmp(lua_State *L) {
|
601
|
-
const char *e1, *s1 = check_utf8(L, 1, &e1);
|
602
|
-
const char *e2, *s2 = check_utf8(L, 2, &e2);
|
603
|
-
while (s1 < e1 || s2 < e2) {
|
604
|
-
unsigned int ch1 = 0, ch2 = 0;
|
605
|
-
if (s1 == e1)
|
606
|
-
ch2 = 1;
|
607
|
-
else if (s2 == e2)
|
608
|
-
ch1 = 1;
|
609
|
-
else {
|
610
|
-
s1 += utf8_decode(s1, e1, &ch1);
|
611
|
-
s2 += utf8_decode(s2, e2, &ch2);
|
612
|
-
ch1 = utf8_tofold(ch1);
|
613
|
-
ch2 = utf8_tofold(ch2);
|
614
|
-
}
|
615
|
-
if (ch1 != ch2) {
|
616
|
-
lua_pushinteger(L, ch1 > ch2 ? 1 : -1);
|
617
|
-
return 1;
|
618
|
-
}
|
619
|
-
}
|
620
|
-
lua_pushinteger(L, 0);
|
621
|
-
return 1;
|
622
|
-
}
|
623
|
-
|
624
|
-
|
625
|
-
/* utf8 pattern matching implement */
|
626
|
-
|
627
|
-
#ifndef LUA_MAXCAPTURES
|
628
|
-
# define LUA_MAXCAPTURES 32
|
629
|
-
#endif /* LUA_MAXCAPTURES */
|
630
|
-
|
631
|
-
#define CAP_UNFINISHED (-1)
|
632
|
-
#define CAP_POSITION (-2)
|
633
|
-
|
634
|
-
|
635
|
-
typedef struct utf8MatchState {
|
636
|
-
int matchdepth; /* control for recursive depth (to avoid C stack overflow) */
|
637
|
-
const char *src_init; /* init of source string */
|
638
|
-
const char *src_end; /* end ('\0') of source string */
|
639
|
-
const char *p_end; /* end ('\0') of pattern */
|
640
|
-
lua_State *L;
|
641
|
-
int level; /* total number of captures (finished or unfinished) */
|
642
|
-
struct {
|
643
|
-
const char *init;
|
644
|
-
ptrdiff_t len;
|
645
|
-
} capture[LUA_MAXCAPTURES];
|
646
|
-
} utf8MatchState;
|
647
|
-
|
648
|
-
/* recursive function */
|
649
|
-
static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p);
|
650
|
-
|
651
|
-
/* maximum recursion depth for 'match' */
|
652
|
-
#if !defined(MAXCCALLS)
|
653
|
-
#define MAXCCALLS 200
|
654
|
-
#endif
|
655
|
-
|
656
|
-
#define L_ESC '%'
|
657
|
-
#define SPECIALS "^$*+?.([%-"
|
658
|
-
|
659
|
-
static int utf8_check_capture (utf8MatchState *ms, int l) {
|
660
|
-
l -= '1';
|
661
|
-
if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
|
662
|
-
return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
|
663
|
-
return l;
|
664
|
-
}
|
665
|
-
|
666
|
-
static int utf8_capture_to_close (utf8MatchState *ms) {
|
667
|
-
int level = ms->level;
|
668
|
-
for (level--; level>=0; level--)
|
669
|
-
if (ms->capture[level].len == CAP_UNFINISHED) return level;
|
670
|
-
return luaL_error(ms->L, "invalid pattern capture");
|
671
|
-
}
|
672
|
-
|
673
|
-
static const char *utf8_classend (utf8MatchState *ms, const char *p) {
|
674
|
-
unsigned int ch;
|
675
|
-
p += utf8_decode(p, ms->p_end, &ch);
|
676
|
-
switch (ch) {
|
677
|
-
case L_ESC: {
|
678
|
-
if (p == ms->p_end)
|
679
|
-
luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
|
680
|
-
return utf8_next(p, ms->p_end);
|
681
|
-
}
|
682
|
-
case '[': {
|
683
|
-
if (*p == '^') p++;
|
684
|
-
do { /* look for a `]' */
|
685
|
-
if (p == ms->p_end)
|
686
|
-
luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
|
687
|
-
if (*(p++) == L_ESC && p < ms->p_end)
|
688
|
-
p++; /* skip escapes (e.g. `%]') */
|
689
|
-
} while (*p != ']');
|
690
|
-
return p+1;
|
691
|
-
}
|
692
|
-
default: {
|
693
|
-
return p;
|
694
|
-
}
|
695
|
-
}
|
696
|
-
}
|
697
|
-
|
698
|
-
static int utf8_match_class (unsigned int c, unsigned int cl) {
|
699
|
-
int res;
|
700
|
-
switch (utf8_tolower(cl)) {
|
701
|
-
case 'a' : res = utf8_isalpha(c); break;
|
702
|
-
case 'c' : res = utf8_iscntrl(c); break;
|
703
|
-
case 'd' : res = utf8_isdigit(c); break;
|
704
|
-
case 'g' : res = utf8_isgraph(c); break;
|
705
|
-
case 'l' : res = utf8_islower(c); break;
|
706
|
-
case 'p' : res = utf8_ispunct(c); break;
|
707
|
-
case 's' : res = utf8_isspace(c); break;
|
708
|
-
case 'u' : res = utf8_isupper(c); break;
|
709
|
-
case 'w' : res = utf8_isalnum(c); break;
|
710
|
-
case 'x' : res = utf8_isxdigit(c); break;
|
711
|
-
case 'z' : res = (c == 0); break; /* deprecated option */
|
712
|
-
default: return (cl == c);
|
713
|
-
}
|
714
|
-
return (utf8_islower(cl) ? res : !res);
|
715
|
-
}
|
716
|
-
|
717
|
-
static int utf8_matchbracketclass (unsigned int c, const char *p, const char *ec) {
|
718
|
-
int sig = 1;
|
719
|
-
assert(*p == '[');
|
720
|
-
if (*++p == '^') {
|
721
|
-
sig = 0;
|
722
|
-
p++; /* skip the `^' */
|
723
|
-
}
|
724
|
-
while (p < ec) {
|
725
|
-
unsigned int ch;
|
726
|
-
p += utf8_decode(p, ec, &ch);
|
727
|
-
if (ch == L_ESC) {
|
728
|
-
p += utf8_decode(p, ec, &ch);
|
729
|
-
if (utf8_match_class(c, ch))
|
730
|
-
return sig;
|
731
|
-
}
|
732
|
-
else {
|
733
|
-
unsigned int next;
|
734
|
-
const char *np = p + utf8_decode(p, ec, &next);
|
735
|
-
if (next == '-' && np < ec) {
|
736
|
-
p = np + utf8_decode(np, ec, &next);
|
737
|
-
if (ch <= c && c <= next)
|
738
|
-
return sig;
|
739
|
-
}
|
740
|
-
else if (ch == c) return sig;
|
741
|
-
}
|
742
|
-
}
|
743
|
-
return !sig;
|
744
|
-
}
|
745
|
-
|
746
|
-
static int utf8_singlematch (utf8MatchState *ms, const char *s, const char *p,
|
747
|
-
const char *ep) {
|
748
|
-
if (s >= ms->src_end)
|
749
|
-
return 0;
|
750
|
-
else {
|
751
|
-
unsigned int ch, pch;
|
752
|
-
utf8_decode(s, ms->src_end, &ch);
|
753
|
-
p += utf8_decode(p, ms->p_end, &pch);
|
754
|
-
switch (pch) {
|
755
|
-
case '.': return 1; /* matches any char */
|
756
|
-
case L_ESC: utf8_decode(p, ms->p_end, &pch);
|
757
|
-
return utf8_match_class(ch, pch);
|
758
|
-
case '[': return utf8_matchbracketclass(ch, p-1, ep-1);
|
759
|
-
default: return pch == ch;
|
760
|
-
}
|
761
|
-
}
|
762
|
-
}
|
763
|
-
|
764
|
-
static const char *utf8_matchbalance (utf8MatchState *ms, const char *s,
|
765
|
-
const char **p) {
|
766
|
-
unsigned int ch, begin, end;
|
767
|
-
*p += utf8_decode(*p, ms->p_end, &begin);
|
768
|
-
if (*p >= ms->p_end)
|
769
|
-
luaL_error(ms->L, "malformed pattern "
|
770
|
-
"(missing arguments to " LUA_QL("%%b") ")");
|
771
|
-
*p += utf8_decode(*p, ms->p_end, &end);
|
772
|
-
s += utf8_decode(s, ms->src_end, &ch);
|
773
|
-
if (ch != begin) return NULL;
|
774
|
-
else {
|
775
|
-
int cont = 1;
|
776
|
-
while (s < ms->src_end) {
|
777
|
-
s += utf8_decode(s, ms->src_end, &ch);
|
778
|
-
if (ch == end) {
|
779
|
-
if (--cont == 0) return s;
|
780
|
-
}
|
781
|
-
else if (ch == begin) cont++;
|
782
|
-
}
|
783
|
-
}
|
784
|
-
return NULL; /* string ends out of balance */
|
785
|
-
}
|
786
|
-
|
787
|
-
static const char *utf8_max_expand (utf8MatchState *ms, const char *s,
|
788
|
-
const char *p, const char *ep) {
|
789
|
-
const char *m = s; /* matched end of single match p */
|
790
|
-
while (utf8_singlematch(ms, m, p, ep))
|
791
|
-
m = utf8_next(m, ms->src_end);
|
792
|
-
/* keeps trying to match with the maximum repetitions */
|
793
|
-
while (s <= m) {
|
794
|
-
const char *res = utf8_match(ms, m, ep+1);
|
795
|
-
if (res) return res;
|
796
|
-
/* else didn't match; reduce 1 repetition to try again */
|
797
|
-
if (s == m) break;
|
798
|
-
m = utf8_prev(s, m);
|
799
|
-
}
|
800
|
-
return NULL;
|
801
|
-
}
|
802
|
-
|
803
|
-
static const char *utf8_min_expand (utf8MatchState *ms, const char *s,
|
804
|
-
const char *p, const char *ep) {
|
805
|
-
for (;;) {
|
806
|
-
const char *res = utf8_match(ms, s, ep+1);
|
807
|
-
if (res != NULL)
|
808
|
-
return res;
|
809
|
-
else if (utf8_singlematch(ms, s, p, ep))
|
810
|
-
s = utf8_next(s, ms->src_end); /* try with one more repetition */
|
811
|
-
else return NULL;
|
812
|
-
}
|
813
|
-
}
|
814
|
-
|
815
|
-
static const char *utf8_start_capture (utf8MatchState *ms, const char *s,
|
816
|
-
const char *p, int what) {
|
817
|
-
const char *res;
|
818
|
-
int level = ms->level;
|
819
|
-
if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
|
820
|
-
ms->capture[level].init = s;
|
821
|
-
ms->capture[level].len = what;
|
822
|
-
ms->level = level+1;
|
823
|
-
if ((res=utf8_match(ms, s, p)) == NULL) /* match failed? */
|
824
|
-
ms->level--; /* undo capture */
|
825
|
-
return res;
|
826
|
-
}
|
827
|
-
|
828
|
-
static const char *utf8_end_capture (utf8MatchState *ms, const char *s,
|
829
|
-
const char *p) {
|
830
|
-
int l = utf8_capture_to_close(ms);
|
831
|
-
const char *res;
|
832
|
-
ms->capture[l].len = s - ms->capture[l].init; /* close capture */
|
833
|
-
if ((res = utf8_match(ms, s, p)) == NULL) /* match failed? */
|
834
|
-
ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
|
835
|
-
return res;
|
836
|
-
}
|
837
|
-
|
838
|
-
static const char *utf8_match_capture (utf8MatchState *ms, const char *s, int l) {
|
839
|
-
size_t len;
|
840
|
-
l = utf8_check_capture(ms, l);
|
841
|
-
len = ms->capture[l].len;
|
842
|
-
if ((size_t)(ms->src_end-s) >= len &&
|
843
|
-
memcmp(ms->capture[l].init, s, len) == 0)
|
844
|
-
return s+len;
|
845
|
-
else return NULL;
|
846
|
-
}
|
847
|
-
|
848
|
-
static const char *utf8_match (utf8MatchState *ms, const char *s, const char *p) {
|
849
|
-
if (ms->matchdepth-- == 0)
|
850
|
-
luaL_error(ms->L, "pattern too complex");
|
851
|
-
init: /* using goto's to optimize tail recursion */
|
852
|
-
if (p != ms->p_end) { /* end of pattern? */
|
853
|
-
unsigned int ch;
|
854
|
-
utf8_decode(p, ms->p_end, &ch);
|
855
|
-
switch (ch) {
|
856
|
-
case '(': { /* start capture */
|
857
|
-
if (*(p + 1) == ')') /* position capture? */
|
858
|
-
s = utf8_start_capture(ms, s, p + 2, CAP_POSITION);
|
859
|
-
else
|
860
|
-
s = utf8_start_capture(ms, s, p + 1, CAP_UNFINISHED);
|
861
|
-
break;
|
862
|
-
}
|
863
|
-
case ')': { /* end capture */
|
864
|
-
s = utf8_end_capture(ms, s, p + 1);
|
865
|
-
break;
|
866
|
-
}
|
867
|
-
case '$': {
|
868
|
-
if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */
|
869
|
-
goto dflt; /* no; go to default */
|
870
|
-
s = (s == ms->src_end) ? s : NULL; /* check end of string */
|
871
|
-
break;
|
872
|
-
}
|
873
|
-
case L_ESC: { /* escaped sequence not in the format class[*+?-]? */
|
874
|
-
const char *prev_p = p;
|
875
|
-
p += utf8_decode(p+1, ms->p_end, &ch) + 1;
|
876
|
-
switch (ch) {
|
877
|
-
case 'b': { /* balanced string? */
|
878
|
-
s = utf8_matchbalance(ms, s, &p);
|
879
|
-
if (s != NULL)
|
880
|
-
goto init; /* return utf8_match(ms, s, p + 4); */
|
881
|
-
/* else fail (s == NULL) */
|
882
|
-
break;
|
883
|
-
}
|
884
|
-
case 'f': { /* frontier? */
|
885
|
-
const char *ep; unsigned int previous = 0, current = 0;
|
886
|
-
if (*p != '[')
|
887
|
-
luaL_error(ms->L, "missing " LUA_QL("[") " after "
|
888
|
-
LUA_QL("%%f") " in pattern");
|
889
|
-
ep = utf8_classend(ms, p); /* points to what is next */
|
890
|
-
if (s != ms->src_init)
|
891
|
-
utf8_decode(utf8_prev(ms->src_init, s), ms->src_end, &previous);
|
892
|
-
if (s != ms->src_end)
|
893
|
-
utf8_decode(s, ms->src_end, ¤t);
|
894
|
-
if (!utf8_matchbracketclass(previous, p, ep - 1) &&
|
895
|
-
utf8_matchbracketclass(current, p, ep - 1)) {
|
896
|
-
p = ep; goto init; /* return utf8_match(ms, s, ep); */
|
897
|
-
}
|
898
|
-
s = NULL; /* match failed */
|
899
|
-
break;
|
900
|
-
}
|
901
|
-
case '0': case '1': case '2': case '3':
|
902
|
-
case '4': case '5': case '6': case '7':
|
903
|
-
case '8': case '9': { /* capture results (%0-%9)? */
|
904
|
-
s = utf8_match_capture(ms, s, ch - '1');
|
905
|
-
if (s != NULL) goto init; /* return utf8_match(ms, s, p + 2) */
|
906
|
-
break;
|
907
|
-
}
|
908
|
-
default: p = prev_p; goto dflt;
|
909
|
-
}
|
910
|
-
break;
|
911
|
-
}
|
912
|
-
default: dflt: { /* pattern class plus optional suffix */
|
913
|
-
const char *ep = utf8_classend(ms, p); /* points to optional suffix */
|
914
|
-
/* does not match at least once? */
|
915
|
-
if (!utf8_singlematch(ms, s, p, ep)) {
|
916
|
-
if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */
|
917
|
-
p = ep + 1; goto init; /* return utf8_match(ms, s, ep + 1); */
|
918
|
-
}
|
919
|
-
else /* '+' or no suffix */
|
920
|
-
s = NULL; /* fail */
|
921
|
-
}
|
922
|
-
else { /* matched once */
|
923
|
-
const char *next_s = utf8_next(s, ms->src_end);
|
924
|
-
switch (*ep) { /* handle optional suffix */
|
925
|
-
case '?': { /* optional */
|
926
|
-
const char *res;
|
927
|
-
const char *next_ep = utf8_next(ep, ms->p_end);
|
928
|
-
if ((res = utf8_match(ms, next_s, next_ep)) != NULL)
|
929
|
-
s = res;
|
930
|
-
else {
|
931
|
-
p = next_ep; goto init; /* else return utf8_match(ms, s, ep + 1); */
|
932
|
-
}
|
933
|
-
break;
|
934
|
-
}
|
935
|
-
case '+': /* 1 or more repetitions */
|
936
|
-
s = next_s; /* 1 match already done */
|
937
|
-
/* go through */
|
938
|
-
case '*': /* 0 or more repetitions */
|
939
|
-
s = utf8_max_expand(ms, s, p, ep);
|
940
|
-
break;
|
941
|
-
case '-': /* 0 or more repetitions (minimum) */
|
942
|
-
s = utf8_min_expand(ms, s, p, ep);
|
943
|
-
break;
|
944
|
-
default: /* no suffix */
|
945
|
-
s = next_s; p = ep; goto init; /* return utf8_match(ms, s + 1, ep); */
|
946
|
-
}
|
947
|
-
}
|
948
|
-
break;
|
949
|
-
}
|
950
|
-
}
|
951
|
-
}
|
952
|
-
ms->matchdepth++;
|
953
|
-
return s;
|
954
|
-
}
|
955
|
-
|
956
|
-
static const char *utf8_lmemfind (const char *s1, size_t l1,
|
957
|
-
const char *s2, size_t l2) {
|
958
|
-
if (l2 == 0) return s1; /* empty strings are everywhere */
|
959
|
-
else if (l2 > l1) return NULL; /* avoids a negative `l1' */
|
960
|
-
else {
|
961
|
-
const char *init; /* to search for a `*s2' inside `s1' */
|
962
|
-
l2--; /* 1st char will be checked by `memchr' */
|
963
|
-
l1 = l1-l2; /* `s2' cannot be found after that */
|
964
|
-
while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
|
965
|
-
init++; /* 1st char is already checked */
|
966
|
-
if (memcmp(init, s2+1, l2) == 0)
|
967
|
-
return init-1;
|
968
|
-
else { /* correct `l1' and `s1' to try again */
|
969
|
-
l1 -= init-s1;
|
970
|
-
s1 = init;
|
971
|
-
}
|
972
|
-
}
|
973
|
-
return NULL; /* not found */
|
974
|
-
}
|
975
|
-
}
|
976
|
-
|
977
|
-
static const char *utf8_get_index(const char *p, const char *s, const char *e, int *pidx) {
|
978
|
-
int idx = 0;
|
979
|
-
while (s < e) {
|
980
|
-
if (s == p)
|
981
|
-
break;
|
982
|
-
else if (s > p) {
|
983
|
-
--idx;
|
984
|
-
break;
|
985
|
-
}
|
986
|
-
s = utf8_next(s, e);
|
987
|
-
++idx;
|
988
|
-
}
|
989
|
-
if (pidx) *pidx = idx;
|
990
|
-
return s;
|
991
|
-
}
|
992
|
-
|
993
|
-
static void utf8_push_onecapture (utf8MatchState *ms, int i, const char *s,
|
994
|
-
const char *e) {
|
995
|
-
if (i >= ms->level) {
|
996
|
-
if (i == 0) /* ms->level == 0, too */
|
997
|
-
lua_pushlstring(ms->L, s, e - s); /* add whole match */
|
998
|
-
else
|
999
|
-
luaL_error(ms->L, "invalid capture index");
|
1000
|
-
}
|
1001
|
-
else {
|
1002
|
-
ptrdiff_t l = ms->capture[i].len;
|
1003
|
-
if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
|
1004
|
-
if (l == CAP_POSITION) {
|
1005
|
-
int idx;
|
1006
|
-
utf8_get_index(ms->capture[i].init, ms->src_init, ms->src_end, &idx);
|
1007
|
-
lua_pushinteger(ms->L, idx+1);
|
1008
|
-
} else
|
1009
|
-
lua_pushlstring(ms->L, ms->capture[i].init, l);
|
1010
|
-
}
|
1011
|
-
}
|
1012
|
-
|
1013
|
-
static int utf8_push_captures (utf8MatchState *ms, const char *s, const char *e) {
|
1014
|
-
int i;
|
1015
|
-
int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
|
1016
|
-
luaL_checkstack(ms->L, nlevels, "too many captures");
|
1017
|
-
for (i = 0; i < nlevels; i++)
|
1018
|
-
utf8_push_onecapture(ms, i, s, e);
|
1019
|
-
return nlevels; /* number of strings pushed */
|
1020
|
-
}
|
1021
|
-
|
1022
|
-
/* check whether pattern has no special characters */
|
1023
|
-
static int nospecials (const char *p, const char * ep) {
|
1024
|
-
while (p < ep) {
|
1025
|
-
if (strpbrk(p, SPECIALS))
|
1026
|
-
return 0; /* pattern has a special character */
|
1027
|
-
p += strlen(p) + 1; /* may have more after \0 */
|
1028
|
-
}
|
1029
|
-
return 1; /* no special chars found */
|
1030
|
-
}
|
1031
|
-
|
1032
|
-
|
1033
|
-
/* utf8 pattern matching interface */
|
1034
|
-
|
1035
|
-
static int find_aux (lua_State *L, int find) {
|
1036
|
-
const char *es, *s = check_utf8(L, 1, &es);
|
1037
|
-
const char *ep, *p = check_utf8(L, 2, &ep);
|
1038
|
-
lua_Integer idx = luaL_optinteger(L, 3, 1);
|
1039
|
-
const char *init;
|
1040
|
-
size_t slen = utf8_length(s, es);
|
1041
|
-
if (idx > 0 && idx > (lua_Integer)slen + 1) { /* start after string's end? */
|
1042
|
-
lua_pushnil(L); /* cannot find anything */
|
1043
|
-
return 1;
|
1044
|
-
}
|
1045
|
-
if (idx < 0) idx += utf8_length(s, es) + 1;
|
1046
|
-
init = utf8_index(s, es, idx);
|
1047
|
-
/* explicit request or no special characters? */
|
1048
|
-
if (find && (lua_toboolean(L, 4) || nospecials(p, ep))) {
|
1049
|
-
/* do a plain search */
|
1050
|
-
do {
|
1051
|
-
const char *s2 = utf8_lmemfind(init, es-init, p, ep-p);
|
1052
|
-
if (!s2) break;
|
1053
|
-
else {
|
1054
|
-
int relidx;
|
1055
|
-
const char *pch = utf8_get_index(s2, init, es, &relidx);
|
1056
|
-
if (pch == s2) {
|
1057
|
-
lua_pushinteger(L, idx + relidx);
|
1058
|
-
lua_pushinteger(L, idx + relidx + utf8_length(p, ep) - 1);
|
1059
|
-
return 2;
|
1060
|
-
}
|
1061
|
-
idx += relidx + 1;
|
1062
|
-
init = utf8_next(pch, es);
|
1063
|
-
}
|
1064
|
-
} while (init < es);
|
1065
|
-
}
|
1066
|
-
else {
|
1067
|
-
utf8MatchState ms;
|
1068
|
-
int anchor = (*p == '^');
|
1069
|
-
if (anchor) p++; /* skip anchor character */
|
1070
|
-
ms.L = L;
|
1071
|
-
ms.matchdepth = MAXCCALLS;
|
1072
|
-
ms.src_init = s;
|
1073
|
-
ms.src_end = es;
|
1074
|
-
ms.p_end = ep;
|
1075
|
-
do {
|
1076
|
-
const char *res;
|
1077
|
-
ms.level = 0;
|
1078
|
-
assert(ms.matchdepth == MAXCCALLS);
|
1079
|
-
if ((res=utf8_match(&ms, init, p)) != NULL) {
|
1080
|
-
if (find) {
|
1081
|
-
lua_pushinteger(L, idx); /* start */
|
1082
|
-
lua_pushinteger(L, idx + utf8_length(init, res) - 1); /* end */
|
1083
|
-
return utf8_push_captures(&ms, NULL, 0) + 2;
|
1084
|
-
}
|
1085
|
-
else
|
1086
|
-
return utf8_push_captures(&ms, init, res);
|
1087
|
-
}
|
1088
|
-
if (init == es) break;
|
1089
|
-
idx += 1;
|
1090
|
-
init = utf8_next(init, es);
|
1091
|
-
} while (init <= es && !anchor);
|
1092
|
-
}
|
1093
|
-
lua_pushnil(L); /* not found */
|
1094
|
-
return 1;
|
1095
|
-
}
|
1096
|
-
|
1097
|
-
static int Lutf8_find(lua_State *L)
|
1098
|
-
{ return find_aux(L, 1); }
|
1099
|
-
|
1100
|
-
static int Lutf8_match(lua_State *L)
|
1101
|
-
{ return find_aux(L, 0); }
|
1102
|
-
|
1103
|
-
static int utf8_gmatch_aux (lua_State *L) {
|
1104
|
-
utf8MatchState ms;
|
1105
|
-
const char *es, *s = check_utf8(L, lua_upvalueindex(1), &es);
|
1106
|
-
const char *ep, *p = check_utf8(L, lua_upvalueindex(2), &ep);
|
1107
|
-
const char *src;
|
1108
|
-
ms.L = L;
|
1109
|
-
ms.matchdepth = MAXCCALLS;
|
1110
|
-
ms.src_init = s;
|
1111
|
-
ms.src_end = es;
|
1112
|
-
ms.p_end = ep;
|
1113
|
-
for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
|
1114
|
-
src <= ms.src_end;
|
1115
|
-
src = utf8_next(src, ms.src_end)) {
|
1116
|
-
const char *e;
|
1117
|
-
ms.level = 0;
|
1118
|
-
assert(ms.matchdepth == MAXCCALLS);
|
1119
|
-
if ((e = utf8_match(&ms, src, p)) != NULL) {
|
1120
|
-
lua_Integer newstart = e-s;
|
1121
|
-
if (e == src) newstart++; /* empty match? go at least one position */
|
1122
|
-
lua_pushinteger(L, newstart);
|
1123
|
-
lua_replace(L, lua_upvalueindex(3));
|
1124
|
-
return utf8_push_captures(&ms, src, e);
|
1125
|
-
}
|
1126
|
-
if (src == ms.src_end) break;
|
1127
|
-
}
|
1128
|
-
return 0; /* not found */
|
1129
|
-
}
|
1130
|
-
|
1131
|
-
static int Lutf8_gmatch(lua_State *L) {
|
1132
|
-
luaL_checkstring(L, 1);
|
1133
|
-
luaL_checkstring(L, 2);
|
1134
|
-
lua_settop(L, 2);
|
1135
|
-
lua_pushinteger(L, 0);
|
1136
|
-
lua_pushcclosure(L, utf8_gmatch_aux, 3);
|
1137
|
-
return 1;
|
1138
|
-
}
|
1139
|
-
|
1140
|
-
static void utf8_add_s (utf8MatchState *ms, luaL_Buffer *b, const char *s,
|
1141
|
-
const char *e) {
|
1142
|
-
const char *new_end, *news = to_utf8(ms->L, 3, &new_end);
|
1143
|
-
while (news < new_end) {
|
1144
|
-
unsigned int ch;
|
1145
|
-
news += utf8_decode(news, new_end, &ch);
|
1146
|
-
if (ch != L_ESC)
|
1147
|
-
add_utf8char(b, ch);
|
1148
|
-
else {
|
1149
|
-
news += utf8_decode(news, new_end, &ch); /* skip ESC */
|
1150
|
-
if (!utf8_isdigit(ch)) {
|
1151
|
-
if (ch != L_ESC)
|
1152
|
-
luaL_error(ms->L, "invalid use of " LUA_QL("%c")
|
1153
|
-
" in replacement string", L_ESC);
|
1154
|
-
add_utf8char(b, ch);
|
1155
|
-
}
|
1156
|
-
else if (ch == '0')
|
1157
|
-
luaL_addlstring(b, s, e-s);
|
1158
|
-
else {
|
1159
|
-
utf8_push_onecapture(ms, ch-'1', s, e);
|
1160
|
-
luaL_addvalue(b); /* add capture to accumulated result */
|
1161
|
-
}
|
1162
|
-
}
|
1163
|
-
}
|
1164
|
-
}
|
1165
|
-
|
1166
|
-
static void utf8_add_value (utf8MatchState *ms, luaL_Buffer *b, const char *s,
|
1167
|
-
const char *e, int tr) {
|
1168
|
-
lua_State *L = ms->L;
|
1169
|
-
switch (tr) {
|
1170
|
-
case LUA_TFUNCTION: {
|
1171
|
-
int n;
|
1172
|
-
lua_pushvalue(L, 3);
|
1173
|
-
n = utf8_push_captures(ms, s, e);
|
1174
|
-
lua_call(L, n, 1);
|
1175
|
-
break;
|
1176
|
-
}
|
1177
|
-
case LUA_TTABLE: {
|
1178
|
-
utf8_push_onecapture(ms, 0, s, e);
|
1179
|
-
lua_gettable(L, 3);
|
1180
|
-
break;
|
1181
|
-
}
|
1182
|
-
default: { /* LUA_TNUMBER or LUA_TSTRING */
|
1183
|
-
utf8_add_s(ms, b, s, e);
|
1184
|
-
return;
|
1185
|
-
}
|
1186
|
-
}
|
1187
|
-
if (!lua_toboolean(L, -1)) { /* nil or false? */
|
1188
|
-
lua_pop(L, 1);
|
1189
|
-
lua_pushlstring(L, s, e - s); /* keep original text */
|
1190
|
-
}
|
1191
|
-
else if (!lua_isstring(L, -1))
|
1192
|
-
luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
|
1193
|
-
luaL_addvalue(b); /* add result to accumulator */
|
1194
|
-
}
|
1195
|
-
|
1196
|
-
static int Lutf8_gsub(lua_State *L) {
|
1197
|
-
const char *es, *s = check_utf8(L, 1, &es);
|
1198
|
-
const char *ep, *p = check_utf8(L, 2, &ep);
|
1199
|
-
int tr = lua_type(L, 3);
|
1200
|
-
lua_Integer max_s = luaL_optinteger(L, 4, (es-s)+1);
|
1201
|
-
int anchor = (*p == '^');
|
1202
|
-
lua_Integer n = 0;
|
1203
|
-
utf8MatchState ms;
|
1204
|
-
luaL_Buffer b;
|
1205
|
-
luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
|
1206
|
-
tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
|
1207
|
-
"string/function/table expected");
|
1208
|
-
luaL_buffinit(L, &b);
|
1209
|
-
if (anchor) p++; /* skip anchor character */
|
1210
|
-
ms.L = L;
|
1211
|
-
ms.matchdepth = MAXCCALLS;
|
1212
|
-
ms.src_init = s;
|
1213
|
-
ms.src_end = es;
|
1214
|
-
ms.p_end = ep;
|
1215
|
-
while (n < max_s) {
|
1216
|
-
const char *e;
|
1217
|
-
ms.level = 0;
|
1218
|
-
assert(ms.matchdepth == MAXCCALLS);
|
1219
|
-
e = utf8_match(&ms, s, p);
|
1220
|
-
if (e) {
|
1221
|
-
n++;
|
1222
|
-
utf8_add_value(&ms, &b, s, e, tr);
|
1223
|
-
}
|
1224
|
-
if (e && e > s) /* non empty match? */
|
1225
|
-
s = e; /* skip it */
|
1226
|
-
else if (s < es) {
|
1227
|
-
unsigned int ch;
|
1228
|
-
s += utf8_decode(s, es, &ch);
|
1229
|
-
add_utf8char(&b, ch);
|
1230
|
-
}
|
1231
|
-
else break;
|
1232
|
-
if (anchor) break;
|
1233
|
-
}
|
1234
|
-
luaL_addlstring(&b, s, es-s);
|
1235
|
-
luaL_pushresult(&b);
|
1236
|
-
lua_pushinteger(L, n); /* number of substitutions */
|
1237
|
-
return 2;
|
1238
|
-
}
|
1239
|
-
|
1240
|
-
|
1241
|
-
/* lua module import interface */
|
1242
|
-
|
1243
|
-
LUALIB_API int luaopen_utf8(lua_State *L) {
|
1244
|
-
luaL_Reg libs[] = {
|
1245
|
-
#define ENTRY(name) { #name, Lutf8_##name }
|
1246
|
-
ENTRY(len),
|
1247
|
-
ENTRY(sub),
|
1248
|
-
ENTRY(reverse),
|
1249
|
-
ENTRY(lower),
|
1250
|
-
ENTRY(upper),
|
1251
|
-
ENTRY(title),
|
1252
|
-
ENTRY(fold),
|
1253
|
-
ENTRY(byte),
|
1254
|
-
ENTRY(char),
|
1255
|
-
ENTRY(escape),
|
1256
|
-
ENTRY(insert),
|
1257
|
-
ENTRY(remove),
|
1258
|
-
ENTRY(charpos),
|
1259
|
-
ENTRY(next),
|
1260
|
-
ENTRY(width),
|
1261
|
-
ENTRY(widthindex),
|
1262
|
-
ENTRY(ncasecmp),
|
1263
|
-
ENTRY(find),
|
1264
|
-
ENTRY(gmatch),
|
1265
|
-
ENTRY(gsub),
|
1266
|
-
ENTRY(match),
|
1267
|
-
#undef ENTRY
|
1268
|
-
{ NULL, NULL }
|
1269
|
-
};
|
1270
|
-
|
1271
|
-
luaL_register(L, "utf8", libs);
|
1272
|
-
|
1273
|
-
return 1;
|
1274
|
-
}
|