@zappdev/cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (209) hide show
  1. package/README.md +55 -0
  2. package/dist/zapp-cli.js +9471 -0
  3. package/native/src/app/app.zc +490 -0
  4. package/native/src/event/event.zc +24 -0
  5. package/native/src/event/events.zc +70 -0
  6. package/native/src/platform/darwin/backend.zc +923 -0
  7. package/native/src/platform/darwin/backend_bootstrap.zc +9 -0
  8. package/native/src/platform/darwin/bootstrap.zc +9 -0
  9. package/native/src/platform/darwin/engine_jsc.zc +86 -0
  10. package/native/src/platform/darwin/engine_qjs.zc +92 -0
  11. package/native/src/platform/darwin/platform.zc +156 -0
  12. package/native/src/platform/darwin/webview.zc +550 -0
  13. package/native/src/platform/darwin/webview_bootstrap.zc +9 -0
  14. package/native/src/platform/darwin/window.zc +1223 -0
  15. package/native/src/platform/darwin/worker/common.zc +223 -0
  16. package/native/src/platform/darwin/worker/core/base64_core.zc +29 -0
  17. package/native/src/platform/darwin/worker/core/crypto_core.zc +19 -0
  18. package/native/src/platform/darwin/worker/core/encoding_core.zc +32 -0
  19. package/native/src/platform/darwin/worker/core/fetch_core.zc +145 -0
  20. package/native/src/platform/darwin/worker/core/url_core.zc +69 -0
  21. package/native/src/platform/darwin/worker/core/websocket_core.zc +179 -0
  22. package/native/src/platform/darwin/worker/dispatch.zc +55 -0
  23. package/native/src/platform/darwin/worker/jsc/base64_jsc.zc +39 -0
  24. package/native/src/platform/darwin/worker/jsc/crypto_jsc.zc +49 -0
  25. package/native/src/platform/darwin/worker/jsc/encoding_jsc.zc +86 -0
  26. package/native/src/platform/darwin/worker/jsc/fetch_jsc.zc +149 -0
  27. package/native/src/platform/darwin/worker/jsc/url_jsc.zc +54 -0
  28. package/native/src/platform/darwin/worker/jsc/websocket_jsc.zc +127 -0
  29. package/native/src/platform/darwin/worker/jsc.zc +670 -0
  30. package/native/src/platform/darwin/worker/mod.zc +30 -0
  31. package/native/src/platform/darwin/worker/qjs/fetch_qjs.zc +233 -0
  32. package/native/src/platform/darwin/worker/qjs/qjs_macros.zc +23 -0
  33. package/native/src/platform/darwin/worker/qjs/websocket_qjs.zc +223 -0
  34. package/native/src/platform/darwin/worker/qjs.zc +1053 -0
  35. package/native/src/platform/darwin/worker/timers.zc +149 -0
  36. package/native/src/platform/darwin/worker/timers_qjs.zc +209 -0
  37. package/native/src/platform/platform.zc +64 -0
  38. package/native/src/platform/shared/log.zc +156 -0
  39. package/native/src/platform/shared/worker/qjs/base64_qjs.zc +38 -0
  40. package/native/src/platform/shared/worker/qjs/crypto_qjs.zc +44 -0
  41. package/native/src/platform/shared/worker/qjs/encoding_qjs.zc +95 -0
  42. package/native/src/platform/shared/worker/qjs/url_qjs.zc +65 -0
  43. package/native/src/platform/shared/worker_registry.zc +206 -0
  44. package/native/src/platform/window.zc +446 -0
  45. package/native/src/platform/windows/backend.zc +452 -0
  46. package/native/src/platform/windows/backend_bootstrap.zc +9 -0
  47. package/native/src/platform/windows/bootstrap.zc +9 -0
  48. package/native/src/platform/windows/engine_qjs.zc +60 -0
  49. package/native/src/platform/windows/platform.zc +387 -0
  50. package/native/src/platform/windows/webview.zc +1175 -0
  51. package/native/src/platform/windows/webview_bootstrap.zc +9 -0
  52. package/native/src/platform/windows/window.zc +1271 -0
  53. package/native/src/platform/windows/worker/common.zc +409 -0
  54. package/native/src/platform/windows/worker/core/base64_core.zc +52 -0
  55. package/native/src/platform/windows/worker/core/crypto_core.zc +34 -0
  56. package/native/src/platform/windows/worker/core/encoding_core.zc +60 -0
  57. package/native/src/platform/windows/worker/core/fetch_core.zc +274 -0
  58. package/native/src/platform/windows/worker/core/url_core.zc +216 -0
  59. package/native/src/platform/windows/worker/core/websocket_core.zc +343 -0
  60. package/native/src/platform/windows/worker/dispatch.zc +34 -0
  61. package/native/src/platform/windows/worker/mod.zc +46 -0
  62. package/native/src/platform/windows/worker/qjs/fetch_qjs.zc +255 -0
  63. package/native/src/platform/windows/worker/qjs/websocket_qjs.zc +263 -0
  64. package/native/src/platform/windows/worker/qjs.zc +1049 -0
  65. package/native/src/platform/windows/worker/timers_qjs.zc +288 -0
  66. package/native/src/platform/worker.zc +8 -0
  67. package/native/src/service/service.zc +228 -0
  68. package/native/vendor/quickjs-ng/.gitattributes +4 -0
  69. package/native/vendor/quickjs-ng/.github/dependabot.yml +7 -0
  70. package/native/vendor/quickjs-ng/.github/workflows/ci.yml +812 -0
  71. package/native/vendor/quickjs-ng/.github/workflows/docs.yml +49 -0
  72. package/native/vendor/quickjs-ng/.github/workflows/release.yml +162 -0
  73. package/native/vendor/quickjs-ng/.github/workflows/test-docs.yml +23 -0
  74. package/native/vendor/quickjs-ng/.github/workflows/tsan.yml +32 -0
  75. package/native/vendor/quickjs-ng/.github/workflows/valgrind.yml +33 -0
  76. package/native/vendor/quickjs-ng/.gitmodules +5 -0
  77. package/native/vendor/quickjs-ng/CMakeLists.txt +553 -0
  78. package/native/vendor/quickjs-ng/LICENSE +24 -0
  79. package/native/vendor/quickjs-ng/Makefile +149 -0
  80. package/native/vendor/quickjs-ng/amalgam.js +53 -0
  81. package/native/vendor/quickjs-ng/api-test.c +927 -0
  82. package/native/vendor/quickjs-ng/builtin-array-fromasync.h +119 -0
  83. package/native/vendor/quickjs-ng/builtin-array-fromasync.js +36 -0
  84. package/native/vendor/quickjs-ng/builtin-iterator-zip-keyed.h +332 -0
  85. package/native/vendor/quickjs-ng/builtin-iterator-zip-keyed.js +194 -0
  86. package/native/vendor/quickjs-ng/builtin-iterator-zip.h +337 -0
  87. package/native/vendor/quickjs-ng/builtin-iterator-zip.js +210 -0
  88. package/native/vendor/quickjs-ng/ctest.c +17 -0
  89. package/native/vendor/quickjs-ng/cutils.h +2013 -0
  90. package/native/vendor/quickjs-ng/cxxtest.cc +2 -0
  91. package/native/vendor/quickjs-ng/dtoa.c +1619 -0
  92. package/native/vendor/quickjs-ng/dtoa.h +87 -0
  93. package/native/vendor/quickjs-ng/examples/fib.c +67 -0
  94. package/native/vendor/quickjs-ng/examples/fib_module.js +10 -0
  95. package/native/vendor/quickjs-ng/examples/hello.js +1 -0
  96. package/native/vendor/quickjs-ng/examples/hello_module.js +6 -0
  97. package/native/vendor/quickjs-ng/examples/meson.build +17 -0
  98. package/native/vendor/quickjs-ng/examples/pi_bigint.js +118 -0
  99. package/native/vendor/quickjs-ng/examples/point.c +154 -0
  100. package/native/vendor/quickjs-ng/examples/test_fib.js +8 -0
  101. package/native/vendor/quickjs-ng/examples/test_point.js +43 -0
  102. package/native/vendor/quickjs-ng/fuzz.c +51 -0
  103. package/native/vendor/quickjs-ng/gen/function_source.c +81 -0
  104. package/native/vendor/quickjs-ng/gen/hello.c +53 -0
  105. package/native/vendor/quickjs-ng/gen/hello_module.c +106 -0
  106. package/native/vendor/quickjs-ng/gen/repl.c +3053 -0
  107. package/native/vendor/quickjs-ng/gen/standalone.c +324 -0
  108. package/native/vendor/quickjs-ng/gen/test_fib.c +81 -0
  109. package/native/vendor/quickjs-ng/libregexp-opcode.h +58 -0
  110. package/native/vendor/quickjs-ng/libregexp.c +2687 -0
  111. package/native/vendor/quickjs-ng/libregexp.h +98 -0
  112. package/native/vendor/quickjs-ng/libunicode-table.h +4707 -0
  113. package/native/vendor/quickjs-ng/libunicode.c +1746 -0
  114. package/native/vendor/quickjs-ng/libunicode.h +126 -0
  115. package/native/vendor/quickjs-ng/list.h +107 -0
  116. package/native/vendor/quickjs-ng/lre-test.c +73 -0
  117. package/native/vendor/quickjs-ng/meson.build +684 -0
  118. package/native/vendor/quickjs-ng/meson_options.txt +6 -0
  119. package/native/vendor/quickjs-ng/qjs-wasi-reactor.c +208 -0
  120. package/native/vendor/quickjs-ng/qjs.c +748 -0
  121. package/native/vendor/quickjs-ng/qjsc.c +673 -0
  122. package/native/vendor/quickjs-ng/quickjs-atom.h +267 -0
  123. package/native/vendor/quickjs-ng/quickjs-c-atomics.h +54 -0
  124. package/native/vendor/quickjs-ng/quickjs-libc.c +4986 -0
  125. package/native/vendor/quickjs-ng/quickjs-libc.h +79 -0
  126. package/native/vendor/quickjs-ng/quickjs-opcode.h +369 -0
  127. package/native/vendor/quickjs-ng/quickjs.c +60259 -0
  128. package/native/vendor/quickjs-ng/quickjs.h +1419 -0
  129. package/native/vendor/quickjs-ng/repl.js +1927 -0
  130. package/native/vendor/quickjs-ng/run-test262.c +2417 -0
  131. package/native/vendor/quickjs-ng/standalone.js +129 -0
  132. package/native/vendor/quickjs-ng/tests/assert.js +49 -0
  133. package/native/vendor/quickjs-ng/tests/bug1221.js +16 -0
  134. package/native/vendor/quickjs-ng/tests/bug1296.js +12 -0
  135. package/native/vendor/quickjs-ng/tests/bug1297.js +22 -0
  136. package/native/vendor/quickjs-ng/tests/bug1301.js +21 -0
  137. package/native/vendor/quickjs-ng/tests/bug1302.js +24 -0
  138. package/native/vendor/quickjs-ng/tests/bug1305.js +26 -0
  139. package/native/vendor/quickjs-ng/tests/bug1318.js +54 -0
  140. package/native/vendor/quickjs-ng/tests/bug1352.js +8 -0
  141. package/native/vendor/quickjs-ng/tests/bug1354.js +6 -0
  142. package/native/vendor/quickjs-ng/tests/bug1355.js +58 -0
  143. package/native/vendor/quickjs-ng/tests/bug1368.js +9 -0
  144. package/native/vendor/quickjs-ng/tests/bug39/1.js +6 -0
  145. package/native/vendor/quickjs-ng/tests/bug39/2.js +6 -0
  146. package/native/vendor/quickjs-ng/tests/bug39/3.js +7 -0
  147. package/native/vendor/quickjs-ng/tests/bug488-upstream.js +7 -0
  148. package/native/vendor/quickjs-ng/tests/bug633/0.js +7 -0
  149. package/native/vendor/quickjs-ng/tests/bug633/1.js +4 -0
  150. package/native/vendor/quickjs-ng/tests/bug633/2.js +4 -0
  151. package/native/vendor/quickjs-ng/tests/bug633/3.js +4 -0
  152. package/native/vendor/quickjs-ng/tests/bug645/0.js +4 -0
  153. package/native/vendor/quickjs-ng/tests/bug645/1.js +9 -0
  154. package/native/vendor/quickjs-ng/tests/bug645/2.js +7 -0
  155. package/native/vendor/quickjs-ng/tests/bug648.js +13 -0
  156. package/native/vendor/quickjs-ng/tests/bug652.js +4 -0
  157. package/native/vendor/quickjs-ng/tests/bug741.js +19 -0
  158. package/native/vendor/quickjs-ng/tests/bug775.js +7 -0
  159. package/native/vendor/quickjs-ng/tests/bug776.js +7 -0
  160. package/native/vendor/quickjs-ng/tests/bug832.js +2 -0
  161. package/native/vendor/quickjs-ng/tests/bug858.js +26 -0
  162. package/native/vendor/quickjs-ng/tests/bug904.js +6 -0
  163. package/native/vendor/quickjs-ng/tests/bug988.js +7 -0
  164. package/native/vendor/quickjs-ng/tests/bug999.js +3 -0
  165. package/native/vendor/quickjs-ng/tests/destructured-export.js +8 -0
  166. package/native/vendor/quickjs-ng/tests/detect_module/0.js +1 -0
  167. package/native/vendor/quickjs-ng/tests/detect_module/1.js +2 -0
  168. package/native/vendor/quickjs-ng/tests/detect_module/2.js +1 -0
  169. package/native/vendor/quickjs-ng/tests/detect_module/3.js +8 -0
  170. package/native/vendor/quickjs-ng/tests/detect_module/4.js +3 -0
  171. package/native/vendor/quickjs-ng/tests/empty.js +0 -0
  172. package/native/vendor/quickjs-ng/tests/fixture_cyclic_import.js +2 -0
  173. package/native/vendor/quickjs-ng/tests/fixture_string_exports.js +12 -0
  174. package/native/vendor/quickjs-ng/tests/function_source.js +14 -0
  175. package/native/vendor/quickjs-ng/tests/microbench.js +1267 -0
  176. package/native/vendor/quickjs-ng/tests/null_or_undefined.js +38 -0
  177. package/native/vendor/quickjs-ng/tests/str-pad-leak.js +5 -0
  178. package/native/vendor/quickjs-ng/tests/test_bigint.js +107 -0
  179. package/native/vendor/quickjs-ng/tests/test_bjson.js +366 -0
  180. package/native/vendor/quickjs-ng/tests/test_builtin.js +1314 -0
  181. package/native/vendor/quickjs-ng/tests/test_closure.js +220 -0
  182. package/native/vendor/quickjs-ng/tests/test_cyclic_import.js +12 -0
  183. package/native/vendor/quickjs-ng/tests/test_domexception.js +35 -0
  184. package/native/vendor/quickjs-ng/tests/test_language.js +755 -0
  185. package/native/vendor/quickjs-ng/tests/test_loop.js +367 -0
  186. package/native/vendor/quickjs-ng/tests/test_queue_microtask.js +39 -0
  187. package/native/vendor/quickjs-ng/tests/test_std.js +340 -0
  188. package/native/vendor/quickjs-ng/tests/test_string_exports.js +25 -0
  189. package/native/vendor/quickjs-ng/tests/test_worker.js +43 -0
  190. package/native/vendor/quickjs-ng/tests/test_worker_module.js +30 -0
  191. package/native/vendor/quickjs-ng/tests.conf +14 -0
  192. package/native/vendor/quickjs-ng/unicode_download.sh +19 -0
  193. package/native/vendor/quickjs-ng/unicode_gen.c +3108 -0
  194. package/native/vendor/quickjs-ng/unicode_gen_def.h +310 -0
  195. package/native/vendor/quickjs-ng/update-version.sh +32 -0
  196. package/native/vendor/webview2/include/WebView2.h +60636 -0
  197. package/native/vendor/webview2/include/WebView2EnvironmentOptions.h +406 -0
  198. package/package.json +33 -0
  199. package/src/backend.ts +139 -0
  200. package/src/build-config.ts +87 -0
  201. package/src/build.ts +276 -0
  202. package/src/common.ts +195 -0
  203. package/src/config.ts +89 -0
  204. package/src/dev.ts +164 -0
  205. package/src/generate.ts +200 -0
  206. package/src/icons.ts +116 -0
  207. package/src/init.ts +190 -0
  208. package/src/package.ts +150 -0
  209. package/src/zapp-cli.ts +263 -0
@@ -0,0 +1,3108 @@
1
+ /*
2
+ * Generation of Unicode tables
3
+ *
4
+ * Copyright (c) 2017-2018 Fabrice Bellard
5
+ * Copyright (c) 2017-2018 Charlie Gordon
6
+ *
7
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ * of this software and associated documentation files (the "Software"), to deal
9
+ * in the Software without restriction, including without limitation the rights
10
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ * copies of the Software, and to permit persons to whom the Software is
12
+ * furnished to do so, subject to the following conditions:
13
+ *
14
+ * The above copyright notice and this permission notice shall be included in
15
+ * all copies or substantial portions of the Software.
16
+ *
17
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ * THE SOFTWARE.
24
+ */
25
+ #include <stdlib.h>
26
+ #include <stdio.h>
27
+ #include <stdarg.h>
28
+ #include <inttypes.h>
29
+ #include <string.h>
30
+ #include <assert.h>
31
+ #include <ctype.h>
32
+ #include <time.h>
33
+
34
+ #include "cutils.h"
35
+
36
+ /* define it to be able to test unicode.c */
37
+ //#define USE_TEST
38
+ /* profile tests */
39
+ //#define PROFILE
40
+
41
+ //#define DUMP_CASE_CONV_TABLE
42
+ //#define DUMP_TABLE_SIZE
43
+ //#define DUMP_CC_TABLE
44
+ //#define DUMP_DECOMP_TABLE
45
+ //#define DUMP_CASE_FOLDING_SPECIAL_CASES
46
+
47
+ /* Ideas:
48
+ - Generalize run length encoding + index for all tables
49
+ - remove redundant tables for ID_start, ID_continue, Case_Ignorable, Cased
50
+
51
+ Case conversion:
52
+ - use a single entry for consecutive U/LF runs
53
+ - allow EXT runs of length > 1
54
+
55
+ Decomposition:
56
+ - Greek lower case (+1f10/1f10) ?
57
+ - allow holes in B runs
58
+ - suppress more upper / lower case redundancy
59
+ */
60
+
61
+ #ifdef USE_TEST
62
+ #include "libunicode.c"
63
+ #endif
64
+
65
+ #define CHARCODE_MAX 0x10ffff
66
+ #define CC_LEN_MAX 3
67
+
68
+ void *mallocz(size_t size)
69
+ {
70
+ void *ptr;
71
+ ptr = malloc(size);
72
+ memset(ptr, 0, size);
73
+ return ptr;
74
+ }
75
+
76
+ const char *get_field(const char *p, int n)
77
+ {
78
+ int i;
79
+ for(i = 0; i < n; i++) {
80
+ while (*p != ';' && *p != '\0')
81
+ p++;
82
+ if (*p == '\0')
83
+ return NULL;
84
+ p++;
85
+ }
86
+ return p;
87
+ }
88
+
89
+ const char *get_field_buf(char *buf, size_t buf_size, const char *p, int n)
90
+ {
91
+ char *q;
92
+ p = get_field(p, n);
93
+ q = buf;
94
+ while (*p != ';' && *p != '\0') {
95
+ if ((q - buf) < buf_size - 1)
96
+ *q++ = *p;
97
+ p++;
98
+ }
99
+ *q = '\0';
100
+ return buf;
101
+ }
102
+
103
+ void add_char(int **pbuf, int *psize, int *plen, int c)
104
+ {
105
+ int len, size, *buf;
106
+ buf = *pbuf;
107
+ size = *psize;
108
+ len = *plen;
109
+ if (len >= size) {
110
+ size = *psize;
111
+ size = max_int(len + 1, size * 3 / 2);
112
+ buf = realloc(buf, sizeof(buf[0]) * size);
113
+ *pbuf = buf;
114
+ *psize = size;
115
+ }
116
+ buf[len++] = c;
117
+ *plen = len;
118
+ }
119
+
120
+ int *get_field_str(int *plen, const char *str, int n)
121
+ {
122
+ const char *p;
123
+ int *buf, len, size;
124
+ p = get_field(str, n);
125
+ if (!p) {
126
+ *plen = 0;
127
+ return NULL;
128
+ }
129
+ len = 0;
130
+ size = 0;
131
+ buf = NULL;
132
+ for(;;) {
133
+ while (isspace(*p))
134
+ p++;
135
+ if (!isxdigit(*p))
136
+ break;
137
+ add_char(&buf, &size, &len, strtoul(p, (char **)&p, 16));
138
+ }
139
+ *plen = len;
140
+ return buf;
141
+ }
142
+
143
+ char *get_line(char *buf, int buf_size, FILE *f)
144
+ {
145
+ int len;
146
+ if (!fgets(buf, buf_size, f))
147
+ return NULL;
148
+ len = strlen(buf);
149
+ if (len > 0 && buf[len - 1] == '\n')
150
+ buf[len - 1] = '\0';
151
+ return buf;
152
+ }
153
+
154
+ #define UNICODE_GENERAL_CATEGORY
155
+
156
+ typedef enum {
157
+ #define DEF(id, str) GCAT_ ## id,
158
+ #include "unicode_gen_def.h"
159
+ #undef DEF
160
+ GCAT_COUNT,
161
+ } UnicodeGCEnum1;
162
+
163
+ static const char *unicode_gc_name[] = {
164
+ #define DEF(id, str) #id,
165
+ #include "unicode_gen_def.h"
166
+ #undef DEF
167
+ };
168
+
169
+ static const char *unicode_gc_short_name[] = {
170
+ #define DEF(id, str) str,
171
+ #include "unicode_gen_def.h"
172
+ #undef DEF
173
+ };
174
+
175
+ #undef UNICODE_GENERAL_CATEGORY
176
+
177
+ #define UNICODE_SCRIPT
178
+
179
+ typedef enum {
180
+ #define DEF(id, str) SCRIPT_ ## id,
181
+ #include "unicode_gen_def.h"
182
+ #undef DEF
183
+ SCRIPT_COUNT,
184
+ } UnicodeScriptEnum1;
185
+
186
+ static const char *unicode_script_name[] = {
187
+ #define DEF(id, str) #id,
188
+ #include "unicode_gen_def.h"
189
+ #undef DEF
190
+ };
191
+
192
+ const char *unicode_script_short_name[] = {
193
+ #define DEF(id, str) str,
194
+ #include "unicode_gen_def.h"
195
+ #undef DEF
196
+ };
197
+
198
+ #undef UNICODE_SCRIPT
199
+
200
+ #define UNICODE_PROP_LIST
201
+
202
+ typedef enum {
203
+ #define DEF(id, str) PROP_ ## id,
204
+ #include "unicode_gen_def.h"
205
+ #undef DEF
206
+ PROP_COUNT,
207
+ } UnicodePropEnum1;
208
+
209
+ static const char *unicode_prop_name[] = {
210
+ #define DEF(id, str) #id,
211
+ #include "unicode_gen_def.h"
212
+ #undef DEF
213
+ };
214
+
215
+ static const char *unicode_prop_short_name[] = {
216
+ #define DEF(id, str) str,
217
+ #include "unicode_gen_def.h"
218
+ #undef DEF
219
+ };
220
+
221
+ #undef UNICODE_PROP_LIST
222
+
223
+ typedef struct {
224
+ /* case conv */
225
+ uint8_t u_len;
226
+ uint8_t l_len;
227
+ uint8_t f_len;
228
+ int u_data[CC_LEN_MAX]; /* to upper case */
229
+ int l_data[CC_LEN_MAX]; /* to lower case */
230
+ int f_data[CC_LEN_MAX]; /* to case folding */
231
+
232
+ uint8_t combining_class;
233
+ uint8_t is_compat:1;
234
+ uint8_t is_excluded:1;
235
+ uint8_t general_category;
236
+ uint8_t script;
237
+ uint8_t script_ext_len;
238
+ uint8_t *script_ext;
239
+ uint32_t prop_bitmap_tab[3];
240
+ /* decomposition */
241
+ int decomp_len;
242
+ int *decomp_data;
243
+ } CCInfo;
244
+
245
+ CCInfo *unicode_db;
246
+
247
+ int find_name(const char **tab, int tab_len, const char *name)
248
+ {
249
+ int i, len, name_len;
250
+ const char *p, *r;
251
+
252
+ name_len = strlen(name);
253
+ for(i = 0; i < tab_len; i++) {
254
+ p = tab[i];
255
+ for(;;) {
256
+ r = strchr(p, ',');
257
+ if (!r)
258
+ len = strlen(p);
259
+ else
260
+ len = r - p;
261
+ if (len == name_len && memcmp(p, name, len) == 0)
262
+ return i;
263
+ if (!r)
264
+ break;
265
+ p = r + 1;
266
+ }
267
+ }
268
+ return -1;
269
+ }
270
+
271
+ static int get_prop(uint32_t c, int prop_idx)
272
+ {
273
+ return (unicode_db[c].prop_bitmap_tab[prop_idx >> 5] >> (prop_idx & 0x1f)) & 1;
274
+ }
275
+
276
+ static void set_prop(uint32_t c, int prop_idx, int val)
277
+ {
278
+ uint32_t mask;
279
+ mask = 1U << (prop_idx & 0x1f);
280
+ if (val)
281
+ unicode_db[c].prop_bitmap_tab[prop_idx >> 5] |= mask;
282
+ else
283
+ unicode_db[c].prop_bitmap_tab[prop_idx >> 5] &= ~mask;
284
+ }
285
+
286
+ void parse_unicode_data(const char *filename)
287
+ {
288
+ FILE *f;
289
+ char line[1024];
290
+ char buf1[256];
291
+ const char *p;
292
+ int code, lc, uc, last_code;
293
+ CCInfo *ci, *tab = unicode_db;
294
+
295
+ f = fopen(filename, "rb");
296
+ if (!f) {
297
+ perror(filename);
298
+ exit(1);
299
+ }
300
+
301
+ last_code = 0;
302
+ for(;;) {
303
+ if (!get_line(line, sizeof(line), f))
304
+ break;
305
+ p = line;
306
+ while (isspace(*p))
307
+ p++;
308
+ if (*p == '#')
309
+ continue;
310
+
311
+ p = get_field(line, 0);
312
+ if (!p)
313
+ continue;
314
+ code = strtoul(p, NULL, 16);
315
+ lc = 0;
316
+ uc = 0;
317
+
318
+ p = get_field(line, 12);
319
+ if (p && *p != ';') {
320
+ uc = strtoul(p, NULL, 16);
321
+ }
322
+
323
+ p = get_field(line, 13);
324
+ if (p && *p != ';') {
325
+ lc = strtoul(p, NULL, 16);
326
+ }
327
+ ci = &tab[code];
328
+ if (uc > 0 || lc > 0) {
329
+ assert(code <= CHARCODE_MAX);
330
+ if (uc > 0) {
331
+ assert(ci->u_len == 0);
332
+ ci->u_len = 1;
333
+ ci->u_data[0] = uc;
334
+ }
335
+ if (lc > 0) {
336
+ assert(ci->l_len == 0);
337
+ ci->l_len = 1;
338
+ ci->l_data[0] = lc;
339
+ }
340
+ }
341
+
342
+ {
343
+ int i;
344
+ get_field_buf(buf1, sizeof(buf1), line, 2);
345
+ i = find_name(unicode_gc_name, countof(unicode_gc_name), buf1);
346
+ if (i < 0) {
347
+ fprintf(stderr, "General category '%s' not found\n",
348
+ buf1);
349
+ exit(1);
350
+ }
351
+ ci->general_category = i;
352
+ }
353
+
354
+ p = get_field(line, 3);
355
+ if (p && *p != ';' && *p != '\0') {
356
+ int cc;
357
+ cc = strtoul(p, NULL, 0);
358
+ if (cc != 0) {
359
+ assert(code <= CHARCODE_MAX);
360
+ ci->combining_class = cc;
361
+ // printf("%05x: %d\n", code, ci->combining_class);
362
+ }
363
+ }
364
+
365
+ p = get_field(line, 5);
366
+ if (p && *p != ';' && *p != '\0') {
367
+ int size;
368
+ assert(code <= CHARCODE_MAX);
369
+ ci->is_compat = 0;
370
+ if (*p == '<') {
371
+ while (*p != '\0' && *p != '>')
372
+ p++;
373
+ if (*p == '>')
374
+ p++;
375
+ ci->is_compat = 1;
376
+ }
377
+ size = 0;
378
+ for(;;) {
379
+ while (isspace(*p))
380
+ p++;
381
+ if (!isxdigit(*p))
382
+ break;
383
+ add_char(&ci->decomp_data, &size, &ci->decomp_len, strtoul(p, (char **)&p, 16));
384
+ }
385
+ }
386
+
387
+ p = get_field(line, 9);
388
+ if (p && *p == 'Y') {
389
+ set_prop(code, PROP_Bidi_Mirrored, 1);
390
+ }
391
+
392
+ /* handle ranges */
393
+ get_field_buf(buf1, sizeof(buf1), line, 1);
394
+ if (strstr(buf1, " Last>")) {
395
+ int i;
396
+ // printf("range: 0x%x-%0x\n", last_code, code);
397
+ assert(ci->decomp_len == 0);
398
+ assert(ci->script_ext_len == 0);
399
+ for(i = last_code + 1; i < code; i++) {
400
+ unicode_db[i] = *ci;
401
+ }
402
+ }
403
+ last_code = code;
404
+ }
405
+
406
+ fclose(f);
407
+ }
408
+
409
+ void parse_special_casing(CCInfo *tab, const char *filename)
410
+ {
411
+ FILE *f;
412
+ char line[1024];
413
+ const char *p;
414
+ int code;
415
+ CCInfo *ci;
416
+
417
+ f = fopen(filename, "rb");
418
+ if (!f) {
419
+ perror(filename);
420
+ exit(1);
421
+ }
422
+
423
+ for(;;) {
424
+ if (!get_line(line, sizeof(line), f))
425
+ break;
426
+ p = line;
427
+ while (isspace(*p))
428
+ p++;
429
+ if (*p == '#')
430
+ continue;
431
+
432
+ p = get_field(line, 0);
433
+ if (!p)
434
+ continue;
435
+ code = strtoul(p, NULL, 16);
436
+ assert(code <= CHARCODE_MAX);
437
+ ci = &tab[code];
438
+
439
+ p = get_field(line, 4);
440
+ if (p) {
441
+ /* locale dependent casing */
442
+ while (isspace(*p))
443
+ p++;
444
+ if (*p != '#' && *p != '\0')
445
+ continue;
446
+ }
447
+
448
+
449
+ p = get_field(line, 1);
450
+ if (p && *p != ';') {
451
+ ci->l_len = 0;
452
+ for(;;) {
453
+ while (isspace(*p))
454
+ p++;
455
+ if (*p == ';')
456
+ break;
457
+ assert(ci->l_len < CC_LEN_MAX);
458
+ ci->l_data[ci->l_len++] = strtoul(p, (char **)&p, 16);
459
+ }
460
+
461
+ if (ci->l_len == 1 && ci->l_data[0] == code)
462
+ ci->l_len = 0;
463
+ }
464
+
465
+ p = get_field(line, 3);
466
+ if (p && *p != ';') {
467
+ ci->u_len = 0;
468
+ for(;;) {
469
+ while (isspace(*p))
470
+ p++;
471
+ if (*p == ';')
472
+ break;
473
+ assert(ci->u_len < CC_LEN_MAX);
474
+ ci->u_data[ci->u_len++] = strtoul(p, (char **)&p, 16);
475
+ }
476
+
477
+ if (ci->u_len == 1 && ci->u_data[0] == code)
478
+ ci->u_len = 0;
479
+ }
480
+ }
481
+
482
+ fclose(f);
483
+ }
484
+
485
+ void parse_case_folding(CCInfo *tab, const char *filename)
486
+ {
487
+ FILE *f;
488
+ char line[1024];
489
+ const char *p;
490
+ int code, status;
491
+ CCInfo *ci;
492
+
493
+ f = fopen(filename, "rb");
494
+ if (!f) {
495
+ perror(filename);
496
+ exit(1);
497
+ }
498
+
499
+ for(;;) {
500
+ if (!get_line(line, sizeof(line), f))
501
+ break;
502
+ p = line;
503
+ while (isspace(*p))
504
+ p++;
505
+ if (*p == '#')
506
+ continue;
507
+
508
+ p = get_field(line, 0);
509
+ if (!p)
510
+ continue;
511
+ code = strtoul(p, NULL, 16);
512
+ assert(code <= CHARCODE_MAX);
513
+ ci = &tab[code];
514
+
515
+ p = get_field(line, 1);
516
+ if (!p)
517
+ continue;
518
+ /* locale dependent casing */
519
+ while (isspace(*p))
520
+ p++;
521
+ status = *p;
522
+ if (status != 'C' && status != 'S' && status != 'F')
523
+ continue;
524
+
525
+ p = get_field(line, 2);
526
+ assert(p != NULL);
527
+ if (status == 'S') {
528
+ /* we always select the simple case folding and assume it
529
+ * comes after the full case folding case */
530
+ assert(ci->f_len >= 2);
531
+ ci->f_len = 0;
532
+ } else {
533
+ assert(ci->f_len == 0);
534
+ }
535
+ for(;;) {
536
+ while (isspace(*p))
537
+ p++;
538
+ if (*p == ';')
539
+ break;
540
+ assert(ci->l_len < CC_LEN_MAX);
541
+ ci->f_data[ci->f_len++] = strtoul(p, (char **)&p, 16);
542
+ }
543
+ }
544
+
545
+ fclose(f);
546
+ }
547
+
548
+ void parse_composition_exclusions(const char *filename)
549
+ {
550
+ FILE *f;
551
+ char line[4096], *p;
552
+ uint32_t c0;
553
+
554
+ f = fopen(filename, "rb");
555
+ if (!f) {
556
+ perror(filename);
557
+ exit(1);
558
+ }
559
+
560
+ for(;;) {
561
+ if (!get_line(line, sizeof(line), f))
562
+ break;
563
+ p = line;
564
+ while (isspace(*p))
565
+ p++;
566
+ if (*p == '#' || *p == '@' || *p == '\0')
567
+ continue;
568
+ c0 = strtoul(p, (char **)&p, 16);
569
+ assert(c0 > 0 && c0 <= CHARCODE_MAX);
570
+ unicode_db[c0].is_excluded = true;
571
+ }
572
+ fclose(f);
573
+ }
574
+
575
+ void parse_derived_core_properties(const char *filename)
576
+ {
577
+ FILE *f;
578
+ char line[4096], *p, buf[256], *q;
579
+ uint32_t c0, c1, c;
580
+ int i;
581
+
582
+ f = fopen(filename, "rb");
583
+ if (!f) {
584
+ perror(filename);
585
+ exit(1);
586
+ }
587
+
588
+ for(;;) {
589
+ if (!get_line(line, sizeof(line), f))
590
+ break;
591
+ p = line;
592
+ while (isspace(*p))
593
+ p++;
594
+ if (*p == '#' || *p == '@' || *p == '\0')
595
+ continue;
596
+ c0 = strtoul(p, (char **)&p, 16);
597
+ if (*p == '.' && p[1] == '.') {
598
+ p += 2;
599
+ c1 = strtoul(p, (char **)&p, 16);
600
+ } else {
601
+ c1 = c0;
602
+ }
603
+ assert(c1 <= CHARCODE_MAX);
604
+ p += strspn(p, " \t");
605
+ if (*p == ';') {
606
+ p++;
607
+ p += strspn(p, " \t");
608
+ q = buf;
609
+ static const char ignore[] = "\t #;"; // includes \0
610
+ while (!memchr(ignore, *p, sizeof(ignore))) {
611
+ if ((q - buf) < sizeof(buf) - 1)
612
+ *q++ = *p;
613
+ p++;
614
+ }
615
+ *q = '\0';
616
+ i = find_name(unicode_prop_name,
617
+ countof(unicode_prop_name), buf);
618
+ if (i < 0) {
619
+ if (!strcmp(buf, "Grapheme_Link"))
620
+ goto next;
621
+ fprintf(stderr, "Property not found: %s\n", buf);
622
+ exit(1);
623
+ }
624
+ for(c = c0; c <= c1; c++) {
625
+ set_prop(c, i, 1);
626
+ }
627
+ next: ;
628
+ }
629
+ }
630
+ fclose(f);
631
+ }
632
+
633
+ void parse_derived_norm_properties(const char *filename)
634
+ {
635
+ FILE *f;
636
+ char line[4096], *p, buf[256], *q;
637
+ uint32_t c0, c1, c;
638
+
639
+ f = fopen(filename, "rb");
640
+ if (!f) {
641
+ perror(filename);
642
+ exit(1);
643
+ }
644
+
645
+ for(;;) {
646
+ if (!get_line(line, sizeof(line), f))
647
+ break;
648
+ p = line;
649
+ while (isspace(*p))
650
+ p++;
651
+ if (*p == '#' || *p == '@' || *p == '\0')
652
+ continue;
653
+ c0 = strtoul(p, (char **)&p, 16);
654
+ if (*p == '.' && p[1] == '.') {
655
+ p += 2;
656
+ c1 = strtoul(p, (char **)&p, 16);
657
+ } else {
658
+ c1 = c0;
659
+ }
660
+ assert(c1 <= CHARCODE_MAX);
661
+ p += strspn(p, " \t");
662
+ if (*p == ';') {
663
+ p++;
664
+ p += strspn(p, " \t");
665
+ q = buf;
666
+ while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
667
+ if ((q - buf) < sizeof(buf) - 1)
668
+ *q++ = *p;
669
+ p++;
670
+ }
671
+ *q = '\0';
672
+ if (!strcmp(buf, "Changes_When_NFKC_Casefolded")) {
673
+ for(c = c0; c <= c1; c++) {
674
+ set_prop(c, PROP_Changes_When_NFKC_Casefolded, 1);
675
+ }
676
+ }
677
+ }
678
+ }
679
+ fclose(f);
680
+ }
681
+
682
+ void parse_prop_list(const char *filename)
683
+ {
684
+ FILE *f;
685
+ char line[4096], *p, buf[256], *q;
686
+ uint32_t c0, c1, c;
687
+ int i;
688
+
689
+ f = fopen(filename, "rb");
690
+ if (!f) {
691
+ perror(filename);
692
+ exit(1);
693
+ }
694
+
695
+ for(;;) {
696
+ if (!get_line(line, sizeof(line), f))
697
+ break;
698
+ p = line;
699
+ while (isspace(*p))
700
+ p++;
701
+ if (*p == '#' || *p == '@' || *p == '\0')
702
+ continue;
703
+ c0 = strtoul(p, (char **)&p, 16);
704
+ if (*p == '.' && p[1] == '.') {
705
+ p += 2;
706
+ c1 = strtoul(p, (char **)&p, 16);
707
+ } else {
708
+ c1 = c0;
709
+ }
710
+ assert(c1 <= CHARCODE_MAX);
711
+ p += strspn(p, " \t");
712
+ if (*p == ';') {
713
+ p++;
714
+ p += strspn(p, " \t");
715
+ q = buf;
716
+ while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
717
+ if ((q - buf) < sizeof(buf) - 1)
718
+ *q++ = *p;
719
+ p++;
720
+ }
721
+ *q = '\0';
722
+ i = find_name(unicode_prop_name,
723
+ countof(unicode_prop_name), buf);
724
+ if (i < 0) {
725
+ fprintf(stderr, "Property not found: %s\n", buf);
726
+ exit(1);
727
+ }
728
+ for(c = c0; c <= c1; c++) {
729
+ set_prop(c, i, 1);
730
+ }
731
+ }
732
+ }
733
+ fclose(f);
734
+ }
735
+
736
+ void parse_scripts(const char *filename)
737
+ {
738
+ FILE *f;
739
+ char line[4096], *p, buf[256], *q;
740
+ uint32_t c0, c1, c;
741
+ int i;
742
+
743
+ f = fopen(filename, "rb");
744
+ if (!f) {
745
+ perror(filename);
746
+ exit(1);
747
+ }
748
+
749
+ for(;;) {
750
+ if (!get_line(line, sizeof(line), f))
751
+ break;
752
+ p = line;
753
+ while (isspace(*p))
754
+ p++;
755
+ if (*p == '#' || *p == '@' || *p == '\0')
756
+ continue;
757
+ c0 = strtoul(p, (char **)&p, 16);
758
+ if (*p == '.' && p[1] == '.') {
759
+ p += 2;
760
+ c1 = strtoul(p, (char **)&p, 16);
761
+ } else {
762
+ c1 = c0;
763
+ }
764
+ assert(c1 <= CHARCODE_MAX);
765
+ p += strspn(p, " \t");
766
+ if (*p == ';') {
767
+ p++;
768
+ p += strspn(p, " \t");
769
+ q = buf;
770
+ while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
771
+ if ((q - buf) < sizeof(buf) - 1)
772
+ *q++ = *p;
773
+ p++;
774
+ }
775
+ *q = '\0';
776
+ i = find_name(unicode_script_name,
777
+ countof(unicode_script_name), buf);
778
+ if (i < 0) {
779
+ fprintf(stderr, "Unknown script: '%s'\n", buf);
780
+ exit(1);
781
+ }
782
+ for(c = c0; c <= c1; c++)
783
+ unicode_db[c].script = i;
784
+ }
785
+ }
786
+ fclose(f);
787
+ }
788
+
789
+ void parse_script_extensions(const char *filename)
790
+ {
791
+ FILE *f;
792
+ char line[4096], *p, buf[256], *q;
793
+ uint32_t c0, c1, c;
794
+ int i;
795
+ uint8_t script_ext[255];
796
+ int script_ext_len;
797
+
798
+ f = fopen(filename, "rb");
799
+ if (!f) {
800
+ perror(filename);
801
+ exit(1);
802
+ }
803
+
804
+ for(;;) {
805
+ if (!get_line(line, sizeof(line), f))
806
+ break;
807
+ p = line;
808
+ while (isspace(*p))
809
+ p++;
810
+ if (*p == '#' || *p == '@' || *p == '\0')
811
+ continue;
812
+ c0 = strtoul(p, (char **)&p, 16);
813
+ if (*p == '.' && p[1] == '.') {
814
+ p += 2;
815
+ c1 = strtoul(p, (char **)&p, 16);
816
+ } else {
817
+ c1 = c0;
818
+ }
819
+ assert(c1 <= CHARCODE_MAX);
820
+ p += strspn(p, " \t");
821
+ script_ext_len = 0;
822
+ if (*p == ';') {
823
+ p++;
824
+ for(;;) {
825
+ p += strspn(p, " \t");
826
+ q = buf;
827
+ while (*p != '\0' && *p != ' ' && *p != '#' && *p != '\t') {
828
+ if ((q - buf) < sizeof(buf) - 1)
829
+ *q++ = *p;
830
+ p++;
831
+ }
832
+ *q = '\0';
833
+ if (buf[0] == '\0')
834
+ break;
835
+ i = find_name(unicode_script_short_name,
836
+ countof(unicode_script_short_name), buf);
837
+ if (i < 0) {
838
+ fprintf(stderr, "Script not found: %s\n", buf);
839
+ exit(1);
840
+ }
841
+ assert(script_ext_len < sizeof(script_ext));
842
+ script_ext[script_ext_len++] = i;
843
+ }
844
+ for(c = c0; c <= c1; c++) {
845
+ CCInfo *ci = &unicode_db[c];
846
+ ci->script_ext_len = script_ext_len;
847
+ ci->script_ext = malloc(sizeof(ci->script_ext[0]) * script_ext_len);
848
+ for(i = 0; i < script_ext_len; i++)
849
+ ci->script_ext[i] = script_ext[i];
850
+ }
851
+ }
852
+ }
853
+ fclose(f);
854
+ }
855
+
856
+ void dump_cc_info(CCInfo *ci, int i)
857
+ {
858
+ int j;
859
+ printf("%05x:", i);
860
+ if (ci->u_len != 0) {
861
+ printf(" U:");
862
+ for(j = 0; j < ci->u_len; j++)
863
+ printf(" %05x", ci->u_data[j]);
864
+ }
865
+ if (ci->l_len != 0) {
866
+ printf(" L:");
867
+ for(j = 0; j < ci->l_len; j++)
868
+ printf(" %05x", ci->l_data[j]);
869
+ }
870
+ if (ci->f_len != 0) {
871
+ printf(" F:");
872
+ for(j = 0; j < ci->f_len; j++)
873
+ printf(" %05x", ci->f_data[j]);
874
+ }
875
+ printf("\n");
876
+ }
877
+
878
+ void dump_unicode_data(CCInfo *tab)
879
+ {
880
+ int i;
881
+ CCInfo *ci;
882
+ for(i = 0; i <= CHARCODE_MAX; i++) {
883
+ ci = &tab[i];
884
+ if (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0) {
885
+ dump_cc_info(ci, i);
886
+ }
887
+ }
888
+ }
889
+
890
+ bool is_complicated_case(const CCInfo *ci)
891
+ {
892
+ return (ci->u_len > 1 || ci->l_len > 1 ||
893
+ (ci->u_len > 0 && ci->l_len > 0) ||
894
+ (ci->f_len != ci->l_len) ||
895
+ (memcmp(ci->f_data, ci->l_data, ci->f_len * sizeof(ci->f_data[0])) != 0));
896
+ }
897
+
898
+ #ifndef USE_TEST
899
+ enum {
900
+ RUN_TYPE_U,
901
+ RUN_TYPE_L,
902
+ RUN_TYPE_UF,
903
+ RUN_TYPE_LF,
904
+ RUN_TYPE_UL,
905
+ RUN_TYPE_LSU,
906
+ RUN_TYPE_U2L_399_EXT2,
907
+ RUN_TYPE_UF_D20,
908
+ RUN_TYPE_UF_D1_EXT,
909
+ RUN_TYPE_U_EXT,
910
+ RUN_TYPE_LF_EXT,
911
+ RUN_TYPE_UF_EXT2,
912
+ RUN_TYPE_LF_EXT2,
913
+ RUN_TYPE_UF_EXT3,
914
+ };
915
+ #endif
916
+
917
+ const char *run_type_str[] = {
918
+ "U",
919
+ "L",
920
+ "UF",
921
+ "LF",
922
+ "UL",
923
+ "LSU",
924
+ "U2L_399_EXT2",
925
+ "UF_D20",
926
+ "UF_D1_EXT",
927
+ "U_EXT",
928
+ "LF_EXT",
929
+ "UF_EXT2",
930
+ "LF_EXT2",
931
+ "UF_EXT3",
932
+ };
933
+
934
+ typedef struct {
935
+ int code;
936
+ int len;
937
+ int type;
938
+ int data;
939
+ int ext_len;
940
+ int ext_data[3];
941
+ int data_index; /* 'data' coming from the table */
942
+ } TableEntry;
943
+
944
+ static int simple_to_lower(CCInfo *tab, int c)
945
+ {
946
+ if (tab[c].l_len != 1)
947
+ return c;
948
+ return tab[c].l_data[0];
949
+ }
950
+
951
+ /* code (17), len (7), type (4) */
952
+
953
+ void find_run_type(TableEntry *te, CCInfo *tab, int code)
954
+ {
955
+ int is_lower, len;
956
+ CCInfo *ci, *ci1, *ci2;
957
+
958
+ ci = &tab[code];
959
+ ci1 = &tab[code + 1];
960
+ ci2 = &tab[code + 2];
961
+ te->code = code;
962
+
963
+ if (ci->l_len == 1 && ci->l_data[0] == code + 2 &&
964
+ ci->f_len == 1 && ci->f_data[0] == ci->l_data[0] &&
965
+ ci->u_len == 0 &&
966
+
967
+ ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
968
+ ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0] &&
969
+ ci1->u_len == 1 && ci1->u_data[0] == code &&
970
+
971
+ ci2->l_len == 0 &&
972
+ ci2->f_len == 0 &&
973
+ ci2->u_len == 1 && ci2->u_data[0] == code) {
974
+ te->len = 3;
975
+ te->data = 0;
976
+ te->type = RUN_TYPE_LSU;
977
+ return;
978
+ }
979
+
980
+ if (is_complicated_case(ci)) {
981
+ len = 1;
982
+ while (code + len <= CHARCODE_MAX) {
983
+ ci1 = &tab[code + len];
984
+ if (ci1->u_len != 1 ||
985
+ ci1->u_data[0] != ci->u_data[0] + len ||
986
+ ci1->l_len != 0 ||
987
+ ci1->f_len != 1 || ci1->f_data[0] != ci1->u_data[0])
988
+ break;
989
+ len++;
990
+ }
991
+ if (len > 1) {
992
+ te->len = len;
993
+ te->type = RUN_TYPE_UF;
994
+ te->data = ci->u_data[0];
995
+ return;
996
+ }
997
+
998
+ if (ci->l_len == 0 &&
999
+ ci->u_len == 2 && ci->u_data[1] == 0x399 &&
1000
+ ci->f_len == 2 && ci->f_data[1] == 0x3B9 &&
1001
+ ci->f_data[0] == simple_to_lower(tab, ci->u_data[0])) {
1002
+ len = 1;
1003
+ while (code + len <= CHARCODE_MAX) {
1004
+ ci1 = &tab[code + len];
1005
+ if (!(ci1->u_len == 2 &&
1006
+ ci1->u_data[1] == ci->u_data[1] &&
1007
+ ci1->u_data[0] == ci->u_data[0] + len &&
1008
+ ci1->f_len == 2 &&
1009
+ ci1->f_data[1] == ci->f_data[1] &&
1010
+ ci1->f_data[0] == ci->f_data[0] + len &&
1011
+ ci1->l_len == 0))
1012
+ break;
1013
+ len++;
1014
+ }
1015
+ te->len = len;
1016
+ te->type = RUN_TYPE_UF_EXT2;
1017
+ te->ext_data[0] = ci->u_data[0];
1018
+ te->ext_data[1] = ci->u_data[1];
1019
+ te->ext_len = 2;
1020
+ return;
1021
+ }
1022
+
1023
+ if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
1024
+ ci->l_len == 1 &&
1025
+ ci->f_len == 1 && ci->f_data[0] == ci->l_data[0]) {
1026
+ len = 1;
1027
+ while (code + len <= CHARCODE_MAX) {
1028
+ ci1 = &tab[code + len];
1029
+ if (!(ci1->u_len == 2 &&
1030
+ ci1->u_data[1] == 0x399 &&
1031
+ ci1->u_data[0] == ci->u_data[0] + len &&
1032
+ ci1->l_len == 1 &&
1033
+ ci1->l_data[0] == ci->l_data[0] + len &&
1034
+ ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0]))
1035
+ break;
1036
+ len++;
1037
+ }
1038
+ te->len = len;
1039
+ te->type = RUN_TYPE_U2L_399_EXT2;
1040
+ te->ext_data[0] = ci->u_data[0];
1041
+ te->ext_data[1] = ci->l_data[0];
1042
+ te->ext_len = 2;
1043
+ return;
1044
+ }
1045
+
1046
+ if (ci->l_len == 1 && ci->u_len == 0 && ci->f_len == 0) {
1047
+ len = 1;
1048
+ while (code + len <= CHARCODE_MAX) {
1049
+ ci1 = &tab[code + len];
1050
+ if (!(ci1->l_len == 1 &&
1051
+ ci1->l_data[0] == ci->l_data[0] + len &&
1052
+ ci1->u_len == 0 && ci1->f_len == 0))
1053
+ break;
1054
+ len++;
1055
+ }
1056
+ te->len = len;
1057
+ te->type = RUN_TYPE_L;
1058
+ te->data = ci->l_data[0];
1059
+ return;
1060
+ }
1061
+
1062
+ if (ci->l_len == 0 &&
1063
+ ci->u_len == 1 &&
1064
+ ci->u_data[0] < 0x1000 &&
1065
+ ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 0x20) {
1066
+ te->len = 1;
1067
+ te->type = RUN_TYPE_UF_D20;
1068
+ te->data = ci->u_data[0];
1069
+ } else if (ci->l_len == 0 &&
1070
+ ci->u_len == 1 &&
1071
+ ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 1) {
1072
+ te->len = 1;
1073
+ te->type = RUN_TYPE_UF_D1_EXT;
1074
+ te->ext_data[0] = ci->u_data[0];
1075
+ te->ext_len = 1;
1076
+ } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_len == 2 &&
1077
+ ci->l_data[0] == ci->f_data[0] &&
1078
+ ci->l_data[1] == ci->f_data[1]) {
1079
+ te->len = 1;
1080
+ te->type = RUN_TYPE_LF_EXT2;
1081
+ te->ext_data[0] = ci->l_data[0];
1082
+ te->ext_data[1] = ci->l_data[1];
1083
+ te->ext_len = 2;
1084
+ } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 2 &&
1085
+ ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
1086
+ ci->f_data[1] == simple_to_lower(tab, ci->u_data[1])) {
1087
+ te->len = 1;
1088
+ te->type = RUN_TYPE_UF_EXT2;
1089
+ te->ext_data[0] = ci->u_data[0];
1090
+ te->ext_data[1] = ci->u_data[1];
1091
+ te->ext_len = 2;
1092
+ } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 3 &&
1093
+ ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
1094
+ ci->f_data[1] == simple_to_lower(tab, ci->u_data[1]) &&
1095
+ ci->f_data[2] == simple_to_lower(tab, ci->u_data[2])) {
1096
+ te->len = 1;
1097
+ te->type = RUN_TYPE_UF_EXT3;
1098
+ te->ext_data[0] = ci->u_data[0];
1099
+ te->ext_data[1] = ci->u_data[1];
1100
+ te->ext_data[2] = ci->u_data[2];
1101
+ te->ext_len = 3;
1102
+ } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 1) {
1103
+ // U+FB05 LATIN SMALL LIGATURE LONG S T
1104
+ assert(code == 0xFB05);
1105
+ te->len = 1;
1106
+ te->type = RUN_TYPE_UF_EXT2;
1107
+ te->ext_data[0] = ci->u_data[0];
1108
+ te->ext_data[1] = ci->u_data[1];
1109
+ te->ext_len = 2;
1110
+ } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 1) {
1111
+ // U+1FD3 GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA or
1112
+ // U+1FE3 GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
1113
+ assert(code == 0x1FD3 || code == 0x1FE3);
1114
+ te->len = 1;
1115
+ te->type = RUN_TYPE_UF_EXT3;
1116
+ te->ext_data[0] = ci->u_data[0];
1117
+ te->ext_data[1] = ci->u_data[1];
1118
+ te->ext_data[2] = ci->u_data[2];
1119
+ te->ext_len = 3;
1120
+ } else {
1121
+ printf("unsupported encoding case:\n");
1122
+ dump_cc_info(ci, code);
1123
+ abort();
1124
+ }
1125
+ } else {
1126
+ /* look for a run of identical conversions */
1127
+ len = 0;
1128
+ for(;;) {
1129
+ if (code >= CHARCODE_MAX || len >= 126)
1130
+ break;
1131
+ ci = &tab[code + len];
1132
+ ci1 = &tab[code + len + 1];
1133
+ if (is_complicated_case(ci) || is_complicated_case(ci1)) {
1134
+ break;
1135
+ }
1136
+ if (ci->l_len != 1 || ci->l_data[0] != code + len + 1)
1137
+ break;
1138
+ if (ci1->u_len != 1 || ci1->u_data[0] != code + len)
1139
+ break;
1140
+ len += 2;
1141
+ }
1142
+ if (len > 0) {
1143
+ te->len = len;
1144
+ te->type = RUN_TYPE_UL;
1145
+ te->data = 0;
1146
+ return;
1147
+ }
1148
+
1149
+ ci = &tab[code];
1150
+ is_lower = ci->l_len > 0;
1151
+ len = 1;
1152
+ while (code + len <= CHARCODE_MAX) {
1153
+ ci1 = &tab[code + len];
1154
+ if (is_complicated_case(ci1))
1155
+ break;
1156
+ if (is_lower) {
1157
+ if (ci1->l_len != 1 ||
1158
+ ci1->l_data[0] != ci->l_data[0] + len)
1159
+ break;
1160
+ } else {
1161
+ if (ci1->u_len != 1 ||
1162
+ ci1->u_data[0] != ci->u_data[0] + len)
1163
+ break;
1164
+ }
1165
+ len++;
1166
+ }
1167
+ te->len = len;
1168
+ if (is_lower) {
1169
+ te->type = RUN_TYPE_LF;
1170
+ te->data = ci->l_data[0];
1171
+ } else {
1172
+ te->type = RUN_TYPE_U;
1173
+ te->data = ci->u_data[0];
1174
+ }
1175
+ }
1176
+ }
1177
+
1178
+ TableEntry conv_table[1000];
1179
+ int conv_table_len;
1180
+ int ext_data[1000];
1181
+ int ext_data_len;
1182
+
1183
+ void dump_case_conv_table1(void)
1184
+ {
1185
+ int i, j;
1186
+ const TableEntry *te;
1187
+
1188
+ for(i = 0; i < conv_table_len; i++) {
1189
+ te = &conv_table[i];
1190
+ printf("%05x %02x %-10s %05x",
1191
+ te->code, te->len, run_type_str[te->type], te->data);
1192
+ for(j = 0; j < te->ext_len; j++) {
1193
+ printf(" %05x", te->ext_data[j]);
1194
+ }
1195
+ printf("\n");
1196
+ }
1197
+ printf("table_len=%d ext_len=%d\n", conv_table_len, ext_data_len);
1198
+ }
1199
+
1200
+ int find_data_index(const TableEntry *conv_table, int len, int data)
1201
+ {
1202
+ int i;
1203
+ const TableEntry *te;
1204
+ for(i = 0; i < len; i++) {
1205
+ te = &conv_table[i];
1206
+ if (te->code == data)
1207
+ return i;
1208
+ }
1209
+ return -1;
1210
+ }
1211
+
1212
+ int find_ext_data_index(int data)
1213
+ {
1214
+ int i;
1215
+ for(i = 0; i < ext_data_len; i++) {
1216
+ if (ext_data[i] == data)
1217
+ return i;
1218
+ }
1219
+ assert(ext_data_len < countof(ext_data));
1220
+ ext_data[ext_data_len++] = data;
1221
+ return ext_data_len - 1;
1222
+ }
1223
+
1224
+ void build_conv_table(CCInfo *tab)
1225
+ {
1226
+ int code, i, j;
1227
+ CCInfo *ci;
1228
+ TableEntry *te;
1229
+
1230
+ te = conv_table;
1231
+ for(code = 0; code <= CHARCODE_MAX; code++) {
1232
+ ci = &tab[code];
1233
+ if (ci->u_len == 0 && ci->l_len == 0 && ci->f_len == 0)
1234
+ continue;
1235
+ assert(te - conv_table < countof(conv_table));
1236
+ find_run_type(te, tab, code);
1237
+ assert(te->len <= 127);
1238
+ code += te->len - 1;
1239
+ te++;
1240
+ }
1241
+ conv_table_len = te - conv_table;
1242
+
1243
+ /* find the data index */
1244
+ for(i = 0; i < conv_table_len; i++) {
1245
+ int data_index;
1246
+ te = &conv_table[i];
1247
+
1248
+ switch(te->type) {
1249
+ case RUN_TYPE_U:
1250
+ case RUN_TYPE_L:
1251
+ case RUN_TYPE_UF:
1252
+ case RUN_TYPE_LF:
1253
+ data_index = find_data_index(conv_table, conv_table_len, te->data);
1254
+ if (data_index < 0) {
1255
+ switch(te->type) {
1256
+ case RUN_TYPE_U:
1257
+ te->type = RUN_TYPE_U_EXT;
1258
+ te->ext_len = 1;
1259
+ te->ext_data[0] = te->data;
1260
+ break;
1261
+ case RUN_TYPE_LF:
1262
+ te->type = RUN_TYPE_LF_EXT;
1263
+ te->ext_len = 1;
1264
+ te->ext_data[0] = te->data;
1265
+ break;
1266
+ default:
1267
+ printf("%05x: index not found\n", te->code);
1268
+ exit(1);
1269
+ }
1270
+ } else {
1271
+ te->data_index = data_index;
1272
+ }
1273
+ break;
1274
+ case RUN_TYPE_UF_D20:
1275
+ te->data_index = te->data;
1276
+ break;
1277
+ }
1278
+ }
1279
+
1280
+ /* find the data index for ext_data */
1281
+ for(i = 0; i < conv_table_len; i++) {
1282
+ te = &conv_table[i];
1283
+ if (te->type == RUN_TYPE_UF_EXT3) {
1284
+ int p, v;
1285
+ v = 0;
1286
+ for(j = 0; j < 3; j++) {
1287
+ p = find_ext_data_index(te->ext_data[j]);
1288
+ assert(p < 16);
1289
+ v = (v << 4) | p;
1290
+ }
1291
+ te->data_index = v;
1292
+ }
1293
+ }
1294
+
1295
+ for(i = 0; i < conv_table_len; i++) {
1296
+ te = &conv_table[i];
1297
+ if (te->type == RUN_TYPE_LF_EXT2 ||
1298
+ te->type == RUN_TYPE_UF_EXT2 ||
1299
+ te->type == RUN_TYPE_U2L_399_EXT2) {
1300
+ int p, v;
1301
+ v = 0;
1302
+ for(j = 0; j < 2; j++) {
1303
+ p = find_ext_data_index(te->ext_data[j]);
1304
+ assert(p < 64);
1305
+ v = (v << 6) | p;
1306
+ }
1307
+ te->data_index = v;
1308
+ }
1309
+ }
1310
+
1311
+ for(i = 0; i < conv_table_len; i++) {
1312
+ te = &conv_table[i];
1313
+ if (te->type == RUN_TYPE_UF_D1_EXT ||
1314
+ te->type == RUN_TYPE_U_EXT ||
1315
+ te->type == RUN_TYPE_LF_EXT) {
1316
+ te->data_index = find_ext_data_index(te->ext_data[0]);
1317
+ }
1318
+ }
1319
+ #ifdef DUMP_CASE_CONV_TABLE
1320
+ dump_case_conv_table1();
1321
+ #endif
1322
+ }
1323
+
1324
+ void dump_case_conv_table(FILE *f)
1325
+ {
1326
+ int i;
1327
+ uint32_t v;
1328
+ const TableEntry *te;
1329
+
1330
+ fprintf(f, "static const uint32_t case_conv_table1[%u] = {", conv_table_len);
1331
+ for(i = 0; i < conv_table_len; i++) {
1332
+ if (i % 4 == 0)
1333
+ fprintf(f, "\n ");
1334
+ te = &conv_table[i];
1335
+ v = te->code << (32 - 17);
1336
+ v |= te->len << (32 - 17 - 7);
1337
+ v |= te->type << (32 - 17 - 7 - 4);
1338
+ v |= te->data_index >> 8;
1339
+ fprintf(f, " 0x%08x,", v);
1340
+ }
1341
+ fprintf(f, "\n};\n\n");
1342
+
1343
+ fprintf(f, "static const uint8_t case_conv_table2[%u] = {", conv_table_len);
1344
+ for(i = 0; i < conv_table_len; i++) {
1345
+ if (i % 8 == 0)
1346
+ fprintf(f, "\n ");
1347
+ te = &conv_table[i];
1348
+ fprintf(f, " 0x%02x,", te->data_index & 0xff);
1349
+ }
1350
+ fprintf(f, "\n};\n\n");
1351
+
1352
+ fprintf(f, "static const uint16_t case_conv_ext[%u] = {", ext_data_len);
1353
+ for(i = 0; i < ext_data_len; i++) {
1354
+ if (i % 8 == 0)
1355
+ fprintf(f, "\n ");
1356
+ fprintf(f, " 0x%04x,", ext_data[i]);
1357
+ }
1358
+ fprintf(f, "\n};\n\n");
1359
+ }
1360
+
1361
+
1362
+ static CCInfo *global_tab;
1363
+
1364
+ static int sp_cc_cmp(const void *p1, const void *p2)
1365
+ {
1366
+ CCInfo *c1 = &global_tab[*(const int *)p1];
1367
+ CCInfo *c2 = &global_tab[*(const int *)p2];
1368
+ if (c1->f_len < c2->f_len) {
1369
+ return -1;
1370
+ } else if (c2->f_len < c1->f_len) {
1371
+ return 1;
1372
+ } else {
1373
+ return memcmp(c1->f_data, c2->f_data, sizeof(c1->f_data[0]) * c1->f_len);
1374
+ }
1375
+ }
1376
+
1377
+ /* dump the case special cases (multi character results which are
1378
+ identical and need specific handling in lre_canonicalize() */
1379
+ void dump_case_folding_special_cases(CCInfo *tab)
1380
+ {
1381
+ int i, len, j;
1382
+ int *perm;
1383
+
1384
+ perm = malloc(sizeof(perm[0]) * (CHARCODE_MAX + 1));
1385
+ for(i = 0; i <= CHARCODE_MAX; i++)
1386
+ perm[i] = i;
1387
+ global_tab = tab;
1388
+ qsort(perm, CHARCODE_MAX + 1, sizeof(perm[0]), sp_cc_cmp);
1389
+ for(i = 0; i <= CHARCODE_MAX;) {
1390
+ if (tab[perm[i]].f_len <= 1) {
1391
+ i++;
1392
+ } else {
1393
+ len = 1;
1394
+ while ((i + len) <= CHARCODE_MAX && !sp_cc_cmp(&perm[i], &perm[i + len]))
1395
+ len++;
1396
+
1397
+ if (len > 1) {
1398
+ for(j = i; j < i + len; j++)
1399
+ dump_cc_info(&tab[perm[j]], perm[j]);
1400
+ }
1401
+ i += len;
1402
+ }
1403
+ }
1404
+ free(perm);
1405
+ global_tab = NULL;
1406
+ }
1407
+
1408
+
1409
+ int tabcmp(const int *tab1, const int *tab2, int n)
1410
+ {
1411
+ int i;
1412
+ for(i = 0; i < n; i++) {
1413
+ if (tab1[i] != tab2[i])
1414
+ return -1;
1415
+ }
1416
+ return 0;
1417
+ }
1418
+
1419
+ void dump_str(const char *str, const int *buf, int len)
1420
+ {
1421
+ int i;
1422
+ printf("%s=", str);
1423
+ for(i = 0; i < len; i++)
1424
+ printf(" %05x", buf[i]);
1425
+ printf("\n");
1426
+ }
1427
+
1428
+ void compute_internal_props(void)
1429
+ {
1430
+ int i;
1431
+ bool has_ul;
1432
+
1433
+ for(i = 0; i <= CHARCODE_MAX; i++) {
1434
+ CCInfo *ci = &unicode_db[i];
1435
+ has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0);
1436
+ if (has_ul) {
1437
+ assert(get_prop(i, PROP_Cased));
1438
+ } else {
1439
+ set_prop(i, PROP_Cased1, get_prop(i, PROP_Cased));
1440
+ }
1441
+ set_prop(i, PROP_ID_Continue1,
1442
+ get_prop(i, PROP_ID_Continue) & (get_prop(i, PROP_ID_Start) ^ 1));
1443
+ set_prop(i, PROP_XID_Start1,
1444
+ get_prop(i, PROP_ID_Start) ^ get_prop(i, PROP_XID_Start));
1445
+ set_prop(i, PROP_XID_Continue1,
1446
+ get_prop(i, PROP_ID_Continue) ^ get_prop(i, PROP_XID_Continue));
1447
+ set_prop(i, PROP_Changes_When_Titlecased1,
1448
+ get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0));
1449
+ set_prop(i, PROP_Changes_When_Casefolded1,
1450
+ get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_len != 0));
1451
+ /* XXX: reduce table size (438 bytes) */
1452
+ set_prop(i, PROP_Changes_When_NFKC_Casefolded1,
1453
+ get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_len != 0));
1454
+ }
1455
+ }
1456
+
1457
+ void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
1458
+ {
1459
+ int i;
1460
+ fprintf(f, "static const uint8_t %s[%d] = {", cname, len);
1461
+ for(i = 0; i < len; i++) {
1462
+ if (i % 8 == 0)
1463
+ fprintf(f, "\n ");
1464
+ fprintf(f, " 0x%02x,", tab[i]);
1465
+ }
1466
+ fprintf(f, "\n};\n\n");
1467
+ }
1468
+
1469
+ #define PROP_BLOCK_LEN 32
1470
+
1471
+ void build_prop_table(FILE *f, int prop_index, bool add_index)
1472
+ {
1473
+ int i, j, n, v, offset, code;
1474
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
1475
+ DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
1476
+ DynBuf dbuf2_s, *dbuf2 = &dbuf2_s;
1477
+ const uint32_t *buf;
1478
+ int buf_len, block_end_pos, bit;
1479
+ char cname[128];
1480
+
1481
+ dbuf_init(dbuf1);
1482
+
1483
+ for(i = 0; i <= CHARCODE_MAX;) {
1484
+ v = get_prop(i, prop_index);
1485
+ j = i + 1;
1486
+ while (j <= CHARCODE_MAX && get_prop(j, prop_index) == v) {
1487
+ j++;
1488
+ }
1489
+ n = j - i;
1490
+ if (j == (CHARCODE_MAX + 1) && v == 0)
1491
+ break; /* no need to encode last zero run */
1492
+ //printf("%05x: %d %d\n", i, n, v);
1493
+ dbuf_put_u32(dbuf1, n - 1);
1494
+ i += n;
1495
+ }
1496
+
1497
+ dbuf_init(dbuf);
1498
+ dbuf_init(dbuf2);
1499
+ buf = (uint32_t *)dbuf1->buf;
1500
+ buf_len = dbuf1->size / sizeof(buf[0]);
1501
+
1502
+ /* the first value is assumed to be 0 */
1503
+ assert(get_prop(0, prop_index) == 0);
1504
+
1505
+ block_end_pos = PROP_BLOCK_LEN;
1506
+ i = 0;
1507
+ code = 0;
1508
+ bit = 0;
1509
+ while (i < buf_len) {
1510
+ if (add_index && dbuf->size >= block_end_pos && bit == 0) {
1511
+ offset = (dbuf->size - block_end_pos);
1512
+ /* XXX: offset could be larger in case of runs of small
1513
+ lengths. Could add code to change the encoding to
1514
+ prevent it at the expense of one byte loss */
1515
+ assert(offset <= 7);
1516
+ v = code | (offset << 21);
1517
+ dbuf_putc(dbuf2, v);
1518
+ dbuf_putc(dbuf2, v >> 8);
1519
+ dbuf_putc(dbuf2, v >> 16);
1520
+ block_end_pos += PROP_BLOCK_LEN;
1521
+ }
1522
+
1523
+ v = buf[i];
1524
+ code += v + 1;
1525
+ bit ^= 1;
1526
+ if (v < 8 && (i + 1) < buf_len && buf[i + 1] < 8) {
1527
+ code += buf[i + 1] + 1;
1528
+ bit ^= 1;
1529
+ dbuf_putc(dbuf, (v << 3) | buf[i + 1]);
1530
+ i += 2;
1531
+ } else if (v < 128) {
1532
+ dbuf_putc(dbuf, 0x80 + v);
1533
+ i++;
1534
+ } else if (v < (1 << 13)) {
1535
+ dbuf_putc(dbuf, 0x40 + (v >> 8));
1536
+ dbuf_putc(dbuf, v);
1537
+ i++;
1538
+ } else {
1539
+ assert(v < (1 << 21));
1540
+ dbuf_putc(dbuf, 0x60 + (v >> 16));
1541
+ dbuf_putc(dbuf, v >> 8);
1542
+ dbuf_putc(dbuf, v);
1543
+ i++;
1544
+ }
1545
+ }
1546
+
1547
+ if (add_index) {
1548
+ /* last index entry */
1549
+ v = code;
1550
+ dbuf_putc(dbuf2, v);
1551
+ dbuf_putc(dbuf2, v >> 8);
1552
+ dbuf_putc(dbuf2, v >> 16);
1553
+ }
1554
+
1555
+ #ifdef DUMP_TABLE_SIZE
1556
+ printf("prop %s: length=%d bytes\n", unicode_prop_name[prop_index],
1557
+ (int)(dbuf->size + dbuf2->size));
1558
+ #endif
1559
+ snprintf(cname, sizeof(cname), "unicode_prop_%s_table", unicode_prop_name[prop_index]);
1560
+ dump_byte_table(f, cname, dbuf->buf, dbuf->size);
1561
+ if (add_index) {
1562
+ snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]);
1563
+ dump_byte_table(f, cname, dbuf2->buf, dbuf2->size);
1564
+ }
1565
+
1566
+ dbuf_free(dbuf);
1567
+ dbuf_free(dbuf1);
1568
+ dbuf_free(dbuf2);
1569
+ }
1570
+
1571
+ void build_flags_tables(FILE *f)
1572
+ {
1573
+ build_prop_table(f, PROP_Cased1, true);
1574
+ build_prop_table(f, PROP_Case_Ignorable, true);
1575
+ build_prop_table(f, PROP_ID_Start, true);
1576
+ build_prop_table(f, PROP_ID_Continue1, true);
1577
+ build_prop_table(f, PROP_White_Space, true);
1578
+ }
1579
+
1580
+ void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
1581
+ const char **tab_short_name)
1582
+ {
1583
+ int i, w, maxw;
1584
+
1585
+ maxw = 0;
1586
+ for(i = 0; i < len; i++) {
1587
+ w = strlen(tab_name[i]);
1588
+ if (tab_short_name[i][0] != '\0') {
1589
+ w += 1 + strlen(tab_short_name[i]);
1590
+ }
1591
+ if (maxw < w)
1592
+ maxw = w;
1593
+ }
1594
+
1595
+ /* generate a sequence of strings terminated by an empty string */
1596
+ fprintf(f, "static const char %s[] =\n", cname);
1597
+ for(i = 0; i < len; i++) {
1598
+ fprintf(f, " \"");
1599
+ w = fprintf(f, "%s", tab_name[i]);
1600
+ if (tab_short_name[i][0] != '\0') {
1601
+ w += fprintf(f, ",%s", tab_short_name[i]);
1602
+ }
1603
+ fprintf(f, "\"%*s\"\\0\"\n", 1 + maxw - w, "");
1604
+ }
1605
+ fprintf(f, ";\n\n");
1606
+ }
1607
+
1608
+ void build_general_category_table(FILE *f)
1609
+ {
1610
+ int i, v, j, n, n1;
1611
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
1612
+ int cw_count, cw_len_count[4], cw_start;
1613
+
1614
+ fprintf(f, "typedef enum {\n");
1615
+ for(i = 0; i < GCAT_COUNT; i++)
1616
+ fprintf(f, " UNICODE_GC_%s,\n", unicode_gc_name[i]);
1617
+ fprintf(f, " UNICODE_GC_COUNT,\n");
1618
+ fprintf(f, "} UnicodeGCEnum;\n\n");
1619
+
1620
+ dump_name_table(f, "unicode_gc_name_table",
1621
+ unicode_gc_name, GCAT_COUNT,
1622
+ unicode_gc_short_name);
1623
+
1624
+
1625
+ dbuf_init(dbuf);
1626
+ cw_count = 0;
1627
+ for(i = 0; i < 4; i++)
1628
+ cw_len_count[i] = 0;
1629
+ for(i = 0; i <= CHARCODE_MAX;) {
1630
+ v = unicode_db[i].general_category;
1631
+ j = i + 1;
1632
+ while (j <= CHARCODE_MAX && unicode_db[j].general_category == v)
1633
+ j++;
1634
+ n = j - i;
1635
+ /* compress Lu/Ll runs */
1636
+ if (v == GCAT_Lu) {
1637
+ n1 = 1;
1638
+ while ((i + n1) <= CHARCODE_MAX && unicode_db[i + n1].general_category == (v + (n1 & 1))) {
1639
+ n1++;
1640
+ }
1641
+ if (n1 > n) {
1642
+ v = 31;
1643
+ n = n1;
1644
+ }
1645
+ }
1646
+ // printf("%05x %05x %d\n", i, n, v);
1647
+ cw_count++;
1648
+ n--;
1649
+ cw_start = dbuf->size;
1650
+ if (n < 7) {
1651
+ dbuf_putc(dbuf, (n << 5) | v);
1652
+ } else if (n < 7 + 128) {
1653
+ n1 = n - 7;
1654
+ assert(n1 < 128);
1655
+ dbuf_putc(dbuf, (0xf << 5) | v);
1656
+ dbuf_putc(dbuf, n1);
1657
+ } else if (n < 7 + 128 + (1 << 14)) {
1658
+ n1 = n - (7 + 128);
1659
+ assert(n1 < (1 << 14));
1660
+ dbuf_putc(dbuf, (0xf << 5) | v);
1661
+ dbuf_putc(dbuf, (n1 >> 8) + 128);
1662
+ dbuf_putc(dbuf, n1);
1663
+ } else {
1664
+ n1 = n - (7 + 128 + (1 << 14));
1665
+ assert(n1 < (1 << 22));
1666
+ dbuf_putc(dbuf, (0xf << 5) | v);
1667
+ dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
1668
+ dbuf_putc(dbuf, n1 >> 8);
1669
+ dbuf_putc(dbuf, n1);
1670
+ }
1671
+ cw_len_count[dbuf->size - cw_start - 1]++;
1672
+ i += n + 1;
1673
+ }
1674
+ #ifdef DUMP_TABLE_SIZE
1675
+ printf("general category: %d entries [",
1676
+ cw_count);
1677
+ for(i = 0; i < 4; i++)
1678
+ printf(" %d", cw_len_count[i]);
1679
+ printf(" ], length=%d bytes\n", (int)dbuf->size);
1680
+ #endif
1681
+
1682
+ dump_byte_table(f, "unicode_gc_table", dbuf->buf, dbuf->size);
1683
+
1684
+ dbuf_free(dbuf);
1685
+ }
1686
+
1687
+ void build_script_table(FILE *f)
1688
+ {
1689
+ int i, v, j, n, n1, type;
1690
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
1691
+ int cw_count, cw_len_count[4], cw_start;
1692
+
1693
+ fprintf(f, "typedef enum {\n");
1694
+ for(i = 0; i < SCRIPT_COUNT; i++)
1695
+ fprintf(f, " UNICODE_SCRIPT_%s,\n", unicode_script_name[i]);
1696
+ fprintf(f, " UNICODE_SCRIPT_COUNT,\n");
1697
+ fprintf(f, "} UnicodeScriptEnum;\n\n");
1698
+
1699
+ i = 1;
1700
+ dump_name_table(f, "unicode_script_name_table",
1701
+ unicode_script_name + i, SCRIPT_COUNT - i,
1702
+ unicode_script_short_name + i);
1703
+
1704
+ dbuf_init(dbuf);
1705
+ cw_count = 0;
1706
+ for(i = 0; i < 4; i++)
1707
+ cw_len_count[i] = 0;
1708
+ for(i = 0; i <= CHARCODE_MAX;) {
1709
+ v = unicode_db[i].script;
1710
+ j = i + 1;
1711
+ while (j <= CHARCODE_MAX && unicode_db[j].script == v)
1712
+ j++;
1713
+ n = j - i;
1714
+ if (v == 0 && j == (CHARCODE_MAX + 1))
1715
+ break;
1716
+ // printf("%05x %05x %d\n", i, n, v);
1717
+ cw_count++;
1718
+ n--;
1719
+ cw_start = dbuf->size;
1720
+ if (v == 0)
1721
+ type = 0;
1722
+ else
1723
+ type = 1;
1724
+ if (n < 96) {
1725
+ dbuf_putc(dbuf, n | (type << 7));
1726
+ } else if (n < 96 + (1 << 12)) {
1727
+ n1 = n - 96;
1728
+ assert(n1 < (1 << 12));
1729
+ dbuf_putc(dbuf, ((n1 >> 8) + 96) | (type << 7));
1730
+ dbuf_putc(dbuf, n1);
1731
+ } else {
1732
+ n1 = n - (96 + (1 << 12));
1733
+ assert(n1 < (1 << 20));
1734
+ dbuf_putc(dbuf, ((n1 >> 16) + 112) | (type << 7));
1735
+ dbuf_putc(dbuf, n1 >> 8);
1736
+ dbuf_putc(dbuf, n1);
1737
+ }
1738
+ if (type != 0)
1739
+ dbuf_putc(dbuf, v);
1740
+
1741
+ cw_len_count[dbuf->size - cw_start - 1]++;
1742
+ i += n + 1;
1743
+ }
1744
+ #if defined(DUMP_TABLE_SIZE)
1745
+ printf("script: %d entries [",
1746
+ cw_count);
1747
+ for(i = 0; i < 4; i++)
1748
+ printf(" %d", cw_len_count[i]);
1749
+ printf(" ], length=%d bytes\n", (int)dbuf->size);
1750
+ #endif
1751
+
1752
+ dump_byte_table(f, "unicode_script_table", dbuf->buf, dbuf->size);
1753
+
1754
+ dbuf_free(dbuf);
1755
+ }
1756
+
1757
+ void build_script_ext_table(FILE *f)
1758
+ {
1759
+ int i, j, n, n1, script_ext_len;
1760
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
1761
+ int cw_count;
1762
+
1763
+ dbuf_init(dbuf);
1764
+ cw_count = 0;
1765
+ for(i = 0; i <= CHARCODE_MAX;) {
1766
+ script_ext_len = unicode_db[i].script_ext_len;
1767
+ j = i + 1;
1768
+ while (j <= CHARCODE_MAX &&
1769
+ unicode_db[j].script_ext_len == script_ext_len &&
1770
+ !memcmp(unicode_db[j].script_ext, unicode_db[i].script_ext,
1771
+ script_ext_len)) {
1772
+ j++;
1773
+ }
1774
+ n = j - i;
1775
+ cw_count++;
1776
+ n--;
1777
+ if (n < 128) {
1778
+ dbuf_putc(dbuf, n);
1779
+ } else if (n < 128 + (1 << 14)) {
1780
+ n1 = n - 128;
1781
+ assert(n1 < (1 << 14));
1782
+ dbuf_putc(dbuf, (n1 >> 8) + 128);
1783
+ dbuf_putc(dbuf, n1);
1784
+ } else {
1785
+ n1 = n - (128 + (1 << 14));
1786
+ assert(n1 < (1 << 22));
1787
+ dbuf_putc(dbuf, (n1 >> 16) + 128 + 64);
1788
+ dbuf_putc(dbuf, n1 >> 8);
1789
+ dbuf_putc(dbuf, n1);
1790
+ }
1791
+ dbuf_putc(dbuf, script_ext_len);
1792
+ for(j = 0; j < script_ext_len; j++)
1793
+ dbuf_putc(dbuf, unicode_db[i].script_ext[j]);
1794
+ i += n + 1;
1795
+ }
1796
+ #ifdef DUMP_TABLE_SIZE
1797
+ printf("script_ext: %d entries",
1798
+ cw_count);
1799
+ printf(", length=%d bytes\n", (int)dbuf->size);
1800
+ #endif
1801
+
1802
+ dump_byte_table(f, "unicode_script_ext_table", dbuf->buf, dbuf->size);
1803
+
1804
+ dbuf_free(dbuf);
1805
+ }
1806
+
1807
+ /* the following properties are synthetized so no table is necessary */
1808
+ #define PROP_TABLE_COUNT PROP_ASCII
1809
+
1810
+ void build_prop_list_table(FILE *f)
1811
+ {
1812
+ int i;
1813
+
1814
+ for(i = 0; i < PROP_TABLE_COUNT; i++) {
1815
+ if (i == PROP_ID_Start ||
1816
+ i == PROP_Case_Ignorable ||
1817
+ i == PROP_ID_Continue1 ||
1818
+ i == PROP_White_Space) {
1819
+ /* already generated */
1820
+ } else {
1821
+ build_prop_table(f, i, false);
1822
+ }
1823
+ }
1824
+
1825
+ fprintf(f, "typedef enum {\n");
1826
+ for(i = 0; i < PROP_COUNT; i++)
1827
+ fprintf(f, " UNICODE_PROP_%s,\n", unicode_prop_name[i]);
1828
+ fprintf(f, " UNICODE_PROP_COUNT,\n");
1829
+ fprintf(f, "} UnicodePropertyEnum;\n\n");
1830
+
1831
+ i = PROP_ASCII_Hex_Digit;
1832
+ dump_name_table(f, "unicode_prop_name_table",
1833
+ unicode_prop_name + i, PROP_XID_Start - i + 1,
1834
+ unicode_prop_short_name + i);
1835
+
1836
+ fprintf(f, "static const uint8_t * const unicode_prop_table[] = {\n");
1837
+ for(i = 0; i < PROP_TABLE_COUNT; i++) {
1838
+ fprintf(f, " unicode_prop_%s_table,\n", unicode_prop_name[i]);
1839
+ }
1840
+ fprintf(f, "};\n\n");
1841
+
1842
+ fprintf(f, "static const uint16_t unicode_prop_len_table[] = {\n");
1843
+ for(i = 0; i < PROP_TABLE_COUNT; i++) {
1844
+ fprintf(f, " countof(unicode_prop_%s_table),\n", unicode_prop_name[i]);
1845
+ }
1846
+ fprintf(f, "};\n\n");
1847
+ }
1848
+
1849
+ #ifdef USE_TEST
1850
+ int check_conv(uint32_t *res, uint32_t c, int conv_type)
1851
+ {
1852
+ return lre_case_conv(res, c, conv_type);
1853
+ }
1854
+
1855
+ void check_case_conv(void)
1856
+ {
1857
+ CCInfo *tab = unicode_db;
1858
+ uint32_t res[3];
1859
+ int l, error;
1860
+ CCInfo ci_s, *ci1, *ci = &ci_s;
1861
+ int code;
1862
+
1863
+ for(code = 0; code <= CHARCODE_MAX; code++) {
1864
+ ci1 = &tab[code];
1865
+ *ci = *ci1;
1866
+ if (ci->l_len == 0) {
1867
+ ci->l_len = 1;
1868
+ ci->l_data[0] = code;
1869
+ }
1870
+ if (ci->u_len == 0) {
1871
+ ci->u_len = 1;
1872
+ ci->u_data[0] = code;
1873
+ }
1874
+ if (ci->f_len == 0) {
1875
+ ci->f_len = 1;
1876
+ ci->f_data[0] = code;
1877
+ }
1878
+
1879
+ error = 0;
1880
+ l = check_conv(res, code, 0);
1881
+ if (l != ci->u_len || tabcmp((int *)res, ci->u_data, l)) {
1882
+ printf("ERROR: L\n");
1883
+ error++;
1884
+ }
1885
+ l = check_conv(res, code, 1);
1886
+ if (l != ci->l_len || tabcmp((int *)res, ci->l_data, l)) {
1887
+ printf("ERROR: U\n");
1888
+ error++;
1889
+ }
1890
+ l = check_conv(res, code, 2);
1891
+ if (l != ci->f_len || tabcmp((int *)res, ci->f_data, l)) {
1892
+ printf("ERROR: F\n");
1893
+ error++;
1894
+ }
1895
+ if (error) {
1896
+ dump_cc_info(ci, code);
1897
+ exit(1);
1898
+ }
1899
+ }
1900
+ }
1901
+
1902
+ #ifdef PROFILE
1903
+ static int64_t get_time_ns(void)
1904
+ {
1905
+ struct timespec ts;
1906
+ clock_gettime(CLOCK_MONOTONIC, &ts);
1907
+ return (int64_t)ts.tv_sec * 1000000000 + ts.tv_nsec;
1908
+ }
1909
+ #endif
1910
+
1911
+
1912
+ void check_flags(void)
1913
+ {
1914
+ int c;
1915
+ bool flag_ref, flag;
1916
+ for(c = 0; c <= CHARCODE_MAX; c++) {
1917
+ flag_ref = get_prop(c, PROP_Cased);
1918
+ flag = lre_is_cased(c);
1919
+ if (flag != flag_ref) {
1920
+ printf("ERROR: c=%05x cased=%d ref=%d\n",
1921
+ c, flag, flag_ref);
1922
+ exit(1);
1923
+ }
1924
+
1925
+ flag_ref = get_prop(c, PROP_Case_Ignorable);
1926
+ flag = lre_is_case_ignorable(c);
1927
+ if (flag != flag_ref) {
1928
+ printf("ERROR: c=%05x case_ignorable=%d ref=%d\n",
1929
+ c, flag, flag_ref);
1930
+ exit(1);
1931
+ }
1932
+
1933
+ flag_ref = get_prop(c, PROP_ID_Start);
1934
+ flag = lre_is_id_start(c);
1935
+ if (flag != flag_ref) {
1936
+ printf("ERROR: c=%05x id_start=%d ref=%d\n",
1937
+ c, flag, flag_ref);
1938
+ exit(1);
1939
+ }
1940
+
1941
+ flag_ref = get_prop(c, PROP_ID_Continue);
1942
+ flag = lre_is_id_continue(c);
1943
+ if (flag != flag_ref) {
1944
+ printf("ERROR: c=%05x id_cont=%d ref=%d\n",
1945
+ c, flag, flag_ref);
1946
+ exit(1);
1947
+ }
1948
+ }
1949
+ #ifdef PROFILE
1950
+ {
1951
+ int64_t ti, count;
1952
+ ti = get_time_ns();
1953
+ count = 0;
1954
+ for(c = 0x20; c <= 0xffff; c++) {
1955
+ flag_ref = get_prop(c, PROP_ID_Start);
1956
+ flag = lre_is_id_start(c);
1957
+ assert(flag == flag_ref);
1958
+ count++;
1959
+ }
1960
+ ti = get_time_ns() - ti;
1961
+ printf("flags time=%0.1f ns/char\n",
1962
+ (double)ti / count);
1963
+ }
1964
+ #endif
1965
+ }
1966
+
1967
+ #endif
1968
+
1969
+ #define CC_BLOCK_LEN 32
1970
+
1971
+ void build_cc_table(FILE *f)
1972
+ {
1973
+ int i, cc, n, cc_table_len, type, n1;
1974
+ DynBuf dbuf_s, *dbuf = &dbuf_s;
1975
+ DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
1976
+ int cw_len_tab[3], cw_start, block_end_pos;
1977
+ uint32_t v;
1978
+
1979
+ dbuf_init(dbuf);
1980
+ dbuf_init(dbuf1);
1981
+ cc_table_len = 0;
1982
+ for(i = 0; i < countof(cw_len_tab); i++)
1983
+ cw_len_tab[i] = 0;
1984
+ block_end_pos = CC_BLOCK_LEN;
1985
+ for(i = 0; i <= CHARCODE_MAX;) {
1986
+ cc = unicode_db[i].combining_class;
1987
+ assert(cc <= 255);
1988
+ /* check increasing values */
1989
+ n = 1;
1990
+ while ((i + n) <= CHARCODE_MAX &&
1991
+ unicode_db[i + n].combining_class == (cc + n))
1992
+ n++;
1993
+ if (n >= 2) {
1994
+ type = 1;
1995
+ } else {
1996
+ type = 0;
1997
+ n = 1;
1998
+ while ((i + n) <= CHARCODE_MAX &&
1999
+ unicode_db[i + n].combining_class == cc)
2000
+ n++;
2001
+ }
2002
+ /* no need to encode the last run */
2003
+ if (cc == 0 && (i + n - 1) == CHARCODE_MAX)
2004
+ break;
2005
+ #ifdef DUMP_CC_TABLE
2006
+ printf("%05x %6d %d %d\n", i, n, type, cc);
2007
+ #endif
2008
+ if (type == 0) {
2009
+ if (cc == 0)
2010
+ type = 2;
2011
+ else if (cc == 230)
2012
+ type = 3;
2013
+ }
2014
+ n1 = n - 1;
2015
+
2016
+ /* add an entry to the index if necessary */
2017
+ if (dbuf->size >= block_end_pos) {
2018
+ v = i | ((dbuf->size - block_end_pos) << 21);
2019
+ dbuf_putc(dbuf1, v);
2020
+ dbuf_putc(dbuf1, v >> 8);
2021
+ dbuf_putc(dbuf1, v >> 16);
2022
+ block_end_pos += CC_BLOCK_LEN;
2023
+ }
2024
+ cw_start = dbuf->size;
2025
+ if (n1 < 48) {
2026
+ dbuf_putc(dbuf, n1 | (type << 6));
2027
+ } else if (n1 < 48 + (1 << 11)) {
2028
+ n1 -= 48;
2029
+ dbuf_putc(dbuf, ((n1 >> 8) + 48) | (type << 6));
2030
+ dbuf_putc(dbuf, n1);
2031
+ } else {
2032
+ n1 -= 48 + (1 << 11);
2033
+ assert(n1 < (1 << 20));
2034
+ dbuf_putc(dbuf, ((n1 >> 16) + 56) | (type << 6));
2035
+ dbuf_putc(dbuf, n1 >> 8);
2036
+ dbuf_putc(dbuf, n1);
2037
+ }
2038
+ cw_len_tab[dbuf->size - cw_start - 1]++;
2039
+ if (type == 0 || type == 1)
2040
+ dbuf_putc(dbuf, cc);
2041
+ cc_table_len++;
2042
+ i += n;
2043
+ }
2044
+
2045
+ /* last index entry */
2046
+ v = i;
2047
+ dbuf_putc(dbuf1, v);
2048
+ dbuf_putc(dbuf1, v >> 8);
2049
+ dbuf_putc(dbuf1, v >> 16);
2050
+
2051
+ dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size);
2052
+ dump_byte_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size);
2053
+
2054
+ #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
2055
+ printf("CC table: size=%d (%d entries) [",
2056
+ (int)(dbuf->size + dbuf1->size),
2057
+ cc_table_len);
2058
+ for(i = 0; i < countof(cw_len_tab); i++)
2059
+ printf(" %d", cw_len_tab[i]);
2060
+ printf(" ]\n");
2061
+ #endif
2062
+ dbuf_free(dbuf);
2063
+ dbuf_free(dbuf1);
2064
+ }
2065
+
2066
+ /* maximum length of decomposition: 18 chars (1), then 8 */
2067
+ #ifndef USE_TEST
2068
+ typedef enum {
2069
+ DECOMP_TYPE_C1, /* 16 bit char */
2070
+ DECOMP_TYPE_L1, /* 16 bit char table */
2071
+ DECOMP_TYPE_L2,
2072
+ DECOMP_TYPE_L3,
2073
+ DECOMP_TYPE_L4,
2074
+ DECOMP_TYPE_L5, /* XXX: not used */
2075
+ DECOMP_TYPE_L6, /* XXX: could remove */
2076
+ DECOMP_TYPE_L7, /* XXX: could remove */
2077
+ DECOMP_TYPE_LL1, /* 18 bit char table */
2078
+ DECOMP_TYPE_LL2,
2079
+ DECOMP_TYPE_S1, /* 8 bit char table */
2080
+ DECOMP_TYPE_S2,
2081
+ DECOMP_TYPE_S3,
2082
+ DECOMP_TYPE_S4,
2083
+ DECOMP_TYPE_S5,
2084
+ DECOMP_TYPE_I1, /* increment 16 bit char value */
2085
+ DECOMP_TYPE_I2_0,
2086
+ DECOMP_TYPE_I2_1,
2087
+ DECOMP_TYPE_I3_1,
2088
+ DECOMP_TYPE_I3_2,
2089
+ DECOMP_TYPE_I4_1,
2090
+ DECOMP_TYPE_I4_2,
2091
+ DECOMP_TYPE_B1, /* 16 bit base + 8 bit offset */
2092
+ DECOMP_TYPE_B2,
2093
+ DECOMP_TYPE_B3,
2094
+ DECOMP_TYPE_B4,
2095
+ DECOMP_TYPE_B5,
2096
+ DECOMP_TYPE_B6,
2097
+ DECOMP_TYPE_B7,
2098
+ DECOMP_TYPE_B8,
2099
+ DECOMP_TYPE_B18,
2100
+ DECOMP_TYPE_LS2,
2101
+ DECOMP_TYPE_PAT3,
2102
+ DECOMP_TYPE_S2_UL,
2103
+ DECOMP_TYPE_LS2_UL,
2104
+ } DecompTypeEnum;
2105
+ #endif
2106
+
2107
+ const char *decomp_type_str[] = {
2108
+ "C1",
2109
+ "L1",
2110
+ "L2",
2111
+ "L3",
2112
+ "L4",
2113
+ "L5",
2114
+ "L6",
2115
+ "L7",
2116
+ "LL1",
2117
+ "LL2",
2118
+ "S1",
2119
+ "S2",
2120
+ "S3",
2121
+ "S4",
2122
+ "S5",
2123
+ "I1",
2124
+ "I2_0",
2125
+ "I2_1",
2126
+ "I3_1",
2127
+ "I3_2",
2128
+ "I4_1",
2129
+ "I4_2",
2130
+ "B1",
2131
+ "B2",
2132
+ "B3",
2133
+ "B4",
2134
+ "B5",
2135
+ "B6",
2136
+ "B7",
2137
+ "B8",
2138
+ "B18",
2139
+ "LS2",
2140
+ "PAT3",
2141
+ "S2_UL",
2142
+ "LS2_UL",
2143
+ };
2144
+
2145
+ const int decomp_incr_tab[4][4] = {
2146
+ { DECOMP_TYPE_I1, 0, -1 },
2147
+ { DECOMP_TYPE_I2_0, 0, 1, -1 },
2148
+ { DECOMP_TYPE_I3_1, 1, 2, -1 },
2149
+ { DECOMP_TYPE_I4_1, 1, 2, -1 },
2150
+ };
2151
+
2152
+ /*
2153
+ entry size:
2154
+ type bits
2155
+ code 18
2156
+ len 7
2157
+ compat 1
2158
+ type 5
2159
+ index 16
2160
+ total 47
2161
+ */
2162
+
2163
+ typedef struct {
2164
+ int code;
2165
+ uint8_t len;
2166
+ uint8_t type;
2167
+ uint8_t c_len;
2168
+ uint16_t c_min;
2169
+ uint16_t data_index;
2170
+ int cost; /* size in bytes from this entry to the end */
2171
+ } DecompEntry;
2172
+
2173
+ int get_decomp_run_size(const DecompEntry *de)
2174
+ {
2175
+ int s;
2176
+ s = 6;
2177
+ if (de->type <= DECOMP_TYPE_C1) {
2178
+ /* nothing more */
2179
+ } else if (de->type <= DECOMP_TYPE_L7) {
2180
+ s += de->len * de->c_len * 2;
2181
+ } else if (de->type <= DECOMP_TYPE_LL2) {
2182
+ /* 18 bits per char */
2183
+ s += (de->len * de->c_len * 18 + 7) / 8;
2184
+ } else if (de->type <= DECOMP_TYPE_S5) {
2185
+ s += de->len * de->c_len;
2186
+ } else if (de->type <= DECOMP_TYPE_I4_2) {
2187
+ s += de->c_len * 2;
2188
+ } else if (de->type <= DECOMP_TYPE_B18) {
2189
+ s += 2 + de->len * de->c_len;
2190
+ } else if (de->type <= DECOMP_TYPE_LS2) {
2191
+ s += de->len * 3;
2192
+ } else if (de->type <= DECOMP_TYPE_PAT3) {
2193
+ s += 4 + de->len * 2;
2194
+ } else if (de->type <= DECOMP_TYPE_S2_UL) {
2195
+ s += de->len;
2196
+ } else if (de->type <= DECOMP_TYPE_LS2_UL) {
2197
+ s += (de->len / 2) * 3;
2198
+ } else {
2199
+ abort();
2200
+ }
2201
+ return s;
2202
+ }
2203
+
2204
+ static const uint16_t unicode_short_table[2] = { 0x2044, 0x2215 };
2205
+
2206
+ /* return -1 if not found */
2207
+ int get_short_code(int c)
2208
+ {
2209
+ int i;
2210
+ if (c < 0x80) {
2211
+ return c;
2212
+ } else if (c >= 0x300 && c < 0x350) {
2213
+ return c - 0x300 + 0x80;
2214
+ } else {
2215
+ for(i = 0; i < countof(unicode_short_table); i++) {
2216
+ if (c == unicode_short_table[i])
2217
+ return i + 0x80 + 0x50;
2218
+ }
2219
+ return -1;
2220
+ }
2221
+ }
2222
+
2223
+ static bool is_short(int code)
2224
+ {
2225
+ return get_short_code(code) >= 0;
2226
+ }
2227
+
2228
+ static bool is_short_tab(const int *tab, int len)
2229
+ {
2230
+ int i;
2231
+ for(i = 0; i < len; i++) {
2232
+ if (!is_short(tab[i]))
2233
+ return false;
2234
+ }
2235
+ return true;
2236
+ }
2237
+
2238
+ static bool is_16bit(const int *tab, int len)
2239
+ {
2240
+ int i;
2241
+ for(i = 0; i < len; i++) {
2242
+ if (tab[i] > 0xffff)
2243
+ return false;
2244
+ }
2245
+ return true;
2246
+ }
2247
+
2248
+ static uint32_t to_lower_simple(uint32_t c)
2249
+ {
2250
+ /* Latin1 and Cyrillic */
2251
+ if (c < 0x100 || (c >= 0x410 && c <= 0x42f))
2252
+ c += 0x20;
2253
+ else
2254
+ c++;
2255
+ return c;
2256
+ }
2257
+
2258
+ /* select best encoding with dynamic programming */
2259
+ void find_decomp_run(DecompEntry *tab_de, int i)
2260
+ {
2261
+ DecompEntry de_s, *de = &de_s;
2262
+ CCInfo *ci, *ci1, *ci2;
2263
+ int l, j, n, len_max;
2264
+
2265
+ ci = &unicode_db[i];
2266
+ l = ci->decomp_len;
2267
+ if (l == 0) {
2268
+ tab_de[i].cost = tab_de[i + 1].cost;
2269
+ return;
2270
+ }
2271
+
2272
+ /* the offset for the compose table has only 6 bits, so we must
2273
+ limit if it can be used by the compose table */
2274
+ if (!ci->is_compat && !ci->is_excluded && l == 2)
2275
+ len_max = 64;
2276
+ else
2277
+ len_max = 127;
2278
+
2279
+ tab_de[i].cost = 0x7fffffff;
2280
+
2281
+ if (!is_16bit(ci->decomp_data, l)) {
2282
+ assert(l <= 2);
2283
+
2284
+ n = 1;
2285
+ for(;;) {
2286
+ de->code = i;
2287
+ de->len = n;
2288
+ de->type = DECOMP_TYPE_LL1 + l - 1;
2289
+ de->c_len = l;
2290
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2291
+ if (de->cost < tab_de[i].cost) {
2292
+ tab_de[i] = *de;
2293
+ }
2294
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2295
+ break;
2296
+ ci1 = &unicode_db[i + n];
2297
+ /* Note: we accept a hole */
2298
+ if (!(ci1->decomp_len == 0 ||
2299
+ (ci1->decomp_len == l &&
2300
+ ci1->is_compat == ci->is_compat)))
2301
+ break;
2302
+ n++;
2303
+ }
2304
+ return;
2305
+ }
2306
+
2307
+ if (l <= 7) {
2308
+ n = 1;
2309
+ for(;;) {
2310
+ de->code = i;
2311
+ de->len = n;
2312
+ if (l == 1 && n == 1) {
2313
+ de->type = DECOMP_TYPE_C1;
2314
+ } else {
2315
+ assert(l <= 8);
2316
+ de->type = DECOMP_TYPE_L1 + l - 1;
2317
+ }
2318
+ de->c_len = l;
2319
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2320
+ if (de->cost < tab_de[i].cost) {
2321
+ tab_de[i] = *de;
2322
+ }
2323
+
2324
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2325
+ break;
2326
+ ci1 = &unicode_db[i + n];
2327
+ /* Note: we accept a hole */
2328
+ if (!(ci1->decomp_len == 0 ||
2329
+ (ci1->decomp_len == l &&
2330
+ ci1->is_compat == ci->is_compat &&
2331
+ is_16bit(ci1->decomp_data, l))))
2332
+ break;
2333
+ n++;
2334
+ }
2335
+ }
2336
+
2337
+ if (l <= 8 || l == 18) {
2338
+ int c_min, c_max, c;
2339
+ c_min = c_max = -1;
2340
+ n = 1;
2341
+ for(;;) {
2342
+ ci1 = &unicode_db[i + n - 1];
2343
+ for(j = 0; j < l; j++) {
2344
+ c = ci1->decomp_data[j];
2345
+ if (c == 0x20) {
2346
+ /* we accept space for Arabic */
2347
+ } else if (c_min == -1) {
2348
+ c_min = c_max = c;
2349
+ } else {
2350
+ c_min = min_int(c_min, c);
2351
+ c_max = max_int(c_max, c);
2352
+ }
2353
+ }
2354
+ if ((c_max - c_min) > 254)
2355
+ break;
2356
+ de->code = i;
2357
+ de->len = n;
2358
+ if (l == 18)
2359
+ de->type = DECOMP_TYPE_B18;
2360
+ else
2361
+ de->type = DECOMP_TYPE_B1 + l - 1;
2362
+ de->c_len = l;
2363
+ de->c_min = c_min;
2364
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2365
+ if (de->cost < tab_de[i].cost) {
2366
+ tab_de[i] = *de;
2367
+ }
2368
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2369
+ break;
2370
+ ci1 = &unicode_db[i + n];
2371
+ if (!(ci1->decomp_len == l &&
2372
+ ci1->is_compat == ci->is_compat))
2373
+ break;
2374
+ n++;
2375
+ }
2376
+ }
2377
+
2378
+ /* find an ascii run */
2379
+ if (l <= 5 && is_short_tab(ci->decomp_data, l)) {
2380
+ n = 1;
2381
+ for(;;) {
2382
+ de->code = i;
2383
+ de->len = n;
2384
+ de->type = DECOMP_TYPE_S1 + l - 1;
2385
+ de->c_len = l;
2386
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2387
+ if (de->cost < tab_de[i].cost) {
2388
+ tab_de[i] = *de;
2389
+ }
2390
+
2391
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2392
+ break;
2393
+ ci1 = &unicode_db[i + n];
2394
+ /* Note: we accept a hole */
2395
+ if (!(ci1->decomp_len == 0 ||
2396
+ (ci1->decomp_len == l &&
2397
+ ci1->is_compat == ci->is_compat &&
2398
+ is_short_tab(ci1->decomp_data, l))))
2399
+ break;
2400
+ n++;
2401
+ }
2402
+ }
2403
+
2404
+ /* check if a single char is increasing */
2405
+ if (l <= 4) {
2406
+ int idx1, idx;
2407
+
2408
+ for(idx1 = 1; (idx = decomp_incr_tab[l - 1][idx1]) >= 0; idx1++) {
2409
+ n = 1;
2410
+ for(;;) {
2411
+ de->code = i;
2412
+ de->len = n;
2413
+ de->type = decomp_incr_tab[l - 1][0] + idx1 - 1;
2414
+ de->c_len = l;
2415
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2416
+ if (de->cost < tab_de[i].cost) {
2417
+ tab_de[i] = *de;
2418
+ }
2419
+
2420
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2421
+ break;
2422
+ ci1 = &unicode_db[i + n];
2423
+ if (!(ci1->decomp_len == l &&
2424
+ ci1->is_compat == ci->is_compat))
2425
+ goto next1;
2426
+ for(j = 0; j < l; j++) {
2427
+ if (j == idx) {
2428
+ if (ci1->decomp_data[j] != ci->decomp_data[j] + n)
2429
+ goto next1;
2430
+ } else {
2431
+ if (ci1->decomp_data[j] != ci->decomp_data[j])
2432
+ goto next1;
2433
+ }
2434
+ }
2435
+ n++;
2436
+ }
2437
+ next1: ;
2438
+ }
2439
+ }
2440
+
2441
+ if (l == 3) {
2442
+ n = 1;
2443
+ for(;;) {
2444
+ de->code = i;
2445
+ de->len = n;
2446
+ de->type = DECOMP_TYPE_PAT3;
2447
+ de->c_len = l;
2448
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2449
+ if (de->cost < tab_de[i].cost) {
2450
+ tab_de[i] = *de;
2451
+ }
2452
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2453
+ break;
2454
+ ci1 = &unicode_db[i + n];
2455
+ if (!(ci1->decomp_len == l &&
2456
+ ci1->is_compat == ci->is_compat &&
2457
+ ci1->decomp_data[1] <= 0xffff &&
2458
+ ci1->decomp_data[0] == ci->decomp_data[0] &&
2459
+ ci1->decomp_data[l - 1] == ci->decomp_data[l - 1]))
2460
+ break;
2461
+ n++;
2462
+ }
2463
+ }
2464
+
2465
+ if (l == 2 && is_short(ci->decomp_data[1])) {
2466
+ n = 1;
2467
+ for(;;) {
2468
+ de->code = i;
2469
+ de->len = n;
2470
+ de->type = DECOMP_TYPE_LS2;
2471
+ de->c_len = l;
2472
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2473
+ if (de->cost < tab_de[i].cost) {
2474
+ tab_de[i] = *de;
2475
+ }
2476
+ if (!((i + n) <= CHARCODE_MAX && n < len_max))
2477
+ break;
2478
+ ci1 = &unicode_db[i + n];
2479
+ if (!(ci1->decomp_len == 0 ||
2480
+ (ci1->decomp_len == l &&
2481
+ ci1->is_compat == ci->is_compat &&
2482
+ ci1->decomp_data[0] <= 0xffff &&
2483
+ is_short(ci1->decomp_data[1]))))
2484
+ break;
2485
+ n++;
2486
+ }
2487
+ }
2488
+
2489
+ if (l == 2) {
2490
+ bool is_16bit;
2491
+
2492
+ n = 0;
2493
+ is_16bit = false;
2494
+ for(;;) {
2495
+ if (!((i + n + 1) <= CHARCODE_MAX && n + 2 <= len_max))
2496
+ break;
2497
+ ci1 = &unicode_db[i + n];
2498
+ if (!(ci1->decomp_len == l &&
2499
+ ci1->is_compat == ci->is_compat &&
2500
+ is_short(ci1->decomp_data[1])))
2501
+ break;
2502
+ if (!is_16bit && !is_short(ci1->decomp_data[0]))
2503
+ is_16bit = true;
2504
+ ci2 = &unicode_db[i + n + 1];
2505
+ if (!(ci2->decomp_len == l &&
2506
+ ci2->is_compat == ci->is_compat &&
2507
+ ci2->decomp_data[0] == to_lower_simple(ci1->decomp_data[0]) &&
2508
+ ci2->decomp_data[1] == ci1->decomp_data[1]))
2509
+ break;
2510
+ n += 2;
2511
+ de->code = i;
2512
+ de->len = n;
2513
+ de->type = DECOMP_TYPE_S2_UL + is_16bit;
2514
+ de->c_len = l;
2515
+ de->cost = get_decomp_run_size(de) + tab_de[i + n].cost;
2516
+ if (de->cost < tab_de[i].cost) {
2517
+ tab_de[i] = *de;
2518
+ }
2519
+ }
2520
+ }
2521
+ }
2522
+
2523
+ void put16(uint8_t *data_buf, int *pidx, uint16_t c)
2524
+ {
2525
+ int idx;
2526
+ idx = *pidx;
2527
+ data_buf[idx++] = c;
2528
+ data_buf[idx++] = c >> 8;
2529
+ *pidx = idx;
2530
+ }
2531
+
2532
+ void add_decomp_data(uint8_t *data_buf, int *pidx, DecompEntry *de)
2533
+ {
2534
+ int i, j, idx, c;
2535
+ CCInfo *ci;
2536
+
2537
+ idx = *pidx;
2538
+ de->data_index = idx;
2539
+ if (de->type <= DECOMP_TYPE_C1) {
2540
+ ci = &unicode_db[de->code];
2541
+ assert(ci->decomp_len == 1);
2542
+ de->data_index = ci->decomp_data[0];
2543
+ } else if (de->type <= DECOMP_TYPE_L7) {
2544
+ for(i = 0; i < de->len; i++) {
2545
+ ci = &unicode_db[de->code + i];
2546
+ for(j = 0; j < de->c_len; j++) {
2547
+ if (ci->decomp_len == 0)
2548
+ c = 0;
2549
+ else
2550
+ c = ci->decomp_data[j];
2551
+ put16(data_buf, &idx, c);
2552
+ }
2553
+ }
2554
+ } else if (de->type <= DECOMP_TYPE_LL2) {
2555
+ int n, p, k;
2556
+ n = (de->len * de->c_len * 18 + 7) / 8;
2557
+ p = de->len * de->c_len * 2;
2558
+ memset(data_buf + idx, 0, n);
2559
+ k = 0;
2560
+ for(i = 0; i < de->len; i++) {
2561
+ ci = &unicode_db[de->code + i];
2562
+ for(j = 0; j < de->c_len; j++) {
2563
+ if (ci->decomp_len == 0)
2564
+ c = 0;
2565
+ else
2566
+ c = ci->decomp_data[j];
2567
+ data_buf[idx + k * 2] = c;
2568
+ data_buf[idx + k * 2 + 1] = c >> 8;
2569
+ data_buf[idx + p + (k / 4)] |= (c >> 16) << ((k % 4) * 2);
2570
+ k++;
2571
+ }
2572
+ }
2573
+ idx += n;
2574
+ } else if (de->type <= DECOMP_TYPE_S5) {
2575
+ for(i = 0; i < de->len; i++) {
2576
+ ci = &unicode_db[de->code + i];
2577
+ for(j = 0; j < de->c_len; j++) {
2578
+ if (ci->decomp_len == 0)
2579
+ c = 0;
2580
+ else
2581
+ c = ci->decomp_data[j];
2582
+ c = get_short_code(c);
2583
+ assert(c >= 0);
2584
+ data_buf[idx++] = c;
2585
+ }
2586
+ }
2587
+ } else if (de->type <= DECOMP_TYPE_I4_2) {
2588
+ ci = &unicode_db[de->code];
2589
+ assert(ci->decomp_len == de->c_len);
2590
+ for(j = 0; j < de->c_len; j++)
2591
+ put16(data_buf, &idx, ci->decomp_data[j]);
2592
+ } else if (de->type <= DECOMP_TYPE_B18) {
2593
+ c = de->c_min;
2594
+ data_buf[idx++] = c;
2595
+ data_buf[idx++] = c >> 8;
2596
+ for(i = 0; i < de->len; i++) {
2597
+ ci = &unicode_db[de->code + i];
2598
+ for(j = 0; j < de->c_len; j++) {
2599
+ assert(ci->decomp_len == de->c_len);
2600
+ c = ci->decomp_data[j];
2601
+ if (c == 0x20) {
2602
+ c = 0xff;
2603
+ } else {
2604
+ c -= de->c_min;
2605
+ assert((uint32_t)c <= 254);
2606
+ }
2607
+ data_buf[idx++] = c;
2608
+ }
2609
+ }
2610
+ } else if (de->type <= DECOMP_TYPE_LS2) {
2611
+ assert(de->c_len == 2);
2612
+ for(i = 0; i < de->len; i++) {
2613
+ ci = &unicode_db[de->code + i];
2614
+ if (ci->decomp_len == 0)
2615
+ c = 0;
2616
+ else
2617
+ c = ci->decomp_data[0];
2618
+ put16(data_buf, &idx, c);
2619
+
2620
+ if (ci->decomp_len == 0)
2621
+ c = 0;
2622
+ else
2623
+ c = ci->decomp_data[1];
2624
+ c = get_short_code(c);
2625
+ assert(c >= 0);
2626
+ data_buf[idx++] = c;
2627
+ }
2628
+ } else if (de->type <= DECOMP_TYPE_PAT3) {
2629
+ ci = &unicode_db[de->code];
2630
+ assert(ci->decomp_len == 3);
2631
+ put16(data_buf, &idx, ci->decomp_data[0]);
2632
+ put16(data_buf, &idx, ci->decomp_data[2]);
2633
+ for(i = 0; i < de->len; i++) {
2634
+ ci = &unicode_db[de->code + i];
2635
+ assert(ci->decomp_len == 3);
2636
+ put16(data_buf, &idx, ci->decomp_data[1]);
2637
+ }
2638
+ } else if (de->type <= DECOMP_TYPE_S2_UL) {
2639
+ for(i = 0; i < de->len; i += 2) {
2640
+ ci = &unicode_db[de->code + i];
2641
+ c = ci->decomp_data[0];
2642
+ c = get_short_code(c);
2643
+ assert(c >= 0);
2644
+ data_buf[idx++] = c;
2645
+ c = ci->decomp_data[1];
2646
+ c = get_short_code(c);
2647
+ assert(c >= 0);
2648
+ data_buf[idx++] = c;
2649
+ }
2650
+ } else if (de->type <= DECOMP_TYPE_LS2_UL) {
2651
+ for(i = 0; i < de->len; i += 2) {
2652
+ ci = &unicode_db[de->code + i];
2653
+ c = ci->decomp_data[0];
2654
+ put16(data_buf, &idx, c);
2655
+ c = ci->decomp_data[1];
2656
+ c = get_short_code(c);
2657
+ assert(c >= 0);
2658
+ data_buf[idx++] = c;
2659
+ }
2660
+ } else {
2661
+ abort();
2662
+ }
2663
+ *pidx = idx;
2664
+ }
2665
+
2666
+ void build_compose_table(FILE *f, const DecompEntry *tab_de);
2667
+
2668
+ void build_decompose_table(FILE *f)
2669
+ {
2670
+ int i, array_len, code_max, data_len, count;
2671
+ DecompEntry *tab_de, de_s, *de = &de_s;
2672
+ uint8_t *data_buf;
2673
+
2674
+ code_max = CHARCODE_MAX;
2675
+
2676
+ tab_de = mallocz((code_max + 2) * sizeof(*tab_de));
2677
+
2678
+ for(i = code_max; i >= 0; i--) {
2679
+ find_decomp_run(tab_de, i);
2680
+ }
2681
+
2682
+ /* build the data buffer */
2683
+ data_buf = malloc(100000);
2684
+ data_len = 0;
2685
+ array_len = 0;
2686
+ for(i = 0; i <= code_max; i++) {
2687
+ de = &tab_de[i];
2688
+ if (de->len != 0) {
2689
+ add_decomp_data(data_buf, &data_len, de);
2690
+ i += de->len - 1;
2691
+ array_len++;
2692
+ }
2693
+ }
2694
+
2695
+ #ifdef DUMP_DECOMP_TABLE
2696
+ /* dump */
2697
+ {
2698
+ int size, size1;
2699
+
2700
+ printf("START LEN TYPE L C SIZE\n");
2701
+ size = 0;
2702
+ for(i = 0; i <= code_max; i++) {
2703
+ de = &tab_de[i];
2704
+ if (de->len != 0) {
2705
+ size1 = get_decomp_run_size(de);
2706
+ printf("%05x %3d %6s %2d %1d %4d\n", i, de->len,
2707
+ decomp_type_str[de->type], de->c_len,
2708
+ unicode_db[i].is_compat, size1);
2709
+ i += de->len - 1;
2710
+ size += size1;
2711
+ }
2712
+ }
2713
+
2714
+ printf("array_len=%d estimated size=%d bytes actual=%d bytes\n",
2715
+ array_len, size, array_len * 6 + data_len);
2716
+ }
2717
+ #endif
2718
+
2719
+ fprintf(f, "static const uint32_t unicode_decomp_table1[%u] = {",
2720
+ array_len);
2721
+ count = 0;
2722
+ for(i = 0; i <= code_max; i++) {
2723
+ de = &tab_de[i];
2724
+ if (de->len != 0) {
2725
+ uint32_t v;
2726
+ if (count++ % 4 == 0)
2727
+ fprintf(f, "\n ");
2728
+ v = (de->code << (32 - 18)) |
2729
+ (de->len << (32 - 18 - 7)) |
2730
+ (de->type << (32 - 18 - 7 - 6)) |
2731
+ unicode_db[de->code].is_compat;
2732
+ fprintf(f, " 0x%08x,", v);
2733
+ i += de->len - 1;
2734
+ }
2735
+ }
2736
+ fprintf(f, "\n};\n\n");
2737
+
2738
+ fprintf(f, "static const uint16_t unicode_decomp_table2[%u] = {",
2739
+ array_len);
2740
+ count = 0;
2741
+ for(i = 0; i <= code_max; i++) {
2742
+ de = &tab_de[i];
2743
+ if (de->len != 0) {
2744
+ if (count++ % 8 == 0)
2745
+ fprintf(f, "\n ");
2746
+ fprintf(f, " 0x%04x,", de->data_index);
2747
+ i += de->len - 1;
2748
+ }
2749
+ }
2750
+ fprintf(f, "\n};\n\n");
2751
+
2752
+ fprintf(f, "static const uint8_t unicode_decomp_data[%u] = {",
2753
+ data_len);
2754
+ for(i = 0; i < data_len; i++) {
2755
+ if (i % 8 == 0)
2756
+ fprintf(f, "\n ");
2757
+ fprintf(f, " 0x%02x,", data_buf[i]);
2758
+ }
2759
+ fprintf(f, "\n};\n\n");
2760
+
2761
+ build_compose_table(f, tab_de);
2762
+
2763
+ free(data_buf);
2764
+
2765
+ free(tab_de);
2766
+ }
2767
+
2768
+ typedef struct {
2769
+ uint32_t c[2];
2770
+ uint32_t p;
2771
+ } ComposeEntry;
2772
+
2773
+ #define COMPOSE_LEN_MAX 10000
2774
+
2775
+ static int ce_cmp(const void *p1, const void *p2)
2776
+ {
2777
+ const ComposeEntry *ce1 = p1;
2778
+ const ComposeEntry *ce2 = p2;
2779
+ int i;
2780
+
2781
+ for(i = 0; i < 2; i++) {
2782
+ if (ce1->c[i] < ce2->c[i])
2783
+ return -1;
2784
+ else if (ce1->c[i] > ce2->c[i])
2785
+ return 1;
2786
+ }
2787
+ return 0;
2788
+ }
2789
+
2790
+
2791
+ static int get_decomp_pos(const DecompEntry *tab_de, int c)
2792
+ {
2793
+ int i, v, k;
2794
+ const DecompEntry *de;
2795
+
2796
+ k = 0;
2797
+ for(i = 0; i <= CHARCODE_MAX; i++) {
2798
+ de = &tab_de[i];
2799
+ if (de->len != 0) {
2800
+ if (c >= de->code && c < de->code + de->len) {
2801
+ v = c - de->code;
2802
+ assert(v < 64);
2803
+ v |= k << 6;
2804
+ assert(v < 65536);
2805
+ return v;
2806
+ }
2807
+ i += de->len - 1;
2808
+ k++;
2809
+ }
2810
+ }
2811
+ return -1;
2812
+ }
2813
+
2814
+ void build_compose_table(FILE *f, const DecompEntry *tab_de)
2815
+ {
2816
+ int i, v, tab_ce_len;
2817
+ ComposeEntry *ce, *tab_ce;
2818
+
2819
+ tab_ce = malloc(sizeof(*tab_ce) * COMPOSE_LEN_MAX);
2820
+ tab_ce_len = 0;
2821
+ for(i = 0; i <= CHARCODE_MAX; i++) {
2822
+ CCInfo *ci = &unicode_db[i];
2823
+ if (ci->decomp_len == 2 && !ci->is_compat &&
2824
+ !ci->is_excluded) {
2825
+ assert(tab_ce_len < COMPOSE_LEN_MAX);
2826
+ ce = &tab_ce[tab_ce_len++];
2827
+ ce->c[0] = ci->decomp_data[0];
2828
+ ce->c[1] = ci->decomp_data[1];
2829
+ ce->p = i;
2830
+ }
2831
+ }
2832
+ qsort(tab_ce, tab_ce_len, sizeof(*tab_ce), ce_cmp);
2833
+
2834
+ fprintf(f, "static const uint16_t unicode_comp_table[%u] = {",
2835
+ tab_ce_len);
2836
+ for(i = 0; i < tab_ce_len; i++) {
2837
+ if (i % 8 == 0)
2838
+ fprintf(f, "\n ");
2839
+ v = get_decomp_pos(tab_de, tab_ce[i].p);
2840
+ if (v < 0) {
2841
+ printf("ERROR: entry for c=%04x not found\n",
2842
+ tab_ce[i].p);
2843
+ exit(1);
2844
+ }
2845
+ fprintf(f, " 0x%04x,", v);
2846
+ }
2847
+ fprintf(f, "\n};\n\n");
2848
+
2849
+ free(tab_ce);
2850
+ }
2851
+
2852
+ #ifdef USE_TEST
2853
+ void check_decompose_table(void)
2854
+ {
2855
+ int c;
2856
+ CCInfo *ci;
2857
+ int res[UNICODE_DECOMP_LEN_MAX], *ref;
2858
+ int len, ref_len, is_compat;
2859
+
2860
+ for(is_compat = 0; is_compat <= 1; is_compat++) {
2861
+ for(c = 0; c < CHARCODE_MAX; c++) {
2862
+ ci = &unicode_db[c];
2863
+ ref_len = ci->decomp_len;
2864
+ ref = ci->decomp_data;
2865
+ if (!is_compat && ci->is_compat) {
2866
+ ref_len = 0;
2867
+ }
2868
+ len = unicode_decomp_char((uint32_t *)res, c, is_compat);
2869
+ if (len != ref_len ||
2870
+ tabcmp(res, ref, ref_len) != 0) {
2871
+ printf("ERROR c=%05x compat=%d\n", c, is_compat);
2872
+ dump_str("res", res, len);
2873
+ dump_str("ref", ref, ref_len);
2874
+ exit(1);
2875
+ }
2876
+ }
2877
+ }
2878
+ }
2879
+
2880
+ void check_compose_table(void)
2881
+ {
2882
+ int i, p;
2883
+ /* XXX: we don't test all the cases */
2884
+
2885
+ for(i = 0; i <= CHARCODE_MAX; i++) {
2886
+ CCInfo *ci = &unicode_db[i];
2887
+ if (ci->decomp_len == 2 && !ci->is_compat &&
2888
+ !ci->is_excluded) {
2889
+ p = unicode_compose_pair(ci->decomp_data[0], ci->decomp_data[1]);
2890
+ if (p != i) {
2891
+ printf("ERROR compose: c=%05x %05x -> %05x ref=%05x\n",
2892
+ ci->decomp_data[0], ci->decomp_data[1], p, i);
2893
+ exit(1);
2894
+ }
2895
+ }
2896
+ }
2897
+
2898
+
2899
+
2900
+ }
2901
+
2902
+ #endif
2903
+
2904
+
2905
+
2906
+ #ifdef USE_TEST
2907
+
2908
+ void check_str(const char *msg, int num, const int *in_buf, int in_len,
2909
+ const int *buf1, int len1,
2910
+ const int *buf2, int len2)
2911
+ {
2912
+ if (len1 != len2 || tabcmp(buf1, buf2, len1) != 0) {
2913
+ printf("%d: ERROR %s:\n", num, msg);
2914
+ dump_str(" in", in_buf, in_len);
2915
+ dump_str("res", buf1, len1);
2916
+ dump_str("ref", buf2, len2);
2917
+ exit(1);
2918
+ }
2919
+ }
2920
+
2921
+ void check_cc_table(void)
2922
+ {
2923
+ int cc, cc_ref, c;
2924
+
2925
+ for(c = 0; c <= CHARCODE_MAX; c++) {
2926
+ cc_ref = unicode_db[c].combining_class;
2927
+ cc = unicode_get_cc(c);
2928
+ if (cc != cc_ref) {
2929
+ printf("ERROR: c=%04x cc=%d cc_ref=%d\n",
2930
+ c, cc, cc_ref);
2931
+ exit(1);
2932
+ }
2933
+ }
2934
+ #ifdef PROFILE
2935
+ {
2936
+ int64_t ti, count;
2937
+
2938
+ ti = get_time_ns();
2939
+ count = 0;
2940
+ /* only do it on meaningful chars */
2941
+ for(c = 0x20; c <= 0xffff; c++) {
2942
+ cc_ref = unicode_db[c].combining_class;
2943
+ cc = unicode_get_cc(c);
2944
+ count++;
2945
+ }
2946
+ ti = get_time_ns() - ti;
2947
+ printf("cc time=%0.1f ns/char\n",
2948
+ (double)ti / count);
2949
+ }
2950
+ #endif
2951
+ }
2952
+
2953
+ void normalization_test(const char *filename)
2954
+ {
2955
+ FILE *f;
2956
+ char line[4096], *p;
2957
+ int *in_str, *nfc_str, *nfd_str, *nfkc_str, *nfkd_str;
2958
+ int in_len, nfc_len, nfd_len, nfkc_len, nfkd_len;
2959
+ int *buf, buf_len, pos;
2960
+
2961
+ f = fopen(filename, "rb");
2962
+ if (!f) {
2963
+ perror(filename);
2964
+ exit(1);
2965
+ }
2966
+ pos = 0;
2967
+ for(;;) {
2968
+ if (!get_line(line, sizeof(line), f))
2969
+ break;
2970
+ pos++;
2971
+ p = line;
2972
+ while (isspace(*p))
2973
+ p++;
2974
+ if (*p == '#' || *p == '@')
2975
+ continue;
2976
+ in_str = get_field_str(&in_len, p, 0);
2977
+ nfc_str = get_field_str(&nfc_len, p, 1);
2978
+ nfd_str = get_field_str(&nfd_len, p, 2);
2979
+ nfkc_str = get_field_str(&nfkc_len, p, 3);
2980
+ nfkd_str = get_field_str(&nfkd_len, p, 4);
2981
+
2982
+ // dump_str("in", in_str, in_len);
2983
+
2984
+ buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFD, NULL, NULL);
2985
+ check_str("nfd", pos, in_str, in_len, buf, buf_len, nfd_str, nfd_len);
2986
+ free(buf);
2987
+
2988
+ buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKD, NULL, NULL);
2989
+ check_str("nfkd", pos, in_str, in_len, buf, buf_len, nfkd_str, nfkd_len);
2990
+ free(buf);
2991
+
2992
+ buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFC, NULL, NULL);
2993
+ check_str("nfc", pos, in_str, in_len, buf, buf_len, nfc_str, nfc_len);
2994
+ free(buf);
2995
+
2996
+ buf_len = unicode_normalize((uint32_t **)&buf, (uint32_t *)in_str, in_len, UNICODE_NFKC, NULL, NULL);
2997
+ check_str("nfkc", pos, in_str, in_len, buf, buf_len, nfkc_str, nfkc_len);
2998
+ free(buf);
2999
+
3000
+ free(in_str);
3001
+ free(nfc_str);
3002
+ free(nfd_str);
3003
+ free(nfkc_str);
3004
+ free(nfkd_str);
3005
+ }
3006
+ fclose(f);
3007
+ }
3008
+ #endif
3009
+
3010
+ int main(int argc, char **argv)
3011
+ {
3012
+ const char *unicode_db_path, *outfilename;
3013
+ char filename[1024];
3014
+
3015
+ if (argc < 2) {
3016
+ printf("usage: %s unicode_db_path [output_file]\n"
3017
+ "\n"
3018
+ "If no output_file is given, a self test is done using the current unicode library\n",
3019
+ argv[0]);
3020
+ exit(1);
3021
+ }
3022
+ unicode_db_path = argv[1];
3023
+ outfilename = NULL;
3024
+ if (argc >= 3)
3025
+ outfilename = argv[2];
3026
+
3027
+ unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
3028
+
3029
+ snprintf(filename, sizeof(filename), "%s/UnicodeData.txt", unicode_db_path);
3030
+
3031
+ parse_unicode_data(filename);
3032
+
3033
+ snprintf(filename, sizeof(filename), "%s/SpecialCasing.txt", unicode_db_path);
3034
+ parse_special_casing(unicode_db, filename);
3035
+
3036
+ snprintf(filename, sizeof(filename), "%s/CaseFolding.txt", unicode_db_path);
3037
+ parse_case_folding(unicode_db, filename);
3038
+
3039
+ snprintf(filename, sizeof(filename), "%s/CompositionExclusions.txt", unicode_db_path);
3040
+ parse_composition_exclusions(filename);
3041
+
3042
+ snprintf(filename, sizeof(filename), "%s/DerivedCoreProperties.txt", unicode_db_path);
3043
+ parse_derived_core_properties(filename);
3044
+
3045
+ snprintf(filename, sizeof(filename), "%s/DerivedNormalizationProps.txt", unicode_db_path);
3046
+ parse_derived_norm_properties(filename);
3047
+
3048
+ snprintf(filename, sizeof(filename), "%s/PropList.txt", unicode_db_path);
3049
+ parse_prop_list(filename);
3050
+
3051
+ snprintf(filename, sizeof(filename), "%s/Scripts.txt", unicode_db_path);
3052
+ parse_scripts(filename);
3053
+
3054
+ snprintf(filename, sizeof(filename), "%s/ScriptExtensions.txt",
3055
+ unicode_db_path);
3056
+ parse_script_extensions(filename);
3057
+
3058
+ snprintf(filename, sizeof(filename), "%s/emoji-data.txt",
3059
+ unicode_db_path);
3060
+ parse_prop_list(filename);
3061
+
3062
+ // dump_unicode_data(unicode_db);
3063
+ build_conv_table(unicode_db);
3064
+
3065
+ #ifdef DUMP_CASE_FOLDING_SPECIAL_CASES
3066
+ dump_case_folding_special_cases(unicode_db);
3067
+ #endif
3068
+
3069
+ if (!outfilename) {
3070
+ #ifdef USE_TEST
3071
+ check_case_conv();
3072
+ check_flags();
3073
+ check_decompose_table();
3074
+ check_compose_table();
3075
+ check_cc_table();
3076
+ snprintf(filename, sizeof(filename), "%s/NormalizationTest.txt", unicode_db_path);
3077
+ normalization_test(filename);
3078
+ #else
3079
+ fprintf(stderr, "Tests are not compiled\n");
3080
+ exit(1);
3081
+ #endif
3082
+ } else
3083
+ {
3084
+ FILE *fo = fopen(outfilename, "wb");
3085
+
3086
+ if (!fo) {
3087
+ perror(outfilename);
3088
+ exit(1);
3089
+ }
3090
+ fprintf(fo,
3091
+ "/* Compressed unicode tables */\n"
3092
+ "/* Automatically generated file - do not edit */\n"
3093
+ "\n"
3094
+ "#include <stdint.h>\n"
3095
+ "\n");
3096
+ dump_case_conv_table(fo);
3097
+ compute_internal_props();
3098
+ build_flags_tables(fo);
3099
+ build_cc_table(fo);
3100
+ build_decompose_table(fo);
3101
+ build_general_category_table(fo);
3102
+ build_script_table(fo);
3103
+ build_script_ext_table(fo);
3104
+ build_prop_list_table(fo);
3105
+ fclose(fo);
3106
+ }
3107
+ return 0;
3108
+ }