gosu 0.13.2 → 0.13.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,699 +0,0 @@
1
- /*
2
- * Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
3
- * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
4
- *
5
- * Permission is hereby granted, free of charge, to any person obtaining a
6
- * copy of this software and associated documentation files (the "Software"),
7
- * to deal in the Software without restriction, including without limitation
8
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
- * and/or sell copies of the Software, and to permit persons to whom the
10
- * Software is furnished to do so, subject to the following conditions:
11
- *
12
- * The above copyright notice and this permission notice shall be included in
13
- * all copies or substantial portions of the Software.
14
- *
15
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21
- * DEALINGS IN THE SOFTWARE.
22
- */
23
-
24
-
25
- /**
26
- * @mainpage
27
- *
28
- * utf8proc is a free/open-source (MIT/expat licensed) C library
29
- * providing Unicode normalization, case-folding, and other operations
30
- * for strings in the UTF-8 encoding, supporting Unicode version
31
- * 9.0.0. See the utf8proc home page (http://julialang.org/utf8proc/)
32
- * for downloads and other information, or the source code on github
33
- * (https://github.com/JuliaLang/utf8proc).
34
- *
35
- * For the utf8proc API documentation, see: @ref utf8proc.h
36
- *
37
- * The features of utf8proc include:
38
- *
39
- * - Transformation of strings (@ref utf8proc_map) to:
40
- * - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)
41
- * - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)
42
- * - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK)
43
- * - case-folding (@ref UTF8PROC_CASEFOLD)
44
- * - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC
45
- * - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)
46
- * - Character-width computation: @ref utf8proc_charwidth
47
- * - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string
48
- * - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8.
49
- */
50
-
51
- /** @file */
52
-
53
- #ifndef UTF8PROC_H
54
- #define UTF8PROC_H
55
-
56
- /** @name API version
57
- *
58
- * The utf8proc API version MAJOR.MINOR.PATCH, following
59
- * semantic-versioning rules (http://semver.org) based on API
60
- * compatibility.
61
- *
62
- * This is also returned at runtime by @ref utf8proc_version; however, the
63
- * runtime version may append a string like "-dev" to the version number
64
- * for prerelease versions.
65
- *
66
- * @note The shared-library version number in the Makefile
67
- * (and CMakeLists.txt, and MANIFEST) may be different,
68
- * being based on ABI compatibility rather than API compatibility.
69
- */
70
- /** @{ */
71
- /** The MAJOR version number (increased when backwards API compatibility is broken). */
72
- #define UTF8PROC_VERSION_MAJOR 2
73
- /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
74
- #define UTF8PROC_VERSION_MINOR 1
75
- /** The PATCH version (increased for fixes that do not change the API). */
76
- #define UTF8PROC_VERSION_PATCH 0
77
- /** @} */
78
-
79
- #include <stdlib.h>
80
-
81
- #if defined(_MSC_VER) && _MSC_VER < 1800
82
- // MSVC prior to 2013 lacked stdbool.h and inttypes.h
83
- typedef signed char utf8proc_int8_t;
84
- typedef unsigned char utf8proc_uint8_t;
85
- typedef short utf8proc_int16_t;
86
- typedef unsigned short utf8proc_uint16_t;
87
- typedef int utf8proc_int32_t;
88
- typedef unsigned int utf8proc_uint32_t;
89
- # ifdef _WIN64
90
- typedef __int64 utf8proc_ssize_t;
91
- typedef unsigned __int64 utf8proc_size_t;
92
- # else
93
- typedef int utf8proc_ssize_t;
94
- typedef unsigned int utf8proc_size_t;
95
- # endif
96
- # ifndef __cplusplus
97
- // emulate C99 bool
98
- typedef unsigned char utf8proc_bool;
99
- # ifndef __bool_true_false_are_defined
100
- # define false 0
101
- # define true 1
102
- # define __bool_true_false_are_defined 1
103
- # endif
104
- # else
105
- typedef bool utf8proc_bool;
106
- # endif
107
- #else
108
- # include <stddef.h>
109
- # include <stdbool.h>
110
- # include <inttypes.h>
111
- typedef int8_t utf8proc_int8_t;
112
- typedef uint8_t utf8proc_uint8_t;
113
- typedef int16_t utf8proc_int16_t;
114
- typedef uint16_t utf8proc_uint16_t;
115
- typedef int32_t utf8proc_int32_t;
116
- typedef uint32_t utf8proc_uint32_t;
117
- typedef size_t utf8proc_size_t;
118
- typedef ptrdiff_t utf8proc_ssize_t;
119
- typedef bool utf8proc_bool;
120
- #endif
121
- #include <limits.h>
122
-
123
- #ifdef _WIN32
124
- # ifdef UTF8PROC_EXPORTS
125
- # define UTF8PROC_DLLEXPORT __declspec(dllexport)
126
- # else
127
- # define UTF8PROC_DLLEXPORT __declspec(dllimport)
128
- # endif
129
- #elif __GNUC__ >= 4
130
- # define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
131
- #else
132
- # define UTF8PROC_DLLEXPORT
133
- #endif
134
-
135
- #ifdef __cplusplus
136
- extern "C" {
137
- #endif
138
-
139
- #ifndef SSIZE_MAX
140
- #define SSIZE_MAX ((size_t)SIZE_MAX/2)
141
- #endif
142
-
143
- #ifndef UINT16_MAX
144
- # define UINT16_MAX 65535U
145
- #endif
146
-
147
- /**
148
- * Option flags used by several functions in the library.
149
- */
150
- typedef enum {
151
- /** The given UTF-8 input is NULL terminated. */
152
- UTF8PROC_NULLTERM = (1<<0),
153
- /** Unicode Versioning Stability has to be respected. */
154
- UTF8PROC_STABLE = (1<<1),
155
- /** Compatibility decomposition (i.e. formatting information is lost). */
156
- UTF8PROC_COMPAT = (1<<2),
157
- /** Return a result with decomposed characters. */
158
- UTF8PROC_COMPOSE = (1<<3),
159
- /** Return a result with decomposed characters. */
160
- UTF8PROC_DECOMPOSE = (1<<4),
161
- /** Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. */
162
- UTF8PROC_IGNORE = (1<<5),
163
- /** Return an error, if the input contains unassigned codepoints. */
164
- UTF8PROC_REJECTNA = (1<<6),
165
- /**
166
- * Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a
167
- * line break, and should be converted to the codepoint for line
168
- * separation (LS).
169
- */
170
- UTF8PROC_NLF2LS = (1<<7),
171
- /**
172
- * Indicating that NLF-sequences are representing a paragraph break, and
173
- * should be converted to the codepoint for paragraph separation
174
- * (PS).
175
- */
176
- UTF8PROC_NLF2PS = (1<<8),
177
- /** Indicating that the meaning of NLF-sequences is unknown. */
178
- UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS),
179
- /** Strips and/or convers control characters.
180
- *
181
- * NLF-sequences are transformed into space, except if one of the
182
- * NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF)
183
- * are treated as a NLF-sequence in this case. All other control
184
- * characters are simply removed.
185
- */
186
- UTF8PROC_STRIPCC = (1<<9),
187
- /**
188
- * Performs unicode case folding, to be able to do a case-insensitive
189
- * string comparison.
190
- */
191
- UTF8PROC_CASEFOLD = (1<<10),
192
- /**
193
- * Inserts 0xFF bytes at the beginning of each sequence which is
194
- * representing a single grapheme cluster (see UAX#29).
195
- */
196
- UTF8PROC_CHARBOUND = (1<<11),
197
- /** Lumps certain characters together.
198
- *
199
- * E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details.
200
- *
201
- * If NLF2LF is set, this includes a transformation of paragraph and
202
- * line separators to ASCII line-feed (LF).
203
- */
204
- UTF8PROC_LUMP = (1<<12),
205
- /** Strips all character markings.
206
- *
207
- * This includes non-spacing, spacing and enclosing (i.e. accents).
208
- * @note This option works only with @ref UTF8PROC_COMPOSE or
209
- * @ref UTF8PROC_DECOMPOSE
210
- */
211
- UTF8PROC_STRIPMARK = (1<<13),
212
- } utf8proc_option_t;
213
-
214
- /** @name Error codes
215
- * Error codes being returned by almost all functions.
216
- */
217
- /** @{ */
218
- /** Memory could not be allocated. */
219
- #define UTF8PROC_ERROR_NOMEM -1
220
- /** The given string is too long to be processed. */
221
- #define UTF8PROC_ERROR_OVERFLOW -2
222
- /** The given string is not a legal UTF-8 string. */
223
- #define UTF8PROC_ERROR_INVALIDUTF8 -3
224
- /** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */
225
- #define UTF8PROC_ERROR_NOTASSIGNED -4
226
- /** Invalid options have been used. */
227
- #define UTF8PROC_ERROR_INVALIDOPTS -5
228
- /** @} */
229
-
230
- /* @name Types */
231
-
232
- /** Holds the value of a property. */
233
- typedef utf8proc_int16_t utf8proc_propval_t;
234
-
235
- /** Struct containing information about a codepoint. */
236
- typedef struct utf8proc_property_struct {
237
- /**
238
- * Unicode category.
239
- * @see utf8proc_category_t.
240
- */
241
- utf8proc_propval_t category;
242
- utf8proc_propval_t combining_class;
243
- /**
244
- * Bidirectional class.
245
- * @see utf8proc_bidi_class_t.
246
- */
247
- utf8proc_propval_t bidi_class;
248
- /**
249
- * @anchor Decomposition type.
250
- * @see utf8proc_decomp_type_t.
251
- */
252
- utf8proc_propval_t decomp_type;
253
- utf8proc_uint16_t decomp_seqindex;
254
- utf8proc_uint16_t casefold_seqindex;
255
- utf8proc_uint16_t uppercase_seqindex;
256
- utf8proc_uint16_t lowercase_seqindex;
257
- utf8proc_uint16_t titlecase_seqindex;
258
- utf8proc_uint16_t comb_index;
259
- unsigned bidi_mirrored:1;
260
- unsigned comp_exclusion:1;
261
- /**
262
- * Can this codepoint be ignored?
263
- *
264
- * Used by @ref utf8proc_decompose_char when @ref UTF8PROC_IGNORE is
265
- * passed as an option.
266
- */
267
- unsigned ignorable:1;
268
- unsigned control_boundary:1;
269
- /** The width of the codepoint. */
270
- unsigned charwidth:2;
271
- unsigned pad:2;
272
- /**
273
- * Boundclass.
274
- * @see utf8proc_boundclass_t.
275
- */
276
- unsigned boundclass:8;
277
- } utf8proc_property_t;
278
-
279
- /** Unicode categories. */
280
- typedef enum {
281
- UTF8PROC_CATEGORY_CN = 0, /**< Other, not assigned */
282
- UTF8PROC_CATEGORY_LU = 1, /**< Letter, uppercase */
283
- UTF8PROC_CATEGORY_LL = 2, /**< Letter, lowercase */
284
- UTF8PROC_CATEGORY_LT = 3, /**< Letter, titlecase */
285
- UTF8PROC_CATEGORY_LM = 4, /**< Letter, modifier */
286
- UTF8PROC_CATEGORY_LO = 5, /**< Letter, other */
287
- UTF8PROC_CATEGORY_MN = 6, /**< Mark, nonspacing */
288
- UTF8PROC_CATEGORY_MC = 7, /**< Mark, spacing combining */
289
- UTF8PROC_CATEGORY_ME = 8, /**< Mark, enclosing */
290
- UTF8PROC_CATEGORY_ND = 9, /**< Number, decimal digit */
291
- UTF8PROC_CATEGORY_NL = 10, /**< Number, letter */
292
- UTF8PROC_CATEGORY_NO = 11, /**< Number, other */
293
- UTF8PROC_CATEGORY_PC = 12, /**< Punctuation, connector */
294
- UTF8PROC_CATEGORY_PD = 13, /**< Punctuation, dash */
295
- UTF8PROC_CATEGORY_PS = 14, /**< Punctuation, open */
296
- UTF8PROC_CATEGORY_PE = 15, /**< Punctuation, close */
297
- UTF8PROC_CATEGORY_PI = 16, /**< Punctuation, initial quote */
298
- UTF8PROC_CATEGORY_PF = 17, /**< Punctuation, final quote */
299
- UTF8PROC_CATEGORY_PO = 18, /**< Punctuation, other */
300
- UTF8PROC_CATEGORY_SM = 19, /**< Symbol, math */
301
- UTF8PROC_CATEGORY_SC = 20, /**< Symbol, currency */
302
- UTF8PROC_CATEGORY_SK = 21, /**< Symbol, modifier */
303
- UTF8PROC_CATEGORY_SO = 22, /**< Symbol, other */
304
- UTF8PROC_CATEGORY_ZS = 23, /**< Separator, space */
305
- UTF8PROC_CATEGORY_ZL = 24, /**< Separator, line */
306
- UTF8PROC_CATEGORY_ZP = 25, /**< Separator, paragraph */
307
- UTF8PROC_CATEGORY_CC = 26, /**< Other, control */
308
- UTF8PROC_CATEGORY_CF = 27, /**< Other, format */
309
- UTF8PROC_CATEGORY_CS = 28, /**< Other, surrogate */
310
- UTF8PROC_CATEGORY_CO = 29, /**< Other, private use */
311
- } utf8proc_category_t;
312
-
313
- /** Bidirectional character classes. */
314
- typedef enum {
315
- UTF8PROC_BIDI_CLASS_L = 1, /**< Left-to-Right */
316
- UTF8PROC_BIDI_CLASS_LRE = 2, /**< Left-to-Right Embedding */
317
- UTF8PROC_BIDI_CLASS_LRO = 3, /**< Left-to-Right Override */
318
- UTF8PROC_BIDI_CLASS_R = 4, /**< Right-to-Left */
319
- UTF8PROC_BIDI_CLASS_AL = 5, /**< Right-to-Left Arabic */
320
- UTF8PROC_BIDI_CLASS_RLE = 6, /**< Right-to-Left Embedding */
321
- UTF8PROC_BIDI_CLASS_RLO = 7, /**< Right-to-Left Override */
322
- UTF8PROC_BIDI_CLASS_PDF = 8, /**< Pop Directional Format */
323
- UTF8PROC_BIDI_CLASS_EN = 9, /**< European Number */
324
- UTF8PROC_BIDI_CLASS_ES = 10, /**< European Separator */
325
- UTF8PROC_BIDI_CLASS_ET = 11, /**< European Number Terminator */
326
- UTF8PROC_BIDI_CLASS_AN = 12, /**< Arabic Number */
327
- UTF8PROC_BIDI_CLASS_CS = 13, /**< Common Number Separator */
328
- UTF8PROC_BIDI_CLASS_NSM = 14, /**< Nonspacing Mark */
329
- UTF8PROC_BIDI_CLASS_BN = 15, /**< Boundary Neutral */
330
- UTF8PROC_BIDI_CLASS_B = 16, /**< Paragraph Separator */
331
- UTF8PROC_BIDI_CLASS_S = 17, /**< Segment Separator */
332
- UTF8PROC_BIDI_CLASS_WS = 18, /**< Whitespace */
333
- UTF8PROC_BIDI_CLASS_ON = 19, /**< Other Neutrals */
334
- UTF8PROC_BIDI_CLASS_LRI = 20, /**< Left-to-Right Isolate */
335
- UTF8PROC_BIDI_CLASS_RLI = 21, /**< Right-to-Left Isolate */
336
- UTF8PROC_BIDI_CLASS_FSI = 22, /**< First Strong Isolate */
337
- UTF8PROC_BIDI_CLASS_PDI = 23, /**< Pop Directional Isolate */
338
- } utf8proc_bidi_class_t;
339
-
340
- /** Decomposition type. */
341
- typedef enum {
342
- UTF8PROC_DECOMP_TYPE_FONT = 1, /**< Font */
343
- UTF8PROC_DECOMP_TYPE_NOBREAK = 2, /**< Nobreak */
344
- UTF8PROC_DECOMP_TYPE_INITIAL = 3, /**< Initial */
345
- UTF8PROC_DECOMP_TYPE_MEDIAL = 4, /**< Medial */
346
- UTF8PROC_DECOMP_TYPE_FINAL = 5, /**< Final */
347
- UTF8PROC_DECOMP_TYPE_ISOLATED = 6, /**< Isolated */
348
- UTF8PROC_DECOMP_TYPE_CIRCLE = 7, /**< Circle */
349
- UTF8PROC_DECOMP_TYPE_SUPER = 8, /**< Super */
350
- UTF8PROC_DECOMP_TYPE_SUB = 9, /**< Sub */
351
- UTF8PROC_DECOMP_TYPE_VERTICAL = 10, /**< Vertical */
352
- UTF8PROC_DECOMP_TYPE_WIDE = 11, /**< Wide */
353
- UTF8PROC_DECOMP_TYPE_NARROW = 12, /**< Narrow */
354
- UTF8PROC_DECOMP_TYPE_SMALL = 13, /**< Small */
355
- UTF8PROC_DECOMP_TYPE_SQUARE = 14, /**< Square */
356
- UTF8PROC_DECOMP_TYPE_FRACTION = 15, /**< Fraction */
357
- UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
358
- } utf8proc_decomp_type_t;
359
-
360
- /** Boundclass property. (TR29) */
361
- typedef enum {
362
- UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
363
- UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
364
- UTF8PROC_BOUNDCLASS_CR = 2, /**< Cr */
365
- UTF8PROC_BOUNDCLASS_LF = 3, /**< Lf */
366
- UTF8PROC_BOUNDCLASS_CONTROL = 4, /**< Control */
367
- UTF8PROC_BOUNDCLASS_EXTEND = 5, /**< Extend */
368
- UTF8PROC_BOUNDCLASS_L = 6, /**< L */
369
- UTF8PROC_BOUNDCLASS_V = 7, /**< V */
370
- UTF8PROC_BOUNDCLASS_T = 8, /**< T */
371
- UTF8PROC_BOUNDCLASS_LV = 9, /**< Lv */
372
- UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
373
- UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
374
- UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
375
- UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
376
- UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
377
- UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
378
- UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
379
- UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
380
- UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
381
- } utf8proc_boundclass_t;
382
-
383
- /**
384
- * Function pointer type passed to @ref utf8proc_map_custom and
385
- * @ref utf8proc_decompose_custom, which is used to specify a user-defined
386
- * mapping of codepoints to be applied in conjunction with other mappings.
387
- */
388
- typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
389
-
390
- /**
391
- * Array containing the byte lengths of a UTF-8 encoded codepoint based
392
- * on the first byte.
393
- */
394
- UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256];
395
-
396
- /**
397
- * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
398
- * (http://semver.org format), possibly with a "-dev" suffix for
399
- * development versions.
400
- */
401
- UTF8PROC_DLLEXPORT const char *utf8proc_version(void);
402
-
403
- /**
404
- * Returns an informative error string for the given utf8proc error code
405
- * (e.g. the error codes returned by @ref utf8proc_map).
406
- */
407
- UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
408
-
409
- /**
410
- * Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
411
- * The maximum number of bytes read is `strlen`, unless `strlen` is
412
- * negative (in which case up to 4 bytes are read).
413
- *
414
- * If a valid codepoint could be read, it is stored in the variable
415
- * pointed to by `codepoint_ref`, otherwise that variable will be set to -1.
416
- * In case of success, the number of bytes read is returned; otherwise, a
417
- * negative error code is returned.
418
- */
419
- UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
420
-
421
- /**
422
- * Check if a codepoint is valid (regardless of whether it has been
423
- * assigned a value by the current Unicode standard).
424
- *
425
- * @return 1 if the given `codepoint` is valid and otherwise return 0.
426
- */
427
- UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
428
-
429
- /**
430
- * Encodes the codepoint as an UTF-8 string in the byte array pointed
431
- * to by `dst`. This array must be at least 4 bytes long.
432
- *
433
- * In case of success the number of bytes written is returned, and
434
- * otherwise 0 is returned.
435
- *
436
- * This function does not check whether `codepoint` is valid Unicode.
437
- */
438
- UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst);
439
-
440
- /**
441
- * Look up the properties for a given codepoint.
442
- *
443
- * @param codepoint The Unicode codepoint.
444
- *
445
- * @returns
446
- * A pointer to a (constant) struct containing information about
447
- * the codepoint.
448
- * @par
449
- * If the codepoint is unassigned or invalid, a pointer to a special struct is
450
- * returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN).
451
- */
452
- UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint);
453
-
454
- /** Decompose a codepoint into an array of codepoints.
455
- *
456
- * @param codepoint the codepoint.
457
- * @param dst the destination buffer.
458
- * @param bufsize the size of the destination buffer.
459
- * @param options one or more of the following flags:
460
- * - @ref UTF8PROC_REJECTNA - return an error `codepoint` is unassigned
461
- * - @ref UTF8PROC_IGNORE - strip "default ignorable" codepoints
462
- * - @ref UTF8PROC_CASEFOLD - apply Unicode casefolding
463
- * - @ref UTF8PROC_COMPAT - replace certain codepoints with their
464
- * compatibility decomposition
465
- * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
466
- * - @ref UTF8PROC_LUMP - lump certain different codepoints together
467
- * - @ref UTF8PROC_STRIPMARK - remove all character marks
468
- * @param last_boundclass
469
- * Pointer to an integer variable containing
470
- * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
471
- * option is used. Otherwise, this parameter is ignored.
472
- *
473
- * @return
474
- * In case of success, the number of codepoints written is returned; in case
475
- * of an error, a negative error code is returned (@ref utf8proc_errmsg).
476
- * @par
477
- * If the number of written codepoints would be bigger than `bufsize`, the
478
- * required buffer size is returned, while the buffer will be overwritten with
479
- * undefined data.
480
- */
481
- UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
482
- utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize,
483
- utf8proc_option_t options, int *last_boundclass
484
- );
485
-
486
- /**
487
- * The same as @ref utf8proc_decompose_char, but acts on a whole UTF-8
488
- * string and orders the decomposed sequences correctly.
489
- *
490
- * If the @ref UTF8PROC_NULLTERM flag in `options` is set, processing
491
- * will be stopped, when a NULL byte is encounted, otherwise `strlen`
492
- * bytes are processed. The result (in the form of 32-bit unicode
493
- * codepoints) is written into the buffer being pointed to by
494
- * `buffer` (which must contain at least `bufsize` entries). In case of
495
- * success, the number of codepoints written is returned; in case of an
496
- * error, a negative error code is returned (@ref utf8proc_errmsg).
497
- * See @ref utf8proc_decompose_custom to supply additional transformations.
498
- *
499
- * If the number of written codepoints would be bigger than `bufsize`, the
500
- * required buffer size is returned, while the buffer will be overwritten with
501
- * undefined data.
502
- */
503
- UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
504
- const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
505
- utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
506
- );
507
-
508
- /**
509
- * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
510
- * that is called on each codepoint in `str` before any other transformations
511
- * (along with a `custom_data` pointer that is passed through to `custom_func`).
512
- * The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom.
513
- */
514
- UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
515
- const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
516
- utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
517
- utf8proc_custom_func custom_func, void *custom_data
518
- );
519
-
520
- /**
521
- * Normalizes the sequence of `length` codepoints pointed to by `buffer`
522
- * in-place (i.e., the result is also stored in `buffer`).
523
- *
524
- * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
525
- * @param length the length (in codepoints) of the buffer.
526
- * @param options a bitwise or (`|`) of one or more of the following flags:
527
- * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
528
- * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
529
- * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
530
- * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
531
- * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
532
- * codepoints
533
- * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
534
- * the unicode versioning stability
535
- *
536
- * @return
537
- * In case of success, the length (in codepoints) of the normalized UTF-32 string is
538
- * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
539
- *
540
- * @warning The entries of the array pointed to by `str` have to be in the
541
- * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
542
- */
543
- UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
544
-
545
- /**
546
- * Reencodes the sequence of `length` codepoints pointed to by `buffer`
547
- * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
548
- * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
549
- *
550
- * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
551
- * @param length the length (in codepoints) of the buffer.
552
- * @param options a bitwise or (`|`) of one or more of the following flags:
553
- * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
554
- * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
555
- * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
556
- * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
557
- * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
558
- * codepoints
559
- * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
560
- * the unicode versioning stability
561
- * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
562
- *
563
- * @return
564
- * In case of success, the length (in bytes) of the resulting nul-terminated
565
- * UTF-8 string is returned; otherwise, a negative error code is returned
566
- * (@ref utf8proc_errmsg).
567
- *
568
- * @warning The amount of free space pointed to by `buffer` must
569
- * exceed the amount of the input data by one byte, and the
570
- * entries of the array pointed to by `str` have to be in the
571
- * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
572
- */
573
- UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
574
-
575
- /**
576
- * Given a pair of consecutive codepoints, return whether a grapheme break is
577
- * permitted between them (as defined by the extended grapheme clusters in UAX#29).
578
- *
579
- * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
580
- * state to break graphemes. This state can be passed in as a pointer
581
- * in the `state` argument and should initially be set to 0. If the
582
- * state is not passed in (i.e. a null pointer is passed), UAX#29 rules
583
- * GB10/12/13 which require this state will not be applied, essentially
584
- * matching the rules in Unicode 8.0.0.
585
- *
586
- * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
587
- * be called IN ORDER on ALL potential breaks in a string.
588
- */
589
- UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
590
- utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
591
-
592
- /**
593
- * Same as @ref utf8proc_grapheme_break_stateful, except without support for the
594
- * Unicode 9 additions to the algorithm. Supported for legacy reasons.
595
- */
596
- UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
597
- utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
598
-
599
-
600
- /**
601
- * Given a codepoint `c`, return the codepoint of the corresponding
602
- * lower-case character, if any; otherwise (if there is no lower-case
603
- * variant, or if `c` is not a valid codepoint) return `c`.
604
- */
605
- UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
606
-
607
- /**
608
- * Given a codepoint `c`, return the codepoint of the corresponding
609
- * upper-case character, if any; otherwise (if there is no upper-case
610
- * variant, or if `c` is not a valid codepoint) return `c`.
611
- */
612
- UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
613
-
614
- /**
615
- * Given a codepoint `c`, return the codepoint of the corresponding
616
- * title-case character, if any; otherwise (if there is no title-case
617
- * variant, or if `c` is not a valid codepoint) return `c`.
618
- */
619
- UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
620
-
621
- /**
622
- * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
623
- * except that a width of 0 is returned for non-printable codepoints
624
- * instead of -1 as in `wcwidth`.
625
- *
626
- * @note
627
- * If you want to check for particular types of non-printable characters,
628
- * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
629
- UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
630
-
631
- /**
632
- * Return the Unicode category for the codepoint (one of the
633
- * @ref utf8proc_category_t constants.)
634
- */
635
- UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint);
636
-
637
- /**
638
- * Return the two-letter (nul-terminated) Unicode category string for
639
- * the codepoint (e.g. `"Lu"` or `"Co"`).
640
- */
641
- UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint);
642
-
643
- /**
644
- * Maps the given UTF-8 string pointed to by `str` to a new UTF-8
645
- * string, allocated dynamically by `malloc` and returned via `dstptr`.
646
- *
647
- * If the @ref UTF8PROC_NULLTERM flag in the `options` field is set,
648
- * the length is determined by a NULL terminator, otherwise the
649
- * parameter `strlen` is evaluated to determine the string length, but
650
- * in any case the result will be NULL terminated (though it might
651
- * contain NULL characters with the string if `str` contained NULL
652
- * characters). Other flags in the `options` field are passed to the
653
- * functions defined above, and regarded as described. See also
654
- * @ref utfproc_map_custom to supply a custom codepoint transformation.
655
- *
656
- * In case of success the length of the new string is returned,
657
- * otherwise a negative error code is returned.
658
- *
659
- * @note The memory of the new UTF-8 string will have been allocated
660
- * with `malloc`, and should therefore be deallocated with `free`.
661
- */
662
- UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
663
- const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
664
- );
665
-
666
- /**
667
- * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
668
- * that is called on each codepoint in `str` before any other transformations
669
- * (along with a `custom_data` pointer that is passed through to `custom_func`).
670
- * The `custom_func` argument is ignored if it is `NULL`.
671
- */
672
- UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
673
- const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
674
- utf8proc_custom_func custom_func, void *custom_data
675
- );
676
-
677
- /** @name Unicode normalization
678
- *
679
- * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
680
- * normalized version of the null-terminated string `str`. These
681
- * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
682
- * combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
683
- */
684
- /** @{ */
685
- /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
686
- UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
687
- /** NFC normalization (@ref UTF8PROC_COMPOSE). */
688
- UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
689
- /** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
690
- UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
691
- /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
692
- UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
693
- /** @} */
694
-
695
- #ifdef __cplusplus
696
- }
697
- #endif
698
-
699
- #endif