u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,68 @@
1
+ #include <ruby.h>
2
+ #include <assert.h>
3
+ #include <stdbool.h>
4
+ #include <stddef.h>
5
+ #include <stdint.h>
6
+
7
+ #include "u.h"
8
+
9
+ #include "private.h"
10
+
11
+ #include "data/case-folding.h"
12
+ #include "output.h"
13
+ #include "u_locale.h"
14
+
15
+ static inline void
16
+ foldcase(uint32_t c, struct output *output)
17
+ {
18
+ size_t i;
19
+ if (unicode_table_lookup(casefold_table, c, &i))
20
+ output_zstring(output, casefold_table[i].data);
21
+ else
22
+ output_char(output, u_char_downcase(c));
23
+ }
24
+
25
+ static inline const char *
26
+ foldcase_step(const char *p, const char *end, struct output *output)
27
+ {
28
+ const char *q;
29
+ foldcase(u_decode(&q, p, end), output);
30
+ return q;
31
+ }
32
+
33
+ #define LATIN_CAPITAL_LETTER_I ((uint32_t)0x0049)
34
+ #define LATIN_SMALL_LETTER_DOTLESS_I ((uint32_t)0x0131)
35
+ #define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((uint32_t)0x0130)
36
+ #define LATIN_SMALL_LETTER_I ((uint32_t)0x0069)
37
+
38
+ static inline const char *
39
+ foldcase_step_turkic(const char *p, const char *end, struct output *output)
40
+ {
41
+ const char *q;
42
+ uint32_t c = u_decode(&q, p, end);
43
+ if (c == LATIN_CAPITAL_LETTER_I)
44
+ output_char(output, LATIN_SMALL_LETTER_DOTLESS_I);
45
+ else if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
46
+ output_char(output, LATIN_SMALL_LETTER_I);
47
+ else
48
+ foldcase(c, output);
49
+ return q;
50
+ }
51
+
52
+ size_t
53
+ u_foldcase(char *result, size_t m, const char *string, size_t n,
54
+ const char *locale)
55
+ {
56
+ assert(string != NULL);
57
+ assert(result != NULL || m == 0);
58
+ enum locale l = _u_locale_from_string(locale);
59
+ const char *end = string + n;
60
+ struct output output = OUTPUT_INIT(result, m);
61
+ if (l == LOCALE_TURKIC)
62
+ for (const char *p = string; p < end; )
63
+ p = foldcase_step_turkic(p, end, &output);
64
+ else
65
+ for (const char *p = string; p < end; )
66
+ p = foldcase_step(p, end, &output);
67
+ return output_finalize(&output);
68
+ }
@@ -0,0 +1,57 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+
9
+ #define ROW(other, cr, lf, control, l, v, lv, lvt, t, regional_indicator, \
10
+ prepend, extend, spacingmark) \
11
+ { [U_GRAPHEME_BREAK_OTHER] = other, \
12
+ [U_GRAPHEME_BREAK_CR] = cr, \
13
+ [U_GRAPHEME_BREAK_LF] = lf, \
14
+ [U_GRAPHEME_BREAK_CONTROL] = control, \
15
+ [U_GRAPHEME_BREAK_L] = l, \
16
+ [U_GRAPHEME_BREAK_V] = v, \
17
+ [U_GRAPHEME_BREAK_LV] = lv, \
18
+ [U_GRAPHEME_BREAK_LVT] = lvt, \
19
+ [U_GRAPHEME_BREAK_T] = t, \
20
+ [U_GRAPHEME_BREAK_REGIONAL_INDICATOR] = regional_indicator, \
21
+ [U_GRAPHEME_BREAK_EXTEND] = extend, \
22
+ [U_GRAPHEME_BREAK_SPACINGMARK] = spacingmark, \
23
+ [U_GRAPHEME_BREAK_PREPEND] = prepend }
24
+ #define K(s) (s | (1 << 4))
25
+ static const uint8_t gb_dfa[][U_GRAPHEME_BREAK_V + 1] = {
26
+ ROW( 0 , 1 , 2 , 2 , 3 , 4 , 4 , 5 , 5 , 6 , 7 ,K(0),K(0)), // Other | Extend | SpacingMark
27
+ ROW( 0 , 1 ,K(2), 2 , 3 , 4 , 4 , 5 , 5 , 6 , 7 , 0 , 0 ), // CR
28
+ ROW( 0 , 1 , 2 , 2 , 3 , 4 , 4 , 5 , 5 , 6 , 7 , 0 , 0 ), // LF | Control
29
+ ROW( 0 , 1 , 2 , 2 ,K(3),K(4),K(4),K(5), 5 , 6 , 7 ,K(0),K(0)), // L
30
+ ROW( 0 , 1 , 2 , 2 , 3 ,K(4), 4 , 5 ,K(5), 6 , 7 ,K(0),K(0)), // V | LV
31
+ ROW( 0 , 1 , 2 , 2 , 3 , 4 , 4 , 5 ,K(5), 6 , 7 ,K(0),K(0)), // LVT | T
32
+ ROW( 0 , 1 , 2 , 2 , 3 , 4 , 4 , 5 , 5 ,K(6), 7 ,K(0),K(0)), // Regional_Indicator
33
+ ROW(K(0), 1 , 2 , 2 ,K(3),K(4),K(4),K(5),K(5),K(6),K(0),K(0),K(0)), // Prepend
34
+ };
35
+ #undef K
36
+
37
+ void
38
+ u_grapheme_clusters(const char *string, size_t n, u_substring_fn fn, void *closure)
39
+ {
40
+ const char *p = string;
41
+ const char *q = p;
42
+ const char *end = p + n;
43
+ uint8_t state = 2;
44
+ while (q < end) {
45
+ const char *r;
46
+ uint32_t c = u_decode(&r, q, end);
47
+ state = gb_dfa[state & 0xf][u_char_grapheme_break(c)];
48
+ if (state >> 4 != 1) {
49
+ if (p < q)
50
+ fn(p, q - p, closure);
51
+ p = q;
52
+ }
53
+ q = r;
54
+ }
55
+ if (p < q)
56
+ fn(p, q - p, closure);
57
+ }
@@ -0,0 +1,27 @@
1
+ #include <assert.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+ #include <stdbool.h>
5
+
6
+ #include "u.h"
7
+ #include "private.h"
8
+
9
+
10
+ /* {{{1
11
+ * Check if the given string begins with ‘prefix’.
12
+ */
13
+ bool
14
+ u_has_prefix(const char *str, const char *prefix)
15
+ {
16
+ assert(str != NULL);
17
+ assert(prefix != NULL);
18
+
19
+ do {
20
+ if (*prefix == '\0')
21
+ return true;
22
+ else if (*str == '\0')
23
+ return false;
24
+ } while (*str++ == *prefix++);
25
+
26
+ return false;
27
+ }
@@ -0,0 +1,93 @@
1
+ #include <assert.h>
2
+ #include <stdint.h>
3
+ #include <stdbool.h>
4
+ #include <string.h>
5
+
6
+ #include "u.h"
7
+ #include "private.h"
8
+
9
+ /* {{{1
10
+ * Retrieve the offset/index of ‘needle’ in ‘haystack’ which is of size
11
+ * ‘haystack_len’.
12
+ */
13
+ static U_PURE size_t
14
+ str_index_n(const char *haystack, const char *needle, size_t haystack_n)
15
+ {
16
+ assert(haystack != NULL);
17
+ assert(needle != NULL);
18
+
19
+ size_t needle_n = strlen(needle);
20
+
21
+ if (needle_n == 0)
22
+ return 0;
23
+
24
+ if (haystack_n < needle_n)
25
+ return -1;
26
+
27
+ const char *end = haystack + haystack_n - needle_n;
28
+ for (const char *p = haystack; *p != '\0' && p <= end; p++) {
29
+ size_t i;
30
+
31
+ for (i = 0; i < needle_n; i++) {
32
+ if (p[i] != needle[i])
33
+ break;
34
+ }
35
+
36
+ if (i == needle_n)
37
+ return p - haystack;
38
+ }
39
+
40
+ return -1;
41
+ }
42
+
43
+
44
+ /* {{{1
45
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
46
+ * doesn't exist.
47
+ */
48
+ size_t
49
+ u_char_index(const char *str, uint32_t c)
50
+ {
51
+ char ch[7];
52
+
53
+ ch[u_char_to_u(c, ch)] = '\0';
54
+ char *p = strstr(str, ch);
55
+ return (p != NULL) ? p - str : -1;
56
+ }
57
+
58
+
59
+ /* {{{1
60
+ * Retrieve the index of the left-most occurence of ‘c’ in ‘str’, or -1 if it
61
+ * doesn't exist, going over at most ‘len’ bytes in ‘str’.
62
+ */
63
+ size_t
64
+ u_char_index_n(const char *str, uint32_t c, size_t n)
65
+ {
66
+ char ch[7];
67
+
68
+ ch[u_char_to_u(c, ch)] = '\0';
69
+
70
+ return str_index_n(str, ch, n);
71
+ }
72
+
73
+
74
+ /* {{{1
75
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
76
+ * -1 if it doesn't exist.
77
+ */
78
+ size_t
79
+ u_index(const char *haystack, const char *needle)
80
+ {
81
+ return strstr(haystack, needle) - haystack;
82
+ }
83
+
84
+
85
+ /* {{{1
86
+ * Retrieve the index of the left-most occurence of ‘needle’ in ‘haystack’, or
87
+ * -1 if it doesn't exist, going over at most ‘len’ bytes in ‘haystack’.
88
+ */
89
+ size_t
90
+ u_index_n(const char *haystack, const char *needle, size_t n)
91
+ {
92
+ return str_index_n(haystack, needle, n);
93
+ }
@@ -0,0 +1,33 @@
1
+ #include <stddef.h>
2
+ #include <stdbool.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+ static bool
9
+ u_is_ascii_only_impl(const char *string, size_t n, bool use_n)
10
+ {
11
+ const char *p = string;
12
+ const char *end = p + n;
13
+ while (P_WITHIN_STR(p, end, use_n)) {
14
+ if (*(unsigned char *)p > 127)
15
+ return false;
16
+
17
+ p++;
18
+ }
19
+
20
+ return true;
21
+ }
22
+
23
+ bool
24
+ u_is_ascii_only(const char *string)
25
+ {
26
+ return u_is_ascii_only_impl(string, 0, false);
27
+ }
28
+
29
+ bool
30
+ u_is_ascii_only_n(const char *string, size_t n)
31
+ {
32
+ return u_is_ascii_only_impl(string, n, true);
33
+ }
@@ -0,0 +1,40 @@
1
+ #include <locale.h>
2
+ #include <stdbool.h>
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+
6
+ #include "u.h"
7
+ #include "u_locale.h"
8
+ #include "private.h"
9
+
10
+ /* {{{1
11
+ * Retrieve the locale type from the environment (LC_CTYPE).
12
+ */
13
+ enum locale
14
+ _u_locale(void)
15
+ {
16
+ const char *locale = setlocale(LC_CTYPE, NULL);
17
+ return locale == NULL ? LOCALE_NORMAL : _u_locale_from_string(locale);
18
+ }
19
+
20
+ enum locale
21
+ _u_locale_from_string(const char *locale)
22
+ {
23
+ if (locale == NULL)
24
+ return _u_locale();
25
+
26
+ if (locale[0] == '\0')
27
+ return LOCALE_NORMAL;
28
+
29
+ if ((locale[0] == 'a' && locale[1] == 'z') ||
30
+ (locale[0] == 't' && locale[1] == 'r'))
31
+ return LOCALE_TURKIC;
32
+
33
+ if (locale[0] == 'l' && locale[1] == 't')
34
+ return LOCALE_LITHUANIAN;
35
+
36
+ if (locale[0] == 'n' && locale[1] == 'l')
37
+ return LOCALE_DUTCH;
38
+
39
+ return LOCALE_NORMAL;
40
+ }
@@ -0,0 +1,14 @@
1
+ /* {{{1
2
+ * LocaleType: This ‹enum› is used for dealing with different locales for
3
+ * turning strings into uppercase or lowercase.
4
+ */
5
+ enum locale {
6
+ LOCALE_NORMAL,
7
+ LOCALE_TURKIC,
8
+ LOCALE_LITHUANIAN,
9
+ LOCALE_DUTCH,
10
+ };
11
+
12
+
13
+ enum locale _u_locale(void);
14
+ enum locale _u_locale_from_string(const char *locale);
@@ -0,0 +1,20 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+ #include "u.h"
5
+
6
+ #include <assert.h>
7
+
8
+ #include <string.h>
9
+ #include "output.h"
10
+
11
+ size_t
12
+ u_mirror(char *result, size_t m, const char *u, size_t n)
13
+ {
14
+ assert(u != NULL);
15
+ assert(result != NULL || m == 0);
16
+ struct output o = OUTPUT_INIT(result, m);
17
+ for (const char *p = u, *end = u + n; p < end; )
18
+ output_char(&o, u_char_mirror(u_decode(&p, p, end)));
19
+ return output_finalize(&o);
20
+ }
@@ -0,0 +1,16 @@
1
+ #include <stdint.h>
2
+ #include <stdbool.h>
3
+ #include <string.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+
9
+ /* {{{1
10
+ * Retrieve the number of bytes making up the given UTF-8 string.
11
+ */
12
+ size_t
13
+ u_n_bytes(const char *str)
14
+ {
15
+ return strlen(str);
16
+ }
@@ -0,0 +1,43 @@
1
+ #include <assert.h>
2
+ #include <stdbool.h>
3
+ #include <stddef.h>
4
+ #include <stdint.h>
5
+ #include <string.h>
6
+
7
+ #include "u.h"
8
+ #include "private.h"
9
+
10
+ /* {{{1
11
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’.
12
+ */
13
+ size_t
14
+ u_n_chars(const char *str)
15
+ {
16
+ assert(str != NULL);
17
+ size_t n = 0;
18
+ for (const char *p = str, *end = str + strlen(str); *p != '\0'; n++)
19
+ u_decode(&p, p, end);
20
+ return n;
21
+ }
22
+
23
+
24
+ /* {{{1
25
+ * Retrieve the number of UTF-8 encoded Unicode characters in ‘str’, examining
26
+ * ‘len’ bytes.
27
+ */
28
+ size_t
29
+ u_n_chars_n(const char *str, size_t n)
30
+ {
31
+ assert(str != NULL || n == 0);
32
+ if (n == 0)
33
+ return 0;
34
+ size_t m = 0;
35
+ const char *p = str;
36
+ const char *end = str + n;
37
+ while (p < end) {
38
+ u_decode(&p, p, end);
39
+ m++;
40
+ }
41
+
42
+ return m;
43
+ }
@@ -0,0 +1,232 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+ #include "u.h"
5
+
6
+ #include "data/constants.h"
7
+ #include "data/decompose.h"
8
+ #include "data/compose.h"
9
+ #include "private.h"
10
+
11
+ #include <string.h>
12
+ #include "output.h"
13
+
14
+ enum {
15
+ SBase = 0xac00,
16
+ LBase = 0x1100,
17
+ VBase = 0x1161,
18
+ TBase = 0x11a7,
19
+ LCount = 19,
20
+ VCount = 21,
21
+ TCount = 28,
22
+ NCount = (VCount * TCount),
23
+ SCount = (LCount * NCount),
24
+ SLast = (SBase + SCount - 1)
25
+ };
26
+
27
+ static const char *
28
+ compatible(size_t i)
29
+ {
30
+ uint16_t j = decomp_table[i].compat_offset;
31
+ return &decomp_expansion_string[j == UNICODE_NOT_PRESENT_OFFSET ?
32
+ decomp_table[i].canon_offset :
33
+ j];
34
+ }
35
+
36
+ static const char *
37
+ canonical(size_t i)
38
+ {
39
+ uint16_t j = decomp_table[i].canon_offset;
40
+ return j == UNICODE_NOT_PRESENT_OFFSET ? NULL : &decomp_expansion_string[j];
41
+ }
42
+
43
+ static void
44
+ decompose(const char *u, const char *end, enum u_normalization_form form,
45
+ struct output *o)
46
+ {
47
+ const char *(*decompose)(size_t) =
48
+ (form == U_NORMALIZATION_FORM_KC ||
49
+ form == U_NORMALIZATION_FORM_KD) ? compatible : canonical;
50
+ for (const char *p = u; p < end; ) {
51
+ uint32_t c = u_decode(&p, p, end);
52
+ if (SBase <= c && c <= SLast) {
53
+ int SIndex = c - SBase;
54
+ output_char(o, LBase + SIndex / NCount);
55
+ output_char(o, VBase + (SIndex % NCount) / TCount);
56
+ uint32_t T = TBase + SIndex % TCount;
57
+ if (T != TBase)
58
+ output_char(o, T);
59
+ } else {
60
+ size_t i;
61
+ const char *d;
62
+ if (unicode_table_lookup(decomp_table, c, &i) &&
63
+ (d = decompose(i)) != NULL)
64
+ output_zstring(o, d);
65
+ else
66
+ output_char(o, c);
67
+ }
68
+ }
69
+ }
70
+
71
+ static inline void
72
+ canonical_swap(char *begin, char *p, char *q,
73
+ enum u_canonical_combining_class ccc)
74
+ {
75
+ char *r = p;
76
+ char *s;
77
+ while (begin < r &&
78
+ u_char_canonical_combining_class(u_decode_r((const char **)&s,
79
+ begin, r)) > ccc)
80
+ r = s;
81
+ char buf[U_CHAR_MAX_BYTE_LENGTH];
82
+ size_t n = q - p;
83
+ memcpy(buf, p, n);
84
+ memmove(r + n, r, p - r);
85
+ memcpy(r, buf, n);
86
+ }
87
+
88
+ static inline bool
89
+ canonical_reorder(char *begin, char *end)
90
+ {
91
+ bool swapped = false;
92
+ char *p;
93
+ uint32_t c = u_decode((const char **)&p, begin, end);
94
+ enum u_canonical_combining_class pcc = u_char_canonical_combining_class(c);
95
+ while (p < end) {
96
+ char *q;
97
+ enum u_canonical_combining_class cc =
98
+ u_char_canonical_combining_class(u_decode((const char **)&q, p, end));
99
+ if (cc != 0 && pcc > cc) {
100
+ canonical_swap(begin, p, q, cc);
101
+ swapped = true;
102
+ } else
103
+ pcc = cc;
104
+ p = q;
105
+ }
106
+ return swapped;
107
+ }
108
+
109
+ static void
110
+ canonical_order(char *begin, size_t n)
111
+ {
112
+ char *end = begin + n;
113
+ while (canonical_reorder(begin, end))
114
+ ;
115
+ }
116
+
117
+ static inline bool
118
+ compose_hangul(uint32_t a, uint32_t b, uint32_t *result)
119
+ {
120
+ int LIndex = a - LBase;
121
+ if (0 <= LIndex && LIndex < LCount) {
122
+ int VIndex = b - VBase;
123
+ if (0 <= VIndex && VIndex < VCount) {
124
+ *result = SBase + (LIndex * VCount + VIndex) * TCount;
125
+ return true;
126
+ }
127
+ }
128
+
129
+ int SIndex = a - SBase;
130
+ if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0) {
131
+ int TIndex = b - TBase;
132
+ if (0 < TIndex && TIndex < TCount) {
133
+ *result = a + TIndex;
134
+ return true;
135
+ }
136
+ }
137
+
138
+ return false;
139
+ }
140
+
141
+ static inline uint16_t
142
+ compose_index(uint32_t c)
143
+ {
144
+ unsigned int page = c >> 8;
145
+ if (page > COMPOSE_TABLE_LAST)
146
+ return 0;
147
+ int16_t i = compose_table[page];
148
+ if (i >= UNICODE_MAX_TABLE_INDEX)
149
+ return i - UNICODE_MAX_TABLE_INDEX;
150
+ return compose_data[i][c & 0xff];
151
+ }
152
+
153
+ static inline bool
154
+ compose_2(uint32_t a, uint32_t b, uint32_t *result)
155
+ {
156
+ if (compose_hangul(a, b, result))
157
+ return true;
158
+
159
+ uint16_t i = compose_index(a);
160
+ if (COMPOSE_FIRST_SINGLE_START <= i && i < COMPOSE_SECOND_START) {
161
+ if (b != compose_first_single[i - COMPOSE_FIRST_SINGLE_START][0])
162
+ return false;
163
+ *result = compose_first_single[i - COMPOSE_FIRST_SINGLE_START][1];
164
+ return true;
165
+ }
166
+
167
+ uint16_t j = compose_index(b);
168
+ if (COMPOSE_SECOND_SINGLE_START <= j) {
169
+ if (a != compose_second_single[j - COMPOSE_SECOND_SINGLE_START][0])
170
+ return false;
171
+ *result = compose_second_single[j - COMPOSE_SECOND_SINGLE_START][1];
172
+ return true;
173
+ }
174
+
175
+ if (COMPOSE_FIRST_START <= i && i < COMPOSE_FIRST_SINGLE_START &&
176
+ COMPOSE_SECOND_START <= j && j < COMPOSE_SECOND_SINGLE_START) {
177
+ uint32_t r = compose_array[i - COMPOSE_FIRST_START][j - COMPOSE_SECOND_START];
178
+ if (r != 0) {
179
+ *result = r;
180
+ return true;
181
+ }
182
+ }
183
+
184
+ return false;
185
+ }
186
+
187
+ static inline size_t
188
+ compose(char *begin, size_t n, enum u_normalization_form form)
189
+ {
190
+ if (form != U_NORMALIZATION_FORM_C && form != U_NORMALIZATION_FORM_KC)
191
+ return n;
192
+ int pcc = -1;
193
+ char *t;
194
+ char *s = begin;
195
+ char *end = begin + n;
196
+ uint32_t sc = u_decode((const char **)&t, s, end);
197
+ for (char *p = t, *q; p < end; p = q) {
198
+ uint32_t c = u_decode((const char **)&q, p, end);
199
+ int cc = u_char_canonical_combining_class(c);
200
+ uint32_t sc2;
201
+ if (pcc < cc && compose_2(sc, c, &sc2)) {
202
+ char *r = u_next(s);
203
+ ptrdiff_t k = u_char_to_u(sc2, NULL) - (r - s);
204
+ memmove(r + k, r, t - r);
205
+ u_char_to_u(sc2, s);
206
+ sc = sc2;
207
+ t += k;
208
+ } else if (cc == 0) {
209
+ pcc = -1;
210
+ s = t;
211
+ sc = c;
212
+ t += u_char_to_u(c, t);
213
+ } else {
214
+ pcc = cc;
215
+ t += u_char_to_u(c, t);
216
+ }
217
+ }
218
+ return t - begin;
219
+ }
220
+
221
+ size_t
222
+ u_normalize(char *result, size_t m, const char *u, size_t n,
223
+ enum u_normalization_form form)
224
+ {
225
+ struct output o = OUTPUT_INIT(result, m);
226
+ decompose(u, u + n, form, &o);
227
+ if (o.m > o.n && o.n > 0) {
228
+ canonical_order(o.result, o.n);
229
+ o.n = compose(o.result, o.n, form);
230
+ }
231
+ return output_finalize(&o);
232
+ }