u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,132 @@
1
+ #include "extconf.h"
2
+ #include <assert.h>
3
+ #include <errno.h>
4
+ #define __USE_XOPEN2K8 1
5
+ #include <locale.h>
6
+ #ifdef HAVE_XLOCALE_H
7
+ # include <xlocale.h>
8
+ #endif
9
+ #define __USE_XOPEN2K 1
10
+ #include <langinfo.h>
11
+ #include <stdbool.h>
12
+ #include <stdint.h>
13
+ #include <stdlib.h>
14
+ #include <string.h>
15
+ #ifndef HAVE_STRXFRM_L
16
+ static inline size_t
17
+ strxfrm_l(char *restrict s1, const char *restrict s2, size_t n,
18
+ UNUSED(locale_t loc))
19
+ {
20
+ return strxfrm(s1, s2, n);
21
+ }
22
+ #endif
23
+ #ifndef HAVE_NL_LANGINFO_L
24
+ static inline char *
25
+ nl_langinfo_l(nl_item item, UNUSED(locale_t loc))
26
+ {
27
+ return nl_langinfo(item);
28
+ }
29
+ #endif
30
+
31
+ #include "u.h"
32
+ #include "private.h"
33
+
34
+ static inline const char *
35
+ codeset(locale_t locale)
36
+ {
37
+ return locale == NULL ?
38
+ nl_langinfo(CODESET) :
39
+ nl_langinfo_l(CODESET, locale);
40
+ }
41
+
42
+ static inline size_t
43
+ transform(char *result, const char *string, size_t n, locale_t locale)
44
+ {
45
+ return locale == NULL ?
46
+ strxfrm(result, string, n) :
47
+ strxfrm_l(result, string, n, locale);
48
+ }
49
+
50
+ static size_t
51
+ ckey(char *result, size_t m, const char *string, size_t n, locale_t locale)
52
+ {
53
+ char saved_sentinel = string[n];
54
+ ((char *)string)[n] = '\0';
55
+ size_t l = 0;
56
+ const char *p = string;
57
+ const char *end = string + n + 1;
58
+ while (true) {
59
+ errno = 0;
60
+ size_t k = m > l ?
61
+ transform(result + l, p, m - l, locale) :
62
+ transform(NULL, p, 0, locale);
63
+ if (errno != 0)
64
+ break;
65
+ l += k;
66
+ p += strlen(p) + 1;
67
+ if (p == end)
68
+ break;
69
+ if (m > l)
70
+ result[l] = '\0';
71
+ l++;
72
+ }
73
+ ((char *)string)[n] = saved_sentinel;
74
+ return l;
75
+ }
76
+
77
+ static size_t
78
+ recode_ckey(char *result, size_t m, const char *string, size_t n,
79
+ locale_t locale, const char *cs)
80
+ {
81
+ char buf[2048];
82
+ errno = 0;
83
+ size_t n_recoded = u_recode(buf, sizeof(buf), string, n, cs);
84
+ if (errno != 0)
85
+ return 0;
86
+ if (n_recoded < sizeof(buf))
87
+ return ckey(result, m, buf, n_recoded, locale);
88
+ char *recoded = malloc(n_recoded + 1);
89
+ if (recoded == NULL)
90
+ return 0;
91
+ u_recode(recoded, n_recoded + 1, string, n, cs);
92
+ size_t n_key = ckey(result, m, recoded, n_recoded, locale);
93
+ free(recoded);
94
+ return n_key;
95
+ }
96
+
97
+ size_t
98
+ u_collation_key(char *result, size_t m, const char *string, size_t n,
99
+ const char *locale)
100
+ {
101
+ assert(string != NULL);
102
+ assert(result != NULL || m == 0);
103
+ locale_t l = NULL;
104
+ if (locale != NULL)
105
+ l = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, locale, NULL);
106
+ const char *cs = codeset(l);
107
+ size_t r = strcmp(cs, "UTF-8") != 0 ?
108
+ recode_ckey(result, m, string, n, l, cs) :
109
+ ckey(result, m, string, n, l);
110
+ if (l != NULL)
111
+ freelocale(l);
112
+ return r;
113
+ }
114
+
115
+ size_t
116
+ u_normalized_collation_key(char *result, size_t m, const char *string, size_t n,
117
+ const char *locale)
118
+ {
119
+ assert(string != NULL);
120
+ assert(result != NULL || m == 0);
121
+ char buf[2048];
122
+ size_t n_normalized = u_normalize(buf, sizeof(buf), string, n, U_NORMALIZATION_FORM_KC);
123
+ if (n_normalized < sizeof(buf))
124
+ return u_collation_key(result, m, buf, n_normalized, locale);
125
+ char *normalized = malloc(n_normalized + 1);
126
+ if (normalized == NULL)
127
+ return 0;
128
+ u_normalize(normalized, n_normalized + 1, string, n, U_NORMALIZATION_FORM_KC);
129
+ size_t n_key = u_collation_key(result, m, normalized, n_normalized, locale);
130
+ free(normalized);
131
+ return n_key;
132
+ }
@@ -0,0 +1,156 @@
1
+ #include <assert.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+ #include <stdbool.h>
5
+
6
+ #include "u.h"
7
+ #include "private.h"
8
+
9
+
10
+ // The dfa table and decode function is © 2008–2010 Björn Höhrmann
11
+ // <bjoern@hoehrmann.de>. See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
12
+ // for details.
13
+
14
+ enum {
15
+ ACCEPT = 0,
16
+ REJECT = 12
17
+ };
18
+
19
+ static const uint8_t dfa[] = {
20
+ // The first part of the table maps bytes to character classes to
21
+ // reduce the size of the transition table and create bitmasks.
22
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
23
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
24
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
25
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
26
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
27
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
28
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
29
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
30
+
31
+ // The second part is a transition table that maps a combination
32
+ // of a state of the automaton and a character class to a state.
33
+ 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
34
+ 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
35
+ 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
36
+ 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
37
+ 12,36,12,12,12,12,12,12,12,12,12,12,
38
+ };
39
+
40
+ // This reversal of ‹dfa› is © 2013 Nikolai Weibull.
41
+ static const uint8_t dfa_r[] = {
42
+ // The first part of the table maps bytes to character classes to
43
+ // reduce the size of the transition table and create bitmasks.
44
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
45
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
49
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
50
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
51
+ 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
52
+
53
+ // The second part is a transition table that maps a combination
54
+ // of a state of the automaton and a character class to a state.
55
+ 0,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,12,12,12,12,12,
56
+ 12,36, 0,12,12,12,12,48,12,36,12,12, 12,60,12, 0, 0,12,12,72,12,72,12,12,
57
+ 12,60,12, 0,12,12,12,72,12,72, 0,12, 12,12,12,12,12, 0, 0,12,12,12,12,12,
58
+ 12,12,12,12,12,12, 0,12,12,12,12, 0,
59
+ };
60
+
61
+ static inline uint32_t
62
+ decode(uint32_t *state, uint32_t *c, uint32_t b)
63
+ {
64
+ uint32_t type = dfa[b];
65
+ *c = *state != ACCEPT ? (*c << 6) | (b & 0x3f) : (0xff >> type) & b;
66
+ *state = dfa[256 + *state + type];
67
+ return *state;
68
+ }
69
+
70
+ #define REPLACEMENT_CHARACTER ((uint32_t)0xfffd)
71
+
72
+ uint32_t
73
+ u_decode(const char **q, const char *u, const char *end)
74
+ {
75
+ assert(u < end);
76
+ uint32_t c, state = ACCEPT;
77
+ const unsigned char *p;
78
+ for (p = (const unsigned char *)u; p < (const unsigned char *)end; p++)
79
+ switch (decode(&state, &c, *p)) {
80
+ case ACCEPT:
81
+ *q = (const char *)p + 1;
82
+ return c;
83
+ case REJECT:
84
+ *q = (const char *)p + 1;
85
+ return REPLACEMENT_CHARACTER;
86
+ }
87
+ *q = (const char *)p;
88
+ return REPLACEMENT_CHARACTER;
89
+ }
90
+
91
+ int
92
+ u_decode_n(uint32_t *result, const char *u, size_t n)
93
+ {
94
+ const char *q;
95
+ *result = u_decode(&q, u, u + n);
96
+ return (int)(q - u);
97
+ }
98
+
99
+ static inline uint32_t
100
+ decode_r(uint32_t *state, uint32_t *c, uint32_t b, int i)
101
+ {
102
+ uint32_t type = dfa_r[b];
103
+ *state = dfa_r[256 + *state + type];
104
+ *c |= (*state != ACCEPT ? b & 0x3f : (0xff >> type) & b) << (6*i);
105
+ return *state;
106
+ }
107
+
108
+ uint32_t
109
+ u_decode_r(const char **p, const char *begin, const char *u)
110
+ {
111
+ assert(begin < u);
112
+ uint32_t c = 0, state = ACCEPT;
113
+ int i = 0;
114
+ const unsigned char *q;
115
+ for (q = (const unsigned char *)u - 1; (const unsigned char *)begin <= q; q--, i++)
116
+ switch (decode_r(&state, &c, *q, i)) {
117
+ case ACCEPT:
118
+ *p = (const char *)q;
119
+ return c;
120
+ case REJECT:
121
+ *p = (const char *)q;
122
+ return REPLACEMENT_CHARACTER;
123
+ }
124
+ *p = (const char *)begin;
125
+ return REPLACEMENT_CHARACTER;
126
+ }
127
+
128
+ static inline uint32_t
129
+ validate(uint32_t *state, uint32_t b)
130
+ {
131
+ uint32_t type = dfa[b];
132
+ return *state = dfa[256 + *state + type];
133
+ }
134
+
135
+ bool
136
+ u_valid(const char *u, size_t n, const char **end)
137
+ {
138
+ uint32_t state = ACCEPT;
139
+ const unsigned char *p = (const unsigned char *)u;
140
+ const unsigned char *o = p;
141
+ const unsigned char *q = p + n;
142
+ for ( ; p < q; p++)
143
+ switch (validate(&state, *p)) {
144
+ case ACCEPT:
145
+ o = p;
146
+ break;
147
+ case REJECT:
148
+ reject:
149
+ if (end != NULL)
150
+ *end = (const char *)o;
151
+ return false;
152
+ }
153
+ if (p == q)
154
+ return state == ACCEPT;
155
+ goto reject;
156
+ }
@@ -0,0 +1,201 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+ #include "u.h"
5
+
6
+ #include <assert.h>
7
+
8
+ #include <string.h>
9
+ #include "output.h"
10
+
11
+ #include "data/constants.h"
12
+ #include "attributes.h"
13
+ #include "u_locale.h"
14
+ #include "titled.h"
15
+ #include "case.h"
16
+ #include "private.h"
17
+
18
+ #define LATIN_CAPITAL_LETTER_I ((uint32_t)0x0049)
19
+ #define LATIN_CAPITAL_LETTER_J ((uint32_t)0x004a)
20
+ #define LATIN_SMALL_LETTER_I ((uint32_t)0x0069)
21
+ #define LATIN_CAPITAL_LETTER_I_WITH_GRAVE ((uint32_t)0x00cc)
22
+ #define LATIN_CAPITAL_LETTER_I_WITH_ACUTE ((uint32_t)0x00cd)
23
+ #define LATIN_CAPITAL_LETTER_I_WITH_TILDE ((uint32_t)0x0128)
24
+ #define LATIN_CAPITAL_LETTER_I_WITH_OGONEK ((uint32_t)0x012e)
25
+ #define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE ((uint32_t)0x0130)
26
+ #define LATIN_SMALL_LETTER_DOTLESS_I ((uint32_t)0x0131)
27
+ #define COMBINING_GRAVE_ACCENT ((uint32_t)0x0300)
28
+ #define COMBINING_ACUTE_ACCENT ((uint32_t)0x0301)
29
+ #define COMBINING_TILDE ((uint32_t)0x0303)
30
+ #define COMBINING_DOT_ABOVE ((uint32_t)0x0307)
31
+ #define GREEK_CAPITAL_LETTER_SIGMA ((uint32_t)0x03a3)
32
+ #define GREEK_SMALL_LETTER_FINAL_SIGMA ((uint32_t)0x03c2)
33
+ #define GREEK_SMALL_LETTER_SIGMA ((uint32_t)0x03c3)
34
+
35
+
36
+ static inline bool
37
+ is_final_sigma(const char *string, const char *p, const char *q, const char *end)
38
+ {
39
+ if (p == string)
40
+ return false;
41
+ uint32_t c;
42
+ while (q < end) {
43
+ c = u_decode(&q, q, end);
44
+ if (u_char_iscaseignorable(c))
45
+ continue;
46
+ if (u_char_iscased(c))
47
+ return false;
48
+ break;
49
+ }
50
+ while (string < p) {
51
+ c = u_decode_r(&p, string, p);
52
+ if (u_char_iscaseignorable(c))
53
+ continue;
54
+ if (u_char_iscased(c))
55
+ return true;
56
+ return false;
57
+ }
58
+ return false;
59
+ }
60
+
61
+ static inline bool
62
+ has_more_above(const char *q, const char *end)
63
+ {
64
+ while (q < end) {
65
+ switch (u_char_canonical_combining_class(u_decode(&q, q, end))) {
66
+ case U_CANONICAL_COMBINING_CLASS_ABOVE:
67
+ return true;
68
+ case U_CANONICAL_COMBINING_CLASS_NOT_REORDERED:
69
+ return false;
70
+ default:
71
+ break;
72
+ }
73
+ }
74
+ return false;
75
+ }
76
+
77
+ static bool
78
+ downcase_lithuanian_i(uint32_t c, uint32_t combiner, struct output *output)
79
+ {
80
+ output_char(output, c);
81
+ output_char(output, COMBINING_DOT_ABOVE);
82
+ if (combiner != '\0')
83
+ output_char(output, combiner);
84
+ return true;
85
+ }
86
+
87
+ static inline bool
88
+ downcase_lithuanian(uint32_t c, const char *q, const char *end,
89
+ struct output *output)
90
+ {
91
+ switch (c) {
92
+ case LATIN_CAPITAL_LETTER_I:
93
+ case LATIN_CAPITAL_LETTER_J:
94
+ case LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
95
+ if (!has_more_above(q, end))
96
+ return false;
97
+ return downcase_lithuanian_i(u_char_downcase(c), '\0', output);
98
+ case LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
99
+ return downcase_lithuanian_i(LATIN_SMALL_LETTER_I,
100
+ COMBINING_GRAVE_ACCENT, output);
101
+ case LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
102
+ return downcase_lithuanian_i(LATIN_SMALL_LETTER_I,
103
+ COMBINING_ACUTE_ACCENT, output);
104
+ case LATIN_CAPITAL_LETTER_I_WITH_TILDE:
105
+ return downcase_lithuanian_i(LATIN_SMALL_LETTER_I,
106
+ COMBINING_TILDE, output);
107
+ default:
108
+ return false;
109
+ }
110
+ }
111
+
112
+ static inline bool
113
+ is_before_dot(const char *q, const char *end)
114
+ {
115
+ while (q < end) {
116
+ uint32_t c = u_decode(&q, q, end);
117
+ if (c == COMBINING_DOT_ABOVE)
118
+ return true;
119
+ switch (u_char_canonical_combining_class(c)) {
120
+ case U_CANONICAL_COMBINING_CLASS_ABOVE:
121
+ case U_CANONICAL_COMBINING_CLASS_NOT_REORDERED:
122
+ return false;
123
+ default:
124
+ break;
125
+ }
126
+ }
127
+ return false;
128
+ }
129
+
130
+ static bool
131
+ is_i(uint32_t c)
132
+ {
133
+ return c == LATIN_CAPITAL_LETTER_I;
134
+ }
135
+
136
+ static inline bool
137
+ downcase_turkic(uint32_t c,
138
+ const char *string, const char *p, const char *q, const char *end,
139
+ struct output *output)
140
+ {
141
+ switch (c) {
142
+ case LATIN_CAPITAL_LETTER_I:
143
+ output_char(output,
144
+ is_before_dot(q, end) ?
145
+ LATIN_SMALL_LETTER_I :
146
+ LATIN_SMALL_LETTER_DOTLESS_I);
147
+ return true;
148
+ case LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE:
149
+ case_simple(LATIN_CAPITAL_LETTER_I,
150
+ U_GENERAL_CATEGORY_LETTER_UPPERCASE,
151
+ false, false, output);
152
+ return true;
153
+ case COMBINING_DOT_ABOVE:
154
+ if (!is_after(string, p, is_i))
155
+ output_char(output, COMBINING_DOT_ABOVE);
156
+ return true;
157
+ default:
158
+ return false;
159
+ }
160
+ }
161
+
162
+ const char *
163
+ _u_downcase_step(const char *string, const char *p, const char *end,
164
+ enum locale locale, struct output *output)
165
+ {
166
+ const char *q;
167
+ uint32_t c = u_decode(&q, p, end);
168
+ enum u_general_category gc;
169
+ if (c == GREEK_CAPITAL_LETTER_SIGMA)
170
+ output_char(output,
171
+ is_final_sigma(string, p, q, end) ?
172
+ GREEK_SMALL_LETTER_FINAL_SIGMA :
173
+ GREEK_SMALL_LETTER_SIGMA);
174
+ else if (locale == LOCALE_LITHUANIAN &&
175
+ downcase_lithuanian(c, q, end, output))
176
+ ;
177
+ else if (locale == LOCALE_TURKIC &&
178
+ downcase_turkic(c, string, p, q, end, output))
179
+ ;
180
+ else if (IS(gc = u_char_general_category(c),
181
+ OR(U_GENERAL_CATEGORY_LETTER_UPPERCASE,
182
+ OR(U_GENERAL_CATEGORY_LETTER_TITLECASE, 0))))
183
+ case_simple(c, gc, false, false, output);
184
+ else
185
+ output_string(output, p, q - p);
186
+ return q;
187
+ }
188
+
189
+ size_t
190
+ u_downcase(char *result, size_t m, const char *u, size_t n,
191
+ const char *locale)
192
+ {
193
+ assert(u != NULL);
194
+ assert(result != NULL || m == 0);
195
+ enum locale l = _u_locale_from_string(locale);
196
+ const char *end = u + n;
197
+ struct output o = OUTPUT_INIT(result, m);
198
+ for (const char *p = u; p < end; )
199
+ p = _u_downcase_step(u, p, end, l, &o);
200
+ return output_finalize(&o);
201
+ }