u 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (316) hide show
  1. checksums.yaml +7 -0
  2. data/build/ext/u/data/attributes.rb +39 -0
  3. data/build/ext/u/data/bidi-mirroring.rb +27 -0
  4. data/build/ext/u/data/canonical-combining-class.rb +15 -0
  5. data/build/ext/u/data/case-folding.rb +39 -0
  6. data/build/ext/u/data/cased.rb +19 -0
  7. data/build/ext/u/data/compose.rb +304 -0
  8. data/build/ext/u/data/constants.rb +31 -0
  9. data/build/ext/u/data/decompose.rb +85 -0
  10. data/build/ext/u/data/general-category.rb +61 -0
  11. data/build/ext/u/data/grapheme-word-break.rb +15 -0
  12. data/build/ext/u/data/marshalled.rb +5 -0
  13. data/build/ext/u/data/script.rb +91 -0
  14. data/build/ext/u/data/soft-dotted.rb +17 -0
  15. data/build/ext/u/data/title-table.rb +30 -0
  16. data/build/ext/u/data/wide.rb +17 -0
  17. data/build/lib/u/build.rb +8 -0
  18. data/build/lib/u/build/data.rb +16 -0
  19. data/build/lib/u/build/data/bidimirroring.rb +26 -0
  20. data/build/lib/u/build/data/break.rb +14 -0
  21. data/build/lib/u/build/data/casefolding.rb +77 -0
  22. data/build/lib/u/build/data/compositionexclusions.rb +14 -0
  23. data/build/lib/u/build/data/derivedeastasianwidth.rb +15 -0
  24. data/build/lib/u/build/data/file.rb +88 -0
  25. data/build/lib/u/build/data/linebreak.rb +14 -0
  26. data/build/lib/u/build/data/proplist.rb +18 -0
  27. data/build/lib/u/build/data/scripts.rb +22 -0
  28. data/build/lib/u/build/data/specialcasing.rb +106 -0
  29. data/build/lib/u/build/data/unicode.rb +41 -0
  30. data/build/lib/u/build/data/unicode/entry.rb +27 -0
  31. data/build/lib/u/build/data/unicode/entry/decomposition.rb +29 -0
  32. data/build/lib/u/build/data/unicode/points.rb +32 -0
  33. data/build/lib/u/build/header.rb +11 -0
  34. data/build/lib/u/build/header/table.rb +19 -0
  35. data/build/lib/u/build/header/table/row.rb +64 -0
  36. data/build/lib/u/build/header/tables.rb +6 -0
  37. data/build/lib/u/build/header/tables/intervals.rb +50 -0
  38. data/build/lib/u/build/header/tables/split.rb +20 -0
  39. data/build/lib/u/build/header/tables/split/data.rb +29 -0
  40. data/build/lib/u/build/header/tables/split/part1.rb +28 -0
  41. data/build/lib/u/build/header/tables/split/part2.rb +13 -0
  42. data/build/lib/u/build/header/tables/split/row.rb +34 -0
  43. data/build/lib/u/build/header/tables/split/rows.rb +22 -0
  44. data/build/test/unit/break.rb +45 -0
  45. data/build/test/unit/case.rb +178 -0
  46. data/build/test/unit/foldcase.rb +44 -0
  47. data/build/test/unit/normalize.rb +81 -0
  48. data/ext/u/attributes.c +62 -0
  49. data/ext/u/attributes.h +5 -0
  50. data/ext/u/case.h +41 -0
  51. data/ext/u/data/attributes.h +3070 -0
  52. data/ext/u/data/bidi-mirroring.h +373 -0
  53. data/ext/u/data/canonical-combining-class.h +2157 -0
  54. data/ext/u/data/case-folding.h +171 -0
  55. data/ext/u/data/cased.h +42 -0
  56. data/ext/u/data/compose.h +1714 -0
  57. data/ext/u/data/constants.h +17 -0
  58. data/ext/u/data/decompose.h +9356 -0
  59. data/ext/u/data/general-category.h +28959 -0
  60. data/ext/u/data/grapheme-break.h +13201 -0
  61. data/ext/u/data/line-break.h +26501 -0
  62. data/ext/u/data/normalization-quick-check.h +3002 -0
  63. data/ext/u/data/script.h +2928 -0
  64. data/ext/u/data/soft-dotted.h +55 -0
  65. data/ext/u/data/title-table.h +41 -0
  66. data/ext/u/data/types.h +11117 -0
  67. data/ext/u/data/wide-cjk.h +197 -0
  68. data/ext/u/data/wide.h +59 -0
  69. data/ext/u/data/word-break.h +10001 -0
  70. data/ext/u/depend +281 -0
  71. data/ext/u/extconf.rb +158 -0
  72. data/ext/u/output.h +51 -0
  73. data/ext/{encoding/character/utf-8 → u}/private.c +11 -15
  74. data/ext/u/private.h +58 -0
  75. data/ext/u/rb_includes.h +10 -0
  76. data/ext/u/rb_private.c +98 -0
  77. data/ext/u/rb_private.h +67 -0
  78. data/ext/u/rb_u.c +251 -0
  79. data/ext/u/rb_u_buffer.c +443 -0
  80. data/ext/u/rb_u_buffer.h +24 -0
  81. data/ext/u/rb_u_re.c +43 -0
  82. data/ext/u/rb_u_re.h +15 -0
  83. data/ext/u/rb_u_string.c +478 -0
  84. data/ext/u/rb_u_string.h +173 -0
  85. data/ext/u/rb_u_string_alnum.c +10 -0
  86. data/ext/u/rb_u_string_alpha.c +10 -0
  87. data/ext/u/rb_u_string_aref.c +142 -0
  88. data/ext/u/rb_u_string_ascii_only.c +13 -0
  89. data/ext/u/rb_u_string_assigned.c +10 -0
  90. data/ext/u/rb_u_string_b.c +18 -0
  91. data/ext/u/rb_u_string_bytesize.c +10 -0
  92. data/ext/u/rb_u_string_byteslice.c +103 -0
  93. data/ext/u/rb_u_string_canonical_combining_class.c +33 -0
  94. data/ext/u/rb_u_string_case_ignorable.c +25 -0
  95. data/ext/u/rb_u_string_casecmp.c +61 -0
  96. data/ext/u/rb_u_string_cased.c +17 -0
  97. data/ext/u/rb_u_string_chomp.c +107 -0
  98. data/ext/u/rb_u_string_chop.c +33 -0
  99. data/ext/u/rb_u_string_chr.c +9 -0
  100. data/ext/u/rb_u_string_cntrl.c +10 -0
  101. data/ext/u/rb_u_string_collate.c +46 -0
  102. data/ext/u/rb_u_string_collation_key.c +18 -0
  103. data/ext/u/rb_u_string_count.c +38 -0
  104. data/ext/u/rb_u_string_defined.c +10 -0
  105. data/ext/u/rb_u_string_delete.c +62 -0
  106. data/ext/u/rb_u_string_digit.c +10 -0
  107. data/ext/u/rb_u_string_downcase.c +13 -0
  108. data/ext/u/rb_u_string_dump.c +153 -0
  109. data/ext/u/rb_u_string_each_byte.c +46 -0
  110. data/ext/u/rb_u_string_each_char.c +49 -0
  111. data/ext/u/rb_u_string_each_codepoint.c +45 -0
  112. data/ext/u/rb_u_string_each_grapheme_cluster.c +36 -0
  113. data/ext/u/rb_u_string_each_line.c +142 -0
  114. data/ext/u/rb_u_string_each_word.c +34 -0
  115. data/ext/u/rb_u_string_empty.c +11 -0
  116. data/ext/u/rb_u_string_end_with.c +31 -0
  117. data/ext/u/rb_u_string_eql.c +30 -0
  118. data/ext/u/rb_u_string_equal.c +33 -0
  119. data/ext/u/rb_u_string_foldcase.c +12 -0
  120. data/ext/u/rb_u_string_folded.c +13 -0
  121. data/ext/u/rb_u_string_format.c +1745 -0
  122. data/ext/u/rb_u_string_general_category.c +109 -0
  123. data/ext/u/rb_u_string_getbyte.c +21 -0
  124. data/ext/u/rb_u_string_graph.c +21 -0
  125. data/ext/u/rb_u_string_grapheme_break.c +61 -0
  126. data/ext/u/rb_u_string_gsub.c +164 -0
  127. data/ext/u/rb_u_string_hash.c +10 -0
  128. data/ext/u/rb_u_string_hex.c +9 -0
  129. data/ext/u/rb_u_string_include.c +10 -0
  130. data/ext/u/rb_u_string_index.c +110 -0
  131. data/ext/u/rb_u_string_inspect.c +189 -0
  132. data/ext/u/rb_u_string_internal_tr.c +148 -0
  133. data/ext/u/rb_u_string_internal_tr.h +29 -0
  134. data/ext/u/rb_u_string_justify.c +169 -0
  135. data/ext/u/rb_u_string_length.c +10 -0
  136. data/ext/u/rb_u_string_line_break.c +115 -0
  137. data/ext/u/rb_u_string_lower.c +13 -0
  138. data/ext/u/rb_u_string_lstrip.c +24 -0
  139. data/ext/u/rb_u_string_match.c +65 -0
  140. data/ext/u/rb_u_string_mirror.c +16 -0
  141. data/ext/u/rb_u_string_newline.c +21 -0
  142. data/ext/u/rb_u_string_normalize.c +70 -0
  143. data/ext/u/rb_u_string_normalized.c +28 -0
  144. data/ext/u/rb_u_string_oct.c +11 -0
  145. data/ext/u/rb_u_string_ord.c +14 -0
  146. data/ext/u/rb_u_string_partition.c +80 -0
  147. data/ext/u/rb_u_string_plus.c +33 -0
  148. data/ext/u/rb_u_string_print.c +10 -0
  149. data/ext/u/rb_u_string_punct.c +10 -0
  150. data/ext/u/rb_u_string_reverse.c +13 -0
  151. data/ext/u/rb_u_string_rindex.c +104 -0
  152. data/ext/u/rb_u_string_rpartition.c +81 -0
  153. data/ext/u/rb_u_string_rstrip.c +29 -0
  154. data/ext/u/rb_u_string_scan.c +109 -0
  155. data/ext/u/rb_u_string_script.c +253 -0
  156. data/ext/u/rb_u_string_soft_dotted.c +13 -0
  157. data/ext/u/rb_u_string_space.c +24 -0
  158. data/ext/u/rb_u_string_split.c +245 -0
  159. data/ext/u/rb_u_string_squeeze.c +75 -0
  160. data/ext/u/rb_u_string_start_with.c +31 -0
  161. data/ext/u/rb_u_string_strip.c +36 -0
  162. data/ext/u/rb_u_string_sub.c +147 -0
  163. data/ext/u/rb_u_string_times.c +35 -0
  164. data/ext/u/rb_u_string_title.c +10 -0
  165. data/ext/u/rb_u_string_titlecase.c +13 -0
  166. data/ext/u/rb_u_string_to_i.c +45 -0
  167. data/ext/u/rb_u_string_to_inum.c +364 -0
  168. data/ext/u/rb_u_string_to_inum.h +1 -0
  169. data/ext/u/rb_u_string_to_str.c +17 -0
  170. data/ext/u/rb_u_string_to_sym.c +12 -0
  171. data/ext/u/rb_u_string_tr.c +290 -0
  172. data/ext/u/rb_u_string_upcase.c +12 -0
  173. data/ext/u/rb_u_string_upper.c +13 -0
  174. data/ext/u/rb_u_string_valid.c +10 -0
  175. data/ext/u/rb_u_string_valid_encoding.c +12 -0
  176. data/ext/u/rb_u_string_wide.c +21 -0
  177. data/ext/u/rb_u_string_wide_cjk.c +21 -0
  178. data/ext/u/rb_u_string_width.c +19 -0
  179. data/ext/u/rb_u_string_word_break.c +63 -0
  180. data/ext/u/rb_u_string_xdigit.c +22 -0
  181. data/ext/u/rb_u_string_zero_width.c +16 -0
  182. data/ext/u/titled.c +55 -0
  183. data/ext/u/titled.h +1 -0
  184. data/ext/u/u.c +23 -0
  185. data/ext/u/u.h +458 -0
  186. data/ext/u/u_char_canonical_combining_class.c +31 -0
  187. data/ext/u/u_char_digit_value.c +21 -0
  188. data/ext/u/u_char_downcase.c +27 -0
  189. data/ext/u/u_char_general_category.c +31 -0
  190. data/ext/u/u_char_grapheme_break.c +28 -0
  191. data/ext/u/u_char_isalnum.c +24 -0
  192. data/ext/u/u_char_isalpha.c +21 -0
  193. data/ext/u/u_char_isassigned.c +16 -0
  194. data/ext/u/u_char_iscased.c +22 -0
  195. data/ext/u/u_char_iscaseignorable.c +29 -0
  196. data/ext/u/u_char_iscntrl.c +17 -0
  197. data/ext/u/u_char_isdefined.c +15 -0
  198. data/ext/u/u_char_isdigit.c +16 -0
  199. data/ext/u/u_char_isgraph.c +22 -0
  200. data/ext/u/u_char_islower.c +16 -0
  201. data/ext/u/u_char_isnewline.c +24 -0
  202. data/ext/u/u_char_isprint.c +21 -0
  203. data/ext/u/u_char_ispunct.c +27 -0
  204. data/ext/u/u_char_issoftdotted.c +18 -0
  205. data/ext/u/u_char_isspace.c +28 -0
  206. data/ext/u/u_char_isupper.c +16 -0
  207. data/ext/u/u_char_isvalid.c +18 -0
  208. data/ext/u/u_char_iswide.c +18 -0
  209. data/ext/u/u_char_iswide_cjk.c +22 -0
  210. data/ext/u/u_char_isxdigit.c +27 -0
  211. data/ext/u/u_char_iszerowidth.c +29 -0
  212. data/ext/u/u_char_line_break.c +29 -0
  213. data/ext/u/u_char_mirror.c +16 -0
  214. data/ext/u/u_char_normalized.c +23 -0
  215. data/ext/u/u_char_script.c +41 -0
  216. data/ext/u/u_char_to_u.c +48 -0
  217. data/ext/u/u_char_upcase.c +24 -0
  218. data/ext/u/u_char_width.c +12 -0
  219. data/ext/u/u_char_word_break.c +28 -0
  220. data/ext/u/u_char_xdigit_value.c +31 -0
  221. data/ext/u/u_collate.c +83 -0
  222. data/ext/u/u_collation_key.c +132 -0
  223. data/ext/u/u_decode.c +156 -0
  224. data/ext/u/u_downcase.c +201 -0
  225. data/ext/u/u_foldcase.c +68 -0
  226. data/ext/u/u_grapheme_clusters.c +57 -0
  227. data/ext/u/u_has_prefix.c +27 -0
  228. data/ext/u/u_index.c +93 -0
  229. data/ext/u/u_is_ascii_only.c +33 -0
  230. data/ext/u/u_locale.c +40 -0
  231. data/ext/u/u_locale.h +14 -0
  232. data/ext/u/u_mirror.c +20 -0
  233. data/ext/u/u_n_bytes.c +16 -0
  234. data/ext/u/u_n_chars.c +43 -0
  235. data/ext/u/u_normalize.c +232 -0
  236. data/ext/u/u_normalized.c +28 -0
  237. data/ext/u/u_offset_to_pointer.c +62 -0
  238. data/ext/u/u_pointer_to_offset.c +23 -0
  239. data/ext/u/u_recode.c +73 -0
  240. data/ext/u/u_reverse.c +21 -0
  241. data/ext/u/u_rindex.c +132 -0
  242. data/ext/u/u_titlecase.c +68 -0
  243. data/ext/u/u_upcase.c +89 -0
  244. data/ext/u/u_width.c +35 -0
  245. data/ext/u/u_words.c +82 -0
  246. data/ext/u/yield.h +27 -0
  247. data/lib/u-1.0.rb +20 -0
  248. data/lib/u-1.0/buffer.rb +10 -0
  249. data/lib/u-1.0/string.rb +9 -0
  250. data/lib/u-1.0/version.rb +287 -0
  251. data/test/unit/case.rb +2080 -0
  252. data/test/unit/foldcase.rb +1136 -0
  253. data/test/unit/graphemebreak.rb +407 -0
  254. data/test/unit/normalize.rb +367545 -0
  255. data/test/unit/u-1.0.rb +10 -0
  256. data/test/unit/u-1.0/buffer.rb +52 -0
  257. data/test/unit/u-1.0/string.rb +1439 -0
  258. data/test/unit/{u.rb → u-1.0/version.rb} +0 -1
  259. data/test/unit/wordbreak.rb +1083 -0
  260. metadata +603 -148
  261. data/README +0 -38
  262. data/Rakefile +0 -64
  263. data/ext/encoding/character/utf-8/break.c +0 -25
  264. data/ext/encoding/character/utf-8/data/break.h +0 -22931
  265. data/ext/encoding/character/utf-8/data/character-tables.h +0 -14358
  266. data/ext/encoding/character/utf-8/data/compose.h +0 -1607
  267. data/ext/encoding/character/utf-8/data/decompose.h +0 -10926
  268. data/ext/encoding/character/utf-8/data/generate-unicode-data.rb +0 -1070
  269. data/ext/encoding/character/utf-8/decompose.c +0 -444
  270. data/ext/encoding/character/utf-8/depend +0 -65
  271. data/ext/encoding/character/utf-8/extconf.rb +0 -67
  272. data/ext/encoding/character/utf-8/private.h +0 -51
  273. data/ext/encoding/character/utf-8/properties.c +0 -1056
  274. data/ext/encoding/character/utf-8/rb_includes.h +0 -19
  275. data/ext/encoding/character/utf-8/rb_methods.h +0 -49
  276. data/ext/encoding/character/utf-8/rb_private.h +0 -52
  277. data/ext/encoding/character/utf-8/rb_utf_aref.c +0 -111
  278. data/ext/encoding/character/utf-8/rb_utf_aset.c +0 -105
  279. data/ext/encoding/character/utf-8/rb_utf_casecmp.c +0 -24
  280. data/ext/encoding/character/utf-8/rb_utf_chomp.c +0 -114
  281. data/ext/encoding/character/utf-8/rb_utf_chop.c +0 -44
  282. data/ext/encoding/character/utf-8/rb_utf_collate.c +0 -13
  283. data/ext/encoding/character/utf-8/rb_utf_count.c +0 -30
  284. data/ext/encoding/character/utf-8/rb_utf_delete.c +0 -60
  285. data/ext/encoding/character/utf-8/rb_utf_downcase.c +0 -13
  286. data/ext/encoding/character/utf-8/rb_utf_each_char.c +0 -27
  287. data/ext/encoding/character/utf-8/rb_utf_foldcase.c +0 -13
  288. data/ext/encoding/character/utf-8/rb_utf_hex.c +0 -14
  289. data/ext/encoding/character/utf-8/rb_utf_index.c +0 -50
  290. data/ext/encoding/character/utf-8/rb_utf_insert.c +0 -48
  291. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.c +0 -332
  292. data/ext/encoding/character/utf-8/rb_utf_internal_bignum.h +0 -12
  293. data/ext/encoding/character/utf-8/rb_utf_internal_tr.c +0 -142
  294. data/ext/encoding/character/utf-8/rb_utf_internal_tr.h +0 -41
  295. data/ext/encoding/character/utf-8/rb_utf_justify.c +0 -96
  296. data/ext/encoding/character/utf-8/rb_utf_length.c +0 -14
  297. data/ext/encoding/character/utf-8/rb_utf_lstrip.c +0 -41
  298. data/ext/encoding/character/utf-8/rb_utf_normalize.c +0 -51
  299. data/ext/encoding/character/utf-8/rb_utf_oct.c +0 -14
  300. data/ext/encoding/character/utf-8/rb_utf_reverse.c +0 -13
  301. data/ext/encoding/character/utf-8/rb_utf_rindex.c +0 -88
  302. data/ext/encoding/character/utf-8/rb_utf_rstrip.c +0 -51
  303. data/ext/encoding/character/utf-8/rb_utf_squeeze.c +0 -70
  304. data/ext/encoding/character/utf-8/rb_utf_strip.c +0 -27
  305. data/ext/encoding/character/utf-8/rb_utf_to_i.c +0 -25
  306. data/ext/encoding/character/utf-8/rb_utf_tr.c +0 -250
  307. data/ext/encoding/character/utf-8/rb_utf_upcase.c +0 -13
  308. data/ext/encoding/character/utf-8/tables.h +0 -38
  309. data/ext/encoding/character/utf-8/unicode.c +0 -319
  310. data/ext/encoding/character/utf-8/unicode.h +0 -216
  311. data/ext/encoding/character/utf-8/utf.c +0 -1334
  312. data/lib/encoding/character/utf-8.rb +0 -201
  313. data/lib/u.rb +0 -16
  314. data/lib/u/string.rb +0 -185
  315. data/lib/u/version.rb +0 -5
  316. data/test/unit/u/string.rb +0 -91
@@ -0,0 +1,28 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+
9
+ /* {{{1
10
+ * Determine whether ‘c’ is some form of whitespace, such as space, tab or a
11
+ * line separator (newline, carriage return, etc.).
12
+ */
13
+ bool
14
+ u_char_isspace(uint32_t c)
15
+ {
16
+ switch (c) {
17
+ case '\t':
18
+ case '\n':
19
+ case '\r':
20
+ case '\f':
21
+ return true;
22
+ default:
23
+ return IS(u_char_general_category(c),
24
+ OR(U_GENERAL_CATEGORY_SEPARATOR_SPACE,
25
+ OR(U_GENERAL_CATEGORY_SEPARATOR_LINE,
26
+ OR(U_GENERAL_CATEGORY_SEPARATOR_PARAGRAPH, 0)))) ? true : false;
27
+ }
28
+ }
@@ -0,0 +1,16 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+
9
+ /* {{{1
10
+ * Determine whether ‘c’ is an uppeercase letter, such as A, B, or C
11
+ */
12
+ bool
13
+ u_char_isupper(uint32_t c)
14
+ {
15
+ return u_char_general_category(c) == U_GENERAL_CATEGORY_LETTER_UPPERCASE;
16
+ }
@@ -0,0 +1,18 @@
1
+ #include <stddef.h>
2
+ #include <stdint.h>
3
+ #include <stdbool.h>
4
+
5
+ #include "u.h"
6
+
7
+
8
+ /* {{{1
9
+ * Check whether ‘c’ is a valid Unicode character.
10
+ */
11
+ bool
12
+ u_char_isvalid(uint32_t c)
13
+ {
14
+ return ((c) < 0x110000 &&
15
+ (((c) & 0xffffff800) != 0xd800) &&
16
+ ((c) < 0xfdd0 || (c) > 0xfdef) &&
17
+ ((c) & 0xfffe) != 0xfffe);
18
+ }
@@ -0,0 +1,18 @@
1
+ #include <stddef.h>
2
+ #include <stdint.h>
3
+ #include <stdbool.h>
4
+ #include <stdlib.h>
5
+
6
+ #include "u.h"
7
+
8
+ #include "private.h"
9
+
10
+ #include "data/wide.h"
11
+
12
+ /* Returns true if C is typically rendered a double-width cell on a terminal. */
13
+ bool
14
+ u_char_iswide(uint32_t c)
15
+ {
16
+ return bsearch(&c, wide, lengthof(wide), sizeof(wide[0]),
17
+ u_char_interval_compare) != NULL;
18
+ }
@@ -0,0 +1,22 @@
1
+ #include <stddef.h>
2
+ #include <stdint.h>
3
+ #include <stdbool.h>
4
+ #include <stdlib.h>
5
+
6
+ #include "u.h"
7
+
8
+ #include "private.h"
9
+
10
+ #include "data/wide-cjk.h"
11
+
12
+ /* Returns true if C is typically rendered a double-width cell on a terminal
13
+ * under legacy East Asian locales.
14
+ *
15
+ * See http://www.unicode.org/reports/tr11/ for more details. */
16
+ bool
17
+ u_char_iswide_cjk(uint32_t c)
18
+ {
19
+ return u_char_iswide(c) ||
20
+ bsearch(&c, wide, lengthof(wide), sizeof(wide[0]),
21
+ u_char_interval_compare) != NULL;
22
+ }
@@ -0,0 +1,27 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+
9
+ #define FULLWIDTH_A ((uint32_t)0xff21)
10
+ #define FULLWIDTH_F ((uint32_t)0xff26)
11
+ #define FULLWIDTH_a ((uint32_t)0xff41)
12
+ #define FULLWIDTH_f ((uint32_t)0xff46)
13
+
14
+
15
+ /* {{{1
16
+ * Determine whether ‘c’ is a hexadecimal digit, such as 0, 1, ..., 9, a, b,
17
+ * ..., f, or A, B, ..., F.
18
+ */
19
+ bool
20
+ u_char_isxdigit(uint32_t c)
21
+ {
22
+ return ((c >= 'a' && c <= 'f') ||
23
+ (c >= 'A' && c <= 'F') ||
24
+ (c >= FULLWIDTH_a && c <= FULLWIDTH_f) ||
25
+ (c >= FULLWIDTH_A && c <= FULLWIDTH_F) ||
26
+ (u_char_general_category(c) == U_GENERAL_CATEGORY_NUMBER_DECIMAL));
27
+ }
@@ -0,0 +1,29 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+
9
+ #define SOFT_HYPHEN ((uint32_t)0x00ad)
10
+ #define ZERO_WIDTH_SPACE ((uint32_t)0x200b)
11
+
12
+
13
+ bool
14
+ u_char_iszerowidth(uint32_t c)
15
+ {
16
+ if (UNLIKELY(c == SOFT_HYPHEN))
17
+ return false;
18
+
19
+ if (UNLIKELY(IS(u_char_general_category(c),
20
+ OR(U_GENERAL_CATEGORY_MARK_NON_SPACING,
21
+ OR(U_GENERAL_CATEGORY_MARK_ENCLOSING,
22
+ OR(U_GENERAL_CATEGORY_OTHER_FORMAT, 0))))))
23
+ return true;
24
+
25
+ if (UNLIKELY((0x1160 <= c && c < 0x1200) || c == ZERO_WIDTH_SPACE))
26
+ return true;
27
+
28
+ return false;
29
+ }
@@ -0,0 +1,29 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+ #include "data/constants.h"
7
+ #include "data/line-break.h"
8
+ #include "private.h"
9
+
10
+ /* Figure out what break type the Unicode character ‘c’ possesses, if any.
11
+ * This information is used for finding word and line boundaries, which is
12
+ * useful when displaying Unicode text on screen. */
13
+ enum u_line_break
14
+ u_char_line_break(uint32_t c)
15
+ {
16
+ int16_t index;
17
+
18
+ if (c <= UNICODE_LAST_CHAR_PART1)
19
+ index = line_break_property_table_part1[c >> 8];
20
+ else if (UNICODE_FIRST_CHAR_PART2 <= c && c <= UNICODE_LAST_CHAR)
21
+ index = line_break_property_table_part2[(c - UNICODE_FIRST_CHAR_PART2) >> 8];
22
+ else
23
+ return U_LINE_BREAK_UNKNOWN;
24
+
25
+ if (index >= UNICODE_MAX_TABLE_INDEX)
26
+ return index - UNICODE_MAX_TABLE_INDEX;
27
+
28
+ return line_break_property_data[index][c & 0xff];
29
+ }
@@ -0,0 +1,16 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+ #include "u.h"
5
+
6
+ #include "data/bidi-mirroring.h"
7
+ #include "private.h"
8
+
9
+ uint32_t
10
+ u_char_mirror(uint32_t c)
11
+ {
12
+ size_t i;
13
+ return unicode_table_lookup(bidi_mirroring_table, c, &i) ?
14
+ bidi_mirroring_table[i].mirrored_ch :
15
+ c;
16
+ }
@@ -0,0 +1,23 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+ #include "u.h"
5
+
6
+ #include "data/constants.h"
7
+ #include "data/normalization-quick-check.h"
8
+
9
+ enum u_normalized
10
+ u_char_normalized(uint32_t c, enum u_normalization_form form)
11
+ {
12
+ int16_t i;
13
+ if (c <= UNICODE_LAST_CHAR_PART1)
14
+ i = normalization_quick_check_table_part1[c >> 8];
15
+ else if (UNICODE_FIRST_CHAR_PART2 <= c && c <= UNICODE_LAST_CHAR)
16
+ i = normalization_quick_check_table_part2[(c - UNICODE_FIRST_CHAR_PART2) >> 8];
17
+ else
18
+ return U_NORMALIZED_YES;
19
+
20
+ return (i >= UNICODE_MAX_TABLE_INDEX ?
21
+ i - UNICODE_MAX_TABLE_INDEX :
22
+ normalization_quick_check_data[i][c & 0xff]) & (((1 << 2) - 1) << (2 * form));
23
+ }
@@ -0,0 +1,41 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+
7
+ #include "private.h"
8
+
9
+ #include "data/script.h"
10
+
11
+
12
+ static inline enum u_script
13
+ u_char_script_bsearch(uint32_t c)
14
+ {
15
+ int begin = 0;
16
+ int end = lengthof(script_table) - 1;
17
+ static int cached_middle = lengthof(script_table) / 2;
18
+ int middle = cached_middle;
19
+
20
+ do {
21
+ uint32_t probe = script_table[middle].start;
22
+ if (c < probe)
23
+ end = middle - 1;
24
+ else if (c >= probe + script_table[middle].chars)
25
+ begin = middle + 1;
26
+ else
27
+ return script_table[cached_middle = middle].script;
28
+
29
+ middle = binary_search_middle_of(begin, end);
30
+ } while (begin <= end);
31
+
32
+ return U_SCRIPT_UNKNOWN;
33
+ }
34
+
35
+ enum u_script
36
+ u_char_script(uint32_t c)
37
+ {
38
+ return c < EASY_SCRIPTS_RANGE ?
39
+ script_easy_table[c] :
40
+ u_char_script_bsearch(c);
41
+ }
@@ -0,0 +1,48 @@
1
+ #include <stddef.h>
2
+ #include <stdint.h>
3
+ #include <stdbool.h>
4
+
5
+ #include "u.h"
6
+
7
+
8
+ /* {{{1
9
+ * Turn an Unicode character (UTF-32) into an UTF-8 character sequence and
10
+ * store it in ‘result’, returning the length of the stored sequence.
11
+ */
12
+ int
13
+ u_char_to_u_n(uint32_t c, char *result, size_t n)
14
+ {
15
+ if (c < 0x80) {
16
+ if (n > 0)
17
+ result[0] = c;
18
+ return 1;
19
+ }
20
+
21
+ size_t m;
22
+ if (c < 0x800)
23
+ m = 2;
24
+ else if (c < 0x10000) {
25
+ if (0xd800 <= c && c < 0xe000)
26
+ return 0;
27
+ m = 3;
28
+ } else if (c < U_N_CODEPOINTS)
29
+ m = 4;
30
+ else
31
+ return 0;
32
+
33
+ if (result != NULL && n >= m) {
34
+ switch (m) {
35
+ case 4: result[3] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x10000;
36
+ case 3: result[2] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x800;
37
+ case 2: result[1] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0xc0;
38
+ result[0] = c;
39
+ }
40
+ }
41
+ return (int)m;
42
+ }
43
+
44
+ int
45
+ u_char_to_u(uint32_t c, char *result)
46
+ {
47
+ return u_char_to_u_n(c, result, 4);
48
+ }
@@ -0,0 +1,24 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+ #include "data/constants.h"
9
+ #include "attributes.h"
10
+ #include "titled.h"
11
+
12
+
13
+ uint32_t
14
+ u_char_upcase(uint32_t c)
15
+ {
16
+ switch (u_char_general_category(c)) {
17
+ case U_GENERAL_CATEGORY_LETTER_LOWERCASE:
18
+ return _u_special_case_table_lookup(c);
19
+ case U_GENERAL_CATEGORY_LETTER_TITLECASE:
20
+ return _u_titlecase_table_lookup(c, true);
21
+ default:
22
+ return c;
23
+ }
24
+ }
@@ -0,0 +1,12 @@
1
+ #include <stddef.h>
2
+ #include <stdint.h>
3
+ #include <stdbool.h>
4
+
5
+ #include "u.h"
6
+
7
+
8
+ size_t
9
+ u_char_width(uint32_t c)
10
+ {
11
+ return u_char_iswide(c) ? 2 : u_char_iszerowidth(c) ? 0 : 1;
12
+ }
@@ -0,0 +1,28 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+ #include "data/constants.h"
9
+ #include "data/word-break.h"
10
+
11
+
12
+ enum u_word_break
13
+ u_char_word_break(uint32_t c)
14
+ {
15
+ int16_t index;
16
+
17
+ if (c <= UNICODE_LAST_CHAR_PART1)
18
+ index = word_break_property_table_part1[c >> 8];
19
+ else if (UNICODE_FIRST_CHAR_PART2 <= c && c <= UNICODE_LAST_CHAR)
20
+ index = word_break_property_table_part2[(c - UNICODE_FIRST_CHAR_PART2) >> 8];
21
+ else
22
+ return U_WORD_BREAK_OTHER;
23
+
24
+ if (index >= UNICODE_MAX_TABLE_INDEX)
25
+ return index - UNICODE_MAX_TABLE_INDEX;
26
+
27
+ return word_break_property_data[index][c & 0xff];
28
+ }
@@ -0,0 +1,31 @@
1
+ #include <stddef.h>
2
+ #include <stdint.h>
3
+ #include <stdbool.h>
4
+
5
+ #include "u.h"
6
+ #include "private.h"
7
+
8
+
9
+ #define FULLWIDTH_A ((uint32_t)0xff21)
10
+ #define FULLWIDTH_F ((uint32_t)0xff26)
11
+ #define FULLWIDTH_a ((uint32_t)0xff41)
12
+ #define FULLWIDTH_f ((uint32_t)0xff46)
13
+
14
+
15
+ /* {{{1
16
+ * Return the numeric value of ‘c’ if it's a hexadecimal digit, or -1 if not.
17
+ */
18
+ int
19
+ u_char_xdigit_value(uint32_t c)
20
+ {
21
+ if (c >= 'a' && c <= 'f')
22
+ return c - 'a' + 10;
23
+ else if (c >= 'A' && c <= 'F')
24
+ return c - 'A' + 10;
25
+ else if (c >= FULLWIDTH_a && c <= FULLWIDTH_f)
26
+ return c - FULLWIDTH_a + 10;
27
+ else if (c >= FULLWIDTH_A && c <= FULLWIDTH_F)
28
+ return c - FULLWIDTH_A + 10;
29
+ else
30
+ return u_char_digit_value(c);
31
+ }
@@ -0,0 +1,83 @@
1
+ #include "extconf.h"
2
+ #include <assert.h>
3
+ #include <errno.h>
4
+ #include <stdbool.h>
5
+ #include <stdint.h>
6
+ #include <stdlib.h>
7
+ #include <string.h>
8
+
9
+ #include "u.h"
10
+ #include "private.h"
11
+
12
+ static int
13
+ compare(const char *a, size_t a_n, const char *b, size_t b_n)
14
+ {
15
+ int r = memcmp(a, b, a_n < b_n ? a_n : b_n);
16
+ if (r == 0) {
17
+ if (a_n < b_n)
18
+ return -1;
19
+ else if (a_n > b_n)
20
+ return 1;
21
+ }
22
+ return r;
23
+ }
24
+
25
+ static size_t
26
+ ckey(char **result, char *buf, size_t m,
27
+ const char *string, size_t n,
28
+ const char *locale)
29
+ {
30
+ errno = 0;
31
+ size_t key_n = u_collation_key(buf, m, string, n, locale);
32
+ if (errno != 0)
33
+ return 0;
34
+ if (key_n < m) {
35
+ *result = buf;
36
+ return key_n;
37
+ }
38
+ char *key = malloc(key_n + 1);
39
+ if (key == NULL)
40
+ return 0;
41
+ key_n = u_collation_key(key, key_n + 1, string, n, locale);
42
+ if (errno != 0) {
43
+ free(key);
44
+ return 0;
45
+ }
46
+ *result = key;
47
+ return key_n;
48
+ }
49
+
50
+ int
51
+ u_collate(const char *a, size_t a_n, const char *b, size_t b_n,
52
+ const char *locale)
53
+ {
54
+ char a_buf[2048];
55
+ char *a_key = NULL;
56
+ size_t a_key_n = ckey(&a_key, a_buf, sizeof(a_buf), a, a_n, locale);
57
+ int a_errno = errno;
58
+
59
+ char b_buf[2048];
60
+ char *b_key = NULL;
61
+ size_t b_key_n = ckey(&b_key, b_buf, sizeof(b_buf), b, b_n, locale);
62
+ int b_errno = errno;
63
+
64
+ int r;
65
+ if (a_errno != 0) {
66
+ if (b_errno != 0)
67
+ r = compare(a, a_n, b, b_n);
68
+ else
69
+ r = 1;
70
+ errno = a_errno;
71
+ } else if (b_errno != 0) {
72
+ r = -1;
73
+ errno = b_errno;
74
+ } else
75
+ r = compare(a_key, a_key_n, b_key, b_key_n);
76
+
77
+ if (a_key != a_buf)
78
+ free(a_key);
79
+ if (b_key != b_buf)
80
+ free(b_key);
81
+
82
+ return r;
83
+ }